From 854cd6916cc070530333d7320f57ed1ffcef2cb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 29 Nov 2023 22:37:47 +0100 Subject: [PATCH] refrain from using a benchmarking script as this is too inflexible and a huge amount of work to create, manual mode it is ..., also create more concrete plan for benchmarks and the conclusions to draw from them in bechmark-plan.md (new) --- benchmarks/benchmark-plan.md | 34 ++++++++++++++++++++++++++++++++++ benchmarks/benchmarker.py | 11 ----------- 2 files changed, 34 insertions(+), 11 deletions(-) create mode 100644 benchmarks/benchmark-plan.md delete mode 100644 benchmarks/benchmarker.py diff --git a/benchmarks/benchmark-plan.md b/benchmarks/benchmark-plan.md new file mode 100644 index 0000000..26cea13 --- /dev/null +++ b/benchmarks/benchmark-plan.md @@ -0,0 +1,34 @@ +# peak performance +- meassure ddr to ddr, intra-node +- meassure ddr to hbm, intra-node +- meassure ddr to ddr, inter-node +- meassure ddr to hbm, inter-node +- meassure ddr to ddr, inter-socket +- meassure ddr to hbm, inter-socket +All for 1KiB, 4KiB, 1MiB, 1GiB +All for HW and also SW path +--> conclude how much overhead DSA engine has +--> conclude size after which using HW makes sense + this point is reached when submit overhead for + hw execution is smaller than entire copy time + for sw execution +# submit // done +- single submit-and-wait +- multi submit +- batch submit +All with both 1 and 4 engines per WQ +All for 1KiB, 4KiB, 1MiB, 1GiB but only ddr-ddr intra node +--> conclude which work submission strategy is best for which size +--> conclude whether multiple engines significantly improve batch perf +# MT submit +- multiple threads submit to the same WQ +- use 1,2,4,8,12 threads +All for 1KiB, 4KiB, 1MiB, 1GiB but only ddr-ddr intra node +All for 1 vs 4 engines +--> conclude how bad mt submit hurts performance +--> conclude whether multiple engines help mt submit +# cross copy // done +- compare which is faster: xcopy, copy from source node, copy from dst node +All for both inter-node and inter-socket copy using DDR and 1MiB on 4E +--> conclude where a copy thread should live + diff --git a/benchmarks/benchmarker.py b/benchmarks/benchmarker.py deleted file mode 100644 index d0741e3..0000000 --- a/benchmarks/benchmarker.py +++ /dev/null @@ -1,11 +0,0 @@ -def main(): - print("This is doing nothing!") - # test all for sizes 1KiB - 64kKiB / 64MiB increasing exponentially - # test ddr->ddr, ddr->hbm, hbm->hbm, hbm->ddr - # test for both-local, one-local, no-local engine - # test for single thread and multi thread on one engine - # test for single thread with other thread(s) working on disjoint set of node but possibly overlapping source/destination memory - - -if __name__ == "__main__": - main()