From a963406f7c0bb6e8c7fecd35605f8f812b8d6850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 30 Jan 2024 15:04:53 +0100 Subject: [PATCH] move mode selection to Configuration.hpp, adapt the CopyMethodPolicy-Function to return only src_node for task sizes under 16MiB which is now required to not cause high submission count which slows down small copies --- qdp_project/src/Benchmark.cpp | 4 ---- qdp_project/src/Configuration.hpp | 6 ++++++ qdp_project/src/utils/BenchmarkHelpers.cpp | 10 ++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp index 98d05b7..de2de99 100644 --- a/qdp_project/src/Benchmark.cpp +++ b/qdp_project/src/Benchmark.cpp @@ -14,10 +14,6 @@ #include "../../offloading-cacher/cache.hpp" -#ifndef MODE_SET_BY_CMAKE -#define MODE_SIMPLE_PREFETCH -#endif - #include "Configuration.hpp" #include "BenchmarkHelpers.cpp" diff --git a/qdp_project/src/Configuration.hpp b/qdp_project/src/Configuration.hpp index 990fccf..7462cf1 100644 --- a/qdp_project/src/Configuration.hpp +++ b/qdp_project/src/Configuration.hpp @@ -1,5 +1,11 @@ #pragma once +#include "utils/memory_literals.h" + +#ifndef MODE_SET_BY_CMAKE +#define MODE_SIMPLE_PREFETCH +#endif + constexpr size_t WL_SIZE_B = 4_GiB; constexpr uint32_t WARMUP_ITERATION_COUNT = 5; constexpr uint32_t ITERATION_COUNT = 5; diff --git a/qdp_project/src/utils/BenchmarkHelpers.cpp b/qdp_project/src/utils/BenchmarkHelpers.cpp index 00727f5..4d000c7 100644 --- a/qdp_project/src/utils/BenchmarkHelpers.cpp +++ b/qdp_project/src/utils/BenchmarkHelpers.cpp @@ -2,6 +2,8 @@ #include "../Configuration.hpp" +#include "memory_literals.h" + constexpr size_t SCANA_TIMING_INDEX = 0; constexpr size_t SCANB_TIMING_INDEX = 1; constexpr size_t AGGRJ_TIMING_INDEX = 2; @@ -38,6 +40,14 @@ int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const } std::vector CopyMethodPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) { + // for small data it is more efficient to run on only one node + // which causes less submissions and therefore completes faster + // as submission cost matters for low transfer size + + if (data_size < 16_MiB) { + return std::vector{ (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node) }; + } + // for sufficiently large data, smart copy is used which will utilize // all four engines for intra-socket copy operations and cross copy on // the source and destination nodes for inter-socket copy