Browse Source

move mode selection to Configuration.hpp, adapt the CopyMethodPolicy-Function to return only src_node for task sizes under 16MiB which is now required to not cause high submission count which slows down small copies

master
Constantin Fürst 11 months ago
parent
commit
a963406f7c
  1. 4
      qdp_project/src/Benchmark.cpp
  2. 6
      qdp_project/src/Configuration.hpp
  3. 10
      qdp_project/src/utils/BenchmarkHelpers.cpp

4
qdp_project/src/Benchmark.cpp

@ -14,10 +14,6 @@
#include "../../offloading-cacher/cache.hpp" #include "../../offloading-cacher/cache.hpp"
#ifndef MODE_SET_BY_CMAKE
#define MODE_SIMPLE_PREFETCH
#endif
#include "Configuration.hpp" #include "Configuration.hpp"
#include "BenchmarkHelpers.cpp" #include "BenchmarkHelpers.cpp"

6
qdp_project/src/Configuration.hpp

@ -1,5 +1,11 @@
#pragma once #pragma once
#include "utils/memory_literals.h"
#ifndef MODE_SET_BY_CMAKE
#define MODE_SIMPLE_PREFETCH
#endif
constexpr size_t WL_SIZE_B = 4_GiB; constexpr size_t WL_SIZE_B = 4_GiB;
constexpr uint32_t WARMUP_ITERATION_COUNT = 5; constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
constexpr uint32_t ITERATION_COUNT = 5; constexpr uint32_t ITERATION_COUNT = 5;

10
qdp_project/src/utils/BenchmarkHelpers.cpp

@ -2,6 +2,8 @@
#include "../Configuration.hpp" #include "../Configuration.hpp"
#include "memory_literals.h"
constexpr size_t SCANA_TIMING_INDEX = 0; constexpr size_t SCANA_TIMING_INDEX = 0;
constexpr size_t SCANB_TIMING_INDEX = 1; constexpr size_t SCANB_TIMING_INDEX = 1;
constexpr size_t AGGRJ_TIMING_INDEX = 2; constexpr size_t AGGRJ_TIMING_INDEX = 2;
@ -38,6 +40,14 @@ int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const
} }
std::vector<int> CopyMethodPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) { std::vector<int> CopyMethodPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) {
// for small data it is more efficient to run on only one node
// which causes less submissions and therefore completes faster
// as submission cost matters for low transfer size
if (data_size < 16_MiB) {
return std::vector<int>{ (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node) };
}
// for sufficiently large data, smart copy is used which will utilize // for sufficiently large data, smart copy is used which will utilize
// all four engines for intra-socket copy operations and cross copy on // all four engines for intra-socket copy operations and cross copy on
// the source and destination nodes for inter-socket copy // the source and destination nodes for inter-socket copy

Loading…
Cancel
Save