@ -2,6 +2,8 @@
# include "../Configuration.hpp"
# include "memory_literals.h"
constexpr size_t SCANA_TIMING_INDEX = 0 ;
constexpr size_t SCANB_TIMING_INDEX = 1 ;
constexpr size_t AGGRJ_TIMING_INDEX = 2 ;
@ -38,6 +40,14 @@ int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const
}
std : : vector < int > CopyMethodPolicy ( const int numa_dst_node , const int numa_src_node , const size_t data_size ) {
// for small data it is more efficient to run on only one node
// which causes less submissions and therefore completes faster
// as submission cost matters for low transfer size
if ( data_size < 16 _MiB ) {
return std : : vector < int > { ( numa_src_node > = 8 ? numa_src_node - 8 : numa_src_node ) } ;
}
// for sufficiently large data, smart copy is used which will utilize
// all four engines for intra-socket copy operations and cross copy on
// the source and destination nodes for inter-socket copy