|
|
@ -32,30 +32,20 @@ int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const |
|
|
|
} |
|
|
|
|
|
|
|
std::vector<int> CopyMethodPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) { |
|
|
|
// we always run on n0 and can cut the amount of code here therefore
|
|
|
|
// for small data it is more efficient to run on only one node
|
|
|
|
// which causes less submissions and therefore completes faster
|
|
|
|
// as submission cost matters for low transfer size
|
|
|
|
|
|
|
|
if (data_size < 16_MiB) { |
|
|
|
return std::vector<int>{ (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node) }; |
|
|
|
} |
|
|
|
|
|
|
|
// for sufficiently large data, smart copy is used which will utilize
|
|
|
|
// all four engines for intra-socket copy operations and cross copy on
|
|
|
|
// the source and destination nodes for inter-socket copy
|
|
|
|
|
|
|
|
const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; |
|
|
|
|
|
|
|
if (same_socket) { |
|
|
|
const bool socket_number = numa_dst_node >> 2; |
|
|
|
if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 }; |
|
|
|
else return std::vector<int>{ 4, 5, 6, 7 }; |
|
|
|
static std::atomic<int> last_node = 0; |
|
|
|
const int node = last_node.fetch_add(1) % 4; |
|
|
|
return std::vector<int>{ node }; |
|
|
|
} |
|
|
|
else { |
|
|
|
return std::vector<int>{ |
|
|
|
(numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node), |
|
|
|
(numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) |
|
|
|
}; |
|
|
|
static std::atomic<int> last_group = 0; |
|
|
|
const int group = last_group.fetch_add(1) % 2; |
|
|
|
return group == 0 ? std::vector<int>{ 0, 1 } : std::vector<int>{ 2, 3 }; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|