From 5578f06c80eb8db114e77ab965b2dc59ef484639 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 16 Jan 2024 22:15:36 +0100
Subject: [PATCH] adapt copy policy function to take data size as well and use
 this to only use destination nodes dsa engine for small data sizes on xeonmax

---
 offloading-cacher/main.cpp | 42 ++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 9 deletions(-)
diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index 443b00b..8193f5a 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -7,6 +7,8 @@
 
 #include "cache.hpp"
 
+static constexpr size_t SIZE_64_MIB = 64 * 1024 * 1024;
+
 dsacache::Cache CACHE;
 
 void InitCache(const std::string& device) {
@@ -15,25 +17,47 @@ void InitCache(const std::string& device) {
             return numa_dst_node;
         };
 
-        auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-            return std::vector<int>{ numa_src_node, numa_dst_node };
+        auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            return std::vector<int>{ numa_dst_node };
         };
 
         CACHE.Init(cache_policy,copy_policy);
     }
     else if (device == "xeonmax") {
         auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            // xeon max is configured to have hbm on node ids that are +8
+            
             return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;
         };
 
-        auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-            const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
-            if (same_socket) {
-                const bool socket_number = numa_dst_node >> 2;
-                if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
-                else return std::vector<int>{ 4, 5, 6, 7 };
+        auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            if (data_size < SIZE_64_MIB) {
+                // if the data size is small then the copy will just be carried
+                // out by the destination node which does not require setting numa
+                // thread affinity as the selected dsa engine is already the one
+                // present on the calling thread
+
+                return std::vector<int>{ (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) };
+            }
+            else {
+                // for sufficiently large data, smart copy is used which will utilize
+                // all four engines for intra-socket copy operations and cross copy on
+                // the source and destination nodes for inter-socket copy
+
+                const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
+
+                if (same_socket) {
+                    const bool socket_number = numa_dst_node >> 2;
+                    if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
+                    else return std::vector<int>{ 4, 5, 6, 7 };
+                }
+                else {
+                    return std::vector<int>{
+                        (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node),
+                        (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node)
+                    };
+                }
             }
-            else return std::vector<int>{ numa_src_node, numa_dst_node };
         };
 
         CACHE.Init(cache_policy,copy_policy);