From 7e48c968285a67162c325d5e15892f53e4225fc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 17 Jan 2024 21:32:23 +0100
Subject: [PATCH] distribute multiple per-node locks for the cache state and
 allocate them on the respective numa node for fastest possible access

---
 offloading-cacher/cache.hpp | 80 ++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 33 deletions(-)
diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 058a1e1..9a198e2 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -196,12 +196,16 @@ namespace dsacache {
     private:
         // mutex for accessing the cache state map
 
-        std::shared_mutex cache_mutex_;
 
         // map from [dst-numa-node,map2]
         // map2 from [data-ptr,cache-structure]
 
-        std::unordered_map<uint8_t, std::unordered_map<uint8_t*, CacheData>> cache_state_;
+        struct LockedNodeCacheState {
+            std::shared_mutex cache_mutex_;
+            std::unordered_map<uint8_t*, CacheData> node_cache_state_;
+        };
+
+        std::unordered_map<uint8_t, LockedNodeCacheState*> cache_state_;
 
         CachePolicy* cache_policy_function_ = nullptr;
         CopyPolicy* copy_policy_function_ = nullptr;
@@ -237,6 +241,7 @@ namespace dsacache {
         std::unique_ptr<CacheData> GetFromCache(uint8_t* src, const size_t size, const int dst_node);
 
     public:
+        ~Cache();
         Cache() = default;
         Cache(const Cache& other) = delete;
 
@@ -265,11 +270,10 @@ namespace dsacache {
 }
 
 inline void dsacache::Cache::Clear() {
-    std::unique_lock<std::shared_mutex> lock(cache_mutex_);
-
-    cache_state_.clear();
-
-    Init(cache_policy_function_, copy_policy_function_);
+    for (auto& nc : cache_state_) {
+        std::unique_lock<std::shared_mutex> lock(nc.second->cache_mutex_);
+        nc.second->node_cache_state_.clear();
+    }
 }
 
 inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
@@ -292,7 +296,9 @@ inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy
 
     for (int node = 0; node < nodes_max; node++) {
         if (numa_bitmask_isbitset(valid_nodes, node)) {
-            cache_state_.insert({node,{}});
+            void* block = numa_alloc_onnode(sizeof(LockedNodeCacheState), node);
+            auto* state = new(block)LockedNodeCacheState;
+            cache_state_.insert({node,state});
         }
     }
 }
@@ -323,9 +329,11 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::Access(uint8_t* dat
     task = std::make_unique<CacheData>(data, size);
 
     {
-        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
+        LockedNodeCacheState* local_cache_state = cache_state_[dst_node];
+
+        std::unique_lock<std::shared_mutex> lock(local_cache_state->cache_mutex_);
 
-        const auto state = cache_state_[dst_node].emplace(task->src_, *task);
+        const auto state = local_cache_state->node_cache_state_.emplace(task->src_, *task);
 
         // if state.second is false then no insertion took place
         // which means that concurrently whith this thread
@@ -488,22 +496,19 @@ inline void dsacache::Cache::Flush(const int node) {
         }
     };
 
-    {
-        // we require exclusive lock as we modify the cache state
+    // we require exclusive lock as we modify the cache state
+    // node == -1 means that cache on all nodes should be flushed
 
-        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
-
-        // node == -1 means that cache on all nodes should be flushed
-
-        if (node == -1) {
-            for (auto& nc : cache_state_) {
-                FlushNode(nc.second);
-            }
-        }
-        else {
-            FlushNode(cache_state_[node]);
+    if (node == -1) {
+        for (auto& nc : cache_state_) {
+            std::unique_lock<std::shared_mutex> lock(nc.second->cache_mutex_);
+            FlushNode(nc.second->node_cache_state_);
         }
     }
+    else {
+        std::unique_lock<std::shared_mutex> lock(cache_state_[node]->cache_mutex_);
+        FlushNode(cache_state_[node]->node_cache_state_);
+    }
 }
 
 inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_t* src, const size_t size, const int dst_node) {
@@ -513,17 +518,19 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
     // from marking the element we may find as unused and
     // clearing it
 
+    LockedNodeCacheState* local_cache_state = cache_state_[dst_node];
+
     // lock the cache state in shared-mode because we read
 
-    std::shared_lock<std::shared_mutex> lock(cache_mutex_);
+    std::shared_lock<std::shared_mutex> lock(local_cache_state->cache_mutex_);
 
     // search for the data in our cache state structure at the given node
 
-    const auto search = cache_state_[dst_node].find(src);
+    const auto search = local_cache_state->node_cache_state_.find(src);
 
     // if the data is in our structure we continue
 
-    if (search != cache_state_[dst_node].end()) {
+    if (search != local_cache_state->node_cache_state_.end()) {
 
         // now check whether the sizes match
 
@@ -538,7 +545,7 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
             // which will cause its deletion only after the last possible outside
             // reference is also destroyed
 
-            cache_state_[dst_node].erase(search);
+            local_cache_state->node_cache_state_.erase(search);
         }
     }
 
@@ -547,26 +554,33 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
 
 void dsacache::Cache::Invalidate(uint8_t* data) {
     // as the cache is modified we must obtain a unique writers lock
-
-    std::unique_lock<std::shared_mutex> lock(cache_mutex_);
-
     // loop through all per-node-caches available
 
     for (auto node : cache_state_) {
+        std::unique_lock<std::shared_mutex> lock(node.second->cache_mutex_);
+
         // search for an entry for the given data pointer
 
-        auto search = node.second.find(data);
+        auto search = node.second->node_cache_state_.find(data);
 
-        if (search != node.second.end()) {
+        if (search != node.second->node_cache_state_.end()) {
             // if the data is represented in-cache
             // then it will be erased to re-trigger
             // caching on next access
 
-            node.second.erase(search);
+            node.second->node_cache_state_.erase(search);
         }
     }
 }
 
+inline dsacache::Cache::~Cache() {
+    for (auto node : cache_state_) {
+        std::unique_lock<std::shared_mutex> lock(node.second->cache_mutex_);
+        node.second->~LockedNodeCacheState();
+        numa_free(reinterpret_cast<void*>(node.second), sizeof(LockedNodeCacheState));
+    }
+}
+
 inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
     src_ = data;
     size_ = size;