From f91cd3202fb28669f9642bc31aef19773c372c9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 9 Jan 2024 17:28:35 +0100
Subject: [PATCH 01/29] add project 'offloading cacher' and function
 declarations for the cacher in its header file

---
 .gitignore                                    |   2 +
 offloading-cacher/CMakeLists.txt              |  19 ++
 offloading-cacher/benchmark.hpp               | 174 ++++++++++++++++++
 .../cmake/modules/FindNUMA.cmake              |  43 +++++
 offloading-cacher/main.cpp                    |  42 +++++
 offloading-cacher/offloading-cache.hpp        |  84 +++++++++
 offloading-cacher/util/dml-helper.hpp         |  26 +++
 7 files changed, 390 insertions(+)
 create mode 100755 offloading-cacher/CMakeLists.txt
 create mode 100644 offloading-cacher/benchmark.hpp
 create mode 100644 offloading-cacher/cmake/modules/FindNUMA.cmake
 create mode 100644 offloading-cacher/main.cpp
 create mode 100644 offloading-cacher/offloading-cache.hpp
 create mode 100644 offloading-cacher/util/dml-helper.hpp
diff --git a/.gitignore b/.gitignore
index ab3553e..55c6836 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@
 *.fls
 
 */.vscode/*
+*/.idea/*
+*/cmake-build-*/*
 
 # ---> C++
 # Prerequisites
diff --git a/offloading-cacher/CMakeLists.txt b/offloading-cacher/CMakeLists.txt
new file mode 100755
index 0000000..7b4844a
--- /dev/null
+++ b/offloading-cacher/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(offloading-cacher)
+
+set(CMAKE_CXX_STANDARD 20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules")
+
+find_package(NUMA REQUIRED)
+
+set(DML_SOURCE_DIR "../../DML/include/")
+set(SOURCES main.cpp)
+
+add_executable(offloading-cacher ${SOURCES})
+
+target_include_directories(offloading-cacher PRIVATE ${CMAKE_SOURCE_DIR} ${NUMA_INCLUDE_DIRS} ${DML_SOURCE_DIR})
+target_link_libraries(offloading-cacher PRIVATE libdml.a pthread ${CMAKE_DL_LIBS} ${NUMA_LIBRARY})
+
+install(TARGETS offloading-cacher DESTINATION ${CMAKE_INSTALL_PREFIX})
diff --git a/offloading-cacher/benchmark.hpp b/offloading-cacher/benchmark.hpp
new file mode 100644
index 0000000..550efc2
--- /dev/null
+++ b/offloading-cacher/benchmark.hpp
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include <numeric>
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <numa.h>
+
+#include <dml/dml.hpp>
+
+#include "util/barrier.hpp"
+#include "util/dml-helper.hpp"
+#include "util/task-data.hpp"
+
+#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
+#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
+#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }}
+
+#define ADD_TIMING_MESSUREMENT { if (i >= 5) { args->submit_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(se - st).count()); args->complete_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - se).count()); args->combined_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - st).count());}}
+
+template <typename path>
+void* thread_function(void* argp) {
+    TaskData* args = reinterpret_cast<TaskData*>(argp);
+
+    // set numa node and core affinity of the current thread
+    numa_run_on_node(args->numa_node);
+    
+    // allocate memory for the move operation on the requested numa nodes
+    void* src = numa_alloc_onnode(args->size, args->nnode_src);
+    void* dst = numa_alloc_onnode(args->size, args->nnode_dst);
+    dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size);
+    dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size);
+
+    std::memset(src, 0, args->size);
+    std::memset(dst, 0, args->size);
+
+    args->status = dml::status_code::ok;
+    args->rep_completed = 0;
+
+    std::chrono::time_point<std::chrono::steady_clock> tps;
+
+    // we add 5 as the first 5 iterations will not be meassured
+    // to remove exceptional values encountered during warmup
+    for (uint32_t i = 0; i < args->rep_count + 5; i++) {
+        // synchronize the start of each iteration
+        // using the barrier structure
+        args->barrier_->wait();
+
+        if (args->batch_submit) {
+            const auto st = std::chrono::steady_clock::now();
+
+            auto sequence = dml::sequence(args->batch_size, std::allocator<dml::byte_t>());
+
+            for (uint32_t j = 0; j < args->batch_size; j++) {
+                // block_on_fault() is required to submit the task in a way so that the
+                // DSA engine can handle page faults itself together with the IOMMU which
+                // requires the WQ to be configured to allow this too
+
+                const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv);
+                CHECK_STATUS(status, "Adding operation to batch failed!");
+            }
+
+            // we use the asynchronous submit-routine even though this is not required
+            // here, however the project later on will only use async operation and
+            // therefore this behaviour should be benchmarked
+            
+            auto handler = dml::submit<path>(dml::batch, sequence);
+
+            const auto se = std::chrono::steady_clock::now();
+
+            auto result = handler.get();
+
+            const auto et = std::chrono::steady_clock::now();
+
+            const dml::status_code status = result.status;
+            CHECK_STATUS(status, "Batch completed with an Error!");
+
+            ADD_TIMING_MESSUREMENT;
+        }
+        else if (args->batch_size > 1) {
+            // implementation for non-batched batch submit follows here
+            // this means we submit a bunch of work as single descriptors
+            // but then dont wait for the completion immediately
+
+            std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers;
+
+            const auto st = std::chrono::steady_clock::now();
+
+            for (uint32_t j = 0; j < args->batch_size; j++) {
+                // block_on_fault() is required to submit the task in a way so that the
+                // DSA engine can handle page faults itself together with the IOMMU which
+                // requires the WQ to be configured to allow this too
+
+                handlers.emplace_back(dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv));
+            }
+
+            const auto se = std::chrono::steady_clock::now();
+
+            for (auto& handler : handlers) {
+                auto result = handler.get();
+                const dml::status_code status = result.status;
+                CHECK_STATUS(status, "Operation completed with an Error!");
+            }
+
+            const auto et = std::chrono::steady_clock::now();
+
+            ADD_TIMING_MESSUREMENT;
+        }
+        else {
+            const auto st = std::chrono::steady_clock::now();
+
+            // we use the asynchronous submit-routine even though this is not required
+            // here, however the project later on will only use async operation and
+            // therefore this behaviour should be benchmarked
+            // block_on_fault() is required to submit the task in a way so that the
+            // DSA engine can handle page faults itself together with the IOMMU which
+            // requires the WQ to be configured to allow this too
+            auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv);
+
+            const auto se = std::chrono::steady_clock::now();
+
+            auto result = handler.get();
+
+            const auto et = std::chrono::steady_clock::now();
+
+            const dml::status_code status = result.status;
+            CHECK_STATUS(status, "Operation completed with an Error!");
+
+            ADD_TIMING_MESSUREMENT;
+        }
+
+        // again: we do not count the first 5 repetitions
+        if (i == 5) tps = std::chrono::steady_clock::now();
+        if (i >= 5) args->rep_completed++;
+    }
+
+    const auto tpe = std::chrono::steady_clock::now();
+
+    args->total_time = std::chrono::duration_cast<std::chrono::nanoseconds>(tpe - tps).count();
+
+    // free the allocated memory regions on the selected nodes
+    numa_free(src, args->size);
+    numa_free(dst, args->size);
+
+    return nullptr;
+}
+
+template <typename path>
+void execute_dml_memcpy(std::vector<TaskData>& args) {
+    barrier task_barrier(args.size());
+    std::vector<pthread_t> threads;
+
+    // initialize numa library
+    numa_available();
+
+    // for each submitted task we link the semaphore
+    // and create the thread, passing the argument
+    for (auto& arg : args) {
+        arg.barrier_ = &task_barrier;
+        threads.emplace_back();
+
+        if (pthread_create(&threads.back(), nullptr, thread_function<path>, &arg) != 0) {
+            std::cerr << "Error creating thread" << std::endl;
+            exit(1);
+        }
+    }
+
+    for (pthread_t& t : threads) {
+        pthread_join(t, nullptr);
+    }
+}
\ No newline at end of file
diff --git a/offloading-cacher/cmake/modules/FindNUMA.cmake b/offloading-cacher/cmake/modules/FindNUMA.cmake
new file mode 100644
index 0000000..94b23c8
--- /dev/null
+++ b/offloading-cacher/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,43 @@
+# Module for locating libnuma
+#
+# Read-only variables:
+#   NUMA_FOUND
+#     Indicates that the library has been found.
+#
+#   NUMA_INCLUDE_DIR
+#     Points to the libnuma include directory.
+#
+#   NUMA_LIBRARY_DIR
+#     Points to the directory that contains the libraries.
+#     The content of this variable can be passed to link_directories.
+#
+#   NUMA_LIBRARY
+#     Points to the libnuma that can be passed to target_link_libararies.
+#
+# Copyright (c) 2013-2020 MulticoreWare, Inc
+
+include(FindPackageHandleStandardArgs)
+
+find_path(NUMA_ROOT_DIR
+  NAMES include/numa.h
+  PATHS ENV NUMA_ROOT
+  DOC "NUMA root directory")
+
+find_path(NUMA_INCLUDE_DIR
+  NAMES numa.h
+  HINTS ${NUMA_ROOT_DIR}
+  PATH_SUFFIXES include
+  DOC "NUMA include directory")
+
+find_library(NUMA_LIBRARY
+  NAMES numa
+  HINTS ${NUMA_ROOT_DIR}
+  DOC "NUMA library")
+
+if (NUMA_LIBRARY)
+    get_filename_component(NUMA_LIBRARY_DIR ${NUMA_LIBRARY} PATH)
+endif()
+
+mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARY_DIR NUMA_LIBRARY)
+
+find_package_handle_standard_args(NUMA REQUIRED_VARS NUMA_ROOT_DIR NUMA_INCLUDE_DIR NUMA_LIBRARY)
\ No newline at end of file
diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
new file mode 100644
index 0000000..f49f1f1
--- /dev/null
+++ b/offloading-cacher/main.cpp
@@ -0,0 +1,42 @@
+#include <dml/dml.hpp>
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+#include "benchmark.hpp"
+
+int main(int argc, char **argv) {
+    if (argc < 3) {
+        std::cout << "Missing input and output file names." << std::endl;
+        std::cout << "Usage: ./benchmarks [input.json] [output.json]" << std::endl;
+        return 1;
+    }
+
+    const std::string input = argv[1];
+    const std::string output = argv[2];
+
+    std::string path;
+    std::vector<TaskData> args;
+
+    std::ifstream is(input);
+    ReadWorkDescription(args, path, is);
+    is.close();
+
+    if (path == "hw") {
+        execute_dml_memcpy<dml::hardware>(args);
+    }
+    else if (path == "sw") {
+        execute_dml_memcpy<dml::software>(args);
+    }
+    else if (path == "auto") {
+        execute_dml_memcpy<dml::automatic>(args);
+    }
+    else {
+        std::cerr << "Path is neither hw/sw/auto." << std::endl;
+    }
+
+    std::ofstream os(output);
+    WriteResultLog(args, path, os);
+    os.close();
+}
diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp
new file mode 100644
index 0000000..613d498
--- /dev/null
+++ b/offloading-cacher/offloading-cache.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <atomic>
+#include <vector>
+#include <unordered_map>
+
+#include <numa.h>
+
+#include <dml/dml.hpp>
+
+namespace offcache {
+    // the cache task structure will be used to submit and
+    // control a cache element, while providing source pointer
+    // and size in bytes for submission
+    //
+    // then the submitting thread may wait on the atomic "result"
+    // which will be notified by the cache worker upon processing
+    // after which the atomic-bool-ptr active will also become valid
+    // 
+    // the data pointed to by result and the bool-ptr are guaranteed
+    // to remain valid until the value pointed to by active is changed
+    // to false, after which the worker may clean up and delete the
+    // structure - carefull, do not call delete on this, the worker does
+    struct CacheTask {
+        uint8_t* data_;
+        size_t size_;
+        std::atomic<uint8_t*> result_ { nullptr };
+        std::atomic<bool>* active_;
+    };
+
+    // worker class, one for each numa node
+    // discovers its node configuration on startup
+    // and keeps track of available memory
+    class CacheWorker {
+    private:
+        uint8_t numa_node_ = 0;
+
+        std::unordered_map<uint8_t*, CacheTask*> cache_info_;
+
+    public:
+        // this is the mailbox of the worker to which a new task
+        // may be submitted by exchanging nullptr with a valid one
+        // and notifying on the atomic after which ownership
+        // of the CacheTask structure is transferred to the worker
+        std::atomic<CacheTask*>* task_slot_ = nullptr;
+
+        static void run(CacheWorker* this_, const uint8_t numa_node);
+    };
+
+    // singleton which holds the cache workers
+    // and is the place where work will be submited
+    class CacheCoordinator {
+    public:
+        // cache policy is defined as a type here to allow flexible usage of the cacher
+        // given a numa destination node (where the data will be needed), the numa source
+        // node (current location of the data) and the data size, this function should
+        // return optimal cache placement
+        // dst node and returned value can differ if the system, for example, has HBM
+        // attached accessible directly to node n under a different node id m
+        typedef uint8_t (CachePolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node, const size_t data_size);
+
+        // copy policy specifies the copy-executing nodes for a given task
+        // which allows flexibility in assignment for optimizing raw throughput
+        // or choosing a conservative usage policy
+        typedef std::vector<uint8_t> (CopyPolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node);
+
+        enum class ExecutionPolicy {
+            Immediate, Relaxed, NoCache
+        };
+
+    private:
+        CachePolicy* cache_policy_function_ = nullptr;
+        CopyPolicy* copy_policy_function_ = nullptr;
+
+    public:
+        void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);
+
+        // submits the given task and takes ownership of the pointer
+        void SubmitTask(CacheTask* task, const ExecutionPolicy policy) const;
+
+        static void WaitOnCompletion(CacheTask* task);
+        static void SignalDataUnused(CacheTask* task);
+    };
+}
\ No newline at end of file
diff --git a/offloading-cacher/util/dml-helper.hpp b/offloading-cacher/util/dml-helper.hpp
new file mode 100644
index 0000000..1686fd1
--- /dev/null
+++ b/offloading-cacher/util/dml-helper.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <dml/dml.hpp>
+
+inline const std::string StatusCodeToString(const dml::status_code code) {
+    switch(code) {
+        case dml::status_code::ok: return "ok";
+        case dml::status_code::false_predicate: return "false predicate";
+        case dml::status_code::partial_completion: return "partial completion";
+        case dml::status_code::nullptr_error: return "nullptr error";
+        case dml::status_code::bad_size: return "bad size";
+        case dml::status_code::bad_length: return "bad length";
+        case dml::status_code::inconsistent_size: return "inconsistent size";
+        case dml::status_code::dualcast_bad_padding: return "dualcast bad padding";
+        case dml::status_code::bad_alignment: return "bad alignment";
+        case dml::status_code::buffers_overlapping: return "buffers overlapping";
+        case dml::status_code::delta_delta_empty: return "delta delta empty";
+        case dml::status_code::batch_overflow: return "batch overflow";
+        case dml::status_code::execution_failed: return "execution failed";
+        case dml::status_code::unsupported_operation: return "unsupported operation";
+        case dml::status_code::queue_busy: return "queue busy";
+        case dml::status_code::error: return "unknown error";
+        case dml::status_code::config_error: return "config error";
+        default: return "unhandled error";
+    }
+}
\ No newline at end of file

From 623366433bfca316ab932fddd0c0cfcfff2c6f77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 9 Jan 2024 18:18:11 +0100
Subject: [PATCH 02/29] continue modifying the declarations for the cacher and
 providing some first definitions

---
 offloading-cacher/offloading-cache.hpp | 101 ++++++++++++++++++++-----
 1 file changed, 82 insertions(+), 19 deletions(-)

diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp
index 613d498..9c2967a 100644
--- a/offloading-cacher/offloading-cache.hpp
+++ b/offloading-cacher/offloading-cache.hpp
@@ -2,13 +2,39 @@
 
 #include <atomic>
 #include <vector>
+#include <thread>
 #include <unordered_map>
 
+#include <semaphore.h>
+
 #include <numa.h>
 
 #include <dml/dml.hpp>
 
 namespace offcache {
+    // execution policy selects in which way the data is supposed to be cached
+    // and returned with the following behaviour is guaranteed in addition to the
+    // returned value being valid:
+    // Immediate:        return as fast as possible
+    //                   may return cached data, can return data in RAM
+    //                   will trigger caching of the data provided
+    // ImmediateNoCache: return as fast as possible and never trigger caching
+    //                   same as Immediate but will not trigger caching
+    // Relaxed:          no rapid return needed, take time
+    //                   will trigger caching and may only return
+    //                   once the caching is successful but can still
+    //                   provide data in RAM
+    enum class ExecutionPolicy {
+        Relaxed, Immediate, ImmediateNoCache
+    };
+
+    struct WorkerTask {
+        uint8_t* src_;
+        uint8_t* dst_;
+        size_t size_;
+        std::atomic<bool> completed_ { false };
+    };
+
     // the cache task structure will be used to submit and
     // control a cache element, while providing source pointer
     // and size in bytes for submission
@@ -16,35 +42,29 @@ namespace offcache {
     // then the submitting thread may wait on the atomic "result"
     // which will be notified by the cache worker upon processing
     // after which the atomic-bool-ptr active will also become valid
-    // 
-    // the data pointed to by result and the bool-ptr are guaranteed
-    // to remain valid until the value pointed to by active is changed
-    // to false, after which the worker may clean up and delete the
-    // structure - carefull, do not call delete on this, the worker does
     struct CacheTask {
         uint8_t* data_;
         size_t size_;
-        std::atomic<uint8_t*> result_ { nullptr };
-        std::atomic<bool>* active_;
+        ExecutionPolicy policy_;
+        uint8_t* result_;
+        std::atomic<bool> active_;
+        std::vector<WorkerTask> sub_tasks_;
     };
 
     // worker class, one for each numa node
     // discovers its node configuration on startup
     // and keeps track of available memory
     class CacheWorker {
-    private:
+    public:
         uint8_t numa_node_ = 0;
 
-        std::unordered_map<uint8_t*, CacheTask*> cache_info_;
-
-    public:
         // this is the mailbox of the worker to which a new task
         // may be submitted by exchanging nullptr with a valid one
         // and notifying on the atomic after which ownership
         // of the CacheTask structure is transferred to the worker
-        std::atomic<CacheTask*>* task_slot_ = nullptr;
+        std::atomic<WorkerTask*>* task_slot_ = nullptr;
 
-        static void run(CacheWorker* this_, const uint8_t numa_node);
+        static void run(CacheWorker* this_);
     };
 
     // singleton which holds the cache workers
@@ -64,11 +84,11 @@ namespace offcache {
         // or choosing a conservative usage policy
         typedef std::vector<uint8_t> (CopyPolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node);
 
-        enum class ExecutionPolicy {
-            Immediate, Relaxed, NoCache
-        };
-
     private:
+        std::unordered_map<uint8_t, CacheWorker> workers_;
+
+        std::unordered_map<uint8_t*, CacheTask*> cache_state_;
+
         CachePolicy* cache_policy_function_ = nullptr;
         CopyPolicy* copy_policy_function_ = nullptr;
 
@@ -78,7 +98,50 @@ namespace offcache {
         // submits the given task and takes ownership of the pointer
         void SubmitTask(CacheTask* task, const ExecutionPolicy policy) const;
 
-        static void WaitOnCompletion(CacheTask* task);
+        // waits upon completion of caching
+        // returns the location of the data
+        static uint8_t* WaitOnCompletion(CacheTask* task);
+
+        // invalidates the given pointer
         static void SignalDataUnused(CacheTask* task);
     };
-}
\ No newline at end of file
+}
+
+void offcache::CacheWorker::run(CacheWorker* this_) {
+
+}
+
+void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
+    cache_policy_function_ = cache_policy_function;
+    copy_policy_function_ = copy_policy_function;
+
+    // initialize numa library
+    numa_available();
+
+    const uint8_t nodes_max = numa_num_configured_nodes();
+    const uint8_t valid_nodes = numa_get_mems_allowed();
+
+    for (uint8_t node = 0; node < nodes_max; node++) {
+        if (numa_bitmask_isbitset(valid_nodes, node)) {
+            workers_.insert({ node, CacheWorker() });
+            workers_[node].numa_node_ = node;
+            std::thread t (CacheWorker::run, &workers_[node]);
+            t.detach();
+        }
+    } 
+}
+
+void offcache::CacheCoordinator::SubmitTask(CacheTask* task, const ExecutionPolicy policy) const {
+
+}
+
+uint8_t* offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) {
+    while (!task->sub_tasks_.empty()) {
+        task->sub_tasks_.back().completed_.wait(false);
+        task->sub_tasks_.pop_back();
+    }
+}
+
+void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) {
+    task->active_.store(false);
+}

From d396056230d36275a397c1ad64588f9758bce283 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 9 Jan 2024 21:02:34 +0100
Subject: [PATCH 03/29] provide first draft of implementations for the cachers
 functionality

---
 offloading-cacher/offloading-cache.hpp | 298 ++++++++++++++++++++-----
 1 file changed, 245 insertions(+), 53 deletions(-)

diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp
index 9c2967a..d937fb8 100644
--- a/offloading-cacher/offloading-cache.hpp
+++ b/offloading-cacher/offloading-cache.hpp
@@ -4,9 +4,11 @@
 #include <vector>
 #include <thread>
 #include <unordered_map>
+#include <shared_mutex>
 
 #include <semaphore.h>
 
+#include <sched.h>
 #include <numa.h>
 
 #include <dml/dml.hpp>
@@ -28,13 +30,6 @@ namespace offcache {
         Relaxed, Immediate, ImmediateNoCache
     };
 
-    struct WorkerTask {
-        uint8_t* src_;
-        uint8_t* dst_;
-        size_t size_;
-        std::atomic<bool> completed_ { false };
-    };
-
     // the cache task structure will be used to submit and
     // control a cache element, while providing source pointer
     // and size in bytes for submission
@@ -45,26 +40,11 @@ namespace offcache {
     struct CacheTask {
         uint8_t* data_;
         size_t size_;
-        ExecutionPolicy policy_;
-        uint8_t* result_;
-        std::atomic<bool> active_;
-        std::vector<WorkerTask> sub_tasks_;
-    };
-
-    // worker class, one for each numa node
-    // discovers its node configuration on startup
-    // and keeps track of available memory
-    class CacheWorker {
-    public:
-        uint8_t numa_node_ = 0;
-
-        // this is the mailbox of the worker to which a new task
-        // may be submitted by exchanging nullptr with a valid one
-        // and notifying on the atomic after which ownership
-        // of the CacheTask structure is transferred to the worker
-        std::atomic<WorkerTask*>* task_slot_ = nullptr;
-
-        static void run(CacheWorker* this_);
+        uint8_t* result_ = nullptr;
+        uint8_t* maybe_result_ = nullptr;
+        std::atomic<bool> active_ { true };
+        std::atomic<bool> valid_ { false };
+        std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers_;
     };
 
     // singleton which holds the cache workers
@@ -77,71 +57,283 @@ namespace offcache {
         // return optimal cache placement
         // dst node and returned value can differ if the system, for example, has HBM
         // attached accessible directly to node n under a different node id m
-        typedef uint8_t (CachePolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node, const size_t data_size);
+        typedef int (CachePolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size);
 
         // copy policy specifies the copy-executing nodes for a given task
         // which allows flexibility in assignment for optimizing raw throughput
         // or choosing a conservative usage policy
-        typedef std::vector<uint8_t> (CopyPolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node);
+        typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node);
 
     private:
-        std::unordered_map<uint8_t, CacheWorker> workers_;
+        std::shared_mutex cache_mutex_;
 
         std::unordered_map<uint8_t*, CacheTask*> cache_state_;
 
         CachePolicy* cache_policy_function_ = nullptr;
         CopyPolicy* copy_policy_function_ = nullptr;
 
+        dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const;
+
+        void SubmitTask(CacheTask* task);
+
+        CacheTask* CreateTask(const uint8_t *data, const size_t size) const;
+
+        void DestroyTask(CacheTask* task) const;
+
     public:
         void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);
 
-        // submits the given task and takes ownership of the pointer
-        void SubmitTask(CacheTask* task, const ExecutionPolicy policy) const;
+        // function to perform data access through the cache
+        // behaviour depends on the chosen execution policy
+        // Immediate and ImmediateNoCache return a cache task
+        // with guaranteed-valid result value where Relaxed
+        // policy does not come with this guarantee.
+        CacheTask* Access(uint8_t* data, const size_t size, const ExecutionPolicy policy);
 
         // waits upon completion of caching
-        // returns the location of the data
-        static uint8_t* WaitOnCompletion(CacheTask* task);
+        static void WaitOnCompletion(CacheTask* task);
 
         // invalidates the given pointer
+        // afterwards the reference to the
+        // cache task object may be forgotten
         static void SignalDataUnused(CacheTask* task);
-    };
-}
 
-void offcache::CacheWorker::run(CacheWorker* this_) {
+        // returns the location of the cached data
+        // which may or may not be valid
+        static uint8_t* GetDataLocation(CacheTask* task);
 
+        void Flush();
+    };
 }
 
-void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
+inline void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
     cache_policy_function_ = cache_policy_function;
     copy_policy_function_ = copy_policy_function;
 
     // initialize numa library
     numa_available();
+}
+
+inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) {
+    // the best situation is if this data is already cached
+    // which we check in an unnamed block in which the cache
+    // is locked for reading to prevent another thread
+    // from marking the element we may find as unused and
+    // clearing it
+    {
+        std::shared_lock<std::shared_mutex> lock(cache_mutex_);
 
-    const uint8_t nodes_max = numa_num_configured_nodes();
-    const uint8_t valid_nodes = numa_get_mems_allowed();
+        const auto search = cache_state_.find(data);
 
-    for (uint8_t node = 0; node < nodes_max; node++) {
-        if (numa_bitmask_isbitset(valid_nodes, node)) {
-            workers_.insert({ node, CacheWorker() });
-            workers_[node].numa_node_ = node;
-            std::thread t (CacheWorker::run, &workers_[node]);
-            t.detach();
+        if (search != cache_state_.end()) {
+            if (search->second->size_ == size) {
+                search->second->active_.store(true);
+                // TODO: check for completed status depending on execution policy
+                return search->second;
+            }
+            else {
+                DestroyTask(search->second);
+                cache_state_.erase(search);
+            }
         }
-    } 
+    }
+
+    // at this point the requested data is not present in cache
+    // and we create a caching task for it
+
+    CacheTask* task = CreateTask(data, size);
+
+    if (policy == ExecutionPolicy::Immediate) {
+        // in intermediate mode the returned task
+        // object is guaranteed to be valid and therefore
+        // its resulting location must be validated
+        // after which we submit the task
+        // maybe_result is then set by submit
+
+        task->result_ = data;
+        SubmitTask(task);
+        return task;
+    }
+    else if (policy == ExecutionPolicy::ImmediateNoCache) {
+        // for immediatenocache we just validate
+        // the generated task and return it
+        // we must also set maybe_result in case
+        // someone waits on this
+
+        task->result_ = data;
+        task->maybe_result_ = data;
+        return task;
+    }
+    else if (policy == ExecutionPolicy::Relaxed) {
+        // for relaxed no valid task must be returned
+        // and therefore we just submit and then give
+        // the possible invalid task back with only
+        // maybe_result set by submission
+        
+        SubmitTask(task);
+        return task;
+    }
+    else {
+        // this should not be reached
+    }
 }
 
-void offcache::CacheCoordinator::SubmitTask(CacheTask* task, const ExecutionPolicy policy) const {
+inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) {
+    // obtain numa node of current thread to determine where the data is needed
+
+    const int current_cpu = sched_getcpu();
+    const int current_node = numa_node_of_cpu(current_cpu);
+
+    // obtain node that the given data pointer is allocated on
+
+    int data_node = -1;
+    get_mempolicy(&data_node, NULL, 0, (void*)task->data_, MPOL_F_NODE | MPOL_F_ADDR);
+
+    // querry cache policy function for the destination numa node
+
+    const uint32_t dst_node = cache_policy_function_(current_node, data_node, task->size_);
+
+    // allocate data on this node and flush the unused parts of the
+    // cache if the operation fails and retry once
+    // TODO: smarter flush strategy could keep some stuff cached
+
+    uint8_t* dst = numa_alloc_onnode(task->size_, dst_node);
+
+    if (dst == nullptr) {
+        Flush();
+
+        dst = numa_alloc_onnode(task->size_, dst_node);
+
+        if (dst == nullptr) {
+            return;
+        }
+    }
 
+    task->maybe_result_ = dst;
+
+    // querry copy policy function for the nodes to use for the copy
+
+    const std::vector<int> executing_nodes = copy_policy_function_(dst_node, data_node);
+    const size_t task_count = executing_nodes.size();
+
+    // at this point the task may be added to the cache structure
+    // due to the task being initialized with the valid flag set to false
+
+    {
+        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
+
+        const auto state = cache_state_.insert({task->data_, task});
+
+        // if state.second is false then no insertion took place
+        // which means that concurrently whith this thread
+        // some other thread must have accessed the same
+        // resource in which case we must perform an abort
+        // TODO: abort is not the only way to handle this situation
+
+        if (!state.second) {
+            // abort by doing the following steps
+            // (1) free the allocated memory, (2) remove the "maybe result" as
+            // we will not run the caching operation, (3) clear the sub tasks
+            // for the very same reason, (4) set the result to the RAM-location
+
+            numa_free(dst, task->size_);
+            task->maybe_result_ = nullptr;
+            task->result_ = task->data_;
+            return;
+        }
+    }
+
+    // each task will copy one fair part of the total size
+    // and in case the total size is not a factor of the
+    // given task count the last node must copy the remainder
+
+    const size_t size = task->size_ / task_count;
+    const size_t last_size = size + task->size_ % task_count;
+
+    // save the current numa node mask to restore later
+    // as executing the copy task will place this thread
+    // on a different node
+
+    const int nodemask = numa_get_run_node_mask();
+
+    for (uint32_t i = 0; i < task_count; i++) {
+        const size_t local_size = i + 1 == task_count ? size : last_size;
+        const size_t local_offset = i * size;
+        const uint8_t* local_src = task->data_ + local_offset; 
+        uint8_t* local_dst = dst + local_offset;
+
+        const auto handler = ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i]);
+        task->handlers_.emplace_back(handler);
+    }
+
+    // set the valid flag of the task as all handlers
+    // required for completion signal are registered
+
+    task->valid_.store(true);
+    task->valid_.notify_all();
+
+    // restore the previous nodemask
+
+    numa_run_on_node_mask(nodemask);
+}
+
+inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> offcache::CacheCoordinator::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) {
+    dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), size);
+    dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), size);
+
+    numa_run_on_node(node);
+
+    return dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv);
+}
+
+inline offcache::CacheTask* offcache::CacheCoordinator::CreateTask(const uint8_t* data, const size_t size) const {
+    CacheTask* task = new CacheTask();
+    task->data_ = data;
+    task->size_ = size;
+    return task;
+}
+
+inline void offcache::CacheCoordinator::DestroyTask(CacheTask* task) const {
+    numa_free(task->result_, task->size_);
+    delete task;
 }
 
-uint8_t* offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) {
-    while (!task->sub_tasks_.empty()) {
-        task->sub_tasks_.back().completed_.wait(false);
-        task->sub_tasks_.pop_back();
+inline void offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) {
+    task->valid_.wait(false);
+
+    for (auto& handler : task->handlers_) {
+        auto result = handler.get();
+        // TODO: handle the returned status code
     }
+
+    task->handlers_.clear();
 }
 
-void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) {
+inline uint8_t* offcache::CacheCoordinator::GetDataLocation(CacheTask* task) {
+    return task->result_;
+}
+
+inline void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) {
     task->active_.store(false);
 }
+
+inline void offcache::CacheCoordinator::Flush() {
+    // TODO: there probably is a better way to implement this flush
+
+    {
+        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
+        
+        auto it = cache_state_.begin();
+
+        while (it != cache_state_.end()) {
+            if (it->second->active_.load() == false) {
+                DestroyTask(it->second);
+                cache_state_.erase(it);
+                it = cache_state_.begin();
+            }
+            else {
+                it++;
+            }
+        }
+    }
+}
\ No newline at end of file

From 5e30a370ceb12cf75d350e7075645ffff6cb8475 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 9 Jan 2024 23:49:22 +0100
Subject: [PATCH 04/29] finish first stage of caching implementation and
 provide a rudimentary test function in the main

---
 offloading-cacher/benchmark.hpp        | 174 ---------------
 offloading-cacher/main.cpp             |  80 ++++---
 offloading-cacher/offloading-cache.hpp | 290 ++++++++++++++++---------
 3 files changed, 236 insertions(+), 308 deletions(-)
 delete mode 100644 offloading-cacher/benchmark.hpp

diff --git a/offloading-cacher/benchmark.hpp b/offloading-cacher/benchmark.hpp
deleted file mode 100644
index 550efc2..0000000
--- a/offloading-cacher/benchmark.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include <chrono>
-#include <numeric>
-
-#include <pthread.h>
-#include <semaphore.h>
-#include <numa.h>
-
-#include <dml/dml.hpp>
-
-#include "util/barrier.hpp"
-#include "util/dml-helper.hpp"
-#include "util/task-data.hpp"
-
-#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
-#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
-#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }}
-
-#define ADD_TIMING_MESSUREMENT { if (i >= 5) { args->submit_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(se - st).count()); args->complete_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - se).count()); args->combined_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - st).count());}}
-
-template <typename path>
-void* thread_function(void* argp) {
-    TaskData* args = reinterpret_cast<TaskData*>(argp);
-
-    // set numa node and core affinity of the current thread
-    numa_run_on_node(args->numa_node);
-    
-    // allocate memory for the move operation on the requested numa nodes
-    void* src = numa_alloc_onnode(args->size, args->nnode_src);
-    void* dst = numa_alloc_onnode(args->size, args->nnode_dst);
-    dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size);
-    dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size);
-
-    std::memset(src, 0, args->size);
-    std::memset(dst, 0, args->size);
-
-    args->status = dml::status_code::ok;
-    args->rep_completed = 0;
-
-    std::chrono::time_point<std::chrono::steady_clock> tps;
-
-    // we add 5 as the first 5 iterations will not be meassured
-    // to remove exceptional values encountered during warmup
-    for (uint32_t i = 0; i < args->rep_count + 5; i++) {
-        // synchronize the start of each iteration
-        // using the barrier structure
-        args->barrier_->wait();
-
-        if (args->batch_submit) {
-            const auto st = std::chrono::steady_clock::now();
-
-            auto sequence = dml::sequence(args->batch_size, std::allocator<dml::byte_t>());
-
-            for (uint32_t j = 0; j < args->batch_size; j++) {
-                // block_on_fault() is required to submit the task in a way so that the
-                // DSA engine can handle page faults itself together with the IOMMU which
-                // requires the WQ to be configured to allow this too
-
-                const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv);
-                CHECK_STATUS(status, "Adding operation to batch failed!");
-            }
-
-            // we use the asynchronous submit-routine even though this is not required
-            // here, however the project later on will only use async operation and
-            // therefore this behaviour should be benchmarked
-            
-            auto handler = dml::submit<path>(dml::batch, sequence);
-
-            const auto se = std::chrono::steady_clock::now();
-
-            auto result = handler.get();
-
-            const auto et = std::chrono::steady_clock::now();
-
-            const dml::status_code status = result.status;
-            CHECK_STATUS(status, "Batch completed with an Error!");
-
-            ADD_TIMING_MESSUREMENT;
-        }
-        else if (args->batch_size > 1) {
-            // implementation for non-batched batch submit follows here
-            // this means we submit a bunch of work as single descriptors
-            // but then dont wait for the completion immediately
-
-            std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers;
-
-            const auto st = std::chrono::steady_clock::now();
-
-            for (uint32_t j = 0; j < args->batch_size; j++) {
-                // block_on_fault() is required to submit the task in a way so that the
-                // DSA engine can handle page faults itself together with the IOMMU which
-                // requires the WQ to be configured to allow this too
-
-                handlers.emplace_back(dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv));
-            }
-
-            const auto se = std::chrono::steady_clock::now();
-
-            for (auto& handler : handlers) {
-                auto result = handler.get();
-                const dml::status_code status = result.status;
-                CHECK_STATUS(status, "Operation completed with an Error!");
-            }
-
-            const auto et = std::chrono::steady_clock::now();
-
-            ADD_TIMING_MESSUREMENT;
-        }
-        else {
-            const auto st = std::chrono::steady_clock::now();
-
-            // we use the asynchronous submit-routine even though this is not required
-            // here, however the project later on will only use async operation and
-            // therefore this behaviour should be benchmarked
-            // block_on_fault() is required to submit the task in a way so that the
-            // DSA engine can handle page faults itself together with the IOMMU which
-            // requires the WQ to be configured to allow this too
-            auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv);
-
-            const auto se = std::chrono::steady_clock::now();
-
-            auto result = handler.get();
-
-            const auto et = std::chrono::steady_clock::now();
-
-            const dml::status_code status = result.status;
-            CHECK_STATUS(status, "Operation completed with an Error!");
-
-            ADD_TIMING_MESSUREMENT;
-        }
-
-        // again: we do not count the first 5 repetitions
-        if (i == 5) tps = std::chrono::steady_clock::now();
-        if (i >= 5) args->rep_completed++;
-    }
-
-    const auto tpe = std::chrono::steady_clock::now();
-
-    args->total_time = std::chrono::duration_cast<std::chrono::nanoseconds>(tpe - tps).count();
-
-    // free the allocated memory regions on the selected nodes
-    numa_free(src, args->size);
-    numa_free(dst, args->size);
-
-    return nullptr;
-}
-
-template <typename path>
-void execute_dml_memcpy(std::vector<TaskData>& args) {
-    barrier task_barrier(args.size());
-    std::vector<pthread_t> threads;
-
-    // initialize numa library
-    numa_available();
-
-    // for each submitted task we link the semaphore
-    // and create the thread, passing the argument
-    for (auto& arg : args) {
-        arg.barrier_ = &task_barrier;
-        threads.emplace_back();
-
-        if (pthread_create(&threads.back(), nullptr, thread_function<path>, &arg) != 0) {
-            std::cerr << "Error creating thread" << std::endl;
-            exit(1);
-        }
-    }
-
-    for (pthread_t& t : threads) {
-        pthread_join(t, nullptr);
-    }
-}
\ No newline at end of file
diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index f49f1f1..b6c9714 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -1,42 +1,64 @@
-#include <dml/dml.hpp>
-
-#include <vector>
 #include <iostream>
-#include <fstream>
+#include <random>
 
-#include "benchmark.hpp"
+#include "offloading-cache.hpp"
 
-int main(int argc, char **argv) {
-    if (argc < 3) {
-        std::cout << "Missing input and output file names." << std::endl;
-        std::cout << "Usage: ./benchmarks [input.json] [output.json]" << std::endl;
-        return 1;
-    }
+double* GetRandomArray(const size_t size) {
+    double* array = new double[size];
 
-    const std::string input = argv[1];
-    const std::string output = argv[2];
+    std::uniform_real_distribution<double> unif(std::numeric_limits<double>::min(), std::numeric_limits<double>::max());
+    std::default_random_engine re;
 
-    std::string path;
-    std::vector<TaskData> args;
+    for (size_t i = 0; i < size; i++) {
+        array[i] = unif(re);
+    }
 
-    std::ifstream is(input);
-    ReadWorkDescription(args, path, is);
-    is.close();
+    return array;
+}
 
-    if (path == "hw") {
-        execute_dml_memcpy<dml::hardware>(args);
+bool IsEqual(const double* a, const double* b, const size_t size) {
+    for (size_t i = 0; i < size; i++) {
+        try {
+            if (a[i] != b[i]) return false;
+        }
+        catch (...) {
+            return false;
+        }
     }
-    else if (path == "sw") {
-        execute_dml_memcpy<dml::software>(args);
+
+    return true;
+}
+
+int main(int argc, char **argv) {
+    offcache::Cache cache;
+
+    auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+        return numa_dst_node;
+    };
+
+    auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
+        return std::vector{ numa_src_node };
+    };
+
+    cache.Init(cache_policy,copy_policy);
+
+    static constexpr size_t data_size = 8192;
+    double* data = GetRandomArray(data_size);
+
+    std::unique_ptr<offcache::CacheData> data_cache = cache.Access(reinterpret_cast<uint8_t *>(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed);
+
+    data_cache->WaitOnCompletion();
+
+    double* cached = reinterpret_cast<double *>(data_cache->GetDataLocation());
+
+    if (data == cached) {
+        std::cout << "Caching did not affect data location." << std::endl;
     }
-    else if (path == "auto") {
-        execute_dml_memcpy<dml::automatic>(args);
+
+    if (IsEqual(data,cached,data_size)) {
+        std::cout << "Cached data is correct." << std::endl;
     }
     else {
-        std::cerr << "Path is neither hw/sw/auto." << std::endl;
+        std::cout << "Cached data is wrong." << std::endl;
     }
-
-    std::ofstream os(output);
-    WriteResultLog(args, path, os);
-    os.close();
 }
diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp
index d937fb8..f40ef3d 100644
--- a/offloading-cacher/offloading-cache.hpp
+++ b/offloading-cacher/offloading-cache.hpp
@@ -1,15 +1,20 @@
 #pragma once
 
+#include <iostream>
+
 #include <atomic>
 #include <vector>
 #include <thread>
 #include <unordered_map>
 #include <shared_mutex>
+#include <mutex>
+#include <memory>
 
 #include <semaphore.h>
 
 #include <sched.h>
 #include <numa.h>
+#include <numaif.h>
 
 #include <dml/dml.hpp>
 
@@ -30,6 +35,8 @@ namespace offcache {
         Relaxed, Immediate, ImmediateNoCache
     };
 
+    class Cache;
+
     // the cache task structure will be used to submit and
     // control a cache element, while providing source pointer
     // and size in bytes for submission
@@ -37,19 +44,41 @@ namespace offcache {
     // then the submitting thread may wait on the atomic "result"
     // which will be notified by the cache worker upon processing
     // after which the atomic-bool-ptr active will also become valid
-    struct CacheTask {
-        uint8_t* data_;
+    class CacheData {
+    public:
+        using dml_handler = dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>;
+
+    private:
+        uint8_t* src_;
         size_t size_;
-        uint8_t* result_ = nullptr;
-        uint8_t* maybe_result_ = nullptr;
-        std::atomic<bool> active_ { true };
-        std::atomic<bool> valid_ { false };
-        std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers_;
+
+        std::atomic<int32_t>* active_;
+
+    protected:
+        std::atomic<uint8_t*>* cache_;
+
+        uint8_t* incomplete_cache_;
+
+        std::unique_ptr<std::vector<dml_handler>> handlers_;
+
+        friend Cache;
+
+    public:
+        CacheData(uint8_t* data, const size_t size);
+        CacheData(const CacheData& other);
+        ~CacheData();
+
+        void Deallocate();
+        void WaitOnCompletion();
+
+        uint8_t* GetDataLocation() const;
+
+        bool Active() const;
     };
 
     // singleton which holds the cache workers
     // and is the place where work will be submited
-    class CacheCoordinator {
+    class Cache {
     public:
         // cache policy is defined as a type here to allow flexible usage of the cacher
         // given a numa destination node (where the data will be needed), the numa source
@@ -67,18 +96,14 @@ namespace offcache {
     private:
         std::shared_mutex cache_mutex_;
 
-        std::unordered_map<uint8_t*, CacheTask*> cache_state_;
+        std::unordered_map<uint8_t*, CacheData> cache_state_;
 
         CachePolicy* cache_policy_function_ = nullptr;
         CopyPolicy* copy_policy_function_ = nullptr;
 
         dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const;
 
-        void SubmitTask(CacheTask* task);
-
-        CacheTask* CreateTask(const uint8_t *data, const size_t size) const;
-
-        void DestroyTask(CacheTask* task) const;
+        void SubmitTask(CacheData* task);
 
     public:
         void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);
@@ -88,33 +113,23 @@ namespace offcache {
         // Immediate and ImmediateNoCache return a cache task
         // with guaranteed-valid result value where Relaxed
         // policy does not come with this guarantee.
-        CacheTask* Access(uint8_t* data, const size_t size, const ExecutionPolicy policy);
-
-        // waits upon completion of caching
-        static void WaitOnCompletion(CacheTask* task);
-
-        // invalidates the given pointer
-        // afterwards the reference to the
-        // cache task object may be forgotten
-        static void SignalDataUnused(CacheTask* task);
-
-        // returns the location of the cached data
-        // which may or may not be valid
-        static uint8_t* GetDataLocation(CacheTask* task);
+        std::unique_ptr<CacheData> Access(uint8_t* data, const size_t size, const ExecutionPolicy policy);
 
         void Flush();
     };
 }
 
-inline void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
+inline void offcache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
     cache_policy_function_ = cache_policy_function;
     copy_policy_function_ = copy_policy_function;
 
     // initialize numa library
     numa_available();
+
+    std::cout << "[-] Cache Initialized" << std::endl;
 }
 
-inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) {
+inline std::unique_ptr<offcache::CacheData> offcache::Cache::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) {
     // the best situation is if this data is already cached
     // which we check in an unnamed block in which the cache
     // is locked for reading to prevent another thread
@@ -126,13 +141,16 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co
         const auto search = cache_state_.find(data);
 
         if (search != cache_state_.end()) {
-            if (search->second->size_ == size) {
-                search->second->active_.store(true);
-                // TODO: check for completed status depending on execution policy
-                return search->second;
+            if (search->second.size_ == size) {
+                search->second.active_->store(true);
+
+                std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
+
+                return std::move(std::make_unique<CacheData>(search->second));
             }
             else {
-                DestroyTask(search->second);
+                std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
+
                 cache_state_.erase(search);
             }
         }
@@ -141,7 +159,7 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co
     // at this point the requested data is not present in cache
     // and we create a caching task for it
 
-    CacheTask* task = CreateTask(data, size);
+    auto task = std::make_unique<CacheData>(data, size);
 
     if (policy == ExecutionPolicy::Immediate) {
         // in intermediate mode the returned task
@@ -150,9 +168,9 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co
         // after which we submit the task
         // maybe_result is then set by submit
 
-        task->result_ = data;
-        SubmitTask(task);
-        return task;
+        task->cache_->store(data);
+        SubmitTask(task.get());
+        return std::move(task);
     }
     else if (policy == ExecutionPolicy::ImmediateNoCache) {
         // for immediatenocache we just validate
@@ -160,9 +178,9 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co
         // we must also set maybe_result in case
         // someone waits on this
 
-        task->result_ = data;
-        task->maybe_result_ = data;
-        return task;
+        task->cache_->store(data);
+        task->incomplete_cache_ = data;
+        return std::move(task);
     }
     else if (policy == ExecutionPolicy::Relaxed) {
         // for relaxed no valid task must be returned
@@ -170,15 +188,15 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co
         // the possible invalid task back with only
         // maybe_result set by submission
         
-        SubmitTask(task);
-        return task;
+        SubmitTask(task.get());
+        return std::move(task);
     }
     else {
         // this should not be reached
     }
 }
 
-inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) {
+inline void offcache::Cache::SubmitTask(CacheData* task) {
     // obtain numa node of current thread to determine where the data is needed
 
     const int current_cpu = sched_getcpu();
@@ -187,42 +205,72 @@ inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) {
     // obtain node that the given data pointer is allocated on
 
     int data_node = -1;
-    get_mempolicy(&data_node, NULL, 0, (void*)task->data_, MPOL_F_NODE | MPOL_F_ADDR);
+    get_mempolicy(&data_node, NULL, 0, (void*)task->src_, MPOL_F_NODE | MPOL_F_ADDR);
 
     // querry cache policy function for the destination numa node
 
-    const uint32_t dst_node = cache_policy_function_(current_node, data_node, task->size_);
+    const int dst_node = cache_policy_function_(current_node, data_node, task->size_);
+
+    std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
 
     // allocate data on this node and flush the unused parts of the
     // cache if the operation fails and retry once
     // TODO: smarter flush strategy could keep some stuff cached
 
-    uint8_t* dst = numa_alloc_onnode(task->size_, dst_node);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
 
     if (dst == nullptr) {
+        std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
+
         Flush();
 
-        dst = numa_alloc_onnode(task->size_, dst_node);
+        dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
 
         if (dst == nullptr) {
+            std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
             return;
         }
     }
 
-    task->maybe_result_ = dst;
+    task->incomplete_cache_ = dst;
 
     // querry copy policy function for the nodes to use for the copy
 
     const std::vector<int> executing_nodes = copy_policy_function_(dst_node, data_node);
     const size_t task_count = executing_nodes.size();
 
-    // at this point the task may be added to the cache structure
-    // due to the task being initialized with the valid flag set to false
+    // each task will copy one fair part of the total size
+    // and in case the total size is not a factor of the
+    // given task count the last node must copy the remainder
+
+    const size_t size = task->size_ / task_count;
+    const size_t last_size = size + task->size_ % task_count;
+
+    // save the current numa node mask to restore later
+    // as executing the copy task will place this thread
+    // on a different node
+
+    bitmask* nodemask = numa_get_run_node_mask();
+
+    for (uint32_t i = 0; i < task_count; i++) {
+        const size_t local_size = i + 1 == task_count ? size : last_size;
+        const size_t local_offset = i * size;
+        const uint8_t* local_src = task->src_ + local_offset;
+        uint8_t* local_dst = dst + local_offset;
+
+        task->handlers_->emplace_back(ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i]));
+    }
+
+    // only at this point may the task be added to the control structure
+    // because adding it earlier could cause it to be returned for an
+    // access request while the handler-vector is not fully populated
+    // which could cause the wait-function to return prematurely
+    // TODO: this can be optimized because the abort is quite expensive
 
     {
         std::unique_lock<std::shared_mutex> lock(cache_mutex_);
 
-        const auto state = cache_state_.insert({task->data_, task});
+        const auto state = cache_state_.insert({task->src_, *task});
 
         // if state.second is false then no insertion took place
         // which means that concurrently whith this thread
@@ -231,94 +279,127 @@ inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) {
         // TODO: abort is not the only way to handle this situation
 
         if (!state.second) {
+            std::cout << "[x] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+
+            // first wait on all copy operations to be completed
+
+            task->WaitOnCompletion();
+
             // abort by doing the following steps
             // (1) free the allocated memory, (2) remove the "maybe result" as
             // we will not run the caching operation, (3) clear the sub tasks
             // for the very same reason, (4) set the result to the RAM-location
 
             numa_free(dst, task->size_);
-            task->maybe_result_ = nullptr;
-            task->result_ = task->data_;
+            task->incomplete_cache_ = nullptr;
+            task->cache_->store(task->src_);
+
+            std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+
             return;
         }
     }
 
-    // each task will copy one fair part of the total size
-    // and in case the total size is not a factor of the
-    // given task count the last node must copy the remainder
+    // restore the previous nodemask
 
-    const size_t size = task->size_ / task_count;
-    const size_t last_size = size + task->size_ % task_count;
+    numa_run_on_node_mask(nodemask);
+}
 
-    // save the current numa node mask to restore later
-    // as executing the copy task will place this thread
-    // on a different node
+inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> offcache::Cache::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const {
+    dml::const_data_view srcv = dml::make_view(src, size);
+    dml::data_view dstv = dml::make_view(dst, size);
 
-    const int nodemask = numa_get_run_node_mask();
+    numa_run_on_node(node);
 
-    for (uint32_t i = 0; i < task_count; i++) {
-        const size_t local_size = i + 1 == task_count ? size : last_size;
-        const size_t local_offset = i * size;
-        const uint8_t* local_src = task->data_ + local_offset; 
-        uint8_t* local_dst = dst + local_offset;
+    return dml::submit<dml::automatic>(dml::mem_copy.block_on_fault(), srcv, dstv);
+}
+
+inline void offcache::CacheData::WaitOnCompletion() {
+    if (handlers_ == nullptr) {
+        std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
-        const auto handler = ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i]);
-        task->handlers_.emplace_back(handler);
+        cache_->wait(nullptr);
+
+        std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
     }
+    else {
+        std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
-    // set the valid flag of the task as all handlers
-    // required for completion signal are registered
+        for (auto& handler : *handlers_) {
+            auto result = handler.get();
+            // TODO: handle the returned status code
+        }
 
-    task->valid_.store(true);
-    task->valid_.notify_all();
+        handlers_ = nullptr;
 
-    // restore the previous nodemask
+        std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
-    numa_run_on_node_mask(nodemask);
+        cache_->store(incomplete_cache_);
+        cache_->notify_all();
+    }
 }
 
-inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> offcache::CacheCoordinator::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) {
-    dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), size);
-    dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), size);
-
-    numa_run_on_node(node);
+offcache::CacheData::CacheData(uint8_t* data, const size_t size) {
+    std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
 
-    return dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv);
+    src_ = data;
+    size_ = size;
+    active_ = new std::atomic<int32_t>();
+    cache_ = new std::atomic<uint8_t*>();
+    incomplete_cache_ = nullptr;
+    handlers_ = std::make_unique<std::vector<dml_handler>>();
 }
 
-inline offcache::CacheTask* offcache::CacheCoordinator::CreateTask(const uint8_t* data, const size_t size) const {
-    CacheTask* task = new CacheTask();
-    task->data_ = data;
-    task->size_ = size;
-    return task;
-}
+offcache::CacheData::CacheData(const offcache::CacheData& other) {
+    std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
 
-inline void offcache::CacheCoordinator::DestroyTask(CacheTask* task) const {
-    numa_free(task->result_, task->size_);
-    delete task;
+    src_ = other.src_;
+    size_ = other.size_;
+    cache_ = other.cache_;
+    active_ = other.active_;
+    incomplete_cache_ = nullptr;
+    handlers_ = nullptr;
+    active_->fetch_add(1);
 }
 
-inline void offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) {
-    task->valid_.wait(false);
+offcache::CacheData::~CacheData() {
+    std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+    const int32_t v = active_->fetch_sub(1);
 
-    for (auto& handler : task->handlers_) {
-        auto result = handler.get();
-        // TODO: handle the returned status code
+    // if the returned value is non-positive
+    // then we must execute proper deletion
+    // as this was the last reference
+
+    if (v <= 0) {
+        std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        Deallocate();
+        delete active_;
+        delete cache_;
     }
+}
 
-    task->handlers_.clear();
+void offcache::CacheData::Deallocate() {
+    std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+    numa_free(cache_, size_);
+    cache_ = nullptr;
+    incomplete_cache_ = nullptr;
 }
 
-inline uint8_t* offcache::CacheCoordinator::GetDataLocation(CacheTask* task) {
-    return task->result_;
+uint8_t *offcache::CacheData::GetDataLocation() const {
+    return cache_->load();
 }
 
-inline void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) {
-    task->active_.store(false);
+bool offcache::CacheData::Active() const {
+    return active_->load() > 0;
 }
 
-inline void offcache::CacheCoordinator::Flush() {
-    // TODO: there probably is a better way to implement this flush
+inline void offcache::Cache::Flush() {
+    std::cout << "[-] Flushing Cache" << std::endl;
+
+    // TODO: there is a better way to implement this flush
 
     {
         std::unique_lock<std::shared_mutex> lock(cache_mutex_);
@@ -326,8 +407,7 @@ inline void offcache::CacheCoordinator::Flush() {
         auto it = cache_state_.begin();
 
         while (it != cache_state_.end()) {
-            if (it->second->active_.load() == false) {
-                DestroyTask(it->second);
+            if (it->second.Active() == false) {
                 cache_state_.erase(it);
                 it = cache_state_.begin();
             }

From f19c069b0ffe88ac1fbf1c2dff6cc9fb65972e5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 9 Jan 2024 23:55:23 +0100
Subject: [PATCH 05/29] always perform copy from src and dst node, add another
 log output about the split

---
 offloading-cacher/main.cpp             | 4 ++--
 offloading-cacher/offloading-cache.hpp | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index b6c9714..7aa8ea0 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -37,12 +37,12 @@ int main(int argc, char **argv) {
     };
 
     auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-        return std::vector{ numa_src_node };
+        return std::vector{ numa_src_node, numa_dst_node };
     };
 
     cache.Init(cache_policy,copy_policy);
 
-    static constexpr size_t data_size = 8192;
+    static constexpr size_t data_size = 1024 * 1024;
     double* data = GetRandomArray(data_size);
 
     std::unique_ptr<offcache::CacheData> data_cache = cache.Access(reinterpret_cast<uint8_t *>(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed);
diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp
index f40ef3d..ea91fae 100644
--- a/offloading-cacher/offloading-cache.hpp
+++ b/offloading-cacher/offloading-cache.hpp
@@ -246,6 +246,8 @@ inline void offcache::Cache::SubmitTask(CacheData* task) {
     const size_t size = task->size_ / task_count;
     const size_t last_size = size + task->size_ % task_count;
 
+    std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+
     // save the current numa node mask to restore later
     // as executing the copy task will place this thread
     // on a different node

From 395d3073100110fc9c899c82eee2c568730837ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 00:58:17 +0100
Subject: [PATCH 06/29] fix an issue with the freeing of data in the cacher

---
 offloading-cacher/main.cpp             |  85 +++++++++--
 offloading-cacher/offloading-cache.hpp | 189 ++++++++++++++++---------
 2 files changed, 192 insertions(+), 82 deletions(-)

diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index 7aa8ea0..726033b 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -3,6 +3,8 @@
 
 #include "offloading-cache.hpp"
 
+offcache::Cache CACHE;
+
 double* GetRandomArray(const size_t size) {
     double* array = new double[size];
 
@@ -29,36 +31,91 @@ bool IsEqual(const double* a, const double* b, const size_t size) {
     return true;
 }
 
+void PerformAccessAndTest(double* src, const size_t size) {
+    // this is the function that any cache access will go through
+    // execution policy picks between three options:
+    // Relaxed          may return an invalid (but not nullptr) CacheData
+    //                  which can then be validated with WaitOnCompletion()
+    // Immediate        never returns an invalid CacheData structure
+    //                  however it may return just the pointer to source
+    //                  WaitOnCompletion() will then ensure that the data
+    //                  is actually in cache
+    // ImmediateNoCache behaves the same as Immediate but does never perform
+    //                  caching itself so only returns cached version if
+    //                  previously cached is available
+
+    std::unique_ptr<offcache::CacheData> data_cache = CACHE.Access(
+            reinterpret_cast<uint8_t *>(src),
+            size * sizeof(double),
+            offcache::ExecutionPolicy::Immediate
+    );
+
+    double* cached_imm = reinterpret_cast<double *>(data_cache->GetDataLocation());
+
+    // check the value immediately just to see if ram or cache was returned
+
+    if (src == cached_imm) {
+        std::cout << "Caching did not immediately yield different data location." << std::endl;
+    }
+    else {
+        std::cout << "Immediately got different data location." << std::endl;
+    }
+
+    // waits for the completion of the asynchronous caching operation
+
+    data_cache->WaitOnCompletion();
+
+    // gets the cache-data-location from the struct
+
+    double* cached = reinterpret_cast<double *>(data_cache->GetDataLocation());
+
+    // tests on the resulting value
+
+    if (src == cached) {
+        std::cout << "Caching did not affect data location." << std::endl;
+    }
+
+    if (IsEqual(src,cached,size)) {
+        std::cout << "Cached data is correct." << std::endl;
+    }
+    else {
+        std::cout << "Cached data is wrong." << std::endl;
+    }
+}
+
 int main(int argc, char **argv) {
-    offcache::Cache cache;
+
+    // given numa destination and source node and the size of the data
+    // this function decides on which the data will be placed
+    // which is used to select the HBM-node for the dst-node if desired
 
     auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
         return numa_dst_node;
     };
 
+    // this function receives the memory source and destination node
+    // and then decides, on which nodes the copy operation will be split
+
     auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
         return std::vector{ numa_src_node, numa_dst_node };
     };
 
-    cache.Init(cache_policy,copy_policy);
+    // initializes the cache with the two policies
+
+    CACHE.Init(cache_policy,copy_policy);
+
+    // generate the test data
 
     static constexpr size_t data_size = 1024 * 1024;
     double* data = GetRandomArray(data_size);
 
-    std::unique_ptr<offcache::CacheData> data_cache = cache.Access(reinterpret_cast<uint8_t *>(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed);
+    std::cout << "--- first access --- " << std::endl;
 
-    data_cache->WaitOnCompletion();
+    PerformAccessAndTest(data, data_size);
 
-    double* cached = reinterpret_cast<double *>(data_cache->GetDataLocation());
+    std::cout << "--- second access --- " << std::endl;
 
-    if (data == cached) {
-        std::cout << "Caching did not affect data location." << std::endl;
-    }
+    PerformAccessAndTest(data, data_size);
 
-    if (IsEqual(data,cached,data_size)) {
-        std::cout << "Cached data is correct." << std::endl;
-    }
-    else {
-        std::cout << "Cached data is wrong." << std::endl;
-    }
+    std::cout << "--- end of application --- " << std::endl;
 }
diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp
index ea91fae..e265665 100644
--- a/offloading-cacher/offloading-cache.hpp
+++ b/offloading-cacher/offloading-cache.hpp
@@ -94,9 +94,14 @@ namespace offcache {
         typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node);
 
     private:
+        // mutex for accessing the cache state map
+
         std::shared_mutex cache_mutex_;
 
-        std::unordered_map<uint8_t*, CacheData> cache_state_;
+        // map from [dst-numa-node,map2]
+        // map2 from [data-ptr,cache-structure]
+
+        std::unordered_map<uint8_t, std::unordered_map<uint8_t*, CacheData>> cache_state_;
 
         CachePolicy* cache_policy_function_ = nullptr;
         CopyPolicy* copy_policy_function_ = nullptr;
@@ -105,6 +110,12 @@ namespace offcache {
 
         void SubmitTask(CacheData* task);
 
+        void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const;
+
+        void AbortTask(CacheData* task) const;
+
+        std::unique_ptr<CacheData> GetFromCache(uint8_t* src, const size_t size);
+
     public:
         void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);
 
@@ -126,40 +137,29 @@ inline void offcache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy
     // initialize numa library
     numa_available();
 
+    const int nodes_max = numa_num_configured_nodes();
+    const bitmask* valid_nodes = numa_get_mems_allowed();
+
+    for (int node = 0; node < nodes_max; node++) {
+        if (numa_bitmask_isbitset(valid_nodes, node)) {
+            cache_state_.insert({node,{}});
+        }
+    }
+
     std::cout << "[-] Cache Initialized" << std::endl;
 }
 
 inline std::unique_ptr<offcache::CacheData> offcache::Cache::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) {
-    // the best situation is if this data is already cached
-    // which we check in an unnamed block in which the cache
-    // is locked for reading to prevent another thread
-    // from marking the element we may find as unused and
-    // clearing it
-    {
-        std::shared_lock<std::shared_mutex> lock(cache_mutex_);
-
-        const auto search = cache_state_.find(data);
-
-        if (search != cache_state_.end()) {
-            if (search->second.size_ == size) {
-                search->second.active_->store(true);
-
-                std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
-
-                return std::move(std::make_unique<CacheData>(search->second));
-            }
-            else {
-                std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
+    std::unique_ptr<CacheData> task = GetFromCache(data, size);
 
-                cache_state_.erase(search);
-            }
-        }
+    if (task != nullptr) {
+        return std::move(task);
     }
 
     // at this point the requested data is not present in cache
     // and we create a caching task for it
 
-    auto task = std::make_unique<CacheData>(data, size);
+    task = std::make_unique<CacheData>(data, size);
 
     if (policy == ExecutionPolicy::Immediate) {
         // in intermediate mode the returned task
@@ -197,19 +197,12 @@ inline std::unique_ptr<offcache::CacheData> offcache::Cache::Access(uint8_t* dat
 }
 
 inline void offcache::Cache::SubmitTask(CacheData* task) {
-    // obtain numa node of current thread to determine where the data is needed
-
-    const int current_cpu = sched_getcpu();
-    const int current_node = numa_node_of_cpu(current_cpu);
-
-    // obtain node that the given data pointer is allocated on
-
-    int data_node = -1;
-    get_mempolicy(&data_node, NULL, 0, (void*)task->src_, MPOL_F_NODE | MPOL_F_ADDR);
+    // get destination numa node for the cache
 
-    // querry cache policy function for the destination numa node
+    int dst_node = -1;
+    int src_node = -1;
 
-    const int dst_node = cache_policy_function_(current_node, data_node, task->size_);
+    GetCacheNode(task->src_, task->size_, &dst_node, &src_node);
 
     std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
 
@@ -236,7 +229,7 @@ inline void offcache::Cache::SubmitTask(CacheData* task) {
 
     // querry copy policy function for the nodes to use for the copy
 
-    const std::vector<int> executing_nodes = copy_policy_function_(dst_node, data_node);
+    const std::vector<int> executing_nodes = copy_policy_function_(dst_node, src_node);
     const size_t task_count = executing_nodes.size();
 
     // each task will copy one fair part of the total size
@@ -272,7 +265,7 @@ inline void offcache::Cache::SubmitTask(CacheData* task) {
     {
         std::unique_lock<std::shared_mutex> lock(cache_mutex_);
 
-        const auto state = cache_state_.insert({task->src_, *task});
+        const auto state = cache_state_[dst_node].emplace(task->src_, *task);
 
         // if state.second is false then no insertion took place
         // which means that concurrently whith this thread
@@ -283,20 +276,7 @@ inline void offcache::Cache::SubmitTask(CacheData* task) {
         if (!state.second) {
             std::cout << "[x] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
 
-            // first wait on all copy operations to be completed
-
-            task->WaitOnCompletion();
-
-            // abort by doing the following steps
-            // (1) free the allocated memory, (2) remove the "maybe result" as
-            // we will not run the caching operation, (3) clear the sub tasks
-            // for the very same reason, (4) set the result to the RAM-location
-
-            numa_free(dst, task->size_);
-            task->incomplete_cache_ = nullptr;
-            task->cache_->store(task->src_);
-
-            std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+            AbortTask(task);
 
             return;
         }
@@ -346,7 +326,7 @@ offcache::CacheData::CacheData(uint8_t* data, const size_t size) {
 
     src_ = data;
     size_ = size;
-    active_ = new std::atomic<int32_t>();
+    active_ = new std::atomic<int32_t>(1);
     cache_ = new std::atomic<uint8_t*>();
     incomplete_cache_ = nullptr;
     handlers_ = std::make_unique<std::vector<dml_handler>>();
@@ -355,21 +335,25 @@ offcache::CacheData::CacheData(uint8_t* data, const size_t size) {
 offcache::CacheData::CacheData(const offcache::CacheData& other) {
     std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
 
+    active_ = other.active_;
+    const int current_active = active_->fetch_add(1);
+
     src_ = other.src_;
     size_ = other.size_;
     cache_ = other.cache_;
-    active_ = other.active_;
     incomplete_cache_ = nullptr;
     handlers_ = nullptr;
-    active_->fetch_add(1);
 }
 
 offcache::CacheData::~CacheData() {
     std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
-    const int32_t v = active_->fetch_sub(1);
+    // due to fetch_sub returning the preivously held value
+    // we must subtract one locally to get the current value
 
-    // if the returned value is non-positive
+    const int32_t v = active_->fetch_sub(1) - 1;
+
+    // if the returned value is zero or lower
     // then we must execute proper deletion
     // as this was the last reference
 
@@ -390,7 +374,23 @@ void offcache::CacheData::Deallocate() {
     incomplete_cache_ = nullptr;
 }
 
-uint8_t *offcache::CacheData::GetDataLocation() const {
+void offcache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const {
+    // obtain numa node of current thread to determine where the data is needed
+
+    const int current_cpu = sched_getcpu();
+    const int current_node = numa_node_of_cpu(current_cpu);
+
+    // obtain node that the given data pointer is allocated on
+
+    *OUT_SRC_NODE = -1;
+    get_mempolicy(OUT_SRC_NODE, NULL, 0, (void*)src, MPOL_F_NODE | MPOL_F_ADDR);
+
+    // querry cache policy function for the destination numa node
+
+    *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size);
+}
+
+uint8_t* offcache::CacheData::GetDataLocation() const {
     return cache_->load();
 }
 
@@ -405,17 +405,70 @@ inline void offcache::Cache::Flush() {
 
     {
         std::unique_lock<std::shared_mutex> lock(cache_mutex_);
-        
-        auto it = cache_state_.begin();
 
-        while (it != cache_state_.end()) {
-            if (it->second.Active() == false) {
-                cache_state_.erase(it);
-                it = cache_state_.begin();
-            }
-            else {
-                it++;
+        for (auto& nc : cache_state_) {
+            auto it = nc.second.begin();
+
+            while (it != nc.second.end()) {
+                if (it->second.Active() == false) {
+                    nc.second.erase(it);
+                    it = nc.second.begin();
+                }
+                else {
+                    it++;
+                }
             }
         }
     }
-}
\ No newline at end of file
+}
+
+void offcache::Cache::AbortTask(offcache::CacheData *task) const {
+    // first wait on all copy operations to be completed
+
+    task->WaitOnCompletion();
+
+    // abort by doing the following steps
+    // (1) free the allocated memory, (2) remove the "maybe result" as
+    // we will not run the caching operation, (3) clear the sub tasks
+    // for the very same reason, (4) set the result to the RAM-location
+
+    numa_free(task->incomplete_cache_, task->size_);
+    task->incomplete_cache_ = nullptr;
+    task->cache_->store(task->src_);
+
+    std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+}
+
+std::unique_ptr<offcache::CacheData> offcache::Cache::GetFromCache(uint8_t* src, const size_t size) {
+    // the best situation is if this data is already cached
+    // which we check in an unnamed block in which the cache
+    // is locked for reading to prevent another thread
+    // from marking the element we may find as unused and
+    // clearing it
+
+    int dst_node = -1;
+    int src_node = -1;
+
+    GetCacheNode(src, size, &dst_node, &src_node);
+
+    std::shared_lock<std::shared_mutex> lock(cache_mutex_);
+
+    const auto search = cache_state_[dst_node].find(src);
+
+    if (search != cache_state_[dst_node].end()) {
+        if (search->second.size_ == size) {
+            search->second.active_->store(true);
+
+            std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
+
+            return std::move(std::make_unique<CacheData>(search->second));
+        }
+        else {
+            std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
+
+            cache_state_[dst_node].erase(search);
+        }
+    }
+
+    return nullptr;
+}

From c01eafedaea03fb70f2c2ae0421e5f2a4b7b2f96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 11:45:29 +0100
Subject: [PATCH 07/29] refactor the cacher to reduce complexity, removes the
 access guarantees (relaxed,immediate,...), uses the fact that other tasks
 will wait on atomic value change for the cache-pointer if it is nullptr to
 add the entry to cache structure earlier reducing cost of two threads
 accessing new entry at the same time, splits the offloading-cache.hpp file
 into two with one containing the data-class (represents a cache entry and
 task) and the other containing the cacher itself

---
 offloading-cacher/cache-data.hpp       | 139 ++++++++
 offloading-cacher/cache.hpp            | 280 +++++++++++++++
 offloading-cacher/main.cpp             |  30 +-
 offloading-cacher/offloading-cache.hpp | 474 -------------------------
 4 files changed, 432 insertions(+), 491 deletions(-)
 create mode 100644 offloading-cacher/cache-data.hpp
 create mode 100644 offloading-cacher/cache.hpp
 delete mode 100644 offloading-cacher/offloading-cache.hpp

diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp
new file mode 100644
index 0000000..4028597
--- /dev/null
+++ b/offloading-cacher/cache-data.hpp
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <iostream>
+
+#include <atomic>
+#include <memory>
+#include <vector>
+
+#include <dml/dml.hpp>
+
+namespace dsacache {
+    class Cache;
+
+    // the cache task structure will be used to submit and
+    // control a cache element, while providing source pointer
+    // and size in bytes for submission
+    //
+    // then the submitting thread may wait on the atomic "result"
+    // which will be notified by the cache worker upon processing
+    // after which the atomic-bool-ptr active will also become valid
+    class CacheData {
+    public:
+        using dml_handler = dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>;
+
+    private:
+        uint8_t* src_;
+        size_t size_;
+
+        std::atomic<int32_t>* active_;
+
+    protected:
+        std::atomic<uint8_t*>* cache_;
+
+        uint8_t* incomplete_cache_;
+
+        std::unique_ptr<std::vector<dml_handler>> handlers_;
+
+        friend Cache;
+
+    public:
+        CacheData(uint8_t* data, const size_t size);
+        CacheData(const CacheData& other);
+        ~CacheData();
+
+        void Deallocate();
+
+        void WaitOnCompletion();
+
+        uint8_t* GetDataLocation() const;
+
+        bool Active() const;
+    };
+}
+
+inline void dsacache::CacheData::WaitOnCompletion() {
+    if (handlers_ == nullptr) {
+        std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        cache_->wait(nullptr);
+
+        std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+    }
+    else {
+        std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        for (auto& handler : *handlers_) {
+            auto result = handler.get();
+            // TODO: handle the returned status code
+        }
+
+        handlers_ = nullptr;
+
+        std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        cache_->store(incomplete_cache_);
+        cache_->notify_all();
+    }
+}
+
+dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
+    std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
+
+    src_ = data;
+    size_ = size;
+    active_ = new std::atomic<int32_t>(1);
+    cache_ = new std::atomic<uint8_t*>();
+    incomplete_cache_ = nullptr;
+    handlers_ = std::make_unique<std::vector<dml_handler>>();
+}
+
+dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
+    std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
+
+    active_ = other.active_;
+    const int current_active = active_->fetch_add(1);
+
+    src_ = other.src_;
+    size_ = other.size_;
+    cache_ = other.cache_;
+    incomplete_cache_ = nullptr;
+    handlers_ = nullptr;
+}
+
+dsacache::CacheData::~CacheData() {
+    std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+    // due to fetch_sub returning the preivously held value
+    // we must subtract one locally to get the current value
+
+    const int32_t v = active_->fetch_sub(1) - 1;
+
+    // if the returned value is zero or lower
+    // then we must execute proper deletion
+    // as this was the last reference
+
+    if (v <= 0) {
+        std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        Deallocate();
+        delete active_;
+        delete cache_;
+    }
+}
+
+void dsacache::CacheData::Deallocate() {
+    std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+    numa_free(cache_, size_);
+    cache_ = nullptr;
+    incomplete_cache_ = nullptr;
+}
+
+uint8_t* dsacache::CacheData::GetDataLocation() const {
+    return cache_->load();
+}
+
+bool dsacache::CacheData::Active() const {
+    return active_->load() > 0;
+}
\ No newline at end of file
diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
new file mode 100644
index 0000000..0081a04
--- /dev/null
+++ b/offloading-cacher/cache.hpp
@@ -0,0 +1,280 @@
+#pragma once
+
+#include <iostream>
+
+#include <unordered_map>
+#include <shared_mutex>
+#include <mutex>
+#include <memory>
+
+#include <sched.h>
+#include <numa.h>
+#include <numaif.h>
+
+#include <dml/dml.hpp>
+
+#include "cache-data.hpp"
+
+namespace dsacache {
+    // singleton which holds the cache workers
+    // and is the place where work will be submited
+    class Cache {
+    public:
+        // cache policy is defined as a type here to allow flexible usage of the cacher
+        // given a numa destination node (where the data will be needed), the numa source
+        // node (current location of the data) and the data size, this function should
+        // return optimal cache placement
+        // dst node and returned value can differ if the system, for example, has HBM
+        // attached accessible directly to node n under a different node id m
+        typedef int (CachePolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size);
+
+        // copy policy specifies the copy-executing nodes for a given task
+        // which allows flexibility in assignment for optimizing raw throughput
+        // or choosing a conservative usage policy
+        typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node);
+
+    private:
+        // mutex for accessing the cache state map
+
+        std::shared_mutex cache_mutex_;
+
+        // map from [dst-numa-node,map2]
+        // map2 from [data-ptr,cache-structure]
+
+        std::unordered_map<uint8_t, std::unordered_map<uint8_t*, CacheData>> cache_state_;
+
+        CachePolicy* cache_policy_function_ = nullptr;
+        CopyPolicy* copy_policy_function_ = nullptr;
+
+        dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> ExecuteCopy(
+                const uint8_t* src, uint8_t* dst, const size_t size, const int node
+        ) const;
+
+        void SubmitTask(CacheData* task, const int dst_node, const int src_node);
+
+        void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const;
+
+        std::unique_ptr<CacheData> GetFromCache(uint8_t* src, const size_t size, const int dst_node);
+
+    public:
+        void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);
+
+        // function to perform data access through the cache
+        std::unique_ptr<CacheData> Access(uint8_t* data, const size_t size);
+
+        void Flush(const int node = -1);
+    };
+}
+
+inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
+    cache_policy_function_ = cache_policy_function;
+    copy_policy_function_ = copy_policy_function;
+
+    // initialize numa library
+    numa_available();
+
+    const int nodes_max = numa_num_configured_nodes();
+    const bitmask* valid_nodes = numa_get_mems_allowed();
+
+    for (int node = 0; node < nodes_max; node++) {
+        if (numa_bitmask_isbitset(valid_nodes, node)) {
+            cache_state_.insert({node,{}});
+        }
+    }
+
+    std::cout << "[-] Cache Initialized" << std::endl;
+}
+
+inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::Access(uint8_t* data, const size_t size) {
+    // get destination numa node for the cache
+
+    int dst_node = -1;
+    int src_node = -1;
+
+    GetCacheNode(data, size, &dst_node, &src_node);
+
+    // check whether the data is already cached
+
+    std::unique_ptr<CacheData> task = GetFromCache(data, size, dst_node);
+
+    if (task != nullptr) {
+        return std::move(task);
+    }
+
+    // at this point the requested data is not present in cache
+    // and we create a caching task for it
+
+    task = std::make_unique<CacheData>(data, size);
+
+    {
+        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
+
+        const auto state = cache_state_[dst_node].emplace(task->src_, *task);
+
+        // if state.second is false then no insertion took place
+        // which means that concurrently whith this thread
+        // some other thread must have accessed the same
+        // resource in which case we return the other
+        // threads data cache structure
+
+        if (!state.second) {
+            std::cout << "[!] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+            return std::move(std::make_unique<CacheData>(state.first->second));
+        }
+    }
+
+    SubmitTask(task.get(), dst_node, src_node);
+
+    return std::move(task);
+}
+
+inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) {
+    std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+
+    // allocate data on this node and flush the unused parts of the
+    // cache if the operation fails and retry once
+    // TODO: smarter flush strategy could keep some stuff cached
+
+    uint8_t* dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
+
+    if (dst == nullptr) {
+        std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
+
+        // allocation on dst_node failed so we flush the cache for this
+        // node hoping to free enough currently unused entries to make
+        // the second allocation attempt successful
+
+        Flush(dst_node);
+
+        dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
+
+        if (dst == nullptr) {
+            std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
+            return;
+        }
+    }
+
+    task->incomplete_cache_ = dst;
+
+    // querry copy policy function for the nodes to use for the copy
+
+    const std::vector<int> executing_nodes = copy_policy_function_(dst_node, src_node);
+    const size_t task_count = executing_nodes.size();
+
+    // each task will copy one fair part of the total size
+    // and in case the total size is not a factor of the
+    // given task count the last node must copy the remainder
+
+    const size_t size = task->size_ / task_count;
+    const size_t last_size = size + task->size_ % task_count;
+
+    std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+
+    // save the current numa node mask to restore later
+    // as executing the copy task will place this thread
+    // on a different node
+
+    bitmask* nodemask = numa_get_run_node_mask();
+
+    for (uint32_t i = 0; i < task_count; i++) {
+        const size_t local_size = i + 1 == task_count ? size : last_size;
+        const size_t local_offset = i * size;
+        const uint8_t* local_src = task->src_ + local_offset;
+        uint8_t* local_dst = dst + local_offset;
+
+        task->handlers_->emplace_back(ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i]));
+    }
+
+    // restore the previous nodemask
+
+    numa_run_on_node_mask(nodemask);
+}
+
+inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> dsacache::Cache::ExecuteCopy(
+        const uint8_t* src, uint8_t* dst, const size_t size, const int node
+) const {
+    dml::const_data_view srcv = dml::make_view(src, size);
+    dml::data_view dstv = dml::make_view(dst, size);
+
+    numa_run_on_node(node);
+
+    return dml::submit<dml::automatic>(dml::mem_copy.block_on_fault(), srcv, dstv);
+}
+
+
+void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const {
+    // obtain numa node of current thread to determine where the data is needed
+
+    const int current_cpu = sched_getcpu();
+    const int current_node = numa_node_of_cpu(current_cpu);
+
+    // obtain node that the given data pointer is allocated on
+
+    *OUT_SRC_NODE = -1;
+    get_mempolicy(OUT_SRC_NODE, NULL, 0, (void*)src, MPOL_F_NODE | MPOL_F_ADDR);
+
+    // querry cache policy function for the destination numa node
+
+    *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size);
+}
+
+
+inline void dsacache::Cache::Flush(const int node) {
+    std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl;
+
+    const auto FlushNode = [](std::unordered_map<uint8_t*,CacheData>& map) {
+        auto it = map.begin();
+
+        while (it != map.end()) {
+            if (it->second.Active() == false) {
+                map.erase(it);
+                it = map.begin();
+            }
+            else {
+                it++;
+            }
+        }
+    };
+
+    {
+        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
+
+        if (node == -1) {
+            for (auto& nc : cache_state_) {
+                FlushNode(nc.second);
+            }
+        }
+        else {
+            FlushNode(cache_state_[node]);
+        }
+    }
+}
+
+std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_t* src, const size_t size, const int dst_node) {
+    // the best situation is if this data is already cached
+    // which we check in an unnamed block in which the cache
+    // is locked for reading to prevent another thread
+    // from marking the element we may find as unused and
+    // clearing it
+
+    std::shared_lock<std::shared_mutex> lock(cache_mutex_);
+
+    const auto search = cache_state_[dst_node].find(src);
+
+    if (search != cache_state_[dst_node].end()) {
+        if (search->second.size_ == size) {
+            search->second.active_->store(true);
+
+            std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
+
+            return std::move(std::make_unique<CacheData>(search->second));
+        }
+        else {
+            std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
+
+            cache_state_[dst_node].erase(search);
+        }
+    }
+
+    return nullptr;
+}
diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index 726033b..e67eb22 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -1,9 +1,9 @@
 #include <iostream>
 #include <random>
 
-#include "offloading-cache.hpp"
+#include "cache.hpp"
 
-offcache::Cache CACHE;
+dsacache::Cache CACHE;
 
 double* GetRandomArray(const size_t size) {
     double* array = new double[size];
@@ -32,22 +32,9 @@ bool IsEqual(const double* a, const double* b, const size_t size) {
 }
 
 void PerformAccessAndTest(double* src, const size_t size) {
-    // this is the function that any cache access will go through
-    // execution policy picks between three options:
-    // Relaxed          may return an invalid (but not nullptr) CacheData
-    //                  which can then be validated with WaitOnCompletion()
-    // Immediate        never returns an invalid CacheData structure
-    //                  however it may return just the pointer to source
-    //                  WaitOnCompletion() will then ensure that the data
-    //                  is actually in cache
-    // ImmediateNoCache behaves the same as Immediate but does never perform
-    //                  caching itself so only returns cached version if
-    //                  previously cached is available
-
-    std::unique_ptr<offcache::CacheData> data_cache = CACHE.Access(
+    std::unique_ptr<dsacache::CacheData> data_cache = CACHE.Access(
             reinterpret_cast<uint8_t *>(src),
-            size * sizeof(double),
-            offcache::ExecutionPolicy::Immediate
+            size * sizeof(double)
     );
 
     double* cached_imm = reinterpret_cast<double *>(data_cache->GetDataLocation());
@@ -57,6 +44,9 @@ void PerformAccessAndTest(double* src, const size_t size) {
     if (src == cached_imm) {
         std::cout << "Caching did not immediately yield different data location." << std::endl;
     }
+    else if (cached_imm == nullptr) {
+        std::cout << "Immediately got nullptr." << std::endl;
+    }
     else {
         std::cout << "Immediately got different data location." << std::endl;
     }
@@ -74,6 +64,12 @@ void PerformAccessAndTest(double* src, const size_t size) {
     if (src == cached) {
         std::cout << "Caching did not affect data location." << std::endl;
     }
+    else if (cached == nullptr) {
+        std::cout << "Got nullptr from cache." << std::endl;
+    }
+    else {
+        std::cout << "Got different data location from cache." << std::endl;
+    }
 
     if (IsEqual(src,cached,size)) {
         std::cout << "Cached data is correct." << std::endl;
diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp
deleted file mode 100644
index e265665..0000000
--- a/offloading-cacher/offloading-cache.hpp
+++ /dev/null
@@ -1,474 +0,0 @@
-#pragma once
-
-#include <iostream>
-
-#include <atomic>
-#include <vector>
-#include <thread>
-#include <unordered_map>
-#include <shared_mutex>
-#include <mutex>
-#include <memory>
-
-#include <semaphore.h>
-
-#include <sched.h>
-#include <numa.h>
-#include <numaif.h>
-
-#include <dml/dml.hpp>
-
-namespace offcache {
-    // execution policy selects in which way the data is supposed to be cached
-    // and returned with the following behaviour is guaranteed in addition to the
-    // returned value being valid:
-    // Immediate:        return as fast as possible
-    //                   may return cached data, can return data in RAM
-    //                   will trigger caching of the data provided
-    // ImmediateNoCache: return as fast as possible and never trigger caching
-    //                   same as Immediate but will not trigger caching
-    // Relaxed:          no rapid return needed, take time
-    //                   will trigger caching and may only return
-    //                   once the caching is successful but can still
-    //                   provide data in RAM
-    enum class ExecutionPolicy {
-        Relaxed, Immediate, ImmediateNoCache
-    };
-
-    class Cache;
-
-    // the cache task structure will be used to submit and
-    // control a cache element, while providing source pointer
-    // and size in bytes for submission
-    //
-    // then the submitting thread may wait on the atomic "result"
-    // which will be notified by the cache worker upon processing
-    // after which the atomic-bool-ptr active will also become valid
-    class CacheData {
-    public:
-        using dml_handler = dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>;
-
-    private:
-        uint8_t* src_;
-        size_t size_;
-
-        std::atomic<int32_t>* active_;
-
-    protected:
-        std::atomic<uint8_t*>* cache_;
-
-        uint8_t* incomplete_cache_;
-
-        std::unique_ptr<std::vector<dml_handler>> handlers_;
-
-        friend Cache;
-
-    public:
-        CacheData(uint8_t* data, const size_t size);
-        CacheData(const CacheData& other);
-        ~CacheData();
-
-        void Deallocate();
-        void WaitOnCompletion();
-
-        uint8_t* GetDataLocation() const;
-
-        bool Active() const;
-    };
-
-    // singleton which holds the cache workers
-    // and is the place where work will be submited
-    class Cache {
-    public:
-        // cache policy is defined as a type here to allow flexible usage of the cacher
-        // given a numa destination node (where the data will be needed), the numa source
-        // node (current location of the data) and the data size, this function should
-        // return optimal cache placement
-        // dst node and returned value can differ if the system, for example, has HBM
-        // attached accessible directly to node n under a different node id m
-        typedef int (CachePolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size);
-
-        // copy policy specifies the copy-executing nodes for a given task
-        // which allows flexibility in assignment for optimizing raw throughput
-        // or choosing a conservative usage policy
-        typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node);
-
-    private:
-        // mutex for accessing the cache state map
-
-        std::shared_mutex cache_mutex_;
-
-        // map from [dst-numa-node,map2]
-        // map2 from [data-ptr,cache-structure]
-
-        std::unordered_map<uint8_t, std::unordered_map<uint8_t*, CacheData>> cache_state_;
-
-        CachePolicy* cache_policy_function_ = nullptr;
-        CopyPolicy* copy_policy_function_ = nullptr;
-
-        dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const;
-
-        void SubmitTask(CacheData* task);
-
-        void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const;
-
-        void AbortTask(CacheData* task) const;
-
-        std::unique_ptr<CacheData> GetFromCache(uint8_t* src, const size_t size);
-
-    public:
-        void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);
-
-        // function to perform data access through the cache
-        // behaviour depends on the chosen execution policy
-        // Immediate and ImmediateNoCache return a cache task
-        // with guaranteed-valid result value where Relaxed
-        // policy does not come with this guarantee.
-        std::unique_ptr<CacheData> Access(uint8_t* data, const size_t size, const ExecutionPolicy policy);
-
-        void Flush();
-    };
-}
-
-inline void offcache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
-    cache_policy_function_ = cache_policy_function;
-    copy_policy_function_ = copy_policy_function;
-
-    // initialize numa library
-    numa_available();
-
-    const int nodes_max = numa_num_configured_nodes();
-    const bitmask* valid_nodes = numa_get_mems_allowed();
-
-    for (int node = 0; node < nodes_max; node++) {
-        if (numa_bitmask_isbitset(valid_nodes, node)) {
-            cache_state_.insert({node,{}});
-        }
-    }
-
-    std::cout << "[-] Cache Initialized" << std::endl;
-}
-
-inline std::unique_ptr<offcache::CacheData> offcache::Cache::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) {
-    std::unique_ptr<CacheData> task = GetFromCache(data, size);
-
-    if (task != nullptr) {
-        return std::move(task);
-    }
-
-    // at this point the requested data is not present in cache
-    // and we create a caching task for it
-
-    task = std::make_unique<CacheData>(data, size);
-
-    if (policy == ExecutionPolicy::Immediate) {
-        // in intermediate mode the returned task
-        // object is guaranteed to be valid and therefore
-        // its resulting location must be validated
-        // after which we submit the task
-        // maybe_result is then set by submit
-
-        task->cache_->store(data);
-        SubmitTask(task.get());
-        return std::move(task);
-    }
-    else if (policy == ExecutionPolicy::ImmediateNoCache) {
-        // for immediatenocache we just validate
-        // the generated task and return it
-        // we must also set maybe_result in case
-        // someone waits on this
-
-        task->cache_->store(data);
-        task->incomplete_cache_ = data;
-        return std::move(task);
-    }
-    else if (policy == ExecutionPolicy::Relaxed) {
-        // for relaxed no valid task must be returned
-        // and therefore we just submit and then give
-        // the possible invalid task back with only
-        // maybe_result set by submission
-        
-        SubmitTask(task.get());
-        return std::move(task);
-    }
-    else {
-        // this should not be reached
-    }
-}
-
-inline void offcache::Cache::SubmitTask(CacheData* task) {
-    // get destination numa node for the cache
-
-    int dst_node = -1;
-    int src_node = -1;
-
-    GetCacheNode(task->src_, task->size_, &dst_node, &src_node);
-
-    std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
-
-    // allocate data on this node and flush the unused parts of the
-    // cache if the operation fails and retry once
-    // TODO: smarter flush strategy could keep some stuff cached
-
-    uint8_t* dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
-
-    if (dst == nullptr) {
-        std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
-
-        Flush();
-
-        dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
-
-        if (dst == nullptr) {
-            std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
-            return;
-        }
-    }
-
-    task->incomplete_cache_ = dst;
-
-    // querry copy policy function for the nodes to use for the copy
-
-    const std::vector<int> executing_nodes = copy_policy_function_(dst_node, src_node);
-    const size_t task_count = executing_nodes.size();
-
-    // each task will copy one fair part of the total size
-    // and in case the total size is not a factor of the
-    // given task count the last node must copy the remainder
-
-    const size_t size = task->size_ / task_count;
-    const size_t last_size = size + task->size_ % task_count;
-
-    std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
-
-    // save the current numa node mask to restore later
-    // as executing the copy task will place this thread
-    // on a different node
-
-    bitmask* nodemask = numa_get_run_node_mask();
-
-    for (uint32_t i = 0; i < task_count; i++) {
-        const size_t local_size = i + 1 == task_count ? size : last_size;
-        const size_t local_offset = i * size;
-        const uint8_t* local_src = task->src_ + local_offset;
-        uint8_t* local_dst = dst + local_offset;
-
-        task->handlers_->emplace_back(ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i]));
-    }
-
-    // only at this point may the task be added to the control structure
-    // because adding it earlier could cause it to be returned for an
-    // access request while the handler-vector is not fully populated
-    // which could cause the wait-function to return prematurely
-    // TODO: this can be optimized because the abort is quite expensive
-
-    {
-        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
-
-        const auto state = cache_state_[dst_node].emplace(task->src_, *task);
-
-        // if state.second is false then no insertion took place
-        // which means that concurrently whith this thread
-        // some other thread must have accessed the same
-        // resource in which case we must perform an abort
-        // TODO: abort is not the only way to handle this situation
-
-        if (!state.second) {
-            std::cout << "[x] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
-
-            AbortTask(task);
-
-            return;
-        }
-    }
-
-    // restore the previous nodemask
-
-    numa_run_on_node_mask(nodemask);
-}
-
-inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> offcache::Cache::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const {
-    dml::const_data_view srcv = dml::make_view(src, size);
-    dml::data_view dstv = dml::make_view(dst, size);
-
-    numa_run_on_node(node);
-
-    return dml::submit<dml::automatic>(dml::mem_copy.block_on_fault(), srcv, dstv);
-}
-
-inline void offcache::CacheData::WaitOnCompletion() {
-    if (handlers_ == nullptr) {
-        std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-        cache_->wait(nullptr);
-
-        std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-    }
-    else {
-        std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-        for (auto& handler : *handlers_) {
-            auto result = handler.get();
-            // TODO: handle the returned status code
-        }
-
-        handlers_ = nullptr;
-
-        std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-        cache_->store(incomplete_cache_);
-        cache_->notify_all();
-    }
-}
-
-offcache::CacheData::CacheData(uint8_t* data, const size_t size) {
-    std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
-
-    src_ = data;
-    size_ = size;
-    active_ = new std::atomic<int32_t>(1);
-    cache_ = new std::atomic<uint8_t*>();
-    incomplete_cache_ = nullptr;
-    handlers_ = std::make_unique<std::vector<dml_handler>>();
-}
-
-offcache::CacheData::CacheData(const offcache::CacheData& other) {
-    std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
-
-    active_ = other.active_;
-    const int current_active = active_->fetch_add(1);
-
-    src_ = other.src_;
-    size_ = other.size_;
-    cache_ = other.cache_;
-    incomplete_cache_ = nullptr;
-    handlers_ = nullptr;
-}
-
-offcache::CacheData::~CacheData() {
-    std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-    // due to fetch_sub returning the preivously held value
-    // we must subtract one locally to get the current value
-
-    const int32_t v = active_->fetch_sub(1) - 1;
-
-    // if the returned value is zero or lower
-    // then we must execute proper deletion
-    // as this was the last reference
-
-    if (v <= 0) {
-        std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-        Deallocate();
-        delete active_;
-        delete cache_;
-    }
-}
-
-void offcache::CacheData::Deallocate() {
-    std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-    numa_free(cache_, size_);
-    cache_ = nullptr;
-    incomplete_cache_ = nullptr;
-}
-
-void offcache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const {
-    // obtain numa node of current thread to determine where the data is needed
-
-    const int current_cpu = sched_getcpu();
-    const int current_node = numa_node_of_cpu(current_cpu);
-
-    // obtain node that the given data pointer is allocated on
-
-    *OUT_SRC_NODE = -1;
-    get_mempolicy(OUT_SRC_NODE, NULL, 0, (void*)src, MPOL_F_NODE | MPOL_F_ADDR);
-
-    // querry cache policy function for the destination numa node
-
-    *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size);
-}
-
-uint8_t* offcache::CacheData::GetDataLocation() const {
-    return cache_->load();
-}
-
-bool offcache::CacheData::Active() const {
-    return active_->load() > 0;
-}
-
-inline void offcache::Cache::Flush() {
-    std::cout << "[-] Flushing Cache" << std::endl;
-
-    // TODO: there is a better way to implement this flush
-
-    {
-        std::unique_lock<std::shared_mutex> lock(cache_mutex_);
-
-        for (auto& nc : cache_state_) {
-            auto it = nc.second.begin();
-
-            while (it != nc.second.end()) {
-                if (it->second.Active() == false) {
-                    nc.second.erase(it);
-                    it = nc.second.begin();
-                }
-                else {
-                    it++;
-                }
-            }
-        }
-    }
-}
-
-void offcache::Cache::AbortTask(offcache::CacheData *task) const {
-    // first wait on all copy operations to be completed
-
-    task->WaitOnCompletion();
-
-    // abort by doing the following steps
-    // (1) free the allocated memory, (2) remove the "maybe result" as
-    // we will not run the caching operation, (3) clear the sub tasks
-    // for the very same reason, (4) set the result to the RAM-location
-
-    numa_free(task->incomplete_cache_, task->size_);
-    task->incomplete_cache_ = nullptr;
-    task->cache_->store(task->src_);
-
-    std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
-}
-
-std::unique_ptr<offcache::CacheData> offcache::Cache::GetFromCache(uint8_t* src, const size_t size) {
-    // the best situation is if this data is already cached
-    // which we check in an unnamed block in which the cache
-    // is locked for reading to prevent another thread
-    // from marking the element we may find as unused and
-    // clearing it
-
-    int dst_node = -1;
-    int src_node = -1;
-
-    GetCacheNode(src, size, &dst_node, &src_node);
-
-    std::shared_lock<std::shared_mutex> lock(cache_mutex_);
-
-    const auto search = cache_state_[dst_node].find(src);
-
-    if (search != cache_state_[dst_node].end()) {
-        if (search->second.size_ == size) {
-            search->second.active_->store(true);
-
-            std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
-
-            return std::move(std::make_unique<CacheData>(search->second));
-        }
-        else {
-            std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
-
-            cache_state_[dst_node].erase(search);
-        }
-    }
-
-    return nullptr;
-}

From 46de3151a2634dbf7eeb75556eb89ee0ff2f669e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 12:29:31 +0100
Subject: [PATCH 08/29] add a lot of comments to the code, also handle errors
 in the dml handlers gracefully

---
 offloading-cacher/cache-data.hpp      | 131 +++++++++++++++++++++++---
 offloading-cacher/cache.hpp           |  35 ++++++-
 offloading-cacher/util/dml-helper.hpp |  60 ++++++++----
 3 files changed, 192 insertions(+), 34 deletions(-)

diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp
index 4028597..4de6138 100644
--- a/offloading-cacher/cache-data.hpp
+++ b/offloading-cacher/cache-data.hpp
@@ -8,6 +8,8 @@
 
 #include <dml/dml.hpp>
 
+#include "util/dml-helper.hpp"
+
 namespace dsacache {
     class Cache;
 
@@ -23,57 +25,130 @@ namespace dsacache {
         using dml_handler = dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>;
 
     private:
+        // data source and size of the block
         uint8_t* src_;
         size_t size_;
 
+        // global reference counting object
         std::atomic<int32_t>* active_;
 
-    protected:
+        // global cache-location pointer
         std::atomic<uint8_t*>* cache_;
 
+        // object-local incomplete cache location pointer
+        // which is only available in the first instance
         uint8_t* incomplete_cache_;
 
+        // dml handler vector pointer which is only
+        // available in the first instance
         std::unique_ptr<std::vector<dml_handler>> handlers_;
 
-        friend Cache;
+        // deallocates the global cache-location
+        // and invalidates it
+        void Deallocate();
+
+        // checks whether there are at least two
+        // valid references to this object which
+        // is done as the cache always has one
+        // internally to any living instance
+        bool Active() const;
 
+        friend Cache;
     public:
         CacheData(uint8_t* data, const size_t size);
         CacheData(const CacheData& other);
         ~CacheData();
 
-        void Deallocate();
-
+        // waits on completion of caching operations
+        // for this task and is safe to be called in
+        // any state of the object
         void WaitOnCompletion();
 
+        // returns the cache data location for this
+        // instance which is valid as long as the
+        // instance is alive
         uint8_t* GetDataLocation() const;
-
-        bool Active() const;
     };
 }
 
 inline void dsacache::CacheData::WaitOnCompletion() {
+    // the cache data entry can be in two states
+    // either it is the original one which has not
+    // been waited for in which case the handlers
+    // are non-null or it is not
+
     if (handlers_ == nullptr) {
         std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
+        // when no handlers are attached to this cache entry we wait on a
+        // value change for the cache structure from nullptr to non-null
+        // which will either go through immediately if the cache is valid
+        // already or wait until the handler-owning thread notifies us
+
         cache_->wait(nullptr);
 
         std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
     }
     else {
+        // when the handlers are non-null there are some DSA task handlers
+        // available on which we must wait here
+
         std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
+        // abort is set if any operation encountered an error
+
+        bool abort = false;
+
         for (auto& handler : *handlers_) {
             auto result = handler.get();
-            // TODO: handle the returned status code
+
+            if (result.status != dml::status_code::ok) {
+                std::cerr << "[x] Encountered bad status code for operation: " << dml::StatusCodeToString(result.status) << std::endl;
+
+                // if one of the copy tasks failed we abort the whole task
+                // after all operations are completed on it
+
+                abort = true;
+            }
         }
 
+        // the handlers are cleared after all have completed
+
         handlers_ = nullptr;
 
-        std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+        // now we act depending on whether an abort has been
+        // called for which signals operation incomplete
+
+        if (abort) {
+            // store nullptr in the cache location
+
+            cache_->store(nullptr);
+
+            // then free the now incomplete cache
+
+            // TODO: it would be possible to salvage the
+            // TODO: operation at this point but this
+            // TODO: is quite complicated so we just abort
+
+            numa_free(incomplete_cache_, size_);
+        }
+        else {
+            std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+            // incomplete cache is now safe to use and therefore we
+            // swap it with the global cache state of this entry
+            // and notify potentially waiting threads
+
+            cache_->store(incomplete_cache_);
+        }
+
+        // as a last step all waiting threads must
+        // be notified (copies of this will wait on value
+        // change of the cache) and the incomplete cache
+        // is cleared to nullptr as it is not incomplete
 
-        cache_->store(incomplete_cache_);
         cache_->notify_all();
+        incomplete_cache_ = nullptr;
     }
 }
 
@@ -91,12 +166,24 @@ dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
 dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
     std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
 
+    // we copy the ptr to the global atomic reference counter
+    // and increase the amount of active references
+
     active_ = other.active_;
     const int current_active = active_->fetch_add(1);
 
+    // source and size will be copied too
+    // as well as the reference to the global
+    // atomic cache pointer
+
     src_ = other.src_;
     size_ = other.size_;
     cache_ = other.cache_;
+
+    // incomplete cache and handlers will not
+    // be copied because only the first instance
+    // will wait on the completion of handlers
+
     incomplete_cache_ = nullptr;
     handlers_ = nullptr;
 }
@@ -104,6 +191,15 @@ dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
 dsacache::CacheData::~CacheData() {
     std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
+    // if this is the first instance of this cache structure
+    // and it has not been waited on and is now being destroyed
+    // we must wait on completion here to ensure the cache
+    // remains in a valid state
+
+    if (handlers_ != nullptr) {
+        WaitOnCompletion();
+    }
+
     // due to fetch_sub returning the preivously held value
     // we must subtract one locally to get the current value
 
@@ -117,6 +213,7 @@ dsacache::CacheData::~CacheData() {
         std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
         Deallocate();
+
         delete active_;
         delete cache_;
     }
@@ -125,9 +222,12 @@ dsacache::CacheData::~CacheData() {
 void dsacache::CacheData::Deallocate() {
     std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
-    numa_free(cache_, size_);
-    cache_ = nullptr;
-    incomplete_cache_ = nullptr;
+    // although deallocate should only be called from
+    // a safe context to do so, it can not hurt to
+    // defensively perform the operation atomically
+
+    uint8_t* cache_local = cache_->exchange(nullptr);
+    if (cache_local != nullptr) numa_free(cache_local, size_);
 }
 
 uint8_t* dsacache::CacheData::GetDataLocation() const {
@@ -135,5 +235,10 @@ uint8_t* dsacache::CacheData::GetDataLocation() const {
 }
 
 bool dsacache::CacheData::Active() const {
-    return active_->load() > 0;
+    // this entry is active if more than one
+    // reference exists to it, as the Cache
+    // will always keep one internally until
+    // the entry is cleared from cache
+
+    return active_->load() > 1;
 }
\ No newline at end of file
diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 0081a04..f3ef90d 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -46,22 +46,42 @@ namespace dsacache {
         CachePolicy* cache_policy_function_ = nullptr;
         CopyPolicy* copy_policy_function_ = nullptr;
 
+        // function used to submit a copy task on a specific node to the dml
+        // engine on that node - will change the current threads node assignment
+        // to achieve this so take care to restore this
         dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> ExecuteCopy(
                 const uint8_t* src, uint8_t* dst, const size_t size, const int node
         ) const;
 
+        // allocates the required memory on the destination node
+        // and then submits task to the dml library for processing
+        // and attaches the handlers to the cache data structure
         void SubmitTask(CacheData* task, const int dst_node, const int src_node);
 
+        // querries the policy functions for the given data and size
+        // to obtain destination cache node, also returns the datas
+        // source node for further usage
+        // output may depend on the calling threads node assignment
+        // as this is set as the "optimal placement" node
         void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const;
 
+        // checks whether the cache contains an entry for
+        // the given data in the given memory node and
+        // returns it, otherwise returns nullptr
         std::unique_ptr<CacheData> GetFromCache(uint8_t* src, const size_t size, const int dst_node);
 
     public:
+        // initializes the cache with the two policy functions
+        // only after this is it safe to use in a threaded environment
         void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);
 
         // function to perform data access through the cache
         std::unique_ptr<CacheData> Access(uint8_t* data, const size_t size);
 
+        // flushes the cache of inactive entries
+        // if node is -1 then the whole cache is
+        // checked and otherwise the specified
+        // node - no checks on node validity
         void Flush(const int node = -1);
     };
 }
@@ -71,11 +91,19 @@ inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy
     copy_policy_function_ = copy_policy_function;
 
     // initialize numa library
+
     numa_available();
 
+    // obtain all available nodes
+    // and those we may allocate
+    // memory on
+
     const int nodes_max = numa_num_configured_nodes();
     const bitmask* valid_nodes = numa_get_mems_allowed();
 
+    // prepare the cache state with entries
+    // for all given nodes
+
     for (int node = 0; node < nodes_max; node++) {
         if (numa_bitmask_isbitset(valid_nodes, node)) {
             cache_state_.insert({node,{}});
@@ -93,6 +121,10 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::Access(uint8_t* dat
 
     GetCacheNode(data, size, &dst_node, &src_node);
 
+    // TODO: at this point it could be beneficial to check whether
+    // TODO: the given destination node is present as an entry
+    // TODO: in the cache state to see if it is valid
+
     // check whether the data is already cached
 
     std::unique_ptr<CacheData> task = GetFromCache(data, size, dst_node);
@@ -149,7 +181,7 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
         dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
 
         if (dst == nullptr) {
-            std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
+            std::cerr << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
             return;
         }
     }
@@ -188,6 +220,7 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
     // restore the previous nodemask
 
     numa_run_on_node_mask(nodemask);
+    numa_free_nodemask(nodemask);
 }
 
 inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> dsacache::Cache::ExecuteCopy(
diff --git a/offloading-cacher/util/dml-helper.hpp b/offloading-cacher/util/dml-helper.hpp
index 1686fd1..de92bb7 100644
--- a/offloading-cacher/util/dml-helper.hpp
+++ b/offloading-cacher/util/dml-helper.hpp
@@ -2,25 +2,45 @@
 
 #include <dml/dml.hpp>
 
-inline const std::string StatusCodeToString(const dml::status_code code) {
-    switch(code) {
-        case dml::status_code::ok: return "ok";
-        case dml::status_code::false_predicate: return "false predicate";
-        case dml::status_code::partial_completion: return "partial completion";
-        case dml::status_code::nullptr_error: return "nullptr error";
-        case dml::status_code::bad_size: return "bad size";
-        case dml::status_code::bad_length: return "bad length";
-        case dml::status_code::inconsistent_size: return "inconsistent size";
-        case dml::status_code::dualcast_bad_padding: return "dualcast bad padding";
-        case dml::status_code::bad_alignment: return "bad alignment";
-        case dml::status_code::buffers_overlapping: return "buffers overlapping";
-        case dml::status_code::delta_delta_empty: return "delta delta empty";
-        case dml::status_code::batch_overflow: return "batch overflow";
-        case dml::status_code::execution_failed: return "execution failed";
-        case dml::status_code::unsupported_operation: return "unsupported operation";
-        case dml::status_code::queue_busy: return "queue busy";
-        case dml::status_code::error: return "unknown error";
-        case dml::status_code::config_error: return "config error";
-        default: return "unhandled error";
+namespace dml {
+    inline const std::string StatusCodeToString(const dml::status_code code) {
+        switch (code) {
+            case dml::status_code::ok:
+                return "ok";
+            case dml::status_code::false_predicate:
+                return "false predicate";
+            case dml::status_code::partial_completion:
+                return "partial completion";
+            case dml::status_code::nullptr_error:
+                return "nullptr error";
+            case dml::status_code::bad_size:
+                return "bad size";
+            case dml::status_code::bad_length:
+                return "bad length";
+            case dml::status_code::inconsistent_size:
+                return "inconsistent size";
+            case dml::status_code::dualcast_bad_padding:
+                return "dualcast bad padding";
+            case dml::status_code::bad_alignment:
+                return "bad alignment";
+            case dml::status_code::buffers_overlapping:
+                return "buffers overlapping";
+            case dml::status_code::delta_delta_empty:
+                return "delta delta empty";
+            case dml::status_code::batch_overflow:
+                return "batch overflow";
+            case dml::status_code::execution_failed:
+                return "execution failed";
+            case dml::status_code::unsupported_operation:
+                return "unsupported operation";
+            case dml::status_code::queue_busy:
+                return "queue busy";
+            case dml::status_code::error:
+                return "unknown error";
+            case dml::status_code::config_error:
+                return "config error";
+            default:
+                return "unhandled error";
+        }
     }
 }
\ No newline at end of file

From 52566fc13b9b1b283b4b3b4187016f8b287904a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 12:31:57 +0100
Subject: [PATCH 09/29] print to cerr for bad states in the test-main

---
 offloading-cacher/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index e67eb22..4310d3d 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -65,7 +65,7 @@ void PerformAccessAndTest(double* src, const size_t size) {
         std::cout << "Caching did not affect data location." << std::endl;
     }
     else if (cached == nullptr) {
-        std::cout << "Got nullptr from cache." << std::endl;
+        std::cerr << "Got nullptr from cache." << std::endl;
     }
     else {
         std::cout << "Got different data location from cache." << std::endl;
@@ -75,7 +75,7 @@ void PerformAccessAndTest(double* src, const size_t size) {
         std::cout << "Cached data is correct." << std::endl;
     }
     else {
-        std::cout << "Cached data is wrong." << std::endl;
+        std::cerr << "Cached data is wrong." << std::endl;
     }
 }
 

From 53e05d096c52042274ab92f05c0e4b367b1f6d31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 12:41:32 +0100
Subject: [PATCH 10/29] add even more comments and remove an old code line that
 modified the reference counter of cache data from the outside

---
 offloading-cacher/cache-data.hpp |  3 ++-
 offloading-cacher/cache.hpp      | 41 ++++++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp
index 4de6138..fe02c90 100644
--- a/offloading-cacher/cache-data.hpp
+++ b/offloading-cacher/cache-data.hpp
@@ -66,7 +66,8 @@ namespace dsacache {
 
         // returns the cache data location for this
         // instance which is valid as long as the
-        // instance is alive
+        // instance is alive - !!! this may also
+        // yield a nullptr !!!
         uint8_t* GetDataLocation() const;
     };
 }
diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index f3ef90d..8fd8362 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -255,23 +255,45 @@ void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST
 inline void dsacache::Cache::Flush(const int node) {
     std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl;
 
+    // this lambda is used because below we have two code paths that
+    // flush nodes, either one single or all successively
+
     const auto FlushNode = [](std::unordered_map<uint8_t*,CacheData>& map) {
+        // begin at the front of the map
+
         auto it = map.begin();
 
+        // loop until we reach the end of the map
+
         while (it != map.end()) {
+            // if the iterator points to an inactive element
+            // then we may erase it
+
             if (it->second.Active() == false) {
+                // erase the iterator from the map
+
                 map.erase(it);
+
+                // as the erasure invalidated out iterator
+                // we must start at the beginning again
+
                 it = map.begin();
             }
             else {
+                // if element is active just move over to the next one
+
                 it++;
             }
         }
     };
 
     {
+        // we require exclusive lock as we modify the cache state
+
         std::unique_lock<std::shared_mutex> lock(cache_mutex_);
 
+        // node == -1 means that cache on all nodes should be flushed
+
         if (node == -1) {
             for (auto& nc : cache_state_) {
                 FlushNode(nc.second);
@@ -290,21 +312,36 @@ std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_t* src,
     // from marking the element we may find as unused and
     // clearing it
 
+    // lock the cache state in shared-mode because we read
+
     std::shared_lock<std::shared_mutex> lock(cache_mutex_);
 
+    // search for the data in our cache state structure at the given node
+
     const auto search = cache_state_[dst_node].find(src);
 
+    // if the data is in our structure we continue
+
     if (search != cache_state_[dst_node].end()) {
-        if (search->second.size_ == size) {
-            search->second.active_->store(true);
 
+        // now check whether the sizes match
+        // TODO: second.size_ >= size would also work
+
+        if (search->second.size_ == size) {
             std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
 
+            // return a unique copy of the entry which uses the object
+            // lifetime and destructor to safely handle deallocation
+
             return std::move(std::make_unique<CacheData>(search->second));
         }
         else {
             std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
 
+            // if the sizes missmatch then we clear the current entry from cache
+            // which will cause its deletion only after the last possible outside
+            // reference is also destroyed
+
             cache_state_[dst_node].erase(search);
         }
     }

From 9c06bd4fa90aafec7c438580d6b07929b25792f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 12:53:10 +0100
Subject: [PATCH 11/29] add class-definition comments and clear some
 double-newlines

---
 offloading-cacher/cache-data.hpp | 15 ++++++++-------
 offloading-cacher/cache.hpp      | 10 ++++++----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp
index fe02c90..95865ca 100644
--- a/offloading-cacher/cache-data.hpp
+++ b/offloading-cacher/cache-data.hpp
@@ -13,13 +13,14 @@
 namespace dsacache {
     class Cache;
 
-    // the cache task structure will be used to submit and
-    // control a cache element, while providing source pointer
-    // and size in bytes for submission
-    //
-    // then the submitting thread may wait on the atomic "result"
-    // which will be notified by the cache worker upon processing
-    // after which the atomic-bool-ptr active will also become valid
+    // cache data holds all required information on
+    // one cache entry and will both be stored
+    // internally by the cache and handed out
+    // as copies to the user
+    // this class uses its object lifetime and
+    // a global reference counter to allow
+    // thread-safe copies and resource management
+
     class CacheData {
     public:
         using dml_handler = dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>;
diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 8fd8362..952dd47 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -16,8 +16,12 @@
 #include "cache-data.hpp"
 
 namespace dsacache {
-    // singleton which holds the cache workers
-    // and is the place where work will be submited
+    // cache class will handle access to data through the cache
+    // by managing the cache through work submission, it sticks
+    // to user-defined caching and copy policies, is thread
+    // safe after initialization and returns copies of
+    // cache data class to the user
+
     class Cache {
     public:
         // cache policy is defined as a type here to allow flexible usage of the cacher
@@ -234,7 +238,6 @@ inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> dsacache::
     return dml::submit<dml::automatic>(dml::mem_copy.block_on_fault(), srcv, dstv);
 }
 
-
 void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const {
     // obtain numa node of current thread to determine where the data is needed
 
@@ -251,7 +254,6 @@ void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST
     *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size);
 }
 
-
 inline void dsacache::Cache::Flush(const int node) {
     std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl;
 

From e5b96727cd65e30358a5fa58c38ccd32cd576d7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 14:24:10 +0100
Subject: [PATCH 12/29] add missing inline specifier to functions as this is
 header-only code

---
 offloading-cacher/cache-data.hpp | 12 ++++++------
 offloading-cacher/cache.hpp      |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp
index 95865ca..1958519 100644
--- a/offloading-cacher/cache-data.hpp
+++ b/offloading-cacher/cache-data.hpp
@@ -154,7 +154,7 @@ inline void dsacache::CacheData::WaitOnCompletion() {
     }
 }
 
-dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
+inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
     std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
 
     src_ = data;
@@ -165,7 +165,7 @@ dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
     handlers_ = std::make_unique<std::vector<dml_handler>>();
 }
 
-dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
+inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
     std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
 
     // we copy the ptr to the global atomic reference counter
@@ -190,7 +190,7 @@ dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
     handlers_ = nullptr;
 }
 
-dsacache::CacheData::~CacheData() {
+inline dsacache::CacheData::~CacheData() {
     std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
     // if this is the first instance of this cache structure
@@ -221,7 +221,7 @@ dsacache::CacheData::~CacheData() {
     }
 }
 
-void dsacache::CacheData::Deallocate() {
+inline void dsacache::CacheData::Deallocate() {
     std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
 
     // although deallocate should only be called from
@@ -232,11 +232,11 @@ void dsacache::CacheData::Deallocate() {
     if (cache_local != nullptr) numa_free(cache_local, size_);
 }
 
-uint8_t* dsacache::CacheData::GetDataLocation() const {
+inline uint8_t* dsacache::CacheData::GetDataLocation() const {
     return cache_->load();
 }
 
-bool dsacache::CacheData::Active() const {
+inline bool dsacache::CacheData::Active() const {
     // this entry is active if more than one
     // reference exists to it, as the Cache
     // will always keep one internally until
diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 952dd47..22b23f8 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -238,7 +238,7 @@ inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> dsacache::
     return dml::submit<dml::automatic>(dml::mem_copy.block_on_fault(), srcv, dstv);
 }
 
-void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const {
+inline void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const {
     // obtain numa node of current thread to determine where the data is needed
 
     const int current_cpu = sched_getcpu();
@@ -307,7 +307,7 @@ inline void dsacache::Cache::Flush(const int node) {
     }
 }
 
-std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_t* src, const size_t size, const int dst_node) {
+inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_t* src, const size_t size, const int dst_node) {
     // the best situation is if this data is already cached
     // which we check in an unnamed block in which the cache
     // is locked for reading to prevent another thread

From e3e17cec7b0d6b6cecdbc740faf72f2635eac5e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 17:44:52 +0100
Subject: [PATCH 13/29] extend the main method of the small supplied test
 program to allow running on multiple threads

---
 offloading-cacher/CMakeLists.txt |   5 +-
 offloading-cacher/main.cpp       | 138 ++++++++++++++++++++++++-------
 2 files changed, 110 insertions(+), 33 deletions(-)

diff --git a/offloading-cacher/CMakeLists.txt b/offloading-cacher/CMakeLists.txt
index 7b4844a..19ddbdd 100755
--- a/offloading-cacher/CMakeLists.txt
+++ b/offloading-cacher/CMakeLists.txt
@@ -1,12 +1,13 @@
 cmake_minimum_required(VERSION 3.18)
 
-project(offloading-cacher)
+project(offloading-cacher LANGUAGES CXX)
 
 set(CMAKE_CXX_STANDARD 20)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules")
 
 find_package(NUMA REQUIRED)
+find_package(OpenMP REQUIRED)
 
 set(DML_SOURCE_DIR "../../DML/include/")
 set(SOURCES main.cpp)
@@ -14,6 +15,6 @@ set(SOURCES main.cpp)
 add_executable(offloading-cacher ${SOURCES})
 
 target_include_directories(offloading-cacher PRIVATE ${CMAKE_SOURCE_DIR} ${NUMA_INCLUDE_DIRS} ${DML_SOURCE_DIR})
-target_link_libraries(offloading-cacher PRIVATE libdml.a pthread ${CMAKE_DL_LIBS} ${NUMA_LIBRARY})
+target_link_libraries(offloading-cacher PRIVATE libdml.a pthread ${CMAKE_DL_LIBS} ${NUMA_LIBRARY} OpenMP::OpenMP_CXX)
 
 install(TARGETS offloading-cacher DESTINATION ${CMAKE_INSTALL_PREFIX})
diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index 4310d3d..08640dc 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -1,10 +1,49 @@
 #include <iostream>
 #include <random>
+#include <vector>
+#include <string>
+
+#include <omp.h>
 
 #include "cache.hpp"
 
 dsacache::Cache CACHE;
 
+void InitCache(const std::string& device) {
+    if (device == "default") {
+        auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            return numa_dst_node;
+        };
+
+        auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
+            return std::vector<int>{ numa_src_node, numa_dst_node };
+        };
+
+        CACHE.Init(cache_policy,copy_policy);
+    }
+    else if (device == "xeonmax") {
+        auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;
+        };
+
+        auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
+            const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
+            if (same_socket) {
+                const bool socket_number = numa_dst_node >> 2;
+                if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
+                else return std::vector<int>{ 4, 5, 6, 7 };
+            }
+            else return std::vector<int>{ numa_src_node, numa_dst_node };
+        };
+
+        CACHE.Init(cache_policy,copy_policy);
+    }
+    else {
+        std::cerr << "Given device '" << device << "' not supported!" << std::endl;
+        exit(-1);
+    }
+}
+
 double* GetRandomArray(const size_t size) {
     double* array = new double[size];
 
@@ -31,7 +70,7 @@ bool IsEqual(const double* a, const double* b, const size_t size) {
     return true;
 }
 
-void PerformAccessAndTest(double* src, const size_t size) {
+void PerformAccessAndTest(double* src, const size_t size, const int tid) {
     std::unique_ptr<dsacache::CacheData> data_cache = CACHE.Access(
             reinterpret_cast<uint8_t *>(src),
             size * sizeof(double)
@@ -42,13 +81,13 @@ void PerformAccessAndTest(double* src, const size_t size) {
     // check the value immediately just to see if ram or cache was returned
 
     if (src == cached_imm) {
-        std::cout << "Caching did not immediately yield different data location." << std::endl;
+        std::cout << "[" << tid << "] Caching did not immediately yield different data location." << std::endl;
     }
     else if (cached_imm == nullptr) {
-        std::cout << "Immediately got nullptr." << std::endl;
+        std::cout << "[" << tid << "] Immediately got nullptr." << std::endl;
     }
     else {
-        std::cout << "Immediately got different data location." << std::endl;
+        std::cout << "[" << tid << "] Immediately got different data location." << std::endl;
     }
 
     // waits for the completion of the asynchronous caching operation
@@ -62,56 +101,93 @@ void PerformAccessAndTest(double* src, const size_t size) {
     // tests on the resulting value
 
     if (src == cached) {
-        std::cout << "Caching did not affect data location." << std::endl;
+        std::cout << "[" << tid << "] Caching did not affect data location." << std::endl;
     }
     else if (cached == nullptr) {
-        std::cerr << "Got nullptr from cache." << std::endl;
+        std::cerr << "[" << tid << "] Got nullptr from cache." << std::endl;
     }
     else {
-        std::cout << "Got different data location from cache." << std::endl;
+        std::cout << "[" << tid << "] Got different data location from cache." << std::endl;
     }
 
     if (IsEqual(src,cached,size)) {
-        std::cout << "Cached data is correct." << std::endl;
+        std::cout << "[" << tid << "] Cached data is correct." << std::endl;
     }
     else {
-        std::cerr << "Cached data is wrong." << std::endl;
+        std::cerr << "[" << tid << "] Cached data is wrong." << std::endl;
     }
 }
 
-int main(int argc, char **argv) {
+void RunTestST(const size_t size) {
+    double* data = GetRandomArray(size);
 
-    // given numa destination and source node and the size of the data
-    // this function decides on which the data will be placed
-    // which is used to select the HBM-node for the dst-node if desired
+    static constexpr int tid = 0;
 
-    auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
-        return numa_dst_node;
-    };
+    std::cout << "[" << tid << "]  first access --- " << std::endl;
 
-    // this function receives the memory source and destination node
-    // and then decides, on which nodes the copy operation will be split
+    PerformAccessAndTest(data, size, tid);
 
-    auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-        return std::vector{ numa_src_node, numa_dst_node };
-    };
+    std::cout << "[" << tid << "]  second access --- " << std::endl;
 
-    // initializes the cache with the two policies
+    PerformAccessAndTest(data, size, tid);
 
-    CACHE.Init(cache_policy,copy_policy);
+    std::cout << "[" << tid << "]  end of application --- " << std::endl;
+}
 
-    // generate the test data
+void RunTestMT(const size_t size) {
+    double* data = GetRandomArray(size);
 
-    static constexpr size_t data_size = 1024 * 1024;
-    double* data = GetRandomArray(data_size);
+    #pragma omp parallel
+    {
+        const int tid = omp_get_thread_num();
 
-    std::cout << "--- first access --- " << std::endl;
+        std::cout << "[" << tid << "] first access --- " << std::endl;
 
-    PerformAccessAndTest(data, data_size);
+        PerformAccessAndTest(data, size, tid);
 
-    std::cout << "--- second access --- " << std::endl;
+        std::cout << "[" << tid << "] second access --- " << std::endl;
 
-    PerformAccessAndTest(data, data_size);
+        PerformAccessAndTest(data, size, tid);
 
-    std::cout << "--- end of application --- " << std::endl;
+        std::cout << "[" << tid << "] end of block --- " << std::endl;
+    }
+}
+
+int main(int argc, char **argv) {
+    if (argc != 4) {
+        std::cerr << "This application requires four parameters!" << std::endl;
+
+        std::cout << "Please provide the following positional arguments: [device] [mode] [size]" << std::endl;
+        std::cout << "[device] from { default, xeonmax } which influences cache and execution placement" << std::endl;
+        std::cout << "[mode] from { st, mt } or single and multi threaded respectively" << std::endl;
+        std::cout << "[size] positive integral number, amount of float64 in data array" << std::endl;
+
+        exit(-1);
+    }
+
+    const std::string device = argv[1];
+    const std::string mode = argv[2];
+    const std::string size_s = argv[3];
+
+    uint32_t size = 0;
+
+    try {
+        size = std::stoul(size_s);
+    }
+    catch (...) {
+        std::cerr << "Given Size '" << size_s << "' caused error during conversion to number!" << std::endl;
+    }
+
+    InitCache(device);
+
+    if (mode == "st") {
+        RunTestST(size);
+    }
+    else if (mode == "mt") {
+        RunTestMT(size);
+    }
+    else {
+        std::cerr << "Given Mode '" << mode << "' not supported!" << std::endl;
+        exit(-1);
+    }
 }

From 4ddd96adcb76ea711e536767cbbd4129ed2a25b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 17:57:25 +0100
Subject: [PATCH 14/29] remove extra whitespace from output in main function

---
 offloading-cacher/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index 08640dc..2302493 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -123,15 +123,15 @@ void RunTestST(const size_t size) {
 
     static constexpr int tid = 0;
 
-    std::cout << "[" << tid << "]  first access --- " << std::endl;
+    std::cout << "[" << tid << "] first access --- " << std::endl;
 
     PerformAccessAndTest(data, size, tid);
 
-    std::cout << "[" << tid << "]  second access --- " << std::endl;
+    std::cout << "[" << tid << "] second access --- " << std::endl;
 
     PerformAccessAndTest(data, size, tid);
 
-    std::cout << "[" << tid << "]  end of application --- " << std::endl;
+    std::cout << "[" << tid << "] end of application --- " << std::endl;
 }
 
 void RunTestMT(const size_t size) {

From 7dfbed68feba4de8fe44aa205fc7437c92d5fd92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 19:04:44 +0100
Subject: [PATCH 15/29] handle allocation slightly different, introduce a
 separate function for cleaner code that does on-node memory allocation, first
 querry the available size and do not rely on numa_alloc_onnode to report
 nullptr if the size is not really available

---
 offloading-cacher/cache.hpp | 56 +++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 22b23f8..cce0439 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -69,6 +69,12 @@ namespace dsacache {
         // as this is set as the "optimal placement" node
         void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const;
 
+        // allocates memory of size "size" on the numa node "node"
+        // and returns nullptr if this is not possible, also may
+        // try to flush the cache of the requested node to
+        // alleviate encountered shortage
+        uint8_t* AllocOnNode(const size_t size, const int node);
+
         // checks whether the cache contains an entry for
         // the given data in the given memory node and
         // returns it, otherwise returns nullptr
@@ -164,32 +170,58 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::Access(uint8_t* dat
     return std::move(task);
 }
 
-inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) {
-    std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
-
+inline uint8_t* dsacache::Cache::AllocOnNode(const size_t size, const int node) {
     // allocate data on this node and flush the unused parts of the
     // cache if the operation fails and retry once
     // TODO: smarter flush strategy could keep some stuff cached
 
-    uint8_t* dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
+    // check currently free memory to see if the data fits
 
-    if (dst == nullptr) {
-        std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
+    long long int free_space = 0;
+    numa_node_size64(node, &free_space);
+
+    if (free_space < size) {
+        std::cout << "[!] Memory shortage when allocating " << size << "B on node " << node << std::endl;
 
-        // allocation on dst_node failed so we flush the cache for this
+        // dst node lacks memory space so we flush the cache for this
         // node hoping to free enough currently unused entries to make
         // the second allocation attempt successful
 
-        Flush(dst_node);
+        Flush(node);
 
-        dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(task->size_, dst_node));
+        // re-test by getting the free space and checking again
 
-        if (dst == nullptr) {
-            std::cerr << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl;
-            return;
+        numa_node_size64(node, &free_space);
+
+        if (free_space < size) {
+            std::cout << "[x] Memory shortage after flush when allocating " << size << "B on node " << node << std::endl;
+
+            return nullptr;
         }
     }
 
+    uint8_t* dst = reinterpret_cast<uint8_t*>(numa_alloc_onnode(size, node));
+
+    if (dst == nullptr) {
+        std::cout << "[x] Allocation try failed for " << size << "B on node " << node << std::endl;
+
+        return nullptr;
+    }
+
+    return dst;
+}
+
+inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) {
+    std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
+
+
+    uint8_t* dst = AllocOnNode(task->size_, dst_node);
+
+    if (dst == nullptr) {
+        std::cout << "[x] Allocation failed so we can not cache" << std::endl;
+        return;
+    }
+
     task->incomplete_cache_ = dst;
 
     // querry copy policy function for the nodes to use for the copy

From 6ab88595b7cd0e0f0d910cbb54846d61f7e03688 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 19:05:16 +0100
Subject: [PATCH 16/29] add test for the cache-flush logic which was previously
 not tested

---
 offloading-cacher/main.cpp | 61 ++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index 2302493..443b00b 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -44,10 +44,10 @@ void InitCache(const std::string& device) {
     }
 }
 
-double* GetRandomArray(const size_t size) {
-    double* array = new double[size];
+uint8_t* GetRandomArray(const size_t size) {
+    uint8_t* array = new uint8_t[size];
 
-    std::uniform_real_distribution<double> unif(std::numeric_limits<double>::min(), std::numeric_limits<double>::max());
+    std::uniform_int_distribution<uint8_t> unif(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
     std::default_random_engine re;
 
     for (size_t i = 0; i < size; i++) {
@@ -57,7 +57,7 @@ double* GetRandomArray(const size_t size) {
     return array;
 }
 
-bool IsEqual(const double* a, const double* b, const size_t size) {
+bool IsEqual(const uint8_t* a, const uint8_t* b, const size_t size) {
     for (size_t i = 0; i < size; i++) {
         try {
             if (a[i] != b[i]) return false;
@@ -70,13 +70,13 @@ bool IsEqual(const double* a, const double* b, const size_t size) {
     return true;
 }
 
-void PerformAccessAndTest(double* src, const size_t size, const int tid) {
+std::unique_ptr<dsacache::CacheData> PerformAccessAndTest(uint8_t* src, const size_t size, const int tid) {
     std::unique_ptr<dsacache::CacheData> data_cache = CACHE.Access(
             reinterpret_cast<uint8_t *>(src),
-            size * sizeof(double)
+            size * sizeof(uint8_t)
     );
 
-    double* cached_imm = reinterpret_cast<double *>(data_cache->GetDataLocation());
+    uint8_t* cached_imm = reinterpret_cast<uint8_t *>(data_cache->GetDataLocation());
 
     // check the value immediately just to see if ram or cache was returned
 
@@ -96,7 +96,7 @@ void PerformAccessAndTest(double* src, const size_t size, const int tid) {
 
     // gets the cache-data-location from the struct
 
-    double* cached = reinterpret_cast<double *>(data_cache->GetDataLocation());
+    uint8_t* cached = reinterpret_cast<uint8_t *>(data_cache->GetDataLocation());
 
     // tests on the resulting value
 
@@ -116,10 +116,12 @@ void PerformAccessAndTest(double* src, const size_t size, const int tid) {
     else {
         std::cerr << "[" << tid << "] Cached data is wrong." << std::endl;
     }
+
+    return std::move(data_cache);
 }
 
 void RunTestST(const size_t size) {
-    double* data = GetRandomArray(size);
+    uint8_t* data = GetRandomArray(size);
 
     static constexpr int tid = 0;
 
@@ -135,7 +137,7 @@ void RunTestST(const size_t size) {
 }
 
 void RunTestMT(const size_t size) {
-    double* data = GetRandomArray(size);
+    uint8_t* data = GetRandomArray(size);
 
     #pragma omp parallel
     {
@@ -153,14 +155,44 @@ void RunTestMT(const size_t size) {
     }
 }
 
+void RunTestFlush(const size_t size) {
+    uint8_t* data1 = GetRandomArray(size);
+    uint8_t* data2 = GetRandomArray(size);
+    uint8_t* data3 = GetRandomArray(size);
+
+    static constexpr int tid = 0;
+
+    std::cout << "[" << tid << "] first access to data d1 and keepalive --- " << std::endl;
+
+    const auto c1 = PerformAccessAndTest(data1, size, tid);
+
+    std::cout << "[" << tid << "] second access to d2 lets d2 vanish --- " << std::endl;
+
+    PerformAccessAndTest(data2, size, tid);
+
+    std::cout << "[" << tid << "] third access to d3 should clear d2 --- " << std::endl;
+
+    PerformAccessAndTest(data3, size, tid);
+
+    std::cout << "[" << tid << "] end of block and test d1 == cache1 --- " << std::endl;
+
+    if (IsEqual(data1, c1->GetDataLocation(), size)) {
+        std::cout << "[" << tid << "] Cached d1 is still correct." << std::endl;
+    }
+    else {
+        std::cerr << "[" << tid << "] Cached d1 is bad." << std::endl;
+    }
+}
+
 int main(int argc, char **argv) {
     if (argc != 4) {
-        std::cerr << "This application requires four parameters!" << std::endl;
+        std::cerr << "This application requires three parameters!" << std::endl;
 
         std::cout << "Please provide the following positional arguments: [device] [mode] [size]" << std::endl;
         std::cout << "[device] from { default, xeonmax } which influences cache and execution placement" << std::endl;
-        std::cout << "[mode] from { st, mt } or single and multi threaded respectively" << std::endl;
-        std::cout << "[size] positive integral number, amount of float64 in data array" << std::endl;
+        std::cout << "[mode] from { st, mt, flt } or single and multi threaded and flushtest respectively" << std::endl;
+        std::cout << "[size] positive integral number, amount of bytes in data array" << std::endl;
+        std::cout << "for flushtest the given size should be 1/3 of the available cache size" << std::endl;
 
         exit(-1);
     }
@@ -186,6 +218,9 @@ int main(int argc, char **argv) {
     else if (mode == "mt") {
         RunTestMT(size);
     }
+    else if (mode == "flt") {
+        RunTestFlush(size);
+    }
     else {
         std::cerr << "Given Mode '" << mode << "' not supported!" << std::endl;
         exit(-1);

From 4fa5ef65227294b6e755bca45eae1eb6a82be7cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 10 Jan 2024 19:21:44 +0100
Subject: [PATCH 17/29] accept existing cache if the cached block is larger
 than the requested view

---
 offloading-cacher/cache.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index cce0439..50e9c29 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -359,9 +359,8 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
     if (search != cache_state_[dst_node].end()) {
 
         // now check whether the sizes match
-        // TODO: second.size_ >= size would also work
 
-        if (search->second.size_ == size) {
+        if (search->second.size_ >= size) {
             std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
 
             // return a unique copy of the entry which uses the object

From d7c5c55208b3f65c5ebe621cb69585bfaf751d81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Mon, 15 Jan 2024 13:16:46 +0100
Subject: [PATCH 18/29] turn library into single-header mode

---
 offloading-cacher/cache-data.hpp      | 246 -----------------------
 offloading-cacher/cache.hpp           | 276 +++++++++++++++++++++++++-
 offloading-cacher/util/dml-helper.hpp |  46 -----
 3 files changed, 275 insertions(+), 293 deletions(-)
 delete mode 100644 offloading-cacher/cache-data.hpp
 delete mode 100644 offloading-cacher/util/dml-helper.hpp

diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp
deleted file mode 100644
index 1958519..0000000
--- a/offloading-cacher/cache-data.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-#pragma once
-
-#include <iostream>
-
-#include <atomic>
-#include <memory>
-#include <vector>
-
-#include <dml/dml.hpp>
-
-#include "util/dml-helper.hpp"
-
-namespace dsacache {
-    class Cache;
-
-    // cache data holds all required information on
-    // one cache entry and will both be stored
-    // internally by the cache and handed out
-    // as copies to the user
-    // this class uses its object lifetime and
-    // a global reference counter to allow
-    // thread-safe copies and resource management
-
-    class CacheData {
-    public:
-        using dml_handler = dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>;
-
-    private:
-        // data source and size of the block
-        uint8_t* src_;
-        size_t size_;
-
-        // global reference counting object
-        std::atomic<int32_t>* active_;
-
-        // global cache-location pointer
-        std::atomic<uint8_t*>* cache_;
-
-        // object-local incomplete cache location pointer
-        // which is only available in the first instance
-        uint8_t* incomplete_cache_;
-
-        // dml handler vector pointer which is only
-        // available in the first instance
-        std::unique_ptr<std::vector<dml_handler>> handlers_;
-
-        // deallocates the global cache-location
-        // and invalidates it
-        void Deallocate();
-
-        // checks whether there are at least two
-        // valid references to this object which
-        // is done as the cache always has one
-        // internally to any living instance
-        bool Active() const;
-
-        friend Cache;
-    public:
-        CacheData(uint8_t* data, const size_t size);
-        CacheData(const CacheData& other);
-        ~CacheData();
-
-        // waits on completion of caching operations
-        // for this task and is safe to be called in
-        // any state of the object
-        void WaitOnCompletion();
-
-        // returns the cache data location for this
-        // instance which is valid as long as the
-        // instance is alive - !!! this may also
-        // yield a nullptr !!!
-        uint8_t* GetDataLocation() const;
-    };
-}
-
-inline void dsacache::CacheData::WaitOnCompletion() {
-    // the cache data entry can be in two states
-    // either it is the original one which has not
-    // been waited for in which case the handlers
-    // are non-null or it is not
-
-    if (handlers_ == nullptr) {
-        std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-        // when no handlers are attached to this cache entry we wait on a
-        // value change for the cache structure from nullptr to non-null
-        // which will either go through immediately if the cache is valid
-        // already or wait until the handler-owning thread notifies us
-
-        cache_->wait(nullptr);
-
-        std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-    }
-    else {
-        // when the handlers are non-null there are some DSA task handlers
-        // available on which we must wait here
-
-        std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-        // abort is set if any operation encountered an error
-
-        bool abort = false;
-
-        for (auto& handler : *handlers_) {
-            auto result = handler.get();
-
-            if (result.status != dml::status_code::ok) {
-                std::cerr << "[x] Encountered bad status code for operation: " << dml::StatusCodeToString(result.status) << std::endl;
-
-                // if one of the copy tasks failed we abort the whole task
-                // after all operations are completed on it
-
-                abort = true;
-            }
-        }
-
-        // the handlers are cleared after all have completed
-
-        handlers_ = nullptr;
-
-        // now we act depending on whether an abort has been
-        // called for which signals operation incomplete
-
-        if (abort) {
-            // store nullptr in the cache location
-
-            cache_->store(nullptr);
-
-            // then free the now incomplete cache
-
-            // TODO: it would be possible to salvage the
-            // TODO: operation at this point but this
-            // TODO: is quite complicated so we just abort
-
-            numa_free(incomplete_cache_, size_);
-        }
-        else {
-            std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-            // incomplete cache is now safe to use and therefore we
-            // swap it with the global cache state of this entry
-            // and notify potentially waiting threads
-
-            cache_->store(incomplete_cache_);
-        }
-
-        // as a last step all waiting threads must
-        // be notified (copies of this will wait on value
-        // change of the cache) and the incomplete cache
-        // is cleared to nullptr as it is not incomplete
-
-        cache_->notify_all();
-        incomplete_cache_ = nullptr;
-    }
-}
-
-inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
-    std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
-
-    src_ = data;
-    size_ = size;
-    active_ = new std::atomic<int32_t>(1);
-    cache_ = new std::atomic<uint8_t*>();
-    incomplete_cache_ = nullptr;
-    handlers_ = std::make_unique<std::vector<dml_handler>>();
-}
-
-inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
-    std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
-
-    // we copy the ptr to the global atomic reference counter
-    // and increase the amount of active references
-
-    active_ = other.active_;
-    const int current_active = active_->fetch_add(1);
-
-    // source and size will be copied too
-    // as well as the reference to the global
-    // atomic cache pointer
-
-    src_ = other.src_;
-    size_ = other.size_;
-    cache_ = other.cache_;
-
-    // incomplete cache and handlers will not
-    // be copied because only the first instance
-    // will wait on the completion of handlers
-
-    incomplete_cache_ = nullptr;
-    handlers_ = nullptr;
-}
-
-inline dsacache::CacheData::~CacheData() {
-    std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-    // if this is the first instance of this cache structure
-    // and it has not been waited on and is now being destroyed
-    // we must wait on completion here to ensure the cache
-    // remains in a valid state
-
-    if (handlers_ != nullptr) {
-        WaitOnCompletion();
-    }
-
-    // due to fetch_sub returning the preivously held value
-    // we must subtract one locally to get the current value
-
-    const int32_t v = active_->fetch_sub(1) - 1;
-
-    // if the returned value is zero or lower
-    // then we must execute proper deletion
-    // as this was the last reference
-
-    if (v <= 0) {
-        std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-        Deallocate();
-
-        delete active_;
-        delete cache_;
-    }
-}
-
-inline void dsacache::CacheData::Deallocate() {
-    std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
-    // although deallocate should only be called from
-    // a safe context to do so, it can not hurt to
-    // defensively perform the operation atomically
-
-    uint8_t* cache_local = cache_->exchange(nullptr);
-    if (cache_local != nullptr) numa_free(cache_local, size_);
-}
-
-inline uint8_t* dsacache::CacheData::GetDataLocation() const {
-    return cache_->load();
-}
-
-inline bool dsacache::CacheData::Active() const {
-    // this entry is active if more than one
-    // reference exists to it, as the Cache
-    // will always keep one internally until
-    // the entry is cleared from cache
-
-    return active_->load() > 1;
-}
\ No newline at end of file
diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 50e9c29..3fe1e19 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -13,9 +13,111 @@
 
 #include <dml/dml.hpp>
 
-#include "cache-data.hpp"
+namespace dml {
+    inline const std::string StatusCodeToString(const dml::status_code code) {
+        switch (code) {
+            case dml::status_code::ok:
+                return "ok";
+            case dml::status_code::false_predicate:
+                return "false predicate";
+            case dml::status_code::partial_completion:
+                return "partial completion";
+            case dml::status_code::nullptr_error:
+                return "nullptr error";
+            case dml::status_code::bad_size:
+                return "bad size";
+            case dml::status_code::bad_length:
+                return "bad length";
+            case dml::status_code::inconsistent_size:
+                return "inconsistent size";
+            case dml::status_code::dualcast_bad_padding:
+                return "dualcast bad padding";
+            case dml::status_code::bad_alignment:
+                return "bad alignment";
+            case dml::status_code::buffers_overlapping:
+                return "buffers overlapping";
+            case dml::status_code::delta_delta_empty:
+                return "delta delta empty";
+            case dml::status_code::batch_overflow:
+                return "batch overflow";
+            case dml::status_code::execution_failed:
+                return "execution failed";
+            case dml::status_code::unsupported_operation:
+                return "unsupported operation";
+            case dml::status_code::queue_busy:
+                return "queue busy";
+            case dml::status_code::error:
+                return "unknown error";
+            case dml::status_code::config_error:
+                return "config error";
+            default:
+                return "unhandled error";
+        }
+    }
+}
 
 namespace dsacache {
+    class Cache;
+
+    // cache data holds all required information on
+    // one cache entry and will both be stored
+    // internally by the cache and handed out
+    // as copies to the user
+    // this class uses its object lifetime and
+    // a global reference counter to allow
+    // thread-safe copies and resource management
+
+    class CacheData {
+    public:
+        using dml_handler = dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>;
+
+    private:
+        // data source and size of the block
+        uint8_t* src_;
+        size_t size_;
+
+        // global reference counting object
+        std::atomic<int32_t>* active_;
+
+        // global cache-location pointer
+        std::atomic<uint8_t*>* cache_;
+
+        // object-local incomplete cache location pointer
+        // which is only available in the first instance
+        uint8_t* incomplete_cache_;
+
+        // dml handler vector pointer which is only
+        // available in the first instance
+        std::unique_ptr<std::vector<dml_handler>> handlers_;
+
+        // deallocates the global cache-location
+        // and invalidates it
+        void Deallocate();
+
+        // checks whether there are at least two
+        // valid references to this object which
+        // is done as the cache always has one
+        // internally to any living instance
+        bool Active() const;
+
+        friend Cache;
+    public:
+        CacheData(uint8_t* data, const size_t size);
+        CacheData(const CacheData& other);
+        ~CacheData();
+
+        // waits on completion of caching operations
+        // for this task and is safe to be called in
+        // any state of the object
+        void WaitOnCompletion();
+
+        // returns the cache data location for this
+        // instance which is valid as long as the
+        // instance is alive - !!! this may also
+        // yield a nullptr !!!
+        uint8_t* GetDataLocation() const;
+    };
+
     // cache class will handle access to data through the cache
     // by managing the cache through work submission, it sticks
     // to user-defined caching and copy policies, is thread
@@ -381,3 +483,175 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
 
     return nullptr;
 }
+
+inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
+    std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
+
+    src_ = data;
+    size_ = size;
+    active_ = new std::atomic<int32_t>(1);
+    cache_ = new std::atomic<uint8_t*>();
+    incomplete_cache_ = nullptr;
+    handlers_ = std::make_unique<std::vector<dml_handler>>();
+}
+
+inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
+    std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
+
+    // we copy the ptr to the global atomic reference counter
+    // and increase the amount of active references
+
+    active_ = other.active_;
+    const int current_active = active_->fetch_add(1);
+
+    // source and size will be copied too
+    // as well as the reference to the global
+    // atomic cache pointer
+
+    src_ = other.src_;
+    size_ = other.size_;
+    cache_ = other.cache_;
+
+    // incomplete cache and handlers will not
+    // be copied because only the first instance
+    // will wait on the completion of handlers
+
+    incomplete_cache_ = nullptr;
+    handlers_ = nullptr;
+}
+
+inline dsacache::CacheData::~CacheData() {
+    std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+    // if this is the first instance of this cache structure
+    // and it has not been waited on and is now being destroyed
+    // we must wait on completion here to ensure the cache
+    // remains in a valid state
+
+    if (handlers_ != nullptr) {
+        WaitOnCompletion();
+    }
+
+    // due to fetch_sub returning the preivously held value
+    // we must subtract one locally to get the current value
+
+    const int32_t v = active_->fetch_sub(1) - 1;
+
+    // if the returned value is zero or lower
+    // then we must execute proper deletion
+    // as this was the last reference
+
+    if (v <= 0) {
+        std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        Deallocate();
+
+        delete active_;
+        delete cache_;
+    }
+}
+
+inline void dsacache::CacheData::Deallocate() {
+    std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+    // although deallocate should only be called from
+    // a safe context to do so, it can not hurt to
+    // defensively perform the operation atomically
+
+    uint8_t* cache_local = cache_->exchange(nullptr);
+    if (cache_local != nullptr) numa_free(cache_local, size_);
+}
+
+inline uint8_t* dsacache::CacheData::GetDataLocation() const {
+    return cache_->load();
+}
+
+inline bool dsacache::CacheData::Active() const {
+    // this entry is active if more than one
+    // reference exists to it, as the Cache
+    // will always keep one internally until
+    // the entry is cleared from cache
+
+    return active_->load() > 1;
+}
+
+inline void dsacache::CacheData::WaitOnCompletion() {
+    // the cache data entry can be in two states
+    // either it is the original one which has not
+    // been waited for in which case the handlers
+    // are non-null or it is not
+
+    if (handlers_ == nullptr) {
+        std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        // when no handlers are attached to this cache entry we wait on a
+        // value change for the cache structure from nullptr to non-null
+        // which will either go through immediately if the cache is valid
+        // already or wait until the handler-owning thread notifies us
+
+        cache_->wait(nullptr);
+
+        std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+    }
+    else {
+        // when the handlers are non-null there are some DSA task handlers
+        // available on which we must wait here
+
+        std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+        // abort is set if any operation encountered an error
+
+        bool abort = false;
+
+        for (auto& handler : *handlers_) {
+            auto result = handler.get();
+
+            if (result.status != dml::status_code::ok) {
+                std::cerr << "[x] Encountered bad status code for operation: " << dml::StatusCodeToString(result.status) << std::endl;
+
+                // if one of the copy tasks failed we abort the whole task
+                // after all operations are completed on it
+
+                abort = true;
+            }
+        }
+
+        // the handlers are cleared after all have completed
+
+        handlers_ = nullptr;
+
+        // now we act depending on whether an abort has been
+        // called for which signals operation incomplete
+
+        if (abort) {
+            // store nullptr in the cache location
+
+            cache_->store(nullptr);
+
+            // then free the now incomplete cache
+
+            // TODO: it would be possible to salvage the
+            // TODO: operation at this point but this
+            // TODO: is quite complicated so we just abort
+
+            numa_free(incomplete_cache_, size_);
+        }
+        else {
+            std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
+
+            // incomplete cache is now safe to use and therefore we
+            // swap it with the global cache state of this entry
+            // and notify potentially waiting threads
+
+            cache_->store(incomplete_cache_);
+        }
+
+        // as a last step all waiting threads must
+        // be notified (copies of this will wait on value
+        // change of the cache) and the incomplete cache
+        // is cleared to nullptr as it is not incomplete
+
+        cache_->notify_all();
+        incomplete_cache_ = nullptr;
+    }
+}
diff --git a/offloading-cacher/util/dml-helper.hpp b/offloading-cacher/util/dml-helper.hpp
deleted file mode 100644
index de92bb7..0000000
--- a/offloading-cacher/util/dml-helper.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#pragma once
-
-#include <dml/dml.hpp>
-
-namespace dml {
-    inline const std::string StatusCodeToString(const dml::status_code code) {
-        switch (code) {
-            case dml::status_code::ok:
-                return "ok";
-            case dml::status_code::false_predicate:
-                return "false predicate";
-            case dml::status_code::partial_completion:
-                return "partial completion";
-            case dml::status_code::nullptr_error:
-                return "nullptr error";
-            case dml::status_code::bad_size:
-                return "bad size";
-            case dml::status_code::bad_length:
-                return "bad length";
-            case dml::status_code::inconsistent_size:
-                return "inconsistent size";
-            case dml::status_code::dualcast_bad_padding:
-                return "dualcast bad padding";
-            case dml::status_code::bad_alignment:
-                return "bad alignment";
-            case dml::status_code::buffers_overlapping:
-                return "buffers overlapping";
-            case dml::status_code::delta_delta_empty:
-                return "delta delta empty";
-            case dml::status_code::batch_overflow:
-                return "batch overflow";
-            case dml::status_code::execution_failed:
-                return "execution failed";
-            case dml::status_code::unsupported_operation:
-                return "unsupported operation";
-            case dml::status_code::queue_busy:
-                return "queue busy";
-            case dml::status_code::error:
-                return "unknown error";
-            case dml::status_code::config_error:
-                return "config error";
-            default:
-                return "unhandled error";
-        }
-    }
-}
\ No newline at end of file

From 8ba716353a7634cd7ddd117170f6f769a5f87d15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Mon, 15 Jan 2024 13:39:06 +0100
Subject: [PATCH 19/29] add Clear() function which forces flush on the entire
 Cache and remove non-warning/error status messages from the cacher

---
 offloading-cacher/cache.hpp | 46 ++++++++++++-------------------------
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 3fe1e19..6b0e712 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -195,9 +195,24 @@ namespace dsacache {
         // checked and otherwise the specified
         // node - no checks on node validity
         void Flush(const int node = -1);
+
+        // forces out all entries from the
+        // cache and therefore will also "forget"
+        // still-in-use entries, these will still
+        // be properly deleted, but the cache
+        // will be fresh - use for testing
+        void Clear();
     };
 }
 
+inline void dsacache::Cache::Clear() {
+    std::unique_lock<std::shared_mutex> lock(cache_mutex_);
+
+    cache_state_.clear();
+
+    Init(cache_policy_function_, copy_policy_function_);
+}
+
 inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {
     cache_policy_function_ = cache_policy_function;
     copy_policy_function_ = copy_policy_function;
@@ -221,8 +236,6 @@ inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy
             cache_state_.insert({node,{}});
         }
     }
-
-    std::cout << "[-] Cache Initialized" << std::endl;
 }
 
 inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::Access(uint8_t* data, const size_t size) {
@@ -314,9 +327,6 @@ inline uint8_t* dsacache::Cache::AllocOnNode(const size_t size, const int node)
 }
 
 inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) {
-    std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
-
-
     uint8_t* dst = AllocOnNode(task->size_, dst_node);
 
     if (dst == nullptr) {
@@ -338,8 +348,6 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
     const size_t size = task->size_ / task_count;
     const size_t last_size = size + task->size_ % task_count;
 
-    std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl;
-
     // save the current numa node mask to restore later
     // as executing the copy task will place this thread
     // on a different node
@@ -389,8 +397,6 @@ inline void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int*
 }
 
 inline void dsacache::Cache::Flush(const int node) {
-    std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl;
-
     // this lambda is used because below we have two code paths that
     // flush nodes, either one single or all successively
 
@@ -463,16 +469,12 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
         // now check whether the sizes match
 
         if (search->second.size_ >= size) {
-            std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
-
             // return a unique copy of the entry which uses the object
             // lifetime and destructor to safely handle deallocation
 
             return std::move(std::make_unique<CacheData>(search->second));
         }
         else {
-            std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl;
-
             // if the sizes missmatch then we clear the current entry from cache
             // which will cause its deletion only after the last possible outside
             // reference is also destroyed
@@ -485,8 +487,6 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
 }
 
 inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
-    std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl;
-
     src_ = data;
     size_ = size;
     active_ = new std::atomic<int32_t>(1);
@@ -496,8 +496,6 @@ inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
 }
 
 inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
-    std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl;
-
     // we copy the ptr to the global atomic reference counter
     // and increase the amount of active references
 
@@ -521,8 +519,6 @@ inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
 }
 
 inline dsacache::CacheData::~CacheData() {
-    std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
     // if this is the first instance of this cache structure
     // and it has not been waited on and is now being destroyed
     // we must wait on completion here to ensure the cache
@@ -542,8 +538,6 @@ inline dsacache::CacheData::~CacheData() {
     // as this was the last reference
 
     if (v <= 0) {
-        std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
         Deallocate();
 
         delete active_;
@@ -552,8 +546,6 @@ inline dsacache::CacheData::~CacheData() {
 }
 
 inline void dsacache::CacheData::Deallocate() {
-    std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
     // although deallocate should only be called from
     // a safe context to do so, it can not hurt to
     // defensively perform the operation atomically
@@ -582,23 +574,17 @@ inline void dsacache::CacheData::WaitOnCompletion() {
     // are non-null or it is not
 
     if (handlers_ == nullptr) {
-        std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
         // when no handlers are attached to this cache entry we wait on a
         // value change for the cache structure from nullptr to non-null
         // which will either go through immediately if the cache is valid
         // already or wait until the handler-owning thread notifies us
 
         cache_->wait(nullptr);
-
-        std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
     }
     else {
         // when the handlers are non-null there are some DSA task handlers
         // available on which we must wait here
 
-        std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
         // abort is set if any operation encountered an error
 
         bool abort = false;
@@ -637,8 +623,6 @@ inline void dsacache::CacheData::WaitOnCompletion() {
             numa_free(incomplete_cache_, size_);
         }
         else {
-            std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl;
-
             // incomplete cache is now safe to use and therefore we
             // swap it with the global cache state of this entry
             // and notify potentially waiting threads

From 0fdf650fe4b3fe23c380e67c0f0ff0e927e0a5bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Mon, 15 Jan 2024 22:43:38 +0100
Subject: [PATCH 20/29] improve the class-comments for Cache and CacheData,
 also free incomplete_cache_ if it has not been waited for (see comment on
 this)

---
 offloading-cacher/cache.hpp | 125 ++++++++++++++++++++++++++++++++----
 1 file changed, 113 insertions(+), 12 deletions(-)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 6b0e712..d96f02b 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -59,13 +59,33 @@ namespace dml {
 namespace dsacache {
     class Cache;
 
-    // cache data holds all required information on
-    // one cache entry and will both be stored
-    // internally by the cache and handed out
-    // as copies to the user
-    // this class uses its object lifetime and
-    // a global reference counter to allow
-    // thread-safe copies and resource management
+    /*
+     * Class Description:
+     * Holds all required information on one cache entry and is used
+     * both internally by the Cache and externally by the user.
+     *
+     * Important Usage Notes:
+     * The pointer is only updated in WaitOnCompletion() which
+     * therefore must be called by the user at some point in order
+     * to use the cached data. Using this class as T for
+     * std::shared_ptr<T> is not recommended as references are
+     * already counted internally.
+     *
+     * Cache Lifetime:
+     * As long as the instance is referenced, the pointer it stores
+     * is guaranteed to be either nullptr or pointing to a valid copy.
+     *
+     * Implementation Detail:
+     * Performs self-reference counting with a shared atomic integer.
+     * Therefore on creating a copy the reference count is increased
+     * and with the destructor it is deacresed. If the last copy is
+     * destroyed the actual underlying data is freed and all shared
+     * variables deleted.
+     *
+     * Notes on Thread Safety:
+     * Class is thread safe in any possible state and performs
+     * reference counting and deallocation itself entirely atomically.
+     */
 
     class CacheData {
     public:
@@ -101,6 +121,7 @@ namespace dsacache {
         bool Active() const;
 
         friend Cache;
+
     public:
         CacheData(uint8_t* data, const size_t size);
         CacheData(const CacheData& other);
@@ -118,11 +139,62 @@ namespace dsacache {
         uint8_t* GetDataLocation() const;
     };
 
-    // cache class will handle access to data through the cache
-    // by managing the cache through work submission, it sticks
-    // to user-defined caching and copy policies, is thread
-    // safe after initialization and returns copies of
-    // cache data class to the user
+    /*
+     * Class Description:
+     * Class will handle access to data through internal copies.
+     * These are obtained via work submission to the Intel DSA which takes
+     * care of asynchronously duplicating the data. The user will define
+     * where these copies lie and which system nodes will perform the copy.
+     * This is done through policy functions set during initialization.
+     *
+     * Placement Policy:
+     * The Placement Policy Function decides on which node a particular
+     * entry is to be placed, given the current executing node and the
+     * data source node and data size. This in turn means that for one
+     * datum, multiple cached copies may exist at one time.
+     *
+     * Cache Lifetime:
+     * When accessing the cache, a CacheData-object will be returned.
+     * As long as this object lives, the pointer which it holds is
+     * guaranteed to be either nullptr or a valid copy. When destroyed
+     * the entry is marked for deletion which is only carried out
+     * when system memory pressure drives an automated cache flush.
+     *
+     * Restrictions:
+     * - Overlapping Pointers may lead to undefined behaviour during
+     *   manual cache invalidation which should not be used if you
+     *   intend to have these types of pointers
+     * - Cache Invalidation may only be performed manually and gives
+     *   no ordering guarantees. Therefore, it is the users responsibility
+     *   to ensure that results after invalidation have been generated
+     *   using the latest state of data. The cache is best suited
+     *   to static data.
+     *
+     * Notes on Thread Safety:
+     * - Cache is completely thread-safe after initialization
+     * - CacheData-class will handle deallocation of data itself by
+     *   performing self-reference-counting atomically and only
+     *   deallocating if the last reference is destroyed
+     * - The internal cache state has one lock which is either
+     *   acquired shared for reading the state (upon accessing an already
+     *   cached element) or unique (accessing a new element, flushing, invalidating)
+     * - Waiting on copy completion is done over an atomic-wait in copies
+     *   of the original CacheData-instance
+     * - Overall this class may experience performance issues due to the use
+     *   of locking (in any configuration), lock contention (worsens with higher
+     *   core count, node count and utilization) and atomics (worse in the same
+     *   situations as lock contention)
+     *
+     * Improving Performance:
+     * When data is never shared between threads or memory size for the cache is
+     * not an issue you may consider having one Cache-instance per thread and removing
+     * the lock in Cache and modifying the reference counting and waiting mechanisms
+     * of CacheData accordingly (although this is high effort and will yield little due
+     * to the atomics not being shared among cores/nodes).
+     * Otherwise, one Cache-instance per node could also be considered. This will allow
+     * the placement policy function to be barebones and reduces the lock contention and
+     * synchronization impact of the atomic variables.
+     */
 
     class Cache {
     public:
@@ -202,6 +274,8 @@ namespace dsacache {
         // be properly deleted, but the cache
         // will be fresh - use for testing
         void Clear();
+
+        void Invalidate(uint8_t* data);
     };
 }
 
@@ -486,6 +560,28 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_
     return nullptr;
 }
 
+void dsacache::Cache::Invalidate(uint8_t* data) {
+    // as the cache is modified we must obtain a unique writers lock
+
+    std::unique_lock<std::shared_mutex> lock(cache_mutex_);
+
+    // loop through all per-node-caches available
+
+    for (auto node : cache_state_) {
+        // search for an entry for the given data pointer
+
+        auto search = node.second.find(data);
+
+        if (search != node.second.end()) {
+            // if the data is represented in-cache
+            // then it will be erased to re-trigger
+            // caching on next access
+
+            node.second.erase(search);
+        }
+    }
+}
+
 inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) {
     src_ = data;
     size_ = size;
@@ -552,6 +648,11 @@ inline void dsacache::CacheData::Deallocate() {
 
     uint8_t* cache_local = cache_->exchange(nullptr);
     if (cache_local != nullptr) numa_free(cache_local, size_);
+
+    // if the cache was never waited for then incomplete_cache_
+    // may still contain a valid pointer which has to be freed
+
+    if (incomplete_cache_ != nullptr) numa_free(incomplete_cache_, size_);
 }
 
 inline uint8_t* dsacache::CacheData::GetDataLocation() const {

From e570a6fe696bba485c976c04ba808aa9ce6e182d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Mon, 15 Jan 2024 22:48:01 +0100
Subject: [PATCH 21/29] reduce the line-count of the switch statement for
 dml::StatusToString by inlining return with case statements

---
 offloading-cacher/cache.hpp | 54 +++++++++++++------------------------
 1 file changed, 18 insertions(+), 36 deletions(-)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index d96f02b..6a717ff 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -16,42 +16,24 @@
 namespace dml {
     inline const std::string StatusCodeToString(const dml::status_code code) {
         switch (code) {
-            case dml::status_code::ok:
-                return "ok";
-            case dml::status_code::false_predicate:
-                return "false predicate";
-            case dml::status_code::partial_completion:
-                return "partial completion";
-            case dml::status_code::nullptr_error:
-                return "nullptr error";
-            case dml::status_code::bad_size:
-                return "bad size";
-            case dml::status_code::bad_length:
-                return "bad length";
-            case dml::status_code::inconsistent_size:
-                return "inconsistent size";
-            case dml::status_code::dualcast_bad_padding:
-                return "dualcast bad padding";
-            case dml::status_code::bad_alignment:
-                return "bad alignment";
-            case dml::status_code::buffers_overlapping:
-                return "buffers overlapping";
-            case dml::status_code::delta_delta_empty:
-                return "delta delta empty";
-            case dml::status_code::batch_overflow:
-                return "batch overflow";
-            case dml::status_code::execution_failed:
-                return "execution failed";
-            case dml::status_code::unsupported_operation:
-                return "unsupported operation";
-            case dml::status_code::queue_busy:
-                return "queue busy";
-            case dml::status_code::error:
-                return "unknown error";
-            case dml::status_code::config_error:
-                return "config error";
-            default:
-                return "unhandled error";
+            case dml::status_code::ok: return "ok";
+            case dml::status_code::false_predicate: return "false predicate";
+            case dml::status_code::partial_completion: return "partial completion";
+            case dml::status_code::nullptr_error: return "nullptr error";
+            case dml::status_code::bad_size: return "bad size";
+            case dml::status_code::bad_length: return "bad length";
+            case dml::status_code::inconsistent_size: return "inconsistent size";
+            case dml::status_code::dualcast_bad_padding: return "dualcast bad padding";
+            case dml::status_code::bad_alignment: return "bad alignment";
+            case dml::status_code::buffers_overlapping: return "buffers overlapping";
+            case dml::status_code::delta_delta_empty: return "delta delta empty";
+            case dml::status_code::batch_overflow: return "batch overflow";
+            case dml::status_code::execution_failed: return "execution failed";
+            case dml::status_code::unsupported_operation: return "unsupported operation";
+            case dml::status_code::queue_busy: return "queue busy";
+            case dml::status_code::error: return "unknown error";
+            case dml::status_code::config_error: return "config error";
+            default: return "unhandled error";
         }
     }
 }

From e4a681ac1efb2ca5c0d274b3110d903dc1c78c95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Mon, 15 Jan 2024 22:50:34 +0100
Subject: [PATCH 22/29] delete the copy-constructor for cache as copying it is
 undesired behaviour

---
 offloading-cacher/cache.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 6a717ff..78d16b0 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -237,6 +237,8 @@ namespace dsacache {
         std::unique_ptr<CacheData> GetFromCache(uint8_t* src, const size_t size, const int dst_node);
 
     public:
+        Cache(const Cache& other) = delete;
+
         // initializes the cache with the two policy functions
         // only after this is it safe to use in a threaded environment
         void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);

From da38c048ca53bf2a1971eeae500e92242c4e235f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 16 Jan 2024 22:14:58 +0100
Subject: [PATCH 23/29] pass data size to copy policy function too

---
 offloading-cacher/cache.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index 78d16b0..b84e347 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -191,7 +191,7 @@ namespace dsacache {
         // copy policy specifies the copy-executing nodes for a given task
         // which allows flexibility in assignment for optimizing raw throughput
         // or choosing a conservative usage policy
-        typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node);
+        typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size);
 
     private:
         // mutex for accessing the cache state map
@@ -396,7 +396,7 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
 
     // querry copy policy function for the nodes to use for the copy
 
-    const std::vector<int> executing_nodes = copy_policy_function_(dst_node, src_node);
+    const std::vector<int> executing_nodes = copy_policy_function_(dst_node, src_node, task->size_);
     const size_t task_count = executing_nodes.size();
 
     // each task will copy one fair part of the total size

From 5578f06c80eb8db114e77ab965b2dc59ef484639 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Tue, 16 Jan 2024 22:15:36 +0100
Subject: [PATCH 24/29] adapt copy policy function to take data size as well
 and use this to only use destination nodes dsa engine for small data sizes on
 xeonmax

---
 offloading-cacher/main.cpp | 42 ++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp
index 443b00b..8193f5a 100644
--- a/offloading-cacher/main.cpp
+++ b/offloading-cacher/main.cpp
@@ -7,6 +7,8 @@
 
 #include "cache.hpp"
 
+static constexpr size_t SIZE_64_MIB = 64 * 1024 * 1024;
+
 dsacache::Cache CACHE;
 
 void InitCache(const std::string& device) {
@@ -15,25 +17,47 @@ void InitCache(const std::string& device) {
             return numa_dst_node;
         };
 
-        auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-            return std::vector<int>{ numa_src_node, numa_dst_node };
+        auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            return std::vector<int>{ numa_dst_node };
         };
 
         CACHE.Init(cache_policy,copy_policy);
     }
     else if (device == "xeonmax") {
         auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            // xeon max is configured to have hbm on node ids that are +8
+            
             return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;
         };
 
-        auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-            const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
-            if (same_socket) {
-                const bool socket_number = numa_dst_node >> 2;
-                if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
-                else return std::vector<int>{ 4, 5, 6, 7 };
+        auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+            if (data_size < SIZE_64_MIB) {
+                // if the data size is small then the copy will just be carried
+                // out by the destination node which does not require setting numa
+                // thread affinity as the selected dsa engine is already the one
+                // present on the calling thread
+
+                return std::vector<int>{ (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) };
+            }
+            else {
+                // for sufficiently large data, smart copy is used which will utilize
+                // all four engines for intra-socket copy operations and cross copy on
+                // the source and destination nodes for inter-socket copy
+
+                const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
+
+                if (same_socket) {
+                    const bool socket_number = numa_dst_node >> 2;
+                    if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
+                    else return std::vector<int>{ 4, 5, 6, 7 };
+                }
+                else {
+                    return std::vector<int>{
+                        (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node),
+                        (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node)
+                    };
+                }
             }
-            else return std::vector<int>{ numa_src_node, numa_dst_node };
         };
 
         CACHE.Init(cache_policy,copy_policy);

From 641a7593feb2d12b72f79f7b1d1583ac052a75fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 17 Jan 2024 11:15:00 +0100
Subject: [PATCH 25/29] add query driven prefetching code repository copy

---
 qdp_project/.gitignore                        | 104 +++++
 qdp_project/CMakeLists.txt                    | 104 +++++
 qdp_project/README.md                         |   3 +
 qdp_project/bench_all_dimes.sh                |  10 +
 qdp_project/bench_max.sh                      |  15 +
 qdp_project/cmake_all_dimes.sh                |  33 ++
 qdp_project/cmake_max.sh                      |   9 +
 qdp_project/src/.gitkeep                      |   0
 .../src/algorithm/operators/aggregation.h     | 316 ++++++++++++++
 qdp_project/src/algorithm/operators/filter.h  | 170 ++++++++
 qdp_project/src/benchmark/DIMES_benchmark.cpp | 240 +++++++++++
 .../src/benchmark/DIMES_cores_benchmark.cpp   | 260 ++++++++++++
 qdp_project/src/benchmark/MAX_benchmark.cpp   | 289 +++++++++++++
 qdp_project/src/benchmark/QDP_minimal.h       | 147 +++++++
 .../src/benchmark/doubly_filtered_agg.cpp     | 149 +++++++
 .../benchmark/filter_aggregate_pipeline.cpp   | 184 ++++++++
 qdp_project/src/benchmark/latency.cpp         | 188 +++++++++
 .../src/benchmark/micro_benchmarks.cpp        | 271 ++++++++++++
 .../pipelines/DIMES_scan_filter_pipe.h        | 391 +++++++++++++++++
 .../pipelines/MAX_scan_filter_pipe.h          | 395 ++++++++++++++++++
 .../benchmark/pipelines/scan_filter_pipe.h    | 387 +++++++++++++++++
 qdp_project/src/utils/array_utils.h           |  80 ++++
 qdp_project/src/utils/barrier_utils.h         |  73 ++++
 qdp_project/src/utils/const.h                 |  33 ++
 qdp_project/src/utils/cpu_set_utils.h         |  82 ++++
 qdp_project/src/utils/execution_modes.h       |  89 ++++
 qdp_project/src/utils/file_output.h           |  76 ++++
 qdp_project/src/utils/iterable_range.h        | 208 +++++++++
 qdp_project/src/utils/measurement_utils.h     | 152 +++++++
 qdp_project/src/utils/memory_literals.h       |  45 ++
 qdp_project/src/utils/pcm.h                   |   6 +
 qdp_project/src/utils/timer_utils.h           |  80 ++++
 qdp_project/src/utils/vector_loader.h         |  93 +++++
 33 files changed, 4682 insertions(+)
 create mode 100644 qdp_project/.gitignore
 create mode 100644 qdp_project/CMakeLists.txt
 create mode 100644 qdp_project/README.md
 create mode 100644 qdp_project/bench_all_dimes.sh
 create mode 100644 qdp_project/bench_max.sh
 create mode 100644 qdp_project/cmake_all_dimes.sh
 create mode 100644 qdp_project/cmake_max.sh
 create mode 100644 qdp_project/src/.gitkeep
 create mode 100644 qdp_project/src/algorithm/operators/aggregation.h
 create mode 100644 qdp_project/src/algorithm/operators/filter.h
 create mode 100644 qdp_project/src/benchmark/DIMES_benchmark.cpp
 create mode 100644 qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
 create mode 100644 qdp_project/src/benchmark/MAX_benchmark.cpp
 create mode 100644 qdp_project/src/benchmark/QDP_minimal.h
 create mode 100644 qdp_project/src/benchmark/doubly_filtered_agg.cpp
 create mode 100644 qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
 create mode 100644 qdp_project/src/benchmark/latency.cpp
 create mode 100644 qdp_project/src/benchmark/micro_benchmarks.cpp
 create mode 100644 qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
 create mode 100644 qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
 create mode 100644 qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
 create mode 100644 qdp_project/src/utils/array_utils.h
 create mode 100644 qdp_project/src/utils/barrier_utils.h
 create mode 100644 qdp_project/src/utils/const.h
 create mode 100644 qdp_project/src/utils/cpu_set_utils.h
 create mode 100644 qdp_project/src/utils/execution_modes.h
 create mode 100644 qdp_project/src/utils/file_output.h
 create mode 100644 qdp_project/src/utils/iterable_range.h
 create mode 100644 qdp_project/src/utils/measurement_utils.h
 create mode 100644 qdp_project/src/utils/memory_literals.h
 create mode 100644 qdp_project/src/utils/pcm.h
 create mode 100644 qdp_project/src/utils/timer_utils.h
 create mode 100644 qdp_project/src/utils/vector_loader.h

diff --git a/qdp_project/.gitignore b/qdp_project/.gitignore
new file mode 100644
index 0000000..1a8b920
--- /dev/null
+++ b/qdp_project/.gitignore
@@ -0,0 +1,104 @@
+
+
+bin/
+
+
+# CMake building files
+CMakeLists.txt.user
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Testing
+Makefile
+cmake_install.cmake
+install_manifest.txt
+compile_commands.json
+CTestTestfile.cmake
+_deps
+.cmake
+
+# Prerequisites
+*.d
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Linker output
+*.ilk
+*.map
+*.exp
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+*.idb
+*.pdb
+
+# Kernel Module Compile Results
+*.mod*
+*.cmd
+.tmp_versions/
+modules.order
+Module.symvers
+Mkfile.old
+dkms.conf
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
diff --git a/qdp_project/CMakeLists.txt b/qdp_project/CMakeLists.txt
new file mode 100644
index 0000000..71c8452
--- /dev/null
+++ b/qdp_project/CMakeLists.txt
@@ -0,0 +1,104 @@
+cmake_minimum_required(VERSION 3.18)
+
+# set the project name
+project(NUMA_Slow_Fast_Datamigration_Test VERSION 0.1)
+
+# specify the C standard
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+#set flags on need                       cross compile for sapphirerapids architecture
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=sapphirerapids")
+#set flags on need                       cross compile for skylake micro architecture
+#set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=skylake-avx512")
+#set flags on need                       cross compile for knights landing micro architecture (for debugging)
+#set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mavx512f -mavx512cd -mavx512er -mavx512pf")
+
+#suppress selected! warnigs that are not very important to resolve. This is to keep the compileation output clean
+set(SUPPRESS_WARNINGS "-Wno-literal-suffix -Wno-volatile")
+
+set(DEBUG_FLAGS "-g3" "-ggdb")
+set(RELEASE_FLAGS "-O3")
+
+#set pcm location
+set(PCM_LOCATION ./thirdParty/pcm)
+set(PCM_LINKS -lpcm -L${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib)
+# pass the in formation about the shared library location to the linker
+link_directories(${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib)
+
+#set flags used for Release and Debug build type
+add_compile_options(
+    "$<$<CONFIG:Release>:${RELEASE_FLAGS}>"
+    "$<$<CONFIG:Debug>:${DEBUG_FLAGS}>"
+)
+
+# evaluate custom variables
+function(eval vname vvalid vdefault)
+  # is variable is set to the below value if its not already defined from the comand line 
+  set(VALID ${vvalid} CACHE INTERNAL "Possible values for ${vname}")
+  set(${vname} ${vdefault} CACHE STRING "The barrier mode")
+  # command for GUI shenanigans
+  set_property(CACHE ${vname} PROPERTY STRINGS VALID)
+
+  if(${vname} IN_LIST VALID)
+    message(STATUS "Variable ${vname} = ${${vname}}")
+  else()
+    message(STATUS "Variable ${vname} has invalid value ${${vname}}")
+    # set the fallback value for use in parent function
+    unset(${vname} CACHE)
+    message(STATUS "Fallback to default: ${vname} = ${vdefault}")
+    set(${vname} ${vdefault} PARENT_SCOPE)
+  endif()
+endfunction()
+
+eval(WSUPPRESS "suppress;show" "show")
+if($<STREQUAL:${BUFFER_LIMIT},suppress> EQUAL 1)
+  add_compile_options("${SUPPRESS_WARNINGS}")
+endif()
+
+eval(BARRIER_MODE "global;local" "global")
+add_definitions(-DBARRIER_MODE="${BARRIER_MODE}")
+
+eval(BUFFER_LIMIT "unlimited;limited" "unlimited")
+add_definitions(-DBUFFER_LIMIT=$<STREQUAL:${BUFFER_LIMIT},limited>)
+
+eval(QUERY "simple;complex" "simple")
+add_definitions(-DQUERY=$<STREQUAL:${QUERY},simple>)
+
+eval(THREAD_FACTOR "1;2;3;4;5;6;7;8;9;10" "4")
+add_definitions(-DTHREAD_GROUP_MULTIPLIER=${THREAD_FACTOR})
+
+eval(PINNING "cpu;numa" "cpu")
+add_definitions(-DPINNING=$<STREQUAL:${PINNING},cpu>)
+
+eval(PCM_M "true;false" "false")
+add_definitions(-DPCM_M=$<STREQUAL:${PCM_M},true>)
+add_definitions(${PCM_LINKS})
+
+# build directory
+set(CMAKE_BINARY_DIR "../bin") #relative to inside build 
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+
+
+
+# include directories
+include_directories(src/utils)
+include_directories(src/algorithm)
+include_directories(src/algorithm/operators)
+include_directories(thirdParty/pcm/src)
+
+# link libraries 
+link_libraries(-lnuma -lpthread)
+
+# Add targets only below   
+# specify build targets
+add_executable(FilterAggregatePipeline src/benchmark/filter_aggregate_pipeline.cpp)
+add_executable(DoublyFiltered src/benchmark/doubly_filtered_agg.cpp)
+add_executable(DIMESBench src/benchmark/DIMES_benchmark.cpp)
+add_executable(DIMESCoreBench src/benchmark/DIMES_cores_benchmark.cpp)
+add_executable(MicroBench src/benchmark/micro_benchmarks.cpp)
+add_executable(MAXBench src/benchmark/MAX_benchmark.cpp
+        src/benchmark/QDP_minimal.h)
+target_link_libraries(MAXBench libpcm.so)
+add_executable(LatencyBench src/benchmark/latency.cpp)
+
diff --git a/qdp_project/README.md b/qdp_project/README.md
new file mode 100644
index 0000000..afad56b
--- /dev/null
+++ b/qdp_project/README.md
@@ -0,0 +1,3 @@
+This is a copy of the Query Driven Prefetching Repository
+https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/qdp_minimal/code
+Original Authors: André Berthold and Anna Bartuschka
diff --git a/qdp_project/bench_all_dimes.sh b/qdp_project/bench_all_dimes.sh
new file mode 100644
index 0000000..9c05e62
--- /dev/null
+++ b/qdp_project/bench_all_dimes.sh
@@ -0,0 +1,10 @@
+#!bin/bash
+
+../bin/DIMESBench_gus
+../bin/DIMESBench_guc
+../bin/DIMESBench_gls
+../bin/DIMESBench_glc
+../bin/DIMESBench_lus
+../bin/DIMESBench_luc
+../bin/DIMESBench_lls
+../bin/DIMESBench_llc
\ No newline at end of file
diff --git a/qdp_project/bench_max.sh b/qdp_project/bench_max.sh
new file mode 100644
index 0000000..fb08bd8
--- /dev/null
+++ b/qdp_project/bench_max.sh
@@ -0,0 +1,15 @@
+#!bin/bash
+
+current_date_time=$(date)
+echo "Benchmark start at: $current_date_time"
+
+../bin/MAXBench_gcc
+
+cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_c_HBM.csv
+
+../bin/MAXBench_gcn
+
+cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_n_HBM.csv
+
+current_date_time=$(date)
+echo "Benchmark end at: $current_date_time"
\ No newline at end of file
diff --git a/qdp_project/cmake_all_dimes.sh b/qdp_project/cmake_all_dimes.sh
new file mode 100644
index 0000000..9ce3a96
--- /dev/null
+++ b/qdp_project/cmake_all_dimes.sh
@@ -0,0 +1,33 @@
+#!bin/bash
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=simple  ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_gus
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_guc
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited   -DQUERY=simple  ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_gls
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited   -DQUERY=complex ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_glc
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=unlimited -DQUERY=simple  ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_lus
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=unlimited -DQUERY=complex ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_luc
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=limited   -DQUERY=simple  ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_lls
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=limited   -DQUERY=complex ..
+cmake --build . --target DIMESBench
+mv ../bin/DIMESBench ../bin/DIMESBench_llc
\ No newline at end of file
diff --git a/qdp_project/cmake_max.sh b/qdp_project/cmake_max.sh
new file mode 100644
index 0000000..03c137b
--- /dev/null
+++ b/qdp_project/cmake_max.sh
@@ -0,0 +1,9 @@
+#!bin/bash
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=cpu -DPCM_M=false ..
+cmake --build . --target MAXBench
+mv ../bin/MAXBench ../bin/MAXBench_gcc
+
+cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=numa -DPCM_M=false ..
+cmake --build . --target MAXBench
+mv ../bin/MAXBench ../bin/MAXBench_gcn
diff --git a/qdp_project/src/.gitkeep b/qdp_project/src/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/qdp_project/src/algorithm/operators/aggregation.h b/qdp_project/src/algorithm/operators/aggregation.h
new file mode 100644
index 0000000..119ab14
--- /dev/null
+++ b/qdp_project/src/algorithm/operators/aggregation.h
@@ -0,0 +1,316 @@
+#pragma once
+
+#include <cstdint>
+#include <algorithm>
+#include <immintrin.h>
+#include <type_traits>
+
+#include "vector_loader.h"
+#include "const.h"
+
+
+/**
+ * @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type.
+ * 
+ * @tparam T 
+ */
+template <typename T>
+class AggFunction {
+    static_assert(std::is_integral<T>::value, "The base type of an AggFunction must be an integral");
+};
+
+/**
+ * @brief Template class that implements methods used for Summation. It wraps the corresponding vector intrinsics 
+ * 
+ * @tparam T base datatype for the implemented methods
+ */
+template<typename T>
+class Sum : public AggFunction<T> {
+public:
+    static inline __m512i simd_agg(__m512i aggregator, __m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_add_epi32(aggregator, vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_add_epi64(aggregator, vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers");
+    };
+
+    static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_mask_add_epi32(aggregator, mask, aggregator, vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_mask_add_epi64(aggregator, mask, aggregator, vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers");
+    };
+
+    static inline T simd_reduce(__m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_reduce_add_epi32(vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_reduce_add_epi64(vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers");
+    };
+
+    static inline T scalar_agg(T aggregator, T scalar) { return aggregator + scalar; };
+
+    static inline __m512i zero() { return _mm512_set1_epi32(0); };
+};
+
+
+/**
+ * @brief Template class that implements methods used for Maximum determination. It wraps the corresponding vector intrinsics 
+ * 
+ * @tparam T base datatype for the implemented methods
+ *
+ */
+template<typename T>
+class Max : public AggFunction<T> {
+public:
+    static inline __m512i simd_agg(__m512i aggregator, __m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_max_epi32(aggregator, vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_max_epi64(aggregator, vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_mask_max_epi32(aggregator, mask, aggregator, vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_mask_max_epi64(aggregator, mask, aggregator, vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); 
+    }
+
+    static inline T simd_reduce(__m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_reduce_max_epi32(vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_reduce_max_epi64(vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline T scalar_agg(T aggregator, T scalar) { return std::max(aggregator, scalar); }
+
+    static inline __m512i zero() { 
+        if constexpr (sizeof(T) == 4) {
+            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFF);
+            else                                    return _mm512_set1_epi32(0x0);
+        }
+        else if constexpr (sizeof(T) == 8) {
+            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF);
+            else                                    return _mm512_set1_epi32(0x0);
+        }
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers");
+    }
+};
+
+/**
+ * @brief Template class that implements methods used for Minimum determination. It wraps the corresponding vector intrinsics 
+ * 
+ * @tparam T base datatype for the implemented methods
+ *
+ */
+template<typename T>
+class Min : public AggFunction<T> {
+public:
+    static inline __m512i simd_agg(__m512i aggregator, __m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_min_epi32(aggregator, vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_min_epi64(aggregator, vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_mask_min_epi32(aggregator, mask, aggregator, vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_mask_min_epi64(aggregator, mask, aggregator, vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline T simd_reduce(__m512i vector) {
+        if      constexpr (sizeof(T) == 4) return _mm512_reduce_min_epi32(vector);
+        else if constexpr (sizeof(T) == 8) return _mm512_reduce_min_epi64(vector);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline T scalar_agg(T aggregator, T scalar) { return std::min(aggregator, scalar); }
+
+    static inline __m512i zero() { 
+        if constexpr (sizeof(T) == 4) {
+            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFF);
+            else                                    return _mm512_set1_epi32(0xFFFFFFFF);
+        }
+        else if constexpr (sizeof(T) == 8) {
+            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFFFFFFFFFF);
+            else                                    return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF);
+        }
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
+    }
+};
+
+/**
+ * @brief Template Class that implements an aggregation operation.
+ * 
+ * @tparam base_t Base type of the values for aggregation
+ * @tparam func 
+ * @tparam load_mode 
+ */
+template<typename base_t, template<typename _base_t> class func, load_mode load_mode> 
+class Aggregation{
+public:
+
+    static_assert(std::is_same_v<base_t, uint64_t>, "Enforce unsigned 64 bit ints.");
+
+    using OP = func<base_t>;
+    /**
+     * @brief Calculates the memory maximal needed to store a chunk's processing result.
+     * 
+     * @param chunk_size_b Size of the chunk in byte
+     * @return size_t Size of the chunk's processing result in byte
+     */
+    static size_t result_bytes_per_chunk(size_t chunk_size_b) {
+        // aggregation returns a single value of type base_t
+        return sizeof(base_t);
+    }
+
+    /**
+     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes. 
+     * The result is written to main memory.
+     * 
+     * @param dest Pointer to the start of the result chunk
+     * @param src Pointer to the start of the source chunk
+     * @param chunk_size_b Size of the source chunk in bytes
+     * @return true When the aggregation is done
+     * @return false Never
+     */
+    static bool apply (base_t *dest, base_t *src, size_t chunk_size_b) {
+        constexpr size_t lanes = VECTOR_SIZE<base_t>();
+        size_t value_count = chunk_size_b / sizeof(base_t);
+        __m512i agg_vec = func<base_t>::zero();
+        size_t i = 0;
+        base_t result = 0;
+        // stop before! running out of space
+        if(value_count >= lanes) {// keep in mind value_count is unsigned so if it becomes negative, it doesn't.
+            for(; i <= value_count - lanes; i += lanes) {
+                __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
+
+                agg_vec = func<base_t>::simd_agg(agg_vec, vec);
+            }
+            result = func<base_t>::simd_reduce(agg_vec);
+        }
+
+        for(; i < value_count; ++i) {
+            result = func<base_t>::scalar_agg(result, src[i]);
+        }
+        *dest = result;
+
+        return true;
+    }
+
+    /**
+     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, 
+     * while applying the bit string stored in *masks*. The result is written to main memory.
+     * 
+     * @param dest Pointer to the start of the result chunk
+     * @param src Pointer to the start of the source chunk
+     * @param masks Pointer the bitstring that marks the values that should be aggregated
+     * @param chunk_size_b Size of the source chunk in bytes
+     * @return true When the aggregation is done
+     * @return false Never
+     */
+    static bool apply_masked (base_t *dest, base_t *src, uint16_t* msks, size_t chunk_size_b) {
+        constexpr size_t lanes = VECTOR_SIZE<base_t>();
+        uint8_t* masks = (uint8_t *)msks;
+        size_t value_count = chunk_size_b / sizeof(base_t);
+        __m512i agg_vec = func<base_t>::zero();
+        size_t i = 0;
+        // stop before! running out of space
+        if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't.
+        for(; i <= value_count - lanes; i += lanes) {
+            __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
+            __mmask8 mask = _mm512_int2mask(masks[i / lanes]);
+
+            agg_vec = func<base_t>::simd_mask_agg(agg_vec, mask, vec);        
+        }
+        *dest = func<base_t>::simd_reduce(agg_vec);
+
+        for(; i < value_count; ++i) {
+            uint8_t mask = masks[i / lanes];
+            if(mask & (0b1 << (i % lanes))){
+                *dest = func<base_t>::scalar_agg(*dest, src[i]);
+            }
+        }
+
+        return true;
+    }
+
+    /**
+     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, 
+     * while applying the bit string stored in *masks*. The values are agggegated in the register *dest* without 
+     * clearing beforehand. 
+     *
+     * NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte
+     * 
+     * @param dest Vector register used for storing and passing the result around
+     * @param src Pointer to the start of the source chunk
+     * @param masks Pointer the bitstring that marks the values that should be aggregated
+     * @param chunk_size_b Size of the source chunk in bytes
+     * @return __m512i Vector register holding the aggregation result
+     */
+    static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks, size_t chunk_size_b) {
+        constexpr size_t lanes = VECTOR_SIZE<base_t>();
+        uint8_t* masks = (uint8_t*) msks;
+        //TODO this function does not work if value_count % lanes != 0
+        size_t value_count = chunk_size_b / sizeof(base_t);
+        size_t i = 0;
+        // stop before! running out of space
+        if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't.
+        for(; i <= value_count - lanes; i += lanes) {
+            __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
+            __mmask8 mask = _mm512_int2mask(masks[i / lanes]);
+            dest = func<base_t>::simd_agg(dest, mask, vec);
+        }
+
+        return dest;
+    }
+
+    /**
+     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, 
+     * while applying two bit strings stored in *masks_0* and *masks_1*. The values are aggregated in the register 
+     * *dest* without clearing beforehand. 
+     *
+     * NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte
+     * 
+     * @param dest Vector register used for storing and passing the result around
+     * @param src Pointer to the start of the source chunk
+     * @param masks_0 Pointer the bitstring that marks the values that should be aggregated
+     * @param masks_1 Pointer the bitstring that marks the values that should be aggregated
+     * @param chunk_size_b Size of the source chunk in bytes
+     * @return __m512i Vector register holding the aggregation result
+     */
+    static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks0, uint16_t* msks1, size_t chunk_size_b) {
+        constexpr size_t lanes = VECTOR_SIZE<base_t>();
+        uint8_t* masks0 = (uint8_t*) msks0; 
+        uint8_t* masks1 = (uint8_t*) msks1; 
+        //TODO this function does not work if value_count % lanes != 0
+        size_t value_count = chunk_size_b / sizeof(base_t);
+        size_t i = 0;
+        // stop before! running out of space
+        if(value_count >= lanes) // keep in mind value_count is unsigned so if it becomes negative, it doesn't.
+        for(; i <= value_count - lanes; i += lanes) {
+            __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
+            __mmask8 mask0 = _mm512_int2mask(masks0[i / lanes]);
+            __mmask8 mask1 = _mm512_int2mask(masks1[i / lanes]);
+
+            mask0 = _kand_mask8(mask0, mask1);
+            dest = func<base_t>::simd_agg(dest, mask0, vec);
+        }
+
+        return dest;
+    }
+
+    /**
+     * @brief Reduces a vector by applying the aggregation function horizontally.
+     * 
+     * @param dest Result of the horizontal aggregation
+     * @param src Vector as source for the horizontal aggregation
+     * @return true When the operation is done
+     * @return false Never
+     */
+    static bool happly (base_t *dest, __m512i src) {
+        *dest = func<base_t>::simd_reduce(src);
+
+        return true;
+    }
+
+    static __m512i get_zero() {
+        return func<base_t>::zero();
+    }
+};
\ No newline at end of file
diff --git a/qdp_project/src/algorithm/operators/filter.h b/qdp_project/src/algorithm/operators/filter.h
new file mode 100644
index 0000000..a58a761
--- /dev/null
+++ b/qdp_project/src/algorithm/operators/filter.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include<cstdint>
+#include<type_traits>
+
+#include <immintrin.h>
+
+#include "vector_loader.h"
+
+/**
+ * @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type.
+ * 
+ * @tparam T An integral datatype
+ */
+template<typename T>
+class FilterFunction {
+    static_assert(std::is_integral<T>::value, "The base type of a FilterFunction must be an integeral.");
+};
+
+/**
+ * @brief Template class that implements methods used for finding values that are not equal to the compare value. 
+ * It wraps the corresponding vector intrinsics.
+ * 
+ * @tparam T base datatype for the implemented methods
+ */
+template<typename T>
+class NEQ : public FilterFunction<T> {
+public:
+    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
+        if      constexpr (sizeof(T) == 4) return _mm512_cmpneq_epi32_mask(vector, comp);
+        else if constexpr (sizeof(T) == 8) return _mm512_cmpneq_epi64_mask(vector, comp);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "NEQ is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline bool scalar_filter(T scalar, T comp) { return scalar != comp; }
+};
+
+template<typename T>
+class EQ : public FilterFunction<T> {
+public:
+    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
+        if      constexpr (sizeof(T) == 4) return _mm512_cmpeq_epi32_mask(vector, comp);
+        else if constexpr (sizeof(T) == 8) return _mm512_cmpeq_epi64_mask(vector, comp);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "EQ is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline bool scalar_filter(T scalar, T comp) { return scalar == comp; }
+};
+
+template<typename T>
+class LT : public FilterFunction<T> {
+public:
+    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
+        if      constexpr (sizeof(T) == 4) return _mm512_cmplt_epi32_mask(vector, comp);
+        else if constexpr (sizeof(T) == 8) return _mm512_cmplt_epi64_mask(vector, comp);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LT is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline bool scalar_filter(T scalar, T comp) { return scalar < comp; }
+};
+
+template<typename T>
+class LEQ : public FilterFunction<T> {
+public:
+    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
+        if      constexpr (sizeof(T) == 4) return _mm512_cmple_epi32_mask(vector, comp);
+        else if constexpr (sizeof(T) == 8) return _mm512_cmple_epi64_mask(vector, comp);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LEQ is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline bool scalar_filter(T scalar, T comp) { return scalar <= comp; }
+};
+
+template<typename T>
+class GT : public FilterFunction<T> {
+public:
+    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
+        if      constexpr (sizeof(T) == 4) return _mm512_cmpgt_epi32_mask(vector, comp);
+        else if constexpr (sizeof(T) == 8) return _mm512_cmpgt_epi64_mask(vector, comp);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GT is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline bool scalar_filter(T scalar, T comp) { return scalar > comp; }
+};
+
+template<typename T>
+class GEQ : public FilterFunction<T> {
+public:
+    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
+        if      constexpr (sizeof(T) == 4) return _mm512_cmpge_epi32_mask(vector, comp);
+        else if constexpr (sizeof(T) == 8) return _mm512_cmpge_epi64_mask(vector, comp);
+        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GEQ is only implemented for 32 and 64 wide integers");
+    }
+
+    static inline bool scalar_filter(T scalar, T comp) { return scalar >= comp; }
+};
+
+
+template<typename base_t, template<typename _base_t> class func, load_mode load_mode, bool copy>
+class Filter {
+public:
+
+    static_assert(std::is_same_v<base_t, uint64_t>, "We enforce 64 bit integer");
+
+    /**
+     * @brief Calculates the memory maximal needed to store a chunk's processing result.
+     * 
+     * @param chunk_size_b Size of the chunk in byte
+     * @return size_t Size of the chunk's processing result in byte
+     */
+    static size_t result_bytes_per_chunk(size_t chunk_size_b) {
+        // + 7 to enshure that we have enougth bytes -> / 8 -> rounds down 
+        // if we had 17 / 8 = 2  but (17 + 7) / 8 = 3
+        // if we hat 16 / 8 = 2 is right, as well as, 16 + 7 / 8 = 2
+        return (chunk_size_b / sizeof(base_t) + 7) / 8;
+    }
+    
+
+    /**
+     * @brief Applies the filter function on the chunk starting at *src* and spanning *chunk_size_b* bytes, while comparing with he same value every time. 
+     * The resulting bit string is written to main memory.
+     * 
+     * @param dest Pointer to the start of the result chunk
+     * @param src Pointer to the start of the source chunk
+     * @param cmp_value Comparision value to compare the values from source to
+     * @param chunk_size_b Size of the source chunk in bytes
+     * @return true When the filter operation is done
+     * @return false Never
+     */
+    // we only need this impl. yet, as all filter are at the end of a pipeline
+    static bool apply_same (uint16_t *dst, base_t *buffer, base_t *src, base_t cmp_value, size_t chunk_size_b) {
+        constexpr uint32_t lanes = VECTOR_SIZE<base_t>();
+        uint8_t* dest = (uint8_t*) dst;
+        size_t value_count = chunk_size_b / sizeof(base_t);
+        __m512i cmp_vec = _mm512_set1_epi64(cmp_value);
+        size_t i = 0;
+        // this weird implementetion is neccessary, see analogous impl in aggregation for explaination
+        if(value_count > lanes) {
+            for(; (i < value_count - lanes); i += lanes) {
+                __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
+                __mmask8 bitmask = func<base_t>::simd_filter(vec, cmp_vec);
+
+                uint8_t int_mask = (uint8_t) _mm512_mask2int(bitmask);
+
+                dest[i / lanes] = int_mask;
+                if constexpr(copy){
+                    Vector_Loader<base_t, load_mode>::store(buffer + i, vec);
+                }
+            }
+        }
+
+        auto dest_pos = i / lanes;
+        uint8_t int_mask = 0;
+        for(; i < value_count; ++i) {
+            base_t val = src[i];
+
+            uint8_t result = func<base_t>::scalar_filter(val, cmp_value);
+
+            int_mask |= (result << (i % lanes));
+
+            if constexpr(copy){
+                buffer[i] = val;
+            }
+        }
+        dest[dest_pos] = int_mask;
+
+        return true;
+    }
+    
+};
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/DIMES_benchmark.cpp b/qdp_project/src/benchmark/DIMES_benchmark.cpp
new file mode 100644
index 0000000..2ca9705
--- /dev/null
+++ b/qdp_project/src/benchmark/DIMES_benchmark.cpp
@@ -0,0 +1,240 @@
+#include <atomic>
+#include <barrier>
+#include <chrono>
+#include <condition_variable>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <tuple>
+#include <utility>
+
+#include <numa.h>
+
+#ifndef THREAD_GROUP_MULTIPLIER
+#define THREAD_GROUP_MULTIPLIER 8
+#endif
+
+#ifndef QUERY
+#define QUERY 1
+#endif
+
+#ifndef BARRIER_MODE
+#define BARRIER_MODE "global"
+#endif
+
+#ifndef BUFFER_LIMIT
+#define BUFFER_LIMIT 1
+#endif
+
+#include "const.h"
+
+#include "file_output.h"
+#include "array_utils.h"
+#include "timer_utils.h"
+#include "barrier_utils.h"
+#include "cpu_set_utils.h"
+#include "iterable_range.h"
+#include "memory_literals.h"
+#include "pipelines/DIMES_scan_filter_pipe.h"
+
+#include "aggregation.h"
+#include "filter.h"
+
+using base_t = uint64_t;
+
+base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value) * row_B[i];
+    }
+    return sum;
+}
+
+base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
+    }
+    return sum;
+}
+
+int main(int argc, char** argv) {
+    // set constants
+    const size_t workload_b = 4_GiB;
+    const base_t compare_value_a = 50;
+    const base_t compare_value_b = 42;
+    constexpr bool simple_query = (QUERY == 1);
+
+    const size_t thread_count = 6;
+    std::ofstream out_file;
+    out_file.open("../results/dimes_" 
+            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
+            "_bm-" + (std::string) BARRIER_MODE + 
+            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
+            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".csv");
+    
+    // set benchmark parameter 
+    Linear_Int_Range<uint32_t, 0, 10, 1> run("run");
+    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
+    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
+
+    uint32_t remote_node   = 3;
+    uint32_t remote_node_2 = 2;
+    uint32_t local_node    = 10; 
+
+    print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time",
+    #ifdef THREAD_TIMINGS
+        "scan_a", "scan_b", "aggr_j",
+    #endif
+    #ifdef BARRIER_TIMINGS
+        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
+    #endif
+        "result");
+
+
+    /*** alloc data and buffers ************************************************/
+    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
+    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
+    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
+    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
+    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
+    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
+    std::memcpy(data_a_hbm, data_a, workload_b);
+    std::memcpy(data_b_hbm, data_b, workload_b);
+    base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node);
+
+    std::ofstream check_file;
+    check_file.open("../results/dimes_" 
+            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
+            "_bm-" + (std::string) BARRIER_MODE + 
+            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
+            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum");
+    if constexpr (QUERY == 1) {
+        //calculate simple checksum if QUERY == 1 -> simple query is applied
+        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
+    } else {
+        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
+    }
+    check_file.close();
+
+    std::string iteration("init");
+    Query_Wrapper<base_t, simple_query>* qw = nullptr;
+    while(iteration != "false") {
+        
+        std::promise<void> p;
+        std::shared_future<void> ready_future(p.get_future());
+
+        if(iteration != "run") {
+        
+            if(qw != nullptr) {
+                delete qw;
+            }
+            
+            std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << std::endl;
+
+            uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
+            uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
+            uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
+            switch(mode.current) {
+            case NewPMode::DRAM_base:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
+                    break;
+            case NewPMode::HBM_base:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
+                    break;
+            case NewPMode::Mixed_base:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
+                    break;
+            case NewPMode::Prefetch: 
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false);
+                    break;
+            }             
+        }
+
+        qw->ready_future = &ready_future;
+        qw->clear_buffers();
+
+        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
+        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
+        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
+
+        std::vector<std::thread> filter_pool;
+        std::vector<std::thread>   copy_pool;
+        std::vector<std::thread>    agg_pool;
+
+        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
+        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
+        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
+
+        int thread_id = 0;
+        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
+        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
+        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids
+        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids
+        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids
+
+        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
+
+            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
+                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
+            }
+
+            // if tc_copy == 0 this loop is skipped
+            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
+                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
+            }
+
+            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
+                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
+            }
+        }
+
+        auto start = std::chrono::steady_clock::now();
+        p.set_value();
+
+        for(std::thread& t : filter_pool) { t.join(); }
+        for(std::thread& t :   copy_pool) { t.join(); }
+        for(std::thread& t :    agg_pool) { t.join(); }
+
+        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
+        auto end = std::chrono::steady_clock::now();
+
+        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
+        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+        double seconds   = (double)(nanos) / nanos_per_second;
+
+
+        print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds,
+        #ifdef THREAD_TIMINGS
+                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
+        #endif
+        #ifdef BARRIER_TIMINGS
+                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
+        #endif
+                results[0]);
+
+        
+        iteration = IterateOnce(run, chunk_size, mode);
+    }
+
+    numa_free(data_b_hbm, workload_b);
+    numa_free(data_a, workload_b);
+    numa_free(data_b, workload_b);
+    
+    numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t));
+
+}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp b/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
new file mode 100644
index 0000000..93c6b1b
--- /dev/null
+++ b/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
@@ -0,0 +1,260 @@
+#include <atomic>
+#include <barrier>
+#include <chrono>
+#include <condition_variable>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <tuple>
+#include <utility>
+
+#include <numa.h>
+
+#ifndef QUERY
+#define QUERY 1
+#endif
+
+#ifndef BARRIER_MODE
+#define BARRIER_MODE "global"
+#endif
+
+#define BUFFER_LIMIT 0
+
+#include "const.h"
+
+#include "file_output.h"
+#include "array_utils.h"
+#include "timer_utils.h"
+#include "barrier_utils.h"
+#include "cpu_set_utils.h"
+#include "iterable_range.h"
+#include "memory_literals.h"
+#include "pipelines/DIMES_scan_filter_pipe.h"
+
+#include "aggregation.h"
+#include "filter.h"
+
+using base_t = uint64_t;
+
+base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value) * row_B[i];
+    }
+    return sum;
+}
+
+base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
+    }
+    return sum;
+}
+
+
+int main(int argc, char** argv) {
+    // set constants
+    const size_t workload_b = 4_GiB;
+    const size_t chunk_size = 2_MiB;
+    const base_t compare_value_a = 50;
+    const base_t compare_value_b = 42;
+    constexpr bool simple_query = (QUERY == 1);
+
+
+    std::ofstream out_file;
+    out_file.open("../results/dimes_cores_" 
+            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
+            "_bm-" + (std::string) BARRIER_MODE + 
+            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
+            ".csv");
+    
+    // set benchmark parameter 
+    Linear_Int_Range<uint32_t, 0, 3, 1> run("run");
+
+    Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_a_thread("scan_a_tc");
+    Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_b_thread("scan_b_tc");
+    Exp_Int_Range<uint32_t, 1, 4+1, 2> aggr_j_thread("aggr_j_tc");
+    Linear_Int_Range<uint32_t, 1, 16+1, 1> thread_group_count("thread_group_c");
+    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
+
+    uint32_t remote_node   = 1;
+    uint32_t remote_node_2 = 0;//on heacboehm II: node 0 is two hops away from node 2 -> prefetching is more beneficial
+    uint32_t local_node    = 2; 
+
+    print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), 
+        "time",
+    #ifdef THREAD_TIMINGS
+        "scan_a", "scan_b", "aggr_j",
+    #endif
+    #ifdef BARRIER_TIMINGS
+        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
+    #endif
+        "result");
+
+
+    /*** alloc data and buffers ************************************************/
+    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
+    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
+    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
+    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
+    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
+    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
+    std::memcpy(data_a_hbm, data_a, workload_b);
+    std::memcpy(data_b_hbm, data_b, workload_b);
+    base_t* results = (base_t*) numa_alloc_onnode(thread_group_count.max * aggr_j_thread.max * sizeof(base_t), remote_node);
+
+    std::ofstream check_file;
+    check_file.open("../results/dimes_cores_" 
+            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
+            "_bm-" + (std::string) BARRIER_MODE + 
+            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
+            ".checksum");
+    if constexpr (QUERY == 1) {
+        //calculate simple checksum if QUERY == 1 -> simple query is applied
+        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
+    } else {
+        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
+    }
+    check_file.close();
+
+    std::string iteration("init");
+    Query_Wrapper<base_t, simple_query>* qw = nullptr;
+    while(iteration != "false") {
+        
+        std::promise<void> p;
+        std::shared_future<void> ready_future(p.get_future());
+
+        // skipping iteration through scan_b_thread while not used
+        while(simple_query && mode.current != NewPMode::Prefetch && scan_b_thread.current != 1) {
+                iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread);
+        }
+
+        if(iteration != "run") {
+            std::cout << "Changing to mode " << mode.current
+                    << " thread_group_count " << thread_group_count.current 
+                    << " thread_ratio " << scan_a_thread.current <<":"<< scan_b_thread.current <<":"<< aggr_j_thread.current
+                    << std::endl;
+        
+            if(qw != nullptr) {
+                if (iteration == thread_group_count.label) {
+
+                } else {
+                    delete qw;
+
+                    uint32_t sat = scan_a_thread.current;
+                    uint32_t sbt = simple_query && mode.current != NewPMode::Prefetch ?  0 : scan_b_thread.current;
+                    uint32_t ajt = aggr_j_thread.current;
+
+                    switch(mode.current) {
+                    case NewPMode::DRAM_base:
+                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, 
+                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
+                            break;
+                    case NewPMode::HBM_base:
+                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
+                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
+                            break;
+                    case NewPMode::Mixed_base:
+                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b_hbm, results, local_node, remote_node, 
+                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
+                            break;
+                    case NewPMode::Prefetch: 
+                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, 
+                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, false);
+                            break;
+                    }
+                }
+            }
+        }
+
+        qw->ready_future = &ready_future;
+        qw->clear_buffers();
+
+        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
+        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
+        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
+
+        std::vector<std::thread> filter_pool;
+        std::vector<std::thread>   copy_pool;
+        std::vector<std::thread>    agg_pool;
+
+        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
+        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
+        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
+
+        int thread_id = 0;
+        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
+        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
+
+        for(uint32_t gid = 0; gid < thread_group_count.current; ++gid) {
+
+            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
+                filter_pool.emplace_back(filter_lambda, gid, thread_group_count.current, tid);
+                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
+            }
+
+            // if tc_copy == 0 this loop is skipped
+            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
+                copy_pool.emplace_back(copy_lambda, gid, thread_group_count.current, tid);
+                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
+            }
+
+            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
+                agg_pool.emplace_back(aggregation_lambda, gid, thread_group_count.current, tid);
+                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
+            }
+        }
+
+        auto start = std::chrono::steady_clock::now();
+        p.set_value();
+
+        for(std::thread& t : filter_pool) { t.join(); }
+        for(std::thread& t :   copy_pool) { t.join(); }
+        for(std::thread& t :    agg_pool) { t.join(); }
+
+        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * thread_group_count.current);
+        auto end = std::chrono::steady_clock::now();
+
+        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
+        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+        double seconds   = (double)(nanos) / nanos_per_second;
+
+print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), 
+        "time",
+    #ifdef THREAD_TIMINGS
+        "scan_a", "scan_b", "aggr_j",
+    #endif
+    #ifdef BARRIER_TIMINGS
+        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
+    #endif
+        "result");
+
+        print_to_file(out_file, run, thread_group_count.current, new_mode_manager::string(mode.current), scan_a_thread, 
+                (simple_query && mode.current != NewPMode::Prefetch ?  0 : scan_b_thread.current), 
+                aggr_j_thread, seconds,
+        #ifdef THREAD_TIMINGS
+                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
+        #endif
+        #ifdef BARRIER_TIMINGS
+                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
+        #endif
+                results[0]);
+        
+        iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread);
+    }
+
+    numa_free(data_b_hbm, workload_b);
+    numa_free(data_a, workload_b);
+    numa_free(data_b, workload_b);
+    
+    numa_free(results, thread_group_count.max * aggr_j_thread.max * sizeof(base_t));
+
+}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/MAX_benchmark.cpp b/qdp_project/src/benchmark/MAX_benchmark.cpp
new file mode 100644
index 0000000..fb50f5a
--- /dev/null
+++ b/qdp_project/src/benchmark/MAX_benchmark.cpp
@@ -0,0 +1,289 @@
+#include <atomic>
+#include <barrier>
+#include <chrono>
+#include <condition_variable>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <tuple>
+#include <utility>
+
+#include <numa.h>
+
+#ifndef THREAD_GROUP_MULTIPLIER
+#define THREAD_GROUP_MULTIPLIER 2
+#endif
+
+#ifndef QUERY
+#define QUERY 1
+#endif
+
+#ifndef BARRIER_MODE
+#define BARRIER_MODE "global"
+#endif
+
+#ifndef BUFFER_LIMIT
+#define BUFFER_LIMIT 1
+#endif
+
+#ifndef PINNING
+#define PINNING 1
+#endif
+
+#ifndef PCM_M
+#define PCM_M 0
+#endif
+
+#if PCM_M == 1
+#include "pcm.h"
+#endif
+
+#include "const.h"
+
+#include "file_output.h"
+#include "array_utils.h"
+#include "timer_utils.h"
+#include "barrier_utils.h"
+#include "measurement_utils.h"
+#include "cpu_set_utils.h"
+#include "iterable_range.h"
+#include "memory_literals.h"
+#include "pipelines/MAX_scan_filter_pipe.h"
+
+#include "aggregation.h"
+#include "filter.h"
+
+using base_t = uint64_t;
+
+base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value) * row_B[i];
+    }
+    return sum;
+}
+
+base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
+    }
+    return sum;
+}
+
+int main(int argc, char** argv) {
+#if PCM == 1
+    pcm::PCM *pcm = pcm::PCM::getInstance();
+	//and check for errors
+	auto error_code = pcm->program();
+	if(error_code != pcm::PCM::Success) {
+		std::cerr << "PCM couldn't start" << std::endl;
+        std::cerr << "Error code: " << error_code << std::endl;
+        std::cerr << "Try to execute 'sudo modprobe msr' and execute this program with root privigeges.";
+        return 1;
+	}
+#endif
+
+    // set constants
+    const size_t workload_b = 2_GiB;
+    const base_t compare_value_a = 50;
+    const base_t compare_value_b = 42;
+    constexpr bool simple_query = (QUERY == 1);
+
+    const size_t thread_count = 6;
+    std::ofstream out_file;
+    out_file.open("../results/max_" 
+            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
+            "_bm-" + (std::string) BARRIER_MODE + 
+            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
+            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + "1MiB-2MiB.csv");
+    
+    // set benchmark parameter 
+    Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
+    constexpr size_t chunk_min = 1_MiB; constexpr size_t chunk_max = 8_MiB + 1; constexpr size_t chunk_incr = 128_kiB;
+    Linear_Int_Range<size_t, chunk_min, chunk_max, chunk_incr> chunk_size("chunk_size");
+    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
+
+    uint32_t remote_node   = 2;
+    uint32_t remote_node_2 = 2;
+    uint32_t local_node    = 10;
+
+    /*uint32_t remote_node   = 6;
+    uint32_t remote_node_2 = 6;
+    uint32_t local_node    = 2;*/
+
+    print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time",
+    #ifdef THREAD_TIMINGS
+        "scan_a", "scan_b", "aggr_j",
+    #endif
+    #ifdef BARRIER_TIMINGS
+        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
+    #endif
+    #if PCM == 1
+        pcm_value_collector::getHead("scan_a"), 
+        pcm_value_collector::getHead("scan_b"), 
+        pcm_value_collector::getHead("aggr_j"),
+    #endif
+        "result");
+
+
+    /*** alloc data and buffers ************************************************/
+    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
+    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
+    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
+    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
+    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
+    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
+    std::memcpy(data_a_hbm, data_a, workload_b);
+    std::memcpy(data_b_hbm, data_b, workload_b);
+    base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node);
+
+    std::ofstream check_file;
+    check_file.open("../results/max_" 
+            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
+            "_bm-" + (std::string) BARRIER_MODE + 
+            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
+            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum");
+    if constexpr (QUERY == 1) {
+        //calculate simple checksum if QUERY == 1 -> simple query is applied
+        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
+    } else {
+        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
+    }
+    check_file.close();
+
+    std::string iteration("init");
+    Query_Wrapper<base_t, simple_query>* qw = nullptr;
+    while(iteration != "false") {
+        
+        std::promise<void> p;
+        std::shared_future<void> ready_future(p.get_future());
+
+        if(iteration != "run") {
+        
+            if(qw != nullptr) {
+                delete qw; 
+            }
+            uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
+            uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
+            uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
+            switch(mode.current) {
+            case NewPMode::DRAM_base:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
+                    break;
+            case NewPMode::HBM_base:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
+                    break;
+            case NewPMode::Mixed_base:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
+                    break;
+            case NewPMode::Prefetch: 
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
+                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false);
+                    break;
+            }             
+        }
+
+        qw->ready_future = &ready_future;
+        qw->clear_buffers();
+
+        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
+        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
+        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
+
+        std::vector<std::thread> filter_pool;
+        std::vector<std::thread>   copy_pool;
+        std::vector<std::thread>    agg_pool;
+
+        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
+        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
+        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
+
+        int thread_id = 0;
+        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
+        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
+        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids
+        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids
+        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids
+
+        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
+
+            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
+                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+#if PINNING
+                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);      
+#else
+                pin_thread_in_range(filter_pool.back(), pinning_ranges);
+#endif
+            }
+
+            // if tc_copy == 0 this loop is skipped
+            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
+                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+#if PINNING
+                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
+#else
+                pin_thread_in_range(copy_pool.back(), pinning_ranges);
+#endif
+            }
+
+            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
+                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+#if PINNING
+                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
+#else
+                pin_thread_in_range(agg_pool.back(), pinning_ranges);
+#endif
+            }
+        }
+
+        auto start = std::chrono::steady_clock::now();
+        p.set_value();
+
+        for(std::thread& t : filter_pool) { t.join(); }
+        for(std::thread& t :   copy_pool) { t.join(); }
+        for(std::thread& t :    agg_pool) { t.join(); }
+
+        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
+        auto end = std::chrono::steady_clock::now();
+
+        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
+        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+        double seconds   = (double)(nanos) / nanos_per_second;
+
+
+        
+        print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds,
+        #ifdef THREAD_TIMINGS
+                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
+        #endif
+        #ifdef BARRIER_TIMINGS
+                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
+        #endif
+        #if PCM == 1
+                qw->pvc->summarize_as_string("scan_a"),
+                qw->pvc->summarize_as_string("scan_b"), 
+                qw->pvc->summarize_as_string("aggr_j"),
+        #endif
+                results[0]);
+
+        iteration = IterateOnce(run, chunk_size, mode);
+    }
+
+    numa_free(data_b_hbm, workload_b);
+    numa_free(data_a, workload_b);
+    numa_free(data_b, workload_b);
+    
+    numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t));
+
+}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/QDP_minimal.h b/qdp_project/src/benchmark/QDP_minimal.h
new file mode 100644
index 0000000..007d0d9
--- /dev/null
+++ b/qdp_project/src/benchmark/QDP_minimal.h
@@ -0,0 +1,147 @@
+#include <chrono>
+#include <iostream>
+#include <thread>
+#include <future>
+#include <numa.h>
+
+#include "const.h"
+#include "array_utils.h"
+#include "cpu_set_utils.h"
+#include "iterable_range.h"
+#include "memory_literals.h"
+#include "pipelines/MAX_scan_filter_pipe.h"
+#include "aggregation.h"
+
+using base_t = uint64_t;
+
+// calculate the checksum for the simple query
+base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value) * row_B[i];
+    }
+    return sum;
+}
+
+// calculate the checksum for the complex query
+base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
+    }
+    return sum;
+}
+
+class QDP_minimal {
+private:
+    // values used for comparisons in the filter operations
+    const base_t compare_value_a = 50;
+    const base_t compare_value_b = 42;
+    // define, which numa nodes to use
+    // Xeon Max: node 0-7 DRAM and 8-15 HBM
+    // if the nodes are changed, the pinning ranges in run should be adjusted accordingly too
+    uint32_t dram_node   = 2; 
+    uint32_t dram_node_2 = 2; 
+    uint32_t hbm_node    = 10;
+
+public:
+    // results of running qdp, set by run()
+    base_t result;
+    base_t checksum;
+    double exec_time;
+
+    // run qdp
+    void run(const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){
+        // allocate data
+        base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, dram_node);
+        base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, dram_node_2);
+        base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t), dram_node);
+
+        // fill the memory with acutal values
+        fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
+        fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
+        
+        // run qdp
+        run(data_a, data_b, results, workload_b, chunk_size, tc_filter, tc_copy, tc_agg);
+
+        // free the allocated memory
+        numa_free(data_a, workload_b);
+        numa_free(data_b, workload_b);
+        numa_free(results, THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t));
+    }
+
+    // run qdp, work on provided memory pointers to enable memory reuse across multiple runs
+    void run(base_t* data_a, base_t* data_b, base_t* results, const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){
+        constexpr bool simple_query = (QUERY == 1);
+        // sync objects
+        std::promise<void> p;
+        std::shared_future<void> ready_future(p.get_future());
+
+        // create the query wrapper, that is managing the to-be-used threads
+        Query_Wrapper<base_t, simple_query>* qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, hbm_node, dram_node, 
+            tc_filter, tc_copy, tc_agg, NewPMode::Prefetch, THREAD_GROUP_MULTIPLIER, compare_value_a, compare_value_b, false);             
+
+        // clear buffers to make sure, that they have been written and are fully mapped before running qdp
+        qw->clear_buffers();
+
+        // creating lambdas for executing filter (scan_a), copy (scan_b), and aggregation tasks on the query wrapper
+        // passing gid (group id), gcnt (group count) and tid (thread id)
+        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
+        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
+        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
+
+        // creating thread pools, holding all used threads
+        std::vector<std::thread> filter_pool;
+        std::vector<std::thread>   copy_pool;
+        std::vector<std::thread>    agg_pool;
+
+        int thread_id = 0;
+        // cpus on node 2 (for sapphire rapids), that the threads should be executed on
+        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; 
+
+        // create all threads for all thread groups and for every task (copy, filter, aggregation), according their specific theadcount
+        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
+            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
+                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);      
+            }
+            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
+                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
+            }
+            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
+                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
+                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
+            }
+        }
+
+        // start the clock 
+        auto start = std::chrono::steady_clock::now();
+        // set value to the promise, to signal the waiting threads, that they can start now
+        p.set_value();
+
+        // wait for all thread to be finished
+        for(std::thread& t : filter_pool) { t.join(); }
+        for(std::thread& t :   copy_pool) { t.join(); }
+        for(std::thread& t :    agg_pool) { t.join(); }
+
+        // sum up the results of all the aggregation threads to get a final result
+        Aggregation<base_t, Sum, load_mode::Aligned>::apply(&result, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
+        auto end = std::chrono::steady_clock::now();
+        
+        // get the overall execution time in seconds
+        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
+        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+        exec_time   = (double)(nanos) / nanos_per_second;
+
+        // calculate the checksum according to the used query
+        if constexpr (QUERY == 1) {
+            // QUERY == 1 -> simple query is applied
+            checksum = sum_check(compare_value_a, data_a, data_b, workload_b);
+        } else {
+            checksum = sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
+        }
+
+        delete qw;
+    }
+};
diff --git a/qdp_project/src/benchmark/doubly_filtered_agg.cpp b/qdp_project/src/benchmark/doubly_filtered_agg.cpp
new file mode 100644
index 0000000..eaee93d
--- /dev/null
+++ b/qdp_project/src/benchmark/doubly_filtered_agg.cpp
@@ -0,0 +1,149 @@
+
+#include <cstring>
+#include <fstream>
+#include <future>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <numa.h>
+
+#include "aggregation.h"
+#include "array_utils.h"
+#include "cpu_set_utils.h"
+#include "file_output.h"
+#include "iterable_range.h"
+#include "memory_literals.h"
+#include "pipelines/scan_filter_pipe.h"
+
+int main () {
+
+    using base_t = uint64_t;
+
+    
+    const size_t workload = 2_GiB;
+    const char filename[256] = "../results/doubly_filtered_results_stronger_affinity_.csv";
+    const uint32_t numa_local = 2;
+    const uint32_t numa_remote = 3;
+
+
+    Linear_Int_Range<uint32_t, 1,  6, 1> thread_group("thread_groups");
+    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_filter("thread_cnt_filter");
+    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_filter_copy("thread_cnt_filter_copy");
+    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_aggregation("thread_cnt_agg");
+    Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
+    Range<PMode, no_copy, mode_manager, mode_manager> mode("mode");
+    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
+
+    std::ofstream out_file;
+    out_file.open(filename);
+    print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, 
+                  thread_count_aggregation, thread_group), "time", "scan_a", "scan_b", "aggr_j", "wait_aggr", "results");
+
+    base_t* data_a     = (base_t*) numa_alloc_onnode(workload, numa_remote);
+    base_t* data_b     = (base_t*) numa_alloc_onnode(workload, numa_remote);
+    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload, numa_local);
+    fill_mt<base_t>(data_a, workload, 0, 100, 42);
+    fill_mt<base_t>(data_b, workload, 0, 100, 420);
+    std::memcpy(data_b_hbm, data_b, workload); 
+    base_t* result = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), 
+                                                 numa_remote);
+
+    std::string iteration("init");
+    Query_Wrapper<base_t, false>* qw = nullptr;
+    
+    while(iteration != "false") {
+
+        std::promise<void> p;
+        std::shared_future<void> ready_future(p.get_future());
+
+        if(iteration != "run") {
+            if(qw != nullptr) {
+                delete qw;
+            }
+
+            switch(mode.current) {
+            case PMode::expl_copy:
+                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, 
+                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, 
+                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, false);
+                    break;
+            case PMode::no_copy:
+                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, 
+                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, 
+                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
+                    break;
+            case PMode::hbm:
+                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b_hbm, result, numa_local, numa_remote, 
+                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current,
+                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
+                    break;
+            }
+        }
+        qw->ready_future = &ready_future;
+        qw->clear_buffers();
+
+
+        // todo create threads depending on mode
+        std::vector<std::thread> thread_pool;
+        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
+        auto filter_copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
+        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };
+
+
+        /* Intel Xeon Gold 6130 // todo implement different for 5120 -> fewer cpus
+        node 0 cpus:  0-15  64- 79
+        node 1 cpus: 16-31  80- 95
+        node 2 cpus: 32-47  96-111
+        node 3 cpus: 48-63 112-127
+        */
+        int thread_id = 0;
+        std::vector<std::pair<int, int>> range {std::make_pair(0, 16), std::make_pair(64, 80)};
+        for(uint32_t gid = 0; gid < thread_group.current; ++gid) {
+
+            
+            for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) {
+                thread_pool.emplace_back(filter_lambda, gid, thread_group.current, tid);
+                pin_thread_in_range(thread_pool.back(), thread_id++, range);
+            }
+
+            for(uint32_t tid = 0; tid < thread_count_filter_copy.current; ++tid) {
+                thread_pool.emplace_back(filter_copy_lambda, gid, thread_group.current, tid);
+                pin_thread_in_range(thread_pool.back(), thread_id++, range);
+            }
+
+            for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) {
+                thread_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid);
+                pin_thread_in_range(thread_pool.back(), thread_id++, range);
+            }
+        }
+
+        auto start = std::chrono::steady_clock::now();
+        p.set_value();
+
+        // wait for every thread to join
+        for(std::thread& t : thread_pool) t.join();
+        // aggregate all partial results
+        Aggregation<base_t, Sum, load_mode::Aligned>::apply(result, result, 
+                                                            sizeof(base_t) * thread_count_aggregation.current * thread_group.current);
+
+        auto end = std::chrono::steady_clock::now();
+
+        double duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / (double)1000000000;
+
+
+        //TODO add mode
+        print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, 
+                thread_count_filter_copy, thread_count_aggregation, thread_group, duration, 
+                qw->trt->summarize_time(0), qw->trt->summarize_time(1),
+                qw->trt->summarize_time(2), qw->trt->summarize_time(3), *result);
+        iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, thread_count_aggregation, thread_group);
+    }
+
+    auto end = std::chrono::system_clock::now();
+    std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+    std::cout << "finished computation at " << std::ctime(&end_time) << std::endl;
+
+    print_to_file(out_file, std::ctime(&end_time));
+}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp b/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
new file mode 100644
index 0000000..b4a6753
--- /dev/null
+++ b/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
@@ -0,0 +1,184 @@
+#include <atomic>
+#include <barrier>
+#include <chrono>
+#include <condition_variable>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <tuple>
+#include <utility>
+
+#include <numa.h>
+
+#include "const.h"
+
+#include "file_output.h"
+#include "array_utils.h"
+#include "timer_utils.h"
+#include "barrier_utils.h"
+#include "cpu_set_utils.h"
+#include "iterable_range.h"
+#include "memory_literals.h"
+#include "pipelines/scan_filter_pipe.h"
+
+#include "aggregation.h"
+#include "filter.h"
+
+using base_t = uint64_t;
+
+base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
+    base_t sum = 0;
+    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
+        sum += (row_A[i] < compare_value) * row_B[i];
+    }
+    return sum;
+}
+
+
+int main(int argc, char** argv) {
+    size_t workload_b = 2_GiB;
+    std::ofstream out_file;
+    out_file.open("filter_aggreagate_pipe_bm_" + (std::string) BARRIER_MODE + ".csv");
+
+    Linear_Int_Range<uint32_t, 1,  7, 1> thread_group("thread_groups");
+    Linear_Int_Range<uint32_t, 0, 10, 1> run("run");
+    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
+    Linear_Int_Range<uint32_t, 1,  2, 1> thread_count_filter("thread_cnt_filter");
+    Linear_Int_Range<uint32_t, 2,  3, 1> thread_count_copy("thread_cnt_copy");
+    Linear_Int_Range<uint32_t, 1,  2, 1> thread_count_aggregation("thread_cnt_agg");
+    Range<PMode, no_copy, mode_manager, mode_manager> mode("mode");
+
+    uint32_t remote_node  = 2;
+    uint32_t remote_node_2 = 2;
+    uint32_t local_node = 10; 
+
+    print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_copy, 
+                  thread_count_aggregation, thread_group), "time",
+    #ifdef THREAD_TIMINGS
+        "scan_a", "scan_b", "aggr_j",
+    #endif
+    #ifdef BARRIER_TIMINGS
+        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
+    #endif
+        "result");
+
+
+    /*** alloc data and buffers ************************************************/
+    base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node);
+    base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
+    base_t* data_b_hbm = (base_t *) numa_alloc_onnode(workload_b, local_node);
+    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
+    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
+    std::memcpy(data_b_hbm, data_b, workload_b);
+    base_t* results = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), remote_node);
+
+    std::string iteration("init");
+    const bool simple_query = true;
+    Query_Wrapper<base_t, simple_query>* qw = nullptr;
+    while(iteration != "false") {
+        base_t compare_value = 50;
+        std::promise<void> p;
+        std::shared_future<void> ready_future(p.get_future());
+
+        if(iteration != "run") {
+        
+            if(qw != nullptr) {
+                delete qw;
+            }
+            
+            std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << " thread_group " << thread_group.current << std::endl;
+            switch(mode.current) {
+            case PMode::expl_copy:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
+                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, false);
+                    break;
+            case PMode::no_copy:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
+                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
+                    break;
+            case PMode::hbm:
+                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
+                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
+                    break;
+            }             
+        }
+
+        qw->ready_future = &ready_future;
+        qw->clear_buffers();
+
+        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
+        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
+        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
+
+        std::vector<std::thread> filter_pool;
+        std::vector<std::thread>   copy_pool;
+        std::vector<std::thread>    agg_pool;
+
+        int thread_id = 0;
+        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm2
+        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
+
+        for(uint32_t gid = 0; gid < thread_group.current; ++gid) {
+
+        for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) {
+            filter_pool.emplace_back(filter_lambda, gid, thread_group.current, tid);
+            pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
+        }
+
+        if(mode.current == PMode::expl_copy){
+            for(uint32_t tid = 0; tid < thread_count_copy.current; ++tid) {
+                copy_pool.emplace_back(copy_lambda, gid, thread_group.current, tid);
+                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
+            }
+        }
+
+        for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) {
+            agg_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid);
+            pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
+            }
+        }
+
+        auto start = std::chrono::steady_clock::now();
+        p.set_value();
+
+        for(std::thread& t : filter_pool) { t.join(); }
+        for(std::thread& t :   copy_pool) { t.join(); }
+        for(std::thread& t :    agg_pool) { t.join(); }
+
+        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_count_aggregation.current * thread_group.current);
+        auto end = std::chrono::steady_clock::now();
+
+        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
+        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+        double seconds   = (double)(nanos) / nanos_per_second;
+
+
+
+        print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, 
+            thread_count_copy, thread_count_aggregation, thread_group, seconds,
+        #ifdef THREAD_TIMINGS
+                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
+        #endif
+        #ifdef BARRIER_TIMINGS
+                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
+        #endif
+                results[0]);
+
+        
+        iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_copy, thread_count_aggregation, thread_group);
+
+    }
+
+    numa_free(data_b_hbm, workload_b);
+    numa_free(data_a, workload_b);
+    numa_free(data_b, workload_b);
+    numa_free(results, thread_group.max * sizeof(base_t));
+
+}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/latency.cpp b/qdp_project/src/benchmark/latency.cpp
new file mode 100644
index 0000000..011066a
--- /dev/null
+++ b/qdp_project/src/benchmark/latency.cpp
@@ -0,0 +1,188 @@
+/*
+ * numa_memory_latency
+ * Copyright (c) 2017 UMEZAWA Takeshi
+ * This software is licensed under GNU GPL version 2 or later.
+ *
+ * This file has been modified
+ */
+
+#include <algorithm>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <iostream>
+#include <unistd.h>
+#include <ctime>
+#include "file_output.h"
+#include <vector>
+#include <random>
+#include <algorithm>
+#include <numa.h>
+
+#ifndef VOLATILE
+#define VOLATILE 0
+#endif
+
+#define cachelinesize 64
+union CACHELINE {
+	char cacheline[cachelinesize];
+	#if VOLATILE
+	volatile CACHELINE* next;
+	#else 
+	CACHELINE* next;
+	#endif /*VOLATILE*/
+};
+
+#define REPT4(x)    do { x; x; x; x; } while(0)
+#define REPT16(x)   do { REPT4(x);   REPT4(x);   REPT4(x);   REPT4(x);   } while(0);
+#define REPT64(x)   do { REPT16(x);  REPT16(x);  REPT16(x);  REPT16(x);  } while(0);
+#define REPT256(x)  do { REPT64(x);  REPT64(x);  REPT64(x);  REPT64(x);  } while(0);
+#define REPT1024(x) do { REPT256(x); REPT256(x); REPT256(x); REPT256(x); } while(0);
+
+size_t bufsize = 1 * 1024 * 1024 * 1024;
+size_t nloop = 128 * 1024;
+std::vector<size_t> offsets;
+
+#if VOLATILE
+
+volatile CACHELINE* walk(volatile CACHELINE* start)
+{
+	volatile CACHELINE* p = start;
+	for (size_t i = 0; i < nloop; ++i) {
+		REPT1024(p = p->next);
+	}
+	return p;
+}
+
+#else
+
+CACHELINE* walk(CACHELINE* start, uint64_t* sum)
+{
+	CACHELINE* p = start;
+	for (size_t i = 0; i < nloop; ++i) {
+        REPT1024(
+			*sum += static_cast<uint64_t>(p->cacheline[cachelinesize-1]);
+            p = p->next;
+        );
+    }
+	return p;
+}
+
+#endif /*VOLATILE*/
+
+void bench(int tasknode, int memnode, std::ofstream* out_file)
+{
+	struct timespec ts_begin, ts_end, ts_elapsed;
+
+	printf("bench(task=%d, mem=%d)\n", tasknode, memnode);
+
+	if (numa_run_on_node(tasknode) != 0) {
+		printf("failed to run on node: %s\n", strerror(errno));
+		return;
+	}
+
+	CACHELINE* const buf = (CACHELINE*)numa_alloc_onnode(bufsize, memnode);
+	if (buf == NULL) {
+		printf("failed to allocate memory\n");
+		return;
+	}
+
+	for (size_t i = 0; i < offsets.size() - 1; ++i) {
+		// assuming that next-pointer never overwrites last Byte of the cacheline/union
+		buf[offsets[i]].cacheline[cachelinesize-1] = offsets[i] % 128;
+		buf[offsets[i]].next = buf + offsets[i+1];
+	}
+	buf[offsets[offsets.size() - 1]].next = buf;
+	buf[offsets[offsets.size() - 1]].cacheline[cachelinesize-1] = offsets[offsets.size() - 1] % 128;
+
+	uint64_t value = 0;
+	uint64_t* sum = &value;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts_begin);
+	
+	#if VOLATILE 
+	walk(buf);
+	#else
+	walk(buf, sum);
+	#endif /*VOLATILE*/
+
+	clock_gettime(CLOCK_MONOTONIC, &ts_end);
+
+	ts_elapsed.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec;
+	ts_elapsed.tv_sec = ts_end.tv_sec - ts_begin.tv_sec;
+	if (ts_elapsed.tv_nsec < 0) {
+		--ts_elapsed.tv_sec;
+		ts_elapsed.tv_nsec += 1000*1000*1000;
+	}
+	double elapsed = ts_elapsed.tv_sec + 0.000000001 * ts_elapsed.tv_nsec;
+	printf("took %fsec. %fns/load\n", elapsed, elapsed/(1024*nloop)*(1000*1000*1000));
+	print_to_file(*out_file, tasknode, memnode, elapsed/(1024*nloop)*(1000*1000*1000), *sum);
+	numa_free(buf, bufsize);
+}
+
+struct RND {
+	std::mt19937 mt;
+	RND() : mt(time(NULL)) {}
+	std::mt19937::result_type operator()(std::mt19937::result_type n) { return mt() % n; }
+} r;
+
+void usage(const char* prog)
+{
+	printf("usage: %s [-h] [bufsize] [nloop]\n", prog);
+}
+
+int main(int argc, char* argv[])
+{
+	int ch;
+
+	while ((ch = getopt(argc, argv, "h")) != -1) {
+		switch (ch) {
+		case 'h':
+		default:
+			usage(argv[0]);
+			exit(1);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc > 1) {
+		// 1048576 KiB = 1 GiB
+		bufsize = atoi(argv[0]) * 1024; // in KiB
+		nloop = atoi(argv[1]) * 1024;
+    } 
+
+	offsets.resize(bufsize / cachelinesize);
+	
+	for (size_t i = 0; i < offsets.size(); ++i)
+		offsets[i] = i;
+	std::random_shuffle(offsets.begin() + 1, offsets.end(), r);
+
+	uint64_t expected_checksum = 0;
+	#if VOLATILE == 0
+    for (size_t i = 0; i < nloop * 1024; ++i) {
+        expected_checksum += offsets[i % offsets.size()] % 128;
+    }
+	#endif
+
+	std::ofstream check_file;
+	check_file.open("../results/micro_bench/latency/micro_bench_latency_" + (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".checksum");
+	check_file << expected_checksum;
+	check_file.close();
+
+
+	printf("benchmark bufsize=%zuKiB, nloop=%zuKi\n", bufsize/1024, nloop/1024);
+
+    std::ofstream out_file;
+    out_file.open("../results/micro_bench/latency/micro_bench_latency_"+ (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".csv");
+    print_to_file(out_file, "tasknode", "memnode", "latency", "checksum");
+
+	for (int tasknode = 0; tasknode < 8; tasknode++) {
+		for (int memnode = 0; memnode < 16; memnode++) {
+			bench(tasknode, memnode, &out_file);
+		}
+	}
+
+	return 0;
+}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/micro_benchmarks.cpp b/qdp_project/src/benchmark/micro_benchmarks.cpp
new file mode 100644
index 0000000..4e63f82
--- /dev/null
+++ b/qdp_project/src/benchmark/micro_benchmarks.cpp
@@ -0,0 +1,271 @@
+#include <iostream>
+#include <chrono>
+#include <future>
+#include <numa.h>
+#include <algorithm>
+#include <cstring>
+#include "memory_literals.h"
+#include "array_utils.h"
+#include "file_output.h"
+#include "aggregation.h"
+
+
+using base_t = uint64_t;
+
+size_t thread_cnt_memcpy = 128;
+size_t thread_cnt_read = 128;
+size_t runs = 10;
+
+
+base_t sum_up(base_t* data, size_t workload){
+    base_t sum = 0;
+    for(int i = 0; i < workload/sizeof(base_t); i++){
+        sum += data[i];
+    }
+    return sum;
+}
+
+int reverse_bits(int number, size_t bit_count) {
+    int result = 0;
+    for(int i = 0; i < bit_count; i++) {
+        result <<= 1; 
+        result |= (number & 1);
+        number >>= 1; 
+    }
+    return result;
+}
+
+
+double measure_memcpy_bw(base_t* src, base_t* dest, size_t workload, base_t* result){
+    std::promise<void> p;
+    std::shared_future<void> ready_future(p.get_future());
+
+    auto thread_lambda =          [&](base_t* source, base_t* destination, size_t count) {
+        ready_future.wait();
+        memcpy(destination, source, count);
+    };
+
+    std::vector<std::thread> thread_pool;
+    size_t total_elements = workload / sizeof(base_t); 
+    size_t elements_per_thread = total_elements / thread_cnt_memcpy;
+    size_t remainder = total_elements % thread_cnt_memcpy; 
+
+    for(size_t tid = 0; tid < thread_cnt_memcpy; tid++) {
+        size_t elements_to_process = elements_per_thread + (tid < remainder ? 1 : 0);
+        size_t byte_offset = (elements_per_thread * tid + std::min(tid, remainder)) * sizeof(base_t);
+        
+        thread_pool.emplace_back(thread_lambda, src + byte_offset / sizeof(base_t), dest + byte_offset / sizeof(base_t), elements_to_process * sizeof(base_t));
+    }
+
+    auto start = std::chrono::steady_clock::now();
+    p.set_value();
+    for(std::thread& t : thread_pool) { t.join(); }
+    auto stop = std::chrono::steady_clock::now();
+
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
+    double seconds = duration.count() / 1e9;
+    double throughput = (workload / seconds) / (1024 * 1024 * 1024);
+    *result = sum_up(dest, workload);
+    return throughput;
+}
+
+double measure_read_bw(base_t* data, size_t workload, base_t* results){
+    const size_t chunk_size = sizeof(__m512i);
+    const size_t num_chunks = (workload) / chunk_size;
+    __m512i* src = reinterpret_cast<__m512i*>(data);
+    std::promise<void> p;
+    std::shared_future<void> ready_future(p.get_future());
+    size_t num_chunks_per_thread = num_chunks / thread_cnt_read;
+    size_t num_chunks_remainder = num_chunks % thread_cnt_read;
+
+    auto thread_lambda =          [&](__m512i* src, int tid, int num_chunks) {
+        __m512i accumulator = _mm512_setzero_si512();
+        ready_future.wait();
+        for (int i = 0; i < num_chunks; i++) {
+            __m512i chunk = _mm512_load_si512(&src[i]);
+            accumulator = _mm512_add_epi64(accumulator, chunk); 
+        }
+        results[tid] = _mm512_reduce_add_epi64(accumulator);
+    };
+
+    std::vector<std::thread> thread_pool;
+    int offset;
+    for(int tid = 0; tid < thread_cnt_read; tid++){
+        if(tid < num_chunks_remainder){
+            offset = tid * (num_chunks_per_thread + 1);
+            thread_pool.emplace_back(thread_lambda, &src[offset], tid, (num_chunks_per_thread + 1));
+        } else {
+            offset = tid*num_chunks_per_thread + num_chunks_remainder;
+            thread_pool.emplace_back(thread_lambda, &src[offset], tid, num_chunks_per_thread);
+        }
+        
+    }
+
+    auto start = std::chrono::steady_clock::now();
+    p.set_value();
+    for(std::thread& t : thread_pool) { t.join(); }
+    auto stop = std::chrono::steady_clock::now();
+
+    Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_cnt_read);
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
+    double seconds = duration.count() / 1e9;
+    double throughput = (workload / seconds) / (1024 * 1024 * 1024);
+    return throughput;
+}
+
+void exec_multiple_runs_memcpy(size_t workload, int exec_node, int src_node, int dest_node, std::ofstream* out_file, std::string iteration_type){
+    base_t value;
+    base_t* result = &value;
+    base_t* src = (base_t*) numa_alloc_onnode(workload, src_node);
+    base_t* dest = (base_t*) numa_alloc_onnode(workload, dest_node);
+    fill_mt<base_t>(src, workload, 0, 100, 42);
+    fill_mt<base_t>(dest, workload, 0, 100, 12);
+    numa_run_on_node(exec_node);
+
+    if(dest_node == 0 && src_node == 0){
+        std::ofstream check_file;
+        check_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
+            + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_" + iteration_type + ".checksum");
+        check_file << sum_up(src, workload);
+        check_file.close();
+    }
+
+    for(size_t run = 0; run < runs; run++){
+        double bw = measure_memcpy_bw(src, dest, workload, result);
+        std::cout << "Copy throughput executed on node " << exec_node << " form node " << src_node << " to node " 
+            << dest_node << ": " << bw << " GiB/s" << std::endl; 
+        print_to_file(*out_file, run, src_node, dest_node, bw, *result);
+        std::memset(dest, 0x00, workload);
+        *result = 0;
+    }
+    numa_free(src, workload);
+    numa_free(dest, workload);
+}
+
+void measure_all_memcpy_bw_for_chosen_execnode(int exec_node){
+    std::ofstream out_file;
+    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
+        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + ".csv");
+    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
+    const size_t workload = 4_GiB;
+    
+    for(int src_node = 0; src_node < 16; src_node++){
+        for(int dest_node = 0; dest_node < 16; dest_node++){
+            exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "");
+        }
+    }
+    out_file.close();
+}
+
+void measure_all_memcpy_bw_for_chosen_execnode_reversed(int exec_node){
+    std::ofstream out_file;
+    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
+        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed.csv");
+    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
+    const size_t workload = 4_GiB;
+    
+    for(int src_node = 15; src_node >= 0; src_node--){
+        for(int dest_node = 15; dest_node >= 0; dest_node--){
+            exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "reversed");
+        }
+    }
+    out_file.close();
+}
+
+
+
+void measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(int exec_node){
+    std::ofstream out_file;
+    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
+        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed_bitwise.csv");
+    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
+    const size_t workload = 4_GiB;
+    
+    for(int src_node = 0; src_node < 16; src_node++){
+        for(int dest_node = 0; dest_node < 16; dest_node++){
+            int reversed_src_node = reverse_bits(src_node, 4);
+            int reversed_dest_node = reverse_bits(dest_node, 4);
+            exec_multiple_runs_memcpy(workload, exec_node, reversed_src_node, reversed_dest_node, &out_file, "reversed_bitwise");
+        }
+    }
+    out_file.close();
+}
+
+
+void exec_multiple_runs_read(size_t workload, int mem_node, int exec_node, std::ofstream *out_file, std::string iteration_type){
+    base_t* data = (base_t*) numa_alloc_onnode(workload, mem_node);
+    fill_mt<base_t>(data, workload, 0, 100, 42);
+    base_t* results = (base_t*) numa_alloc_onnode(thread_cnt_read * sizeof(base_t), exec_node);
+    numa_run_on_node(exec_node);
+
+    if(mem_node == 0 && exec_node == 0){
+        std::ofstream check_file;
+        check_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_" + iteration_type + ".checksum");
+        check_file << sum_up(data, workload);
+        check_file.close();
+    }
+
+    for(size_t run = 0; run < runs; run++){
+        double bw =  measure_read_bw(data, workload, results);
+        std::cout << "Read throughput executed on node " << exec_node << " for node " << mem_node << ": " << bw << " GiB/s" << std::endl;
+        print_to_file(*out_file, run, exec_node, mem_node, bw, results[0]);
+        std::memset(results, 0x00, thread_cnt_read * sizeof(base_t));
+    }
+    numa_free(data, workload);
+    numa_free(results, thread_cnt_read * sizeof(base_t));
+}
+
+void measure_all_read_bw(){
+    std::ofstream out_file;
+    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + ".csv");
+    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
+    const size_t workload = 8_GiB;
+
+    for(int exec_node = 0; exec_node < 8; exec_node++){
+        for(int mem_node = 0; mem_node < 16; mem_node++){
+            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "");
+        }
+    }
+    out_file.close();
+}
+
+void measure_all_read_bw_reversed(){
+    std::ofstream out_file;
+    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed.csv");
+    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
+    const size_t workload = 8_GiB;
+
+    for(int exec_node = 7; exec_node >= 0; exec_node--){
+        for(int mem_node = 15; mem_node >= 0; mem_node--){
+            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed");
+        }
+    }
+    out_file.close();
+}
+
+void measure_all_read_bw_reversed_bitwise(){
+    std::ofstream out_file;
+    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed_bitwise.csv");
+    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
+    const size_t workload = 8_GiB;
+
+    for(int exec_node0 = 0; exec_node0 < 8; exec_node0++){
+        for(int mem_node0 = 0; mem_node0 < 16; mem_node0++){
+            int mem_node = reverse_bits(mem_node0, 4);
+            int exec_node = reverse_bits(exec_node0, 3);
+            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed_bitwise");
+        }
+    }
+    out_file.close();
+}
+
+
+
+int main() {
+    // nodes 0-7 hold cores and DRAM, nodes 8-15 only HBM
+
+    measure_all_read_bw_reversed_bitwise();
+    measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(0);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
new file mode 100644
index 0000000..6dbc652
--- /dev/null
+++ b/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
@@ -0,0 +1,391 @@
+
+#include <cassert>
+#include <mutex>
+#include <cstring>
+#include <bitset>
+
+#include <numa.h>
+
+#include "filter.h"
+#include "aggregation.h"
+#include "vector_loader.h"
+#include "timer_utils.h"
+#include "barrier_utils.h"
+#include "execution_modes.h"
+
+
+template<typename base_t, bool simple>
+class Query_Wrapper {
+public:
+    // sync
+    std::shared_future<void>* ready_future;
+
+    thread_runtime_timing* trt;
+    barrier_timing* bt;
+
+private:
+    // numa
+    uint32_t close_mem;
+    uint32_t far_mem;
+
+    // data
+    size_t size_b;
+    size_t chunk_size_b;
+    size_t chunk_size_w;
+    size_t chunk_cnt;
+    base_t* data_a;
+    base_t* data_b;
+    base_t* dest;
+
+    // ratios
+    uint32_t thread_count_fc;
+    uint32_t thread_count_fi;
+    uint32_t thread_count_ag;
+    uint32_t thread_group;
+
+    // done bits
+    volatile uint8_t* ready_flag_a;
+    volatile uint8_t* ready_flag_b;
+    std::mutex ready_a_m;
+    std::mutex ready_b_m;
+
+    // buffer
+    uint16_t* mask_a;
+    uint16_t* mask_b;
+    base_t** buffer_b;
+
+    // params
+    base_t cmp_a;
+    base_t cmp_b;
+    bool no_copy;
+    NewPMode mode;
+
+    // sync
+    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
+    std::string barrier_mode = BARRIER_MODE;
+
+    using filterCopy   = Filter<base_t, LT, load_mode::Stream, true>;
+    using filterNoCopy = Filter<base_t, LT, load_mode::Stream, false>;
+    using filter       = Filter<base_t, LT, load_mode::Stream, false>;
+    using aggregation  = Aggregation<base_t, Sum, load_mode::Stream>;
+
+public: 
+    
+
+    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
+                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
+                  NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) :
+                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
+                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){
+        
+        chunk_size_w = chunk_size_b / sizeof(base_t);
+        chunk_cnt = size_b / chunk_size_b;
+        thread_count_fi = tc_fi;
+        thread_count_fc = tc_fc;
+        thread_count_ag = tc_ag;
+
+        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
+            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
+        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
+            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
+
+        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
+        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
+
+        trt = new thread_runtime_timing(4, 16*4*4*4, close_mem);
+        bt = new barrier_timing(4, 16*4*4*4, close_mem);
+        reset_barriers();
+
+        if constexpr(BUFFER_LIMIT==1) {
+            // TODO size ok like that?
+            buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem);
+            buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
+            buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
+        } else {
+            buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem);
+            base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem);
+            *buffer_b = buffer_tmp;
+        }
+    };
+
+    void reset_barriers(){
+        if(sync_barrier != nullptr) {
+            for(auto& barrier : *sync_barrier) {
+                delete barrier;
+            }
+            sync_barrier.reset();
+        }
+
+        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
+        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
+        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
+        uint32_t barrier_thread_count;
+
+        if constexpr(simple){
+            barrier_thread_count = (thread_group / barrier_count) *
+                                        (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi));
+        } else {
+            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
+        }
+        for(uint32_t i = 0; i < barrier_count; ++i) {
+            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
+        }
+    }
+
+    void clear_buffers () {
+        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
+        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
+
+        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
+        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
+        if constexpr(BUFFER_LIMIT==1) {
+            std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b);
+            std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b);
+        } else {
+            std::memset(*buffer_b, 0x00, size_b);
+        }
+
+        trt->reset_accumulator();
+        bt->reset_accumulator();
+        reset_barriers();
+    };
+
+    ~Query_Wrapper() {
+        numa_free((void*)ready_flag_a,
+            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
+        numa_free((void*)ready_flag_b, 
+            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
+
+        numa_free(mask_a, size_b / sizeof(base_t));
+        numa_free(mask_b, size_b / sizeof(base_t));
+        if constexpr(BUFFER_LIMIT==1) {
+            numa_free(buffer_b[0], thread_group * chunk_size_b);
+            numa_free(buffer_b[1], thread_group * chunk_size_b);
+            numa_free(buffer_b, size_b * sizeof(base_t*));
+        } else {
+            numa_free(*buffer_b, size_b);
+        }
+
+        delete trt;
+        for(auto& barrier : *sync_barrier) {
+            delete barrier;
+        }
+        delete bt;
+
+    };
+
+    //this can be set without need to change allocations
+    void set_thread_group_count(uint32_t value) {
+        this->thread_group = value;
+    };
+
+private:
+    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
+                                            size_t tcnt) {
+        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
+        return chunk_ptr + tid * (chunk_size_w / tcnt);
+    }
+
+    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
+                                            size_t tcnt) {
+        // 16 integer are addressed with one uint16_t in mask buffer
+        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
+        return base_ptr + (offset / 16);
+    }
+
+    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
+        uint8_t value = bitmap[bitpos / 8];
+        switch(bitpos % 8) {
+        case 0: return value & 0b00000001;
+        case 1: return value & 0b00000010;
+        case 2: return value & 0b00000100;
+        case 3: return value & 0b00001000;
+        case 4: return value & 0b00010000;
+        case 5: return value & 0b00100000;
+        case 6: return value & 0b01000000;
+        case 7: return value & 0b10000000;
+        default: return false;
+        }
+    }
+
+    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
+        mutex.lock();
+        switch(bitpos % 8) {
+        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
+        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
+        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
+        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
+        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
+        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
+        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
+        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
+        }
+        mutex.unlock();
+    }
+
+public:
+
+    static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) {
+        base_t sum = 0;
+        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
+            if(a[i] >= cmp_a && b[i] <= cmp_b) {
+                sum += b[i];
+            }
+        }
+        return sum;
+    }
+
+    static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
+        uint32_t cnt = 0;
+        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
+            if(leq) {
+                if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) {
+                    ++cnt;
+                } 
+            } else {
+                if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) {
+                    ++cnt;
+                } 
+            }
+        }
+    }
+
+    static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
+        for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) {
+            std::bitset<16> m(mask[i]);
+            uint16_t ch = 0;
+            for(int j = 0; j < 16; ++j) {
+                if(data[i*16 + j] <= cmp) {
+                    ch |=  0x1 << j;
+                }
+            }
+            std::bitset<16> c(ch);
+
+            std::cout << "act " << m << std::endl;
+            std::cout << "rea " << c << std::endl << std::endl;
+        }
+    }
+
+
+    void scan_b(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_fc;
+        assert(chunk_size_w % tcnt == 0);
+        assert(chunk_size_w % 16   == 0);
+        assert(chunk_size_w % tcnt * 16 == 0);
+
+        // wait till everyone can start
+        ready_future->wait();
+
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            trt->start_timer(1, tid * gcnt + gid);
+            
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b  , chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr = get_sub_mask_ptr (mask_b  , chunk_id, chunk_size_w, tid, tcnt);
+
+            if constexpr(simple){
+                base_t* buffer_ptr;
+                if constexpr(BUFFER_LIMIT==1) {
+                    buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
+                } else {
+                    buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
+                }
+                std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt);
+            } else {
+                if(no_copy) {
+                    filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
+                } else {
+                    base_t* buffer_ptr;
+                    if constexpr(BUFFER_LIMIT==1) {
+                        buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
+                    } else {
+                        buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
+                    }
+                    filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
+                }
+            }
+            
+            trt->stop_timer(1, tid * gcnt + gid);
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
+
+        }
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+
+    }
+
+    void scan_a(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_fi;
+        assert(chunk_size_w % tcnt == 0);
+        assert(chunk_size_w % 16   == 0);
+        assert(chunk_size_w % tcnt * 16 == 0);
+
+        // wait till everyone can start
+        ready_future->wait();
+
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            trt->start_timer(0, tid * gcnt + gid);
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
+
+            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
+
+            trt->stop_timer(0, tid * gcnt + gid);
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
+        }
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+    }
+
+    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_ag;
+        // wait till everyone can start
+        ready_future->wait();
+
+        // calculate values
+        __m512i aggregator = aggregation::OP::zero();
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
+            trt->start_timer(2, tid * gcnt + gid);
+
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t* chunk_ptr;
+            if(no_copy) {
+                chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
+            } else {
+                if constexpr(BUFFER_LIMIT==1) {   
+                    chunk_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
+                } else {
+                    chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
+                }
+            }
+            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
+
+            base_t tmp = _mm512_reduce_add_epi64(aggregator);
+            if constexpr(simple){
+                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt);
+            } else {
+                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
+            }
+            trt->stop_timer(2, tid * gcnt + gid);
+        }
+
+        // so threads with more runs dont wait for finished threads
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+
+        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
+    }
+};
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
new file mode 100644
index 0000000..3b1d861
--- /dev/null
+++ b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
@@ -0,0 +1,395 @@
+
+#include <cassert>
+#include <mutex>
+#include <cstring>
+#include <bitset>
+#include <algorithm>
+
+#include <numa.h>
+
+#include "filter.h"
+#include "aggregation.h"
+#include "vector_loader.h"
+#include "timer_utils.h"
+#include "barrier_utils.h"
+#include "measurement_utils.h"
+#include "execution_modes.h"
+
+#include "../../../thirdParty/dsa_offload/offloading-cacher/cache.hpp"
+
+template<typename base_t, bool simple>
+class Query_Wrapper {
+public:
+    // sync
+    std::shared_future<void>* ready_future;
+
+    thread_runtime_timing* trt;
+    barrier_timing* bt;
+    pcm_value_collector* pvc;
+
+private:
+    dsacache::Cache cache_;
+
+    // numa
+    uint32_t close_mem;
+    uint32_t far_mem;
+
+    // data
+    size_t size_b;
+    size_t chunk_size_b;
+    size_t chunk_size_w;
+    size_t chunk_cnt;
+    base_t* data_a;
+    base_t* data_b;
+    base_t* dest;
+
+    // ratios
+    uint32_t thread_count_fc;
+    uint32_t thread_count_fi;
+    uint32_t thread_count_ag;
+    uint32_t thread_group;
+
+    // done bits
+    volatile uint8_t* ready_flag_a;
+    volatile uint8_t* ready_flag_b;
+    std::mutex ready_a_m;
+    std::mutex ready_b_m;
+
+    // buffer
+    uint16_t* mask_a;
+    uint16_t* mask_b;
+
+    // params
+    base_t cmp_a;
+    base_t cmp_b;
+    NewPMode mode;
+
+    // sync
+    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
+    std::string barrier_mode = BARRIER_MODE;
+
+    using filterCopy   = Filter<base_t, LT, load_mode::Stream, true>;
+    using filterNoCopy = Filter<base_t, LT, load_mode::Stream, false>;
+    using filter       = Filter<base_t, LT, load_mode::Stream, false>;
+    using aggregation  = Aggregation<base_t, Sum, load_mode::Stream>;
+
+    void InitCache(const std::string& device) {
+        if (device == "default") {
+            static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+                return numa_dst_node;
+            };
+
+            static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
+                return std::vector<int>{ numa_src_node, numa_dst_node };
+            };
+
+            cache_.Init(cache_policy,copy_policy);
+        }
+        else if (device == "xeonmax") {
+            static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+                return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;
+            };
+
+            static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
+                const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
+                if (same_socket) {
+                    const bool socket_number = numa_dst_node >> 2;
+                    if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
+                    else return std::vector<int>{ 4, 5, 6, 7 };
+                }
+                else return std::vector<int>{ numa_src_node, numa_dst_node };
+            };
+
+            cache_.Init(cache_policy,copy_policy);
+        }
+        else {
+            std::cerr << "Given device '" << device << "' not supported!" << std::endl;
+            exit(-1);
+        }
+    }
+
+public: 
+    
+
+    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
+                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
+                  NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42) :
+                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
+                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b){
+        
+        chunk_size_w = chunk_size_b / sizeof(base_t);
+        chunk_cnt = size_b / chunk_size_b;
+        thread_count_fi = tc_fi;
+        thread_count_fc = tc_fc;
+        thread_count_ag = tc_ag;
+
+        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
+            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
+        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
+            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
+
+        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
+        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
+
+        InitCache("xeonmax");
+
+        size_t measurement_space = THREAD_GROUP_MULTIPLIER * std::max(std::max(tc_fi, tc_fc), tc_ag);
+        trt = new thread_runtime_timing(3, measurement_space, far_mem);
+        bt = new barrier_timing(3, measurement_space, far_mem);
+        pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, far_mem);
+        reset_barriers();
+    };
+
+    void reset_barriers(){
+        if(sync_barrier != nullptr) {
+            for(auto& barrier : *sync_barrier) {
+                delete barrier;
+            }
+            sync_barrier.reset();
+        }
+
+        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
+        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
+        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
+        uint32_t barrier_thread_count;
+
+        if constexpr(simple){
+            barrier_thread_count = (thread_group / barrier_count) *
+                                        (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi));
+        } else {
+            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
+        }
+        for(uint32_t i = 0; i < barrier_count; ++i) {
+            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
+        }
+    }
+
+    void clear_buffers () {
+        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
+        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
+
+        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
+        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
+
+        cache_.Clear();
+
+        trt->reset_accumulator();
+        bt->reset_accumulator();
+        pvc->reset();
+        reset_barriers();
+    };
+
+    ~Query_Wrapper() {
+        numa_free((void*)ready_flag_a,
+            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
+        numa_free((void*)ready_flag_b, 
+            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
+
+        numa_free(mask_a, size_b / sizeof(base_t));
+        numa_free(mask_b, size_b / sizeof(base_t));
+
+        delete trt;
+        for(auto& barrier : *sync_barrier) {
+            delete barrier;
+        }
+        delete bt;
+        delete pvc;
+    };
+
+    //this can be set without need to change allocations
+    void set_thread_group_count(uint32_t value) {
+        this->thread_group = value;
+    };
+
+private:
+    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
+                                            size_t tcnt) {
+        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
+        return chunk_ptr + tid * (chunk_size_w / tcnt);
+    }
+
+    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
+                                            size_t tcnt) {
+        // 16 integer are addressed with one uint16_t in mask buffer
+        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
+        return base_ptr + (offset / 16);
+    }
+
+    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
+        uint8_t value = bitmap[bitpos / 8];
+        switch(bitpos % 8) {
+        case 0: return value & 0b00000001;
+        case 1: return value & 0b00000010;
+        case 2: return value & 0b00000100;
+        case 3: return value & 0b00001000;
+        case 4: return value & 0b00010000;
+        case 5: return value & 0b00100000;
+        case 6: return value & 0b01000000;
+        case 7: return value & 0b10000000;
+        default: return false;
+        }
+    }
+
+    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
+        mutex.lock();
+        switch(bitpos % 8) {
+        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
+        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
+        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
+        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
+        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
+        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
+        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
+        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
+        }
+        mutex.unlock();
+    }
+
+public:
+    void scan_b(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_fc;
+        assert(chunk_size_w % tcnt == 0);
+        assert(chunk_size_w % 16   == 0);
+        assert(chunk_size_w % tcnt * 16 == 0);
+
+        // wait till everyone can start
+        ready_future->wait();
+
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            trt->start_timer(1, tid * gcnt + gid);
+            pvc->start("scan_b", tid * gcnt + gid);
+            
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr = get_sub_mask_ptr(mask_b, chunk_id, chunk_size_w, tid, tcnt);
+
+            if constexpr(simple){
+                cache_.Access(chunk_ptr, chunk_size_b / tcnt);
+            } else {
+                const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt);
+
+                // wait on copy to complete - during this time other threads may
+                // continue with their calculation which leads to little impact
+                // and we will be faster if the cache is used
+
+                data->WaitOnCompletion();
+
+                // obtain the data location from the cache entry
+
+                base_t* data_ptr = data->GetDataLocation();
+
+                // nullptr is still a legal return value for CacheData::GetLocation()
+                // even after waiting, so this must be checked
+
+                if (data_ptr == nullptr) {
+                    data_ptr = chunk_ptr;
+                }
+
+                filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt);
+            }
+
+            pvc->stop("scan_b", tid * gcnt + gid);
+            trt->stop_timer(1, tid * gcnt + gid);
+            
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
+        }
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+    }
+
+    void scan_a(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_fi;
+        assert(chunk_size_w % tcnt == 0);
+        assert(chunk_size_w % 16   == 0);
+        assert(chunk_size_w % tcnt * 16 == 0);
+
+        // wait till everyone can start
+        ready_future->wait();
+
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+
+        for(uint32_t i = 0; i < runs; ++i) {
+            trt->start_timer(0, tid * gcnt + gid);
+            pvc->start("scan_a", tid * gcnt + gid);
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
+
+            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
+
+            pvc->stop("scan_a", tid * gcnt + gid);
+            trt->stop_timer(0, tid * gcnt + gid);
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
+        }
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+    }
+
+    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_ag;
+        // wait till everyone can start
+        ready_future->wait();
+
+        // calculate values
+        __m512i aggregator = aggregation::OP::zero();
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
+            trt->start_timer(2, tid * gcnt + gid);
+            pvc->start("aggr_j", tid * gcnt + gid);
+
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            const base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
+
+            // access the cache for the given chunk which will have been accessed in scan_b
+
+            const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt);
+
+            // wait on the caching task to complete, this will give time for other processes
+            // to make progress here which will therefore not hurt performance
+
+            data->WaitOnCompletion();
+
+            // after the copy task has finished we obtain the pointer to the cached
+            // copy of data_b which is then used from now on
+
+            const base_t* data_ptr = data->GetDataLocation();
+
+            // nullptr is still a legal return value for CacheData::GetLocation()
+            // even after waiting, so this must be checked
+
+            if (data_ptr == nullptr) {
+                data_ptr = chunk_ptr;
+                std::cerr << "Cache Miss" << std::endl;
+            }
+
+            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
+
+            base_t tmp = _mm512_reduce_add_epi64(aggregator);
+
+            if constexpr(simple){
+                aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, chunk_size_b / tcnt);
+            } else {
+                aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
+            }
+
+            pvc->stop("aggr_j", tid * gcnt + gid);
+            trt->stop_timer(2, tid * gcnt + gid);
+        }
+
+        // so threads with more runs dont wait for alerady finished threads
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+
+        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
+    }
+};
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
new file mode 100644
index 0000000..2b10b06
--- /dev/null
+++ b/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
@@ -0,0 +1,387 @@
+
+#include <cassert>
+#include <mutex>
+#include <cstring>
+#include <bitset>
+
+#include <numa.h>
+
+#include "filter.h"
+#include "aggregation.h"
+#include "vector_loader.h"
+#include "timer_utils.h"
+#include "barrier_utils.h"
+#include "execution_modes.h"
+
+
+template<typename base_t, bool simple>
+class Query_Wrapper {
+public:
+    // sync
+    std::shared_future<void>* ready_future;
+
+    thread_runtime_timing* trt;
+    barrier_timing* bt;
+
+private:
+    // numa
+    uint32_t close_mem;
+    uint32_t far_mem;
+
+    // data
+    size_t size_b;
+    size_t chunk_size_b;
+    size_t chunk_size_w;
+    size_t chunk_cnt;
+    base_t* data_a;
+    base_t* data_b;
+    base_t* dest;
+
+    // ratios
+    uint32_t thread_count_fc;
+    uint32_t thread_count_fi;
+    uint32_t thread_count_ag;
+    uint32_t thread_group;
+
+    // done bits
+    volatile uint8_t* ready_flag_a;
+    volatile uint8_t* ready_flag_b;
+    std::mutex ready_a_m;
+    std::mutex ready_b_m;
+
+    // buffer
+    uint16_t* mask_a;
+    uint16_t* mask_b;
+    base_t** buffer_b;
+
+    // params
+    base_t cmp_a;
+    base_t cmp_b;
+    bool no_copy;
+    PMode mode;
+
+    // sync
+    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
+    std::string barrier_mode = BARRIER_MODE;
+
+    using filterCopy   = Filter<base_t, LEQ, load_mode::Aligned, true>;
+    using filterNoCopy = Filter<base_t, LEQ, load_mode::Aligned, false>;
+    using filter       = Filter<base_t, GEQ, load_mode::Aligned, false>;
+    using aggregation  = Aggregation<base_t, Sum, load_mode::Aligned>;
+
+public: 
+    
+
+    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
+                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
+                  PMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) :
+                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
+                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){
+        
+        chunk_size_w = chunk_size_b / sizeof(base_t);
+        chunk_cnt = size_b / chunk_size_b;
+        thread_count_fi = tc_fi;
+        thread_count_fc = tc_fc;
+        thread_count_ag = tc_ag;
+
+        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
+            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
+        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
+            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
+
+        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
+        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
+
+        trt = new thread_runtime_timing(4, 20, close_mem);
+        bt = new barrier_timing(4, 20, close_mem);
+        reset_barriers();
+
+        if constexpr(BUFFER_LIMIT==1) {
+            // TODO size ok like that?
+            buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem);
+            buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
+            buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
+        } else {
+            buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem);
+            base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem);
+            *buffer_b = buffer_tmp;
+        }
+    };
+
+    void reset_barriers(){
+        if(sync_barrier != nullptr) {
+            for(auto& barrier : *sync_barrier) {
+                delete barrier;
+            }
+            sync_barrier.reset();
+        }
+
+        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
+        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
+        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
+        uint32_t barrier_thread_count;
+
+        if constexpr(simple){
+            barrier_thread_count = (thread_group / barrier_count) *
+                                        (mode == PMode::expl_copy ? thread_count_sum : (thread_count_ag + thread_count_fi));
+        } else {
+            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
+        }
+        for(uint32_t i = 0; i < barrier_count; ++i) {
+            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
+        }
+    }
+
+
+    void clear_buffers () {
+        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
+        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
+
+        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
+        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
+        if constexpr(BUFFER_LIMIT==1) {
+            std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b);
+            std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b);
+        } else {
+            std::memset(*buffer_b, 0x00, size_b);
+        }
+
+        trt->reset_accumulator();
+        bt->reset_accumulator();
+        reset_barriers();
+    };
+
+    ~Query_Wrapper() {
+        numa_free((void*)ready_flag_a,
+            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
+        numa_free((void*)ready_flag_b, 
+            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
+
+        numa_free(mask_a, size_b / sizeof(base_t));
+        numa_free(mask_b, size_b / sizeof(base_t));
+        if constexpr(BUFFER_LIMIT==1) {
+            numa_free(buffer_b[0], thread_group * chunk_size_b);
+            numa_free(buffer_b[1], thread_group * chunk_size_b);
+            numa_free(buffer_b, size_b * sizeof(base_t*));
+        } else {
+            numa_free(*buffer_b, size_b);
+        }
+
+        delete trt;
+        for(auto& barrier : *sync_barrier) {
+            delete barrier;
+        }
+        delete bt;
+
+    };
+
+private:
+    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
+                                            size_t tcnt) {
+        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
+        return chunk_ptr + tid * (chunk_size_w / tcnt);
+    }
+
+    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
+                                            size_t tcnt) {
+        // 16 integer are addressed with one uint16_t in mask buffer
+        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
+        return base_ptr + (offset / 16);
+    }
+
+    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
+        uint8_t value = bitmap[bitpos / 8];
+        switch(bitpos % 8) {
+        case 0: return value & 0b00000001;
+        case 1: return value & 0b00000010;
+        case 2: return value & 0b00000100;
+        case 3: return value & 0b00001000;
+        case 4: return value & 0b00010000;
+        case 5: return value & 0b00100000;
+        case 6: return value & 0b01000000;
+        case 7: return value & 0b10000000;
+        default: return false;
+        }
+    }
+
+    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
+        mutex.lock();
+        switch(bitpos % 8) {
+        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
+        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
+        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
+        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
+        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
+        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
+        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
+        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
+        }
+        mutex.unlock();
+    }
+
+public:
+
+    static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) {
+        base_t sum = 0;
+        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
+            if(a[i] >= cmp_a && b[i] <= cmp_b) {
+                sum += b[i];
+            }
+        }
+        return sum;
+    }
+
+    static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
+        uint32_t cnt = 0;
+        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
+            if(leq) {
+                if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) {
+                    ++cnt;
+                } 
+            } else {
+                if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) {
+                    ++cnt;
+                } 
+            }
+        }
+    }
+
+    static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
+        for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) {
+            std::bitset<16> m(mask[i]);
+            uint16_t ch = 0;
+            for(int j = 0; j < 16; ++j) {
+                if(data[i*16 + j] <= cmp) {
+                    ch |=  0x1 << j;
+                }
+            }
+            std::bitset<16> c(ch);
+
+            std::cout << "act " << m << std::endl;
+            std::cout << "rea " << c << std::endl << std::endl;
+        }
+    }
+
+
+    void scan_b(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_fc;
+        assert(chunk_size_w % tcnt == 0);
+        assert(chunk_size_w % 16   == 0);
+        assert(chunk_size_w % tcnt * 16 == 0);
+
+        // wait till everyone can start
+        ready_future->wait();
+
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            trt->start_timer(1, tid * gcnt + gid);
+            
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b  , chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr = get_sub_mask_ptr (mask_b  , chunk_id, chunk_size_w, tid, tcnt);
+
+            if constexpr(simple){
+                base_t* buffer_ptr;
+                if constexpr(BUFFER_LIMIT==1) {
+                    buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
+                } else {
+                    buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
+                }
+                std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt);
+            } else {
+                if(no_copy) {
+                    filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
+                } else {
+                    base_t* buffer_ptr;
+                    if constexpr(BUFFER_LIMIT==1) {
+                        buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
+                    } else {
+                        buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
+                    }
+                    filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
+                }
+            }
+            
+            trt->stop_timer(1, tid * gcnt + gid);
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
+
+        }
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+
+    }
+
+    void scan_a(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_fi;
+        assert(chunk_size_w % tcnt == 0);
+        assert(chunk_size_w % 16   == 0);
+        assert(chunk_size_w % tcnt * 16 == 0);
+
+        // wait till everyone can start
+        ready_future->wait();
+
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            trt->start_timer(0, tid * gcnt + gid);
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
+
+            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
+
+            trt->stop_timer(0, tid * gcnt + gid);
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
+        }
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+    }
+
+    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
+        size_t tcnt = thread_count_ag;
+        // wait till everyone can start
+        ready_future->wait();
+
+        // calculate values
+        __m512i aggregator = aggregation::OP::zero();
+        // the lower gids run once more if the chunks are not evenly distributable
+        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
+        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+        for(uint32_t i = 0; i < runs; ++i) {
+            
+            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
+            trt->start_timer(2, tid * gcnt + gid);
+
+            // calculate pointers
+            size_t chunk_id = gid + gcnt * i;
+            base_t* chunk_ptr;
+            if(no_copy) {
+                chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
+            } else {
+                if constexpr(BUFFER_LIMIT==1) {
+                    chunk_ptr = get_sub_chunk_ptr(buffer_b[i%2], gid, chunk_size_w, tid, tcnt);
+                } else {
+                    chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
+                }
+            }
+            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
+            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
+
+            base_t tmp = _mm512_reduce_add_epi64(aggregator);
+            if constexpr(simple){
+                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt);
+            } else {
+                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
+            }
+            trt->stop_timer(2, tid * gcnt + gid);
+        }
+
+        // so threads with more runs dont wait for finished threads
+        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
+
+        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
+    }
+};
\ No newline at end of file
diff --git a/qdp_project/src/utils/array_utils.h b/qdp_project/src/utils/array_utils.h
new file mode 100644
index 0000000..52eba76
--- /dev/null
+++ b/qdp_project/src/utils/array_utils.h
@@ -0,0 +1,80 @@
+#pragma once
+#include <cstdlib>
+#include <ctime>
+#include <cstdint>
+#include <type_traits>
+#include <random>
+#include <chrono>
+
+#include <immintrin.h>
+
+/// @brief Fills a given array with random generated integers.
+/// @tparam base_t Datatype of the array
+/// @param dest Pointer to the array
+/// @param size Size of the array
+/// @param min Minumum value of the generated integers
+/// @param max Maximum value of the generated integers
+template<typename base_t>
+void fill(base_t * dest, uint64_t size, base_t min, base_t max) {
+    std::srand(std::time(nullptr));
+    for(uint64_t i = 0; i < size/sizeof(base_t); ++i) {
+        dest[i] = (std::rand() % (max - min)) + min;
+    }
+}
+
+/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937).
+/// @tparam base_t Datatype of the array
+/// @param dest Pointer to the array
+/// @param size Size of the array
+/// @param min Minumum value of the generated integers
+/// @param max Maximum value of the generated integers
+template <typename T>
+void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) {
+	static_assert(std::is_integral<T>::value, "Data type is not integral.");
+        
+    size = size / sizeof(T);
+
+    std::mt19937::result_type seed;
+    if (int_seed == 0) {
+        std::random_device rd;
+        seed = rd() ^ (
+            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::seconds>(
+                std::chrono::system_clock::now().time_since_epoch()).count() + 
+            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::microseconds>(
+                std::chrono::high_resolution_clock::now().time_since_epoch()).count());
+    } else seed = int_seed;
+        
+    std::mt19937 gen(seed);
+    std::uniform_int_distribution<T> distrib(min, max);
+        
+    for (uint64_t j = 0; j < size; ++j) {
+        array[j] = distrib(gen);
+    }
+	
+}
+
+/**
+ * @brief Checks if two arrays of the integral type *T* contain the same values
+ * 
+ * @tparam T Integral type of *array0* and *array1*
+ * @param array0 Array 0 to check
+ * @param array1 Array 1 to check
+ * @param size_b Size of the two arrays in byte
+ * @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index)
+ * @return bool Weathor or not the content is equal or not
+ */
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value, bool>::type
+        check_same(T* array0, T* array1, size_t size_b, bool verbose) {
+    for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) {
+        __m512i vec0 = _mm512_stream_load_si512(array0 + i);
+        __m512i vec1 = _mm512_stream_load_si512(array1 + i);
+
+        __mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1);
+    }
+
+    //TODO complete function
+
+    return false;
+}
+
diff --git a/qdp_project/src/utils/barrier_utils.h b/qdp_project/src/utils/barrier_utils.h
new file mode 100644
index 0000000..a68f801
--- /dev/null
+++ b/qdp_project/src/utils/barrier_utils.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <cstdint>
+#include <numa.h>
+#include <barrier>
+#include <chrono>
+
+#define BARRIER_TIMINGS 1
+
+
+struct barrier_completion_function {
+    inline void operator() () {
+        return;
+    }
+};
+
+struct barrier_timing {
+
+    uint32_t time_points, time_threads;
+    double** time_accumulator;
+
+    barrier_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) {
+#ifdef BARRIER_TIMINGS
+        time_points = timing_points;
+        time_threads = timing_threads;
+        time_accumulator =  (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node);
+        for(uint32_t i = 0; i < timing_points; ++i) {
+            time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node);
+        }
+#endif
+    }
+
+    ~barrier_timing() {
+#ifdef BARRIER_TIMINGS
+        for(uint32_t i = 0; i < time_points; ++i) {
+            numa_free(time_accumulator[i], time_threads * sizeof(double));
+        }
+        numa_free(time_accumulator, time_points * sizeof(double*));
+#endif
+    }
+
+    void reset_accumulator() {
+#ifdef BARRIER_TIMINGS        
+        for(uint32_t i = 0; i < time_points;  ++i){
+        for(uint32_t j = 0; j < time_threads; ++j){
+            time_accumulator[i][j] = 0.0;
+        }}
+#endif
+    }
+
+    double summarize_time(uint32_t time_point) {
+#ifdef BARRIER_TIMINGS
+        double sum = 0.0;
+        for(uint32_t i = 0; i < time_threads; ++i) {
+            sum += time_accumulator[time_point][i];
+        }
+        return sum;
+#endif
+    }
+
+    void timed_wait(std::barrier<struct barrier_completion_function>& barrier, uint32_t point_id, uint32_t thread_id) {
+#ifdef BARRIER_TIMINGS
+        auto before_barrier = std::chrono::steady_clock::now();
+#endif
+        barrier.arrive_and_wait();
+#ifdef BARRIER_TIMINGS
+        auto after_barrier = std::chrono::steady_clock::now();
+        uint64_t barrier_wait_time = std::chrono::duration_cast<std::chrono::nanoseconds>(after_barrier - before_barrier).count();
+        double seconds = barrier_wait_time / (1000.0 * 1000.0 * 1000.0);
+        time_accumulator[point_id][thread_id] += seconds;
+#endif    
+    }
+};
\ No newline at end of file
diff --git a/qdp_project/src/utils/const.h b/qdp_project/src/utils/const.h
new file mode 100644
index 0000000..fde4b55
--- /dev/null
+++ b/qdp_project/src/utils/const.h
@@ -0,0 +1,33 @@
+/**
+ * @file const.h
+ * @author André Berthold
+ * @brief Defines handy constants.
+ * @version 0.1
+ * @date 2023-05-25
+ * 
+ * @copyright Copyright (c) 2023
+ * 
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <immintrin.h>
+
+constexpr size_t VECTOR_SIZE_I = 512;
+constexpr size_t VECTOR_SIZE_B = VECTOR_SIZE_I / 8;
+constexpr size_t VECTOR_SIZE_H = VECTOR_SIZE_B / sizeof(uint32_t);
+constexpr size_t VECTOR_SIZE_W = VECTOR_SIZE_B / sizeof(uint64_t);
+
+template<typename T>
+constexpr size_t VECTOR_SIZE() {
+    return VECTOR_SIZE_B / sizeof(T);
+}
+
+template<typename T>
+constexpr size_t V_MASK_SIZE() {
+    return VECTOR_SIZE<T>() / 8;
+}
+
+
+const __mmask16 full_m16 = _mm512_int2mask(0xFFFF); 
\ No newline at end of file
diff --git a/qdp_project/src/utils/cpu_set_utils.h b/qdp_project/src/utils/cpu_set_utils.h
new file mode 100644
index 0000000..ba82604
--- /dev/null
+++ b/qdp_project/src/utils/cpu_set_utils.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <cstdint>
+#include <thread>
+#include <cassert>
+#include <iostream>
+#include <vector>
+#include <utility>
+
+/** Sets all bits in a given cpu_set_t between L and H (condition L <= H)*/
+#define CPU_BETWEEN(L, H, SET) assert(L <= H); for(; L < H; ++L) {CPU_SET(L, SET);}
+
+/**
+ * Applies the affinity defined in set to the thread, through pthread library 
+ * calls. If it fails it wites the problem to stderr and terminated the program.
+*/
+inline void pin_thread(std::thread& thread, cpu_set_t* set) {
+    int error_code = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), set);
+    if (error_code != 0) {
+        std::cerr << "Error calling pthread_setaffinity_np in copy_pool assignment: " << error_code << std::endl;
+        exit(-1);
+    }
+}
+
+/**
+ * Returns the cpu id of the thread_id-th cpu in a given (multi)range. Thread_id
+ * greater than the number of cpus in the (multi)range are valid. In this case
+ * the (thread_id % #cpus in the range)-th cpu in the range is returned.
+*/
+int get_cpu_id(int thread_id, const std::vector<std::pair<int, int>>& range) {
+    int subrange_size = range[0].second - range[0].first;
+
+    int i = 0;
+    while(subrange_size <= thread_id) {
+        thread_id -= subrange_size;
+        i = (i + 1) % range.size();
+        subrange_size = range[i].second - range[i].first;
+    }
+    return thread_id + range[i].first;
+}
+
+/*inline void cpu_set_between(cpu_set_t* set, uint32_t low, uint32_t high) {
+    assert(low != high);
+    if (low > high) std::swap(low, high);
+
+    for(; low < high; ++low) {
+        CPU_SET(low, set);
+    }
+}*/
+
+/**
+ * Pins the given thread to the thread_id-th cpu in the given range. 
+*/
+void pin_thread_in_range(std::thread& thread, int thread_id, std::vector<std::pair<int, int>>& range) {
+    cpu_set_t set; 
+    CPU_ZERO(&set);
+    CPU_SET(get_cpu_id(thread_id, range), &set);
+
+    pin_thread(thread, &set);
+}
+
+/**
+ * Pins the given thread to all cpus in the given range. 
+*/
+void pin_thread_in_range(std::thread& thread, std::vector<std::pair<int, int>>& range) {
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    for(auto r : range) { CPU_BETWEEN(r.first, r.second, &set); }
+
+    pin_thread(thread, &set);
+}
+
+/**
+ * Pins the given thread to all cpu ids between low (incl.) and high (excl.).
+*/
+inline void pin_thread_between(std::thread& thread, uint32_t low, uint32_t high) {
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    CPU_BETWEEN(low, high, &set);
+
+    pin_thread(thread, &set);
+}
\ No newline at end of file
diff --git a/qdp_project/src/utils/execution_modes.h b/qdp_project/src/utils/execution_modes.h
new file mode 100644
index 0000000..ca04b4f
--- /dev/null
+++ b/qdp_project/src/utils/execution_modes.h
@@ -0,0 +1,89 @@
+#include <string> 
+
+enum PMode{no_copy = 0, hbm = 1, expl_copy = 2};
+struct mode_manager {
+    static inline PMode inc(PMode value) {
+        return static_cast<PMode>(value + 1);
+    };
+    static inline bool pred(PMode value) {
+        return no_copy <= value && value <= expl_copy;
+    };
+    static std::string string(PMode value) {
+        switch(value) {
+            case no_copy:  return "no_copy";
+            case hbm:      return "hbm_pre";
+            case expl_copy:return "expl_co";
+        } return "no_copy";
+    };
+};
+
+#define SIMPLE_Q 0
+#define COMPLEX_Q 1
+
+#define SCAN_A 0
+#define SCAN_B 1
+#define AGGR_J 2
+
+enum NewPMode{DRAM_base = 0, HBM_base = 1, Mixed_base = 2, Prefetch = 3};
+struct new_mode_manager {
+    /*constexpr static int thread_counts[2][4][3] = {
+        //simple query
+        //scan_a, scan_b, aggr_j
+        {{3,      0,      3}, // DRAM_base
+         {3,      0,      3}, // HBM_base
+         {3,      0,      3}, // Mixed_base
+         {1,      4,      1}},// Prefetching
+        //complex query
+        {{1,      4,      1}, // DRAM_base
+         {1,      4,      1}, // HBM_base
+         {1,      4,      1}, // Mixed_base
+         {1,      4,      1}},// Prefetching
+    };*/
+
+    /*constexpr static int thread_counts[2][4][3] = {
+        //simple query
+        //scan_a, scan_b, aggr_j
+        {{2,      0,      4}, // DRAM_base
+         {2,      0,      4}, // HBM_base
+         {2,      0,      4}, // Mixed_base
+         {1,      4,      1}},// Prefetching
+        //complex query
+        {{1,      4,      1}, // DRAM_base
+         {1,      4,      1}, // HBM_base
+         {1,      4,      1}, // Mixed_base
+         {1,      4,      1}},// Prefetching
+    };*/
+
+    constexpr static int thread_counts[2][4][3] = {
+        //simple query
+        //scan_a, scan_b, aggr_j
+        {{4,      0,      2}, // DRAM_base
+         {4,      0,      2}, // HBM_base
+         {4,      0,      2}, // Mixed_base
+         {1,      4,      1}},// Prefetching
+        //complex query
+        {{1,      4,      1}, // DRAM_base
+         {1,      4,      1}, // HBM_base
+         {1,      4,      1}, // Mixed_base
+         {1,      4,      1}},// Prefetching
+    };
+
+    static inline NewPMode inc(NewPMode value) {
+        return static_cast<NewPMode>(value + 1);
+    };
+    static inline bool pred(NewPMode value) {
+        return DRAM_base <= value && value <= Prefetch;
+    };
+    static int thread_count(uint8_t query_type, NewPMode mode, uint8_t thread_type){
+        if(query_type > 1) query_type = 1;
+        if(thread_type > 2) thread_type = 2;
+        return (thread_counts[query_type][mode][thread_type]);
+    };
+    static std::string string(NewPMode value) {
+        switch(value) {
+            case  DRAM_base: return "DRAM_Baseline";
+            case   HBM_base: return "HBM_Baseline";
+            case Mixed_base: return "DRAM_HBM_Baseline";
+        }                    return "Q-d_Prefetching";
+    };
+};
\ No newline at end of file
diff --git a/qdp_project/src/utils/file_output.h b/qdp_project/src/utils/file_output.h
new file mode 100644
index 0000000..1dd85ba
--- /dev/null
+++ b/qdp_project/src/utils/file_output.h
@@ -0,0 +1,76 @@
+/**
+ * @file file_output.h
+ * @author André Berthold
+ * @brief Implements a template-function that accepts an arbitrary number of parameters that should be printed
+ * @version 0.1
+ * @date 2023-05-25
+ * 
+ * @copyright Copyright (c) 2023
+ * 
+ */
+#pragma once
+
+#include <fstream>
+#include <string>
+#include <type_traits>
+
+#include "iterable_range.h"
+
+template<class T>
+inline constexpr bool is_numeric_v = std::disjunction<
+            std::is_integral<T>, 
+            std::is_floating_point<T>>::value;
+
+/**
+ * @brief Converts a parameter to a string by either  using it directly or its member current (if it is of type Labeled) 
+ * as parameter to the std::string-Constructor.
+ * 
+ * @tparam T Type of the parameter
+ * @param value Parameter to be converted
+ * @return std::string The converted parameter
+ */
+template<typename T>
+inline std::string to_string(T value) {
+    if constexpr(std::is_base_of<Labeled, T>::value){
+        // integrals cannot be use in the string constructor and must be translated by the std::to_string-function
+        if constexpr (is_numeric_v<decltype(value.current)>) {
+            return std::to_string(value.current);
+        } else {
+            return std::string(value.current);
+        }
+    } else {
+        // integrals cannot be use in the string constructor and must be translated by the std::to_string-function
+        if constexpr (is_numeric_v<decltype(value)>) {
+            return std::to_string(value);
+        } else {
+            return std::string(value);
+        }
+    }
+}
+
+/**
+ * @brief This function wites the content of *val* to *file*. Terminates terecursive function definition.
+ * 
+ * @tparam type Type of the paramter *val* (is usually implicitly defeined)
+ * @param file File that is written to
+ * @param val Value that is translated to a char stream and written to the file
+ */
+template<typename type>
+inline void print_to_file(std::ofstream &file, type val) {
+    file << to_string(val) << std::endl;
+}
+
+/**
+ * @brief This function wites the content of *val* and that content if *vals* to *file*.
+ * 
+ * @tparam type Type of the paramter *val* (is usually implicitly defeined)
+ * @tparam types Parameter pack that describes the types of *vals*
+ * @param file File that is written to
+ * @param val Value that is translated to a char stream and written to the file
+ * @param vals Paramater pack of values that are gonna be printed to the file
+ */
+template<typename type, typename... types>
+inline void print_to_file(std::ofstream &file, type val, types ... vals) {
+    file << to_string(val) << ",";
+    print_to_file(file, vals...);
+}
\ No newline at end of file
diff --git a/qdp_project/src/utils/iterable_range.h b/qdp_project/src/utils/iterable_range.h
new file mode 100644
index 0000000..95fc57e
--- /dev/null
+++ b/qdp_project/src/utils/iterable_range.h
@@ -0,0 +1,208 @@
+ #pragma once
+
+#include <cstdint>
+#include <type_traits>
+#include <string>
+
+
+constexpr auto NO_NEXT = "false";
+
+/**
+ * @brief Class that adds an label member-parameter to a sub-class
+ * 
+ */
+class Labeled {
+public:
+    std::string label;
+public:
+    Labeled(std::string str) : label(str) {};
+    Labeled(const char* str) { this->label = std::string(str); };
+};
+
+/**
+ * @brief Converts a parameter to a string by either reading the member label (if it is of type Labeled) or using it 
+ * as parameter to the std::string-Constructor.
+ * 
+ * @tparam T Type of the parameter
+ * @param value Parameter to be converted
+ * @return std::string The converted parameter
+ */
+template<typename T>
+inline std::string generateHead(T value) {
+    if constexpr(std::is_base_of<Labeled, T>::value){
+        return value.label;
+    } else {
+        return std::string(value);
+    }
+}
+
+/**
+ * @brief Converts a parameter-pack to a string calling genarateHead(T) on every parameter and concatenatin the results.
+ * 
+ * @tparam T Type of the first parameter
+ * @tparam Ts Parameter pack specifying the preceeding parameters' types
+ * @param value Parameter to be transformed
+ * @param values Parameter-pack of the next prameters to be transformed
+ * @return std::string Comma-separated concatenation of all parameters string representation
+ */
+template<typename T, typename... Ts>
+inline std::string generateHead(T value, Ts... values) {
+    return generateHead(value) + ',' + generateHead(values...);
+}
+
+
+/**
+ * @brief Takes a single Range object and calls its next function.
+ * 
+ * @tparam T Specific type of the Range object
+ * @param t Instance of the Range object
+ * @return std::string Label of the Range object or "false" if the Range reaced its end and was reset
+ */
+template<typename T>
+std::string IterateOnce(T& t) {
+    if(t.next()) return t.label;
+    else t.reset();
+    return std::string(NO_NEXT); //the string signalises that the iteration has to be terminiated.
+}
+
+/**
+ * @brief Takes a number of Range objects and recusively increments them till the first Range does not reach its end
+ * upon incrementing. It tarts at the first Range object given. Every Range object that reached its end is reset to 
+ * its start value. 
+ * 
+ * @tparam T Specific type of the first Range object
+ * @tparam Ts Types to the following Range objects
+ * @param t First instance of the Range object
+ * @param ts Parameter pack of the following Range objects
+ * @return std::string Label of the highest index Range object that was altered, or "false" if the last Range object 
+ * reache its end and was reset
+ */
+template<typename T, typename... Ts>
+std::string IterateOnce(T& t , Ts&... ts) {
+    if(t.next()) return t.label;
+    else t.reset();
+    return IterateOnce<Ts...>(ts...);
+}
+
+
+/**
+ * @brief Class that provides a convenient interface for iteratin throug a parameter range. It stores a public value 
+ * that can be altered by the classes' methods.
+ * 
+ * @tparam T Base type of the parameter
+ * @tparam INIT Initial value of the current pointer
+ * @tparam PRED Struct providing an apply function testing if the current value is in range or not
+ * @tparam INC Struct providing an apply function setting the current value to the value following the current value
+ */
+template<typename T, T INIT, typename PRED, typename INC>
+class Range : public Labeled {
+public:
+    /**
+     * @brief Current value of the parameter
+     */
+    T current = INIT;
+
+    /**
+     * @brief Resets current to its initial value
+     */
+    void reset() {current = INIT; };
+
+    /**
+     * @brief Sets current to its next value (according to INC::inc) and returns if the range Reached its end 
+     * (accordingt to PRED::pred).
+     * 
+     * @return true The newly assigned value of current is in the range
+     * @return false Otherwise
+     */
+    bool next() {
+        current = INC::inc(current);
+        return PRED::pred(current);
+    };
+
+    /**
+     * @brief Checks if current is in the Range (according to PRED).
+     * 
+     * @return true PRED returns true
+     * @return false Otherwise
+     */
+    bool valid() { return PRED::apply(current); };
+};
+
+/**
+ * @brief Class that is in contrast to Range specialized for integral values.
+ * 
+ * @tparam T Integral base type of the Range
+ * @tparam INIT Initial value of the parameter
+ * @tparam MAX Maximal value of the parameter
+ * @tparam INC Struct providing an apply function setting the current value to the value following the current value
+ */
+template<typename T, T INIT, T MAX, typename INC>
+class Int_Range : public Labeled {
+static_assert(std::is_integral<T>::value, "Int_Range requires an integral base type");
+
+public:
+    const T max = MAX;
+    T current = INIT;
+
+    void reset() {current = INIT; };
+
+    bool next() {
+        current = INC::inc(current);
+        return current < MAX;
+    };
+
+    bool valid() { return current < MAX; };
+
+};
+
+/**
+ * @brief Class that is in contrast to Int_Range specialized for integrals that grow linearly.
+ * 
+ * @tparam T Integral base type of the Range
+ * @tparam INIT Initial value of the parameter
+ * @tparam MAX Maximal value of the parameter
+ * @tparam STEP Increase of the value per next()-call
+ */
+template<typename T, T INIT, T MAX, T STEP = 1>
+class Linear_Int_Range : public Labeled {
+static_assert(std::is_integral<T>::value, "Linear_Int_Range requires an integral base type");
+
+public:
+    const T max = MAX;
+    T current = INIT;
+
+    void reset() {current = INIT; };
+
+    bool next() {
+        current += STEP;
+        return current < MAX;
+    };
+
+    bool valid() { return current < MAX; };
+};
+
+/**
+ * @brief Class that is in contrast to Int_Range specialized for integrals that grow exponetially.
+ * 
+ * @tparam T Integral base type of the Range
+ * @tparam INIT Initial value of the parameter
+ * @tparam MAX Maximal value of the parameter
+ * @tparam FACTOR Multiplicative Increase of the value per next()-call
+ */
+template<typename T, T INIT, T MAX, T FACTOR = 2>
+class Exp_Int_Range : public Labeled {
+static_assert(std::is_integral<T>::value, "Exp_Int_Range requires an integral base type");
+
+public:
+    const T max = MAX;
+    T current = INIT;
+
+    void reset() {current = INIT; };
+
+    bool next() {
+        current *= FACTOR;
+        return current < MAX;
+    };
+
+    bool valid() { return current < MAX; };
+};
\ No newline at end of file
diff --git a/qdp_project/src/utils/measurement_utils.h b/qdp_project/src/utils/measurement_utils.h
new file mode 100644
index 0000000..f403de0
--- /dev/null
+++ b/qdp_project/src/utils/measurement_utils.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <cstdint>
+#include <chrono>
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include <numa.h>
+
+
+#if PCM_M == 1
+#define PCM_MEASURE 1
+#include "pcm.h"
+#endif
+
+
+
+struct pcm_value_collector {
+    const uint32_t value_count = 6;
+
+    uint32_t threads;
+    std::vector<std::string> points;
+#ifdef PCM_MEASURE
+    pcm::SystemCounterState** states;
+#endif
+    uint64_t** collection;
+
+    pcm_value_collector(const std::vector<std::string>& in_points, uint32_t threads, uint32_t memory_node) : threads(threads) {
+#ifdef PCM_MEASURE
+        points = std::vector(in_points);
+        
+        collection = (uint64_t**) numa_alloc_onnode(threads * sizeof(uint64_t*), memory_node);
+        states = (pcm::SystemCounterState**) numa_alloc_onnode(threads * sizeof(pcm::SystemCounterState*), memory_node);
+        for(int i = 0; i < threads; ++i) {
+            collection[i] = (uint64_t*) numa_alloc_onnode(points.size() * value_count * sizeof(uint64_t), memory_node);
+            states[i] = (pcm::SystemCounterState*) numa_alloc_onnode(points.size() * sizeof(pcm::SystemCounterState), memory_node);
+        }
+#endif
+    }
+
+    ~pcm_value_collector() {
+#ifdef PCM_MEASURE
+        for(int i = 0; i < threads; ++i) {
+            numa_free(collection[threads], points.size() * value_count * sizeof(uint64_t));
+        }
+        numa_free(collection, threads * sizeof(uint64_t*));
+        numa_free(states, threads * sizeof(pcm::SystemCounterState));
+#endif
+    }
+
+    void reset() {
+#ifdef PCM_MEASURE
+        for(int i = 0; i < threads; ++i)
+            for(uint32_t j = 0; j < points.size() * value_count;  ++j){
+                collection[i][j] = 0;
+            }
+#endif
+    }
+
+    int64_t point_index(const std::string& value) {
+        auto it = std::find(points.begin(), points.end(), value);
+
+        if(it == points.end()) return -1;
+        else return it - points.begin();
+    }
+
+    std::vector<uint64_t> summarize(const std::string &point) {
+#ifdef PCM_MEASURE
+        std::vector<uint64_t> sums(value_count);
+        int64_t idx = point_index(point);
+        if(idx < 0) return sums;
+
+        for(uint32_t v = 0; v < value_count; ++v) {
+            for(uint32_t i = 0; i < threads; ++i) {
+                sums[v] += collection[i][static_cast<uint32_t>(idx) + points.size() * v];
+            }
+        }
+        return sums;
+#endif
+        return std::vector<uint64_t> {0};
+    }
+
+    std::string summarize_as_string(const std::string &point) {
+#ifdef PCM_MEASURE
+        auto summary = summarize(point);
+        auto it = summary.begin();
+        auto end = summary.end();
+
+        if(it >= end) return "";
+
+        std::string result(""); 
+        result += std::to_string(*it);
+        ++it;
+
+        while(it < end) {
+            result += ",";
+            result += std::to_string(*it);
+            ++it;
+        }
+        return result;
+#endif
+        return "";
+    }
+
+    void start(const std::string& point, uint32_t thread) {
+#ifdef PCM_MEASURE
+        int64_t idx = point_index(point);
+        if(idx < 0) {
+            std::cerr << "Invalid 'point' given. Ignored!" << std::endl;
+            return;
+        }
+        
+        states[thread][static_cast<uint32_t>(idx)] = pcm::getSystemCounterState();
+#endif
+    }
+
+    static std::string getHead(const std::string& point) {
+        return point + "_l2h," + 
+               point + "_l2m," + 
+               point + "_l3h," + 
+               point + "_l3hns," + 
+               point + "_l3m," + 
+               point + "_mc";  
+    }
+
+#ifdef PCM_MEASURE
+    void read_values(uint32_t point_idx, uint32_t thread, pcm::SystemCounterState& start, pcm::SystemCounterState& end) {
+        collection[thread][point_idx + points.size() * 0] += getL2CacheHits(start, end);
+        collection[thread][point_idx + points.size() * 1] += getL2CacheMisses(start, end);
+        collection[thread][point_idx + points.size() * 2] += getL3CacheHits(start, end);
+        collection[thread][point_idx + points.size() * 3] += getL3CacheHitsNoSnoop(start, end);
+        collection[thread][point_idx + points.size() * 4] += getL3CacheMisses(start, end);
+        collection[thread][point_idx + points.size() * 5] += getBytesReadFromMC(start, end);
+    }
+#endif
+
+    void stop(const std::string& point, uint32_t thread) {
+#ifdef PCM_MEASURE
+        auto state = pcm::getSystemCounterState();
+
+        int64_t idx = point_index(point);
+        if(idx < 0) {
+            std::cerr << "Invalid 'point' given. Ignored!" << std::endl;
+            return;
+        }
+        
+        auto start = states[thread][static_cast<uint32_t>(idx)];
+        read_values(static_cast<uint32_t>(idx), thread, start, state);
+#endif
+    }
+};
diff --git a/qdp_project/src/utils/memory_literals.h b/qdp_project/src/utils/memory_literals.h
new file mode 100644
index 0000000..bcf6395
--- /dev/null
+++ b/qdp_project/src/utils/memory_literals.h
@@ -0,0 +1,45 @@
+/**
+ * @file memory_literals.h
+ * @author André Berthold
+ * @brief Defines some operators that ease to define a certain size of memory.
+ * e.g. to alloc 3 Gib (Gibibit = 2^30 bit) of memory one can now simply write: "std::malloc(3_Gib)"
+ *      to alloc 512 MB (Megabyte = 10^2 byte) of memory one can now simply write: "std::malloc(512_MB)"
+ * @version 0.1
+ * @date 2023-05-25
+ * 
+ * @copyright Copyright (c) 2023
+ * 
+ */
+#pragma once
+
+#include <cstdint>
+
+typedef const unsigned long long int ull_int;
+//***************************************************************************//
+// Bit **********************************************************************//
+//***************************************************************************//
+constexpr size_t operator ""_b(ull_int value) {
+    // one byte is 8 bit + one byte if bit is no multiple of 8
+    return value / 8 + value % 8;
+}
+constexpr size_t operator ""_kb (ull_int value) { return value * 1000 / 8; }
+constexpr size_t operator ""_kib(ull_int value) { return value * 1024 / 8; }
+constexpr size_t operator ""_Mb (ull_int value) { return value * 1000 * 1000 / 8; }
+constexpr size_t operator ""_Mib(ull_int value) { return value * 1024 * 1024 / 8; }
+constexpr size_t operator ""_Gb (ull_int value) { return value * 1000 * 1000 * 1000 / 8; }
+constexpr size_t operator ""_Gib(ull_int value) { return value * 1024 * 1024 * 1024 / 8; }
+constexpr size_t operator ""_Tb (ull_int value) { return value * 1000 * 1000 * 1000 * 1000 / 8; }
+constexpr size_t operator ""_Tib(ull_int value) { return value * 1024 * 1024 * 1024 * 1024 / 8; }
+
+//***************************************************************************//
+// Byte *********************************************************************//
+//***************************************************************************//
+constexpr size_t operator ""_B  (ull_int value) { return value; }
+constexpr size_t operator ""_kB (ull_int value) { return value * 1000; }
+constexpr size_t operator ""_kiB(ull_int value) { return value * 1024; }
+constexpr size_t operator ""_MB (ull_int value) { return value * 1000 * 1000; }
+constexpr size_t operator ""_MiB(ull_int value) { return value * 1024 * 1024; }
+constexpr size_t operator ""_GB (ull_int value) { return value * 1000 * 1000 * 1000; }
+constexpr size_t operator ""_GiB(ull_int value) { return value * 1024 * 1024 * 1024; }
+constexpr size_t operator ""_TB (ull_int value) { return value * 1000 * 1000 * 1000 * 1000; }
+constexpr size_t operator ""_TiB(ull_int value) { return value * 1024 * 1024 * 1024 * 1024; }
\ No newline at end of file
diff --git a/qdp_project/src/utils/pcm.h b/qdp_project/src/utils/pcm.h
new file mode 100644
index 0000000..91a19e0
--- /dev/null
+++ b/qdp_project/src/utils/pcm.h
@@ -0,0 +1,6 @@
+#pragma once
+//this file includes all important header from the pcm repository
+#include "cpucounters.h"
+#include "msr.h"
+#include "pci.h"
+#include "mutex.h"
diff --git a/qdp_project/src/utils/timer_utils.h b/qdp_project/src/utils/timer_utils.h
new file mode 100644
index 0000000..b6ec54f
--- /dev/null
+++ b/qdp_project/src/utils/timer_utils.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cstdint>
+#include <chrono>
+#include <barrier>
+
+#include <numa.h>
+
+#define THREAD_TIMINGS 1
+
+
+
+struct thread_runtime_timing {
+    using time_point_t = std::chrono::time_point<std::chrono::steady_clock>;
+
+    uint32_t time_points, time_threads;
+    time_point_t** start_times;
+    double** time_accumulator;
+
+    thread_runtime_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) {
+#ifdef THREAD_TIMINGS
+        time_points = timing_points;
+        time_threads = timing_threads;
+        start_times = (time_point_t**) numa_alloc_onnode(timing_points * sizeof(time_point_t*), memory_node);
+        time_accumulator =  (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node);
+        for(uint32_t i = 0; i < timing_points; ++i) {
+            start_times[i] = (time_point_t*) numa_alloc_onnode(timing_threads * sizeof(time_point_t), memory_node);
+            time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node);
+        }
+#endif
+    }
+
+    ~thread_runtime_timing() {
+#ifdef THREAD_TIMINGS
+        for(uint32_t i = 0; i < time_points; ++i) {
+            numa_free(start_times[i], time_threads * sizeof(time_point_t));
+            numa_free(time_accumulator[i], time_threads * sizeof(double));
+        }
+        numa_free(start_times, time_points * sizeof(time_point_t*));
+        numa_free(time_accumulator, time_points * sizeof(double*));
+#endif
+    }
+
+    void reset_accumulator() {
+#ifdef THREAD_TIMINGS        
+        for(uint32_t i = 0; i < time_points;  ++i){
+        for(uint32_t j = 0; j < time_threads; ++j){
+            time_accumulator[i][j] = 0.0;
+        }}
+#endif
+    }
+
+    double summarize_time(uint32_t time_point) {
+#ifdef THREAD_TIMINGS
+        double sum = 0.0;
+        for(uint32_t i = 0; i < time_threads; ++i) {
+            sum += time_accumulator[time_point][i];
+        }
+        return sum;
+#endif
+    }
+
+    void stop_timer(uint32_t point_id, uint32_t thread_id) {
+#ifdef THREAD_TIMINGS
+        auto end_time = std::chrono::steady_clock::now();
+        auto start_time = start_times[point_id][thread_id];
+
+        uint64_t time = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
+        double seconds = time / (1000.0 * 1000.0 * 1000.0);
+        time_accumulator[point_id][thread_id] += seconds;
+#endif 
+    }
+
+    void start_timer(uint32_t point_id, uint32_t thread_id) {
+#ifdef THREAD_TIMINGS
+        start_times[point_id][thread_id] = std::chrono::steady_clock::now();
+#endif
+    }
+
+};
diff --git a/qdp_project/src/utils/vector_loader.h b/qdp_project/src/utils/vector_loader.h
new file mode 100644
index 0000000..ceab169
--- /dev/null
+++ b/qdp_project/src/utils/vector_loader.h
@@ -0,0 +1,93 @@
+/**
+ * @file vector_loader.h
+ * @author André Berthold
+ * @brief Provides an interface to easily excange vector loading strategies
+ * @version 0.1
+ * @date 2023-05-25
+ * 
+ * @copyright Copyright (c) 2023
+ * 
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include <immintrin.h>
+
+enum load_mode {Unaligned = 0, Aligned = 1, Stream = 2};
+
+/**
+ * @brief A class template that provides functions for loading and storing data of type *base_t* into/from vectors using the stretegy *mode*.
+ * 
+ * @tparam base_t Base type of the data
+ * @tparam mode Strategy for loading the vector 
+ */
+template<typename base_t, load_mode mode> 
+class Vector_Loader {};
+
+/**
+ * @brief Template specialization for Vector_Loader with base_t = uint32_t.
+ * 
+ * @tparam mode Strategy for loading the vector
+ */
+template<load_mode mode> 
+class Vector_Loader<uint32_t, mode> {
+    using base_t = uint32_t;
+    using mask_t = __mmask16;
+    using mask_base_t = uint8_t;
+public:
+
+    /**
+     * @brief Loads 512 bit of data into a vector register
+     * 
+     * @param src Pointer to the data to load
+     * @return __m512i The vector register with the loaded data
+     */
+    static inline __m512i load(base_t* src) {
+        if      constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi32(src);
+        else if constexpr (mode == load_mode::Aligned)   return _mm512_load_epi32 (src);
+        else if constexpr (mode == load_mode::Stream)    return _mm512_stream_load_si512(src);        
+    };
+
+    /**
+     * @brief Stroes data from a given vector register to a destination pointer
+     * 
+     * @param dst Pointer to the data destination
+     * @param vector Vector register containing the data to store
+     */
+    static inline void store(base_t* dst, __m512i vector) {
+        if      constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi32(dst, vector);
+        else if constexpr (mode == load_mode::Aligned)   _mm512_store_epi32 (dst, vector);
+        else if constexpr (mode == load_mode::Stream)    _mm512_stream_si512((__m512i*)(dst), vector);
+    };
+};
+
+/**
+ * @brief Template specialization for Vector_Loader with base_t = uint64_t.
+ * 
+ * @tparam mode Strategy for loading the vector
+ */
+template<load_mode mode> 
+class Vector_Loader<uint64_t, mode> {
+    using base_t = uint64_t;
+    using mask_t = __mmask8;
+    using mask_base_t = uint8_t;
+public:
+    
+
+
+    static inline __m512i load(base_t* src) {
+        if      constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi64(src);
+        else if constexpr (mode == load_mode::Aligned)   return _mm512_load_epi64 (src);
+        else if constexpr (mode == load_mode::Stream)    return _mm512_stream_load_si512(src);        
+    };
+
+    static inline void store(base_t* dst, __m512i vector) {
+        if      constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi64(dst, vector);
+        else if constexpr (mode == load_mode::Aligned)   _mm512_store_epi64 (dst, vector);
+        else if constexpr (mode == load_mode::Stream)    _mm512_stream_si512((__m512i*)(dst), vector);
+    };
+
+};

From 2b635d025d12e4853f5f4249156222d9e4fd3336 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 17 Jan 2024 13:43:16 +0100
Subject: [PATCH 26/29] add defaulted-constructor for cache which got
 implicitly deleted by deleting the copy constructor

---
 offloading-cacher/cache.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp
index b84e347..058a1e1 100644
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@@ -237,6 +237,7 @@ namespace dsacache {
         std::unique_ptr<CacheData> GetFromCache(uint8_t* src, const size_t size, const int dst_node);
 
     public:
+        Cache() = default;
         Cache(const Cache& other) = delete;
 
         // initializes the cache with the two policy functions

From 6cc49daf893c349d00583c203154f1eb48fa9dc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 17 Jan 2024 13:45:24 +0100
Subject: [PATCH 27/29] remove all unused files and benchmark methods, adapt
 the MAX-Benchmark to use the cacher, remove manually-set numa configuration
 and replace it with dynamically adapting to the configured affinity, add two
 more template-options to the worker that control whether a is cached as well
 and whether scanb waits on the caching

---
 qdp_project/CMakeLists.txt                    |  26 +-
 qdp_project/bench_all_dimes.sh                |  10 -
 qdp_project/bench_max.sh                      |   7 +-
 qdp_project/cmake_all_dimes.sh                |  33 --
 qdp_project/src/benchmark/DIMES_benchmark.cpp | 240 -----------
 .../src/benchmark/DIMES_cores_benchmark.cpp   | 260 ------------
 qdp_project/src/benchmark/MAX_benchmark.cpp   |  81 ++--
 qdp_project/src/benchmark/QDP_minimal.h       | 147 -------
 .../src/benchmark/doubly_filtered_agg.cpp     | 149 -------
 .../benchmark/filter_aggregate_pipeline.cpp   | 184 ---------
 qdp_project/src/benchmark/latency.cpp         | 188 ---------
 .../src/benchmark/micro_benchmarks.cpp        | 271 ------------
 .../pipelines/DIMES_scan_filter_pipe.h        | 391 ------------------
 .../pipelines/MAX_scan_filter_pipe.h          | 199 +++++----
 .../benchmark/pipelines/scan_filter_pipe.h    | 387 -----------------
 qdp_project/src/utils/execution_modes.h       |  41 +-
 16 files changed, 181 insertions(+), 2433 deletions(-)
 delete mode 100644 qdp_project/bench_all_dimes.sh
 delete mode 100644 qdp_project/cmake_all_dimes.sh
 delete mode 100644 qdp_project/src/benchmark/DIMES_benchmark.cpp
 delete mode 100644 qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
 delete mode 100644 qdp_project/src/benchmark/QDP_minimal.h
 delete mode 100644 qdp_project/src/benchmark/doubly_filtered_agg.cpp
 delete mode 100644 qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
 delete mode 100644 qdp_project/src/benchmark/latency.cpp
 delete mode 100644 qdp_project/src/benchmark/micro_benchmarks.cpp
 delete mode 100644 qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
 delete mode 100644 qdp_project/src/benchmark/pipelines/scan_filter_pipe.h

diff --git a/qdp_project/CMakeLists.txt b/qdp_project/CMakeLists.txt
index 71c8452..97c1915 100644
--- a/qdp_project/CMakeLists.txt
+++ b/qdp_project/CMakeLists.txt
@@ -20,12 +20,6 @@ set(SUPPRESS_WARNINGS "-Wno-literal-suffix -Wno-volatile")
 set(DEBUG_FLAGS "-g3" "-ggdb")
 set(RELEASE_FLAGS "-O3")
 
-#set pcm location
-set(PCM_LOCATION ./thirdParty/pcm)
-set(PCM_LINKS -lpcm -L${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib)
-# pass the in formation about the shared library location to the linker
-link_directories(${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib)
-
 #set flags used for Release and Debug build type
 add_compile_options(
     "$<$<CONFIG:Release>:${RELEASE_FLAGS}>"
@@ -71,34 +65,18 @@ add_definitions(-DTHREAD_GROUP_MULTIPLIER=${THREAD_FACTOR})
 eval(PINNING "cpu;numa" "cpu")
 add_definitions(-DPINNING=$<STREQUAL:${PINNING},cpu>)
 
-eval(PCM_M "true;false" "false")
-add_definitions(-DPCM_M=$<STREQUAL:${PCM_M},true>)
-add_definitions(${PCM_LINKS})
-
 # build directory
 set(CMAKE_BINARY_DIR "../bin") #relative to inside build 
 set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
 
-
-
 # include directories
 include_directories(src/utils)
 include_directories(src/algorithm)
 include_directories(src/algorithm/operators)
-include_directories(thirdParty/pcm/src)
 
 # link libraries 
-link_libraries(-lnuma -lpthread)
+link_libraries(-lnuma -lpthread -l:libdml.a)
 
 # Add targets only below   
 # specify build targets
-add_executable(FilterAggregatePipeline src/benchmark/filter_aggregate_pipeline.cpp)
-add_executable(DoublyFiltered src/benchmark/doubly_filtered_agg.cpp)
-add_executable(DIMESBench src/benchmark/DIMES_benchmark.cpp)
-add_executable(DIMESCoreBench src/benchmark/DIMES_cores_benchmark.cpp)
-add_executable(MicroBench src/benchmark/micro_benchmarks.cpp)
-add_executable(MAXBench src/benchmark/MAX_benchmark.cpp
-        src/benchmark/QDP_minimal.h)
-target_link_libraries(MAXBench libpcm.so)
-add_executable(LatencyBench src/benchmark/latency.cpp)
-
+add_executable(MAXBench src/benchmark/MAX_benchmark.cpp)
\ No newline at end of file
diff --git a/qdp_project/bench_all_dimes.sh b/qdp_project/bench_all_dimes.sh
deleted file mode 100644
index 9c05e62..0000000
--- a/qdp_project/bench_all_dimes.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!bin/bash
-
-../bin/DIMESBench_gus
-../bin/DIMESBench_guc
-../bin/DIMESBench_gls
-../bin/DIMESBench_glc
-../bin/DIMESBench_lus
-../bin/DIMESBench_luc
-../bin/DIMESBench_lls
-../bin/DIMESBench_llc
\ No newline at end of file
diff --git a/qdp_project/bench_max.sh b/qdp_project/bench_max.sh
index fb08bd8..b7e0168 100644
--- a/qdp_project/bench_max.sh
+++ b/qdp_project/bench_max.sh
@@ -3,13 +3,8 @@
 current_date_time=$(date)
 echo "Benchmark start at: $current_date_time"
 
-../bin/MAXBench_gcc
 
-cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_c_HBM.csv
-
-../bin/MAXBench_gcn
-
-cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_n_HBM.csv
+../bin/MAXBench
 
 current_date_time=$(date)
 echo "Benchmark end at: $current_date_time"
\ No newline at end of file
diff --git a/qdp_project/cmake_all_dimes.sh b/qdp_project/cmake_all_dimes.sh
deleted file mode 100644
index 9ce3a96..0000000
--- a/qdp_project/cmake_all_dimes.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!bin/bash
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=simple  ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_gus
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_guc
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited   -DQUERY=simple  ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_gls
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited   -DQUERY=complex ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_glc
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=unlimited -DQUERY=simple  ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_lus
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=unlimited -DQUERY=complex ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_luc
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=limited   -DQUERY=simple  ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_lls
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=limited   -DQUERY=complex ..
-cmake --build . --target DIMESBench
-mv ../bin/DIMESBench ../bin/DIMESBench_llc
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/DIMES_benchmark.cpp b/qdp_project/src/benchmark/DIMES_benchmark.cpp
deleted file mode 100644
index 2ca9705..0000000
--- a/qdp_project/src/benchmark/DIMES_benchmark.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-#include <atomic>
-#include <barrier>
-#include <chrono>
-#include <condition_variable>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <future>
-#include <iostream>
-#include <limits>
-#include <list>
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <tuple>
-#include <utility>
-
-#include <numa.h>
-
-#ifndef THREAD_GROUP_MULTIPLIER
-#define THREAD_GROUP_MULTIPLIER 8
-#endif
-
-#ifndef QUERY
-#define QUERY 1
-#endif
-
-#ifndef BARRIER_MODE
-#define BARRIER_MODE "global"
-#endif
-
-#ifndef BUFFER_LIMIT
-#define BUFFER_LIMIT 1
-#endif
-
-#include "const.h"
-
-#include "file_output.h"
-#include "array_utils.h"
-#include "timer_utils.h"
-#include "barrier_utils.h"
-#include "cpu_set_utils.h"
-#include "iterable_range.h"
-#include "memory_literals.h"
-#include "pipelines/DIMES_scan_filter_pipe.h"
-
-#include "aggregation.h"
-#include "filter.h"
-
-using base_t = uint64_t;
-
-base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
-    base_t sum = 0;
-    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
-        sum += (row_A[i] < compare_value) * row_B[i];
-    }
-    return sum;
-}
-
-base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
-    base_t sum = 0;
-    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
-        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
-    }
-    return sum;
-}
-
-int main(int argc, char** argv) {
-    // set constants
-    const size_t workload_b = 4_GiB;
-    const base_t compare_value_a = 50;
-    const base_t compare_value_b = 42;
-    constexpr bool simple_query = (QUERY == 1);
-
-    const size_t thread_count = 6;
-    std::ofstream out_file;
-    out_file.open("../results/dimes_" 
-            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
-            "_bm-" + (std::string) BARRIER_MODE + 
-            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
-            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".csv");
-    
-    // set benchmark parameter 
-    Linear_Int_Range<uint32_t, 0, 10, 1> run("run");
-    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
-    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
-
-    uint32_t remote_node   = 3;
-    uint32_t remote_node_2 = 2;
-    uint32_t local_node    = 10; 
-
-    print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time",
-    #ifdef THREAD_TIMINGS
-        "scan_a", "scan_b", "aggr_j",
-    #endif
-    #ifdef BARRIER_TIMINGS
-        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
-    #endif
-        "result");
-
-
-    /*** alloc data and buffers ************************************************/
-    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
-    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
-    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
-    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
-    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
-    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
-    std::memcpy(data_a_hbm, data_a, workload_b);
-    std::memcpy(data_b_hbm, data_b, workload_b);
-    base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node);
-
-    std::ofstream check_file;
-    check_file.open("../results/dimes_" 
-            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
-            "_bm-" + (std::string) BARRIER_MODE + 
-            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
-            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum");
-    if constexpr (QUERY == 1) {
-        //calculate simple checksum if QUERY == 1 -> simple query is applied
-        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
-    } else {
-        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
-    }
-    check_file.close();
-
-    std::string iteration("init");
-    Query_Wrapper<base_t, simple_query>* qw = nullptr;
-    while(iteration != "false") {
-        
-        std::promise<void> p;
-        std::shared_future<void> ready_future(p.get_future());
-
-        if(iteration != "run") {
-        
-            if(qw != nullptr) {
-                delete qw;
-            }
-            
-            std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << std::endl;
-
-            uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
-            uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
-            uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
-            switch(mode.current) {
-            case NewPMode::DRAM_base:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
-                    break;
-            case NewPMode::HBM_base:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
-                    break;
-            case NewPMode::Mixed_base:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
-                    break;
-            case NewPMode::Prefetch: 
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false);
-                    break;
-            }             
-        }
-
-        qw->ready_future = &ready_future;
-        qw->clear_buffers();
-
-        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
-        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
-        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
-
-        std::vector<std::thread> filter_pool;
-        std::vector<std::thread>   copy_pool;
-        std::vector<std::thread>    agg_pool;
-
-        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
-        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
-        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
-
-        int thread_id = 0;
-        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
-        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
-        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids
-        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids
-        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids
-
-        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
-
-            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
-                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
-                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
-            }
-
-            // if tc_copy == 0 this loop is skipped
-            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
-                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
-                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
-            }
-
-            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
-                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
-                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
-            }
-        }
-
-        auto start = std::chrono::steady_clock::now();
-        p.set_value();
-
-        for(std::thread& t : filter_pool) { t.join(); }
-        for(std::thread& t :   copy_pool) { t.join(); }
-        for(std::thread& t :    agg_pool) { t.join(); }
-
-        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
-        auto end = std::chrono::steady_clock::now();
-
-        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
-        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
-        double seconds   = (double)(nanos) / nanos_per_second;
-
-
-        print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds,
-        #ifdef THREAD_TIMINGS
-                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
-        #endif
-        #ifdef BARRIER_TIMINGS
-                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
-        #endif
-                results[0]);
-
-        
-        iteration = IterateOnce(run, chunk_size, mode);
-    }
-
-    numa_free(data_b_hbm, workload_b);
-    numa_free(data_a, workload_b);
-    numa_free(data_b, workload_b);
-    
-    numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t));
-
-}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp b/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
deleted file mode 100644
index 93c6b1b..0000000
--- a/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-#include <atomic>
-#include <barrier>
-#include <chrono>
-#include <condition_variable>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <future>
-#include <iostream>
-#include <limits>
-#include <list>
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <tuple>
-#include <utility>
-
-#include <numa.h>
-
-#ifndef QUERY
-#define QUERY 1
-#endif
-
-#ifndef BARRIER_MODE
-#define BARRIER_MODE "global"
-#endif
-
-#define BUFFER_LIMIT 0
-
-#include "const.h"
-
-#include "file_output.h"
-#include "array_utils.h"
-#include "timer_utils.h"
-#include "barrier_utils.h"
-#include "cpu_set_utils.h"
-#include "iterable_range.h"
-#include "memory_literals.h"
-#include "pipelines/DIMES_scan_filter_pipe.h"
-
-#include "aggregation.h"
-#include "filter.h"
-
-using base_t = uint64_t;
-
-base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
-    base_t sum = 0;
-    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
-        sum += (row_A[i] < compare_value) * row_B[i];
-    }
-    return sum;
-}
-
-base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
-    base_t sum = 0;
-    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
-        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
-    }
-    return sum;
-}
-
-
-int main(int argc, char** argv) {
-    // set constants
-    const size_t workload_b = 4_GiB;
-    const size_t chunk_size = 2_MiB;
-    const base_t compare_value_a = 50;
-    const base_t compare_value_b = 42;
-    constexpr bool simple_query = (QUERY == 1);
-
-
-    std::ofstream out_file;
-    out_file.open("../results/dimes_cores_" 
-            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
-            "_bm-" + (std::string) BARRIER_MODE + 
-            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
-            ".csv");
-    
-    // set benchmark parameter 
-    Linear_Int_Range<uint32_t, 0, 3, 1> run("run");
-
-    Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_a_thread("scan_a_tc");
-    Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_b_thread("scan_b_tc");
-    Exp_Int_Range<uint32_t, 1, 4+1, 2> aggr_j_thread("aggr_j_tc");
-    Linear_Int_Range<uint32_t, 1, 16+1, 1> thread_group_count("thread_group_c");
-    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
-
-    uint32_t remote_node   = 1;
-    uint32_t remote_node_2 = 0;//on heacboehm II: node 0 is two hops away from node 2 -> prefetching is more beneficial
-    uint32_t local_node    = 2; 
-
-    print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), 
-        "time",
-    #ifdef THREAD_TIMINGS
-        "scan_a", "scan_b", "aggr_j",
-    #endif
-    #ifdef BARRIER_TIMINGS
-        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
-    #endif
-        "result");
-
-
-    /*** alloc data and buffers ************************************************/
-    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
-    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
-    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
-    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
-    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
-    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
-    std::memcpy(data_a_hbm, data_a, workload_b);
-    std::memcpy(data_b_hbm, data_b, workload_b);
-    base_t* results = (base_t*) numa_alloc_onnode(thread_group_count.max * aggr_j_thread.max * sizeof(base_t), remote_node);
-
-    std::ofstream check_file;
-    check_file.open("../results/dimes_cores_" 
-            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
-            "_bm-" + (std::string) BARRIER_MODE + 
-            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
-            ".checksum");
-    if constexpr (QUERY == 1) {
-        //calculate simple checksum if QUERY == 1 -> simple query is applied
-        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
-    } else {
-        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
-    }
-    check_file.close();
-
-    std::string iteration("init");
-    Query_Wrapper<base_t, simple_query>* qw = nullptr;
-    while(iteration != "false") {
-        
-        std::promise<void> p;
-        std::shared_future<void> ready_future(p.get_future());
-
-        // skipping iteration through scan_b_thread while not used
-        while(simple_query && mode.current != NewPMode::Prefetch && scan_b_thread.current != 1) {
-                iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread);
-        }
-
-        if(iteration != "run") {
-            std::cout << "Changing to mode " << mode.current
-                    << " thread_group_count " << thread_group_count.current 
-                    << " thread_ratio " << scan_a_thread.current <<":"<< scan_b_thread.current <<":"<< aggr_j_thread.current
-                    << std::endl;
-        
-            if(qw != nullptr) {
-                if (iteration == thread_group_count.label) {
-
-                } else {
-                    delete qw;
-
-                    uint32_t sat = scan_a_thread.current;
-                    uint32_t sbt = simple_query && mode.current != NewPMode::Prefetch ?  0 : scan_b_thread.current;
-                    uint32_t ajt = aggr_j_thread.current;
-
-                    switch(mode.current) {
-                    case NewPMode::DRAM_base:
-                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, 
-                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
-                            break;
-                    case NewPMode::HBM_base:
-                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
-                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
-                            break;
-                    case NewPMode::Mixed_base:
-                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b_hbm, results, local_node, remote_node, 
-                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
-                            break;
-                    case NewPMode::Prefetch: 
-                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, 
-                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, false);
-                            break;
-                    }
-                }
-            }
-        }
-
-        qw->ready_future = &ready_future;
-        qw->clear_buffers();
-
-        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
-        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
-        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
-
-        std::vector<std::thread> filter_pool;
-        std::vector<std::thread>   copy_pool;
-        std::vector<std::thread>    agg_pool;
-
-        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
-        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
-        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
-
-        int thread_id = 0;
-        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
-        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
-
-        for(uint32_t gid = 0; gid < thread_group_count.current; ++gid) {
-
-            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
-                filter_pool.emplace_back(filter_lambda, gid, thread_group_count.current, tid);
-                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
-            }
-
-            // if tc_copy == 0 this loop is skipped
-            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
-                copy_pool.emplace_back(copy_lambda, gid, thread_group_count.current, tid);
-                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
-            }
-
-            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
-                agg_pool.emplace_back(aggregation_lambda, gid, thread_group_count.current, tid);
-                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
-            }
-        }
-
-        auto start = std::chrono::steady_clock::now();
-        p.set_value();
-
-        for(std::thread& t : filter_pool) { t.join(); }
-        for(std::thread& t :   copy_pool) { t.join(); }
-        for(std::thread& t :    agg_pool) { t.join(); }
-
-        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * thread_group_count.current);
-        auto end = std::chrono::steady_clock::now();
-
-        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
-        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
-        double seconds   = (double)(nanos) / nanos_per_second;
-
-print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), 
-        "time",
-    #ifdef THREAD_TIMINGS
-        "scan_a", "scan_b", "aggr_j",
-    #endif
-    #ifdef BARRIER_TIMINGS
-        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
-    #endif
-        "result");
-
-        print_to_file(out_file, run, thread_group_count.current, new_mode_manager::string(mode.current), scan_a_thread, 
-                (simple_query && mode.current != NewPMode::Prefetch ?  0 : scan_b_thread.current), 
-                aggr_j_thread, seconds,
-        #ifdef THREAD_TIMINGS
-                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
-        #endif
-        #ifdef BARRIER_TIMINGS
-                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
-        #endif
-                results[0]);
-        
-        iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread);
-    }
-
-    numa_free(data_b_hbm, workload_b);
-    numa_free(data_a, workload_b);
-    numa_free(data_b, workload_b);
-    
-    numa_free(results, thread_group_count.max * aggr_j_thread.max * sizeof(base_t));
-
-}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/MAX_benchmark.cpp b/qdp_project/src/benchmark/MAX_benchmark.cpp
index fb50f5a..0414e29 100644
--- a/qdp_project/src/benchmark/MAX_benchmark.cpp
+++ b/qdp_project/src/benchmark/MAX_benchmark.cpp
@@ -92,33 +92,36 @@ int main(int argc, char** argv) {
 #endif
 
     // set constants
-    const size_t workload_b = 2_GiB;
-    const base_t compare_value_a = 50;
-    const base_t compare_value_b = 42;
+    constexpr size_t workload_b = 2_GiB;
+    constexpr base_t compare_value_a = 50;
+    constexpr base_t compare_value_b = 42;
     constexpr bool simple_query = (QUERY == 1);
+    constexpr bool cache_a = false;
+    constexpr bool wait_b = false;
+
+    constexpr size_t chunk_min = 1_MiB;
+    constexpr size_t chunk_max = 8_MiB + 1;
+    constexpr size_t chunk_incr = 128_kiB;
+
+    // thread count is 12 here but as the default measurement uses 6
+    // we must restrict the core assignment of these 12 threads to
+    // 6 physical cpu cores on the executing node
+
+    constexpr size_t thread_count = 12;
 
-    const size_t thread_count = 6;
     std::ofstream out_file;
+
     out_file.open("../results/max_" 
             "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
             "_bm-" + (std::string) BARRIER_MODE + 
             "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
-            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + "1MiB-2MiB.csv");
+            "_tc-" + std::to_string(thread_count) + "1MiB-2MiB.csv");
     
     // set benchmark parameter 
     Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
-    constexpr size_t chunk_min = 1_MiB; constexpr size_t chunk_max = 8_MiB + 1; constexpr size_t chunk_incr = 128_kiB;
     Linear_Int_Range<size_t, chunk_min, chunk_max, chunk_incr> chunk_size("chunk_size");
     Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
 
-    uint32_t remote_node   = 2;
-    uint32_t remote_node_2 = 2;
-    uint32_t local_node    = 10;
-
-    /*uint32_t remote_node   = 6;
-    uint32_t remote_node_2 = 6;
-    uint32_t local_node    = 2;*/
-
     print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time",
     #ifdef THREAD_TIMINGS
         "scan_a", "scan_b", "aggr_j",
@@ -133,24 +136,22 @@ int main(int argc, char** argv) {
     #endif
         "result");
 
-
     /*** alloc data and buffers ************************************************/
-    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
-    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
-    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
-    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
+
+    base_t* data_a     = (base_t*) numa_alloc_local(workload_b);
+    base_t* data_b     = (base_t*) numa_alloc_local(workload_b);
+    base_t* results    = (base_t*) numa_alloc_local(thread_count * sizeof(base_t));
+
     fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
     fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
-    std::memcpy(data_a_hbm, data_a, workload_b);
-    std::memcpy(data_b_hbm, data_b, workload_b);
-    base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node);
+
 
     std::ofstream check_file;
     check_file.open("../results/max_" 
             "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
             "_bm-" + (std::string) BARRIER_MODE + 
             "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
-            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum");
+            "_tc-" + std::to_string(thread_count) + ".checksum");
     if constexpr (QUERY == 1) {
         //calculate simple checksum if QUERY == 1 -> simple query is applied
         check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
@@ -160,37 +161,34 @@ int main(int argc, char** argv) {
     check_file.close();
 
     std::string iteration("init");
-    Query_Wrapper<base_t, simple_query>* qw = nullptr;
+    Query_Wrapper<base_t, simple_query, cache_a, wait_b>* qw = nullptr;
+
     while(iteration != "false") {
         
         std::promise<void> p;
         std::shared_future<void> ready_future(p.get_future());
 
         if(iteration != "run") {
-        
             if(qw != nullptr) {
                 delete qw; 
             }
+
             uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
             uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
             uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
+
             switch(mode.current) {
-            case NewPMode::DRAM_base:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
-                    break;
-            case NewPMode::HBM_base:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
-                    break;
-            case NewPMode::Mixed_base:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
-                    break;
-            case NewPMode::Prefetch: 
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
-                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false);
+                case NewPMode::Prefetch:
+                    qw = new Query_Wrapper<base_t, simple_query, cache_a, wait_b>(
+                        &ready_future, workload_b, chunk_size.current,
+                        data_a, data_b, results, tc_filter, tc_copy, tc_agg,
+                        mode.current, 50, 42
+                    );
+
                     break;
+                default:
+                    std::cerr << "[x] Unsupported Execution Mode by this build." << std::endl;
+                    exit(-1);
             }             
         }
 
@@ -280,10 +278,7 @@ int main(int argc, char** argv) {
         iteration = IterateOnce(run, chunk_size, mode);
     }
 
-    numa_free(data_b_hbm, workload_b);
     numa_free(data_a, workload_b);
     numa_free(data_b, workload_b);
-    
     numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t));
-
 }
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/QDP_minimal.h b/qdp_project/src/benchmark/QDP_minimal.h
deleted file mode 100644
index 007d0d9..0000000
--- a/qdp_project/src/benchmark/QDP_minimal.h
+++ /dev/null
@@ -1,147 +0,0 @@
-#include <chrono>
-#include <iostream>
-#include <thread>
-#include <future>
-#include <numa.h>
-
-#include "const.h"
-#include "array_utils.h"
-#include "cpu_set_utils.h"
-#include "iterable_range.h"
-#include "memory_literals.h"
-#include "pipelines/MAX_scan_filter_pipe.h"
-#include "aggregation.h"
-
-using base_t = uint64_t;
-
-// calculate the checksum for the simple query
-base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
-    base_t sum = 0;
-    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
-        sum += (row_A[i] < compare_value) * row_B[i];
-    }
-    return sum;
-}
-
-// calculate the checksum for the complex query
-base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
-    base_t sum = 0;
-    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
-        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
-    }
-    return sum;
-}
-
-class QDP_minimal {
-private:
-    // values used for comparisons in the filter operations
-    const base_t compare_value_a = 50;
-    const base_t compare_value_b = 42;
-    // define, which numa nodes to use
-    // Xeon Max: node 0-7 DRAM and 8-15 HBM
-    // if the nodes are changed, the pinning ranges in run should be adjusted accordingly too
-    uint32_t dram_node   = 2; 
-    uint32_t dram_node_2 = 2; 
-    uint32_t hbm_node    = 10;
-
-public:
-    // results of running qdp, set by run()
-    base_t result;
-    base_t checksum;
-    double exec_time;
-
-    // run qdp
-    void run(const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){
-        // allocate data
-        base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, dram_node);
-        base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, dram_node_2);
-        base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t), dram_node);
-
-        // fill the memory with acutal values
-        fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
-        fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
-        
-        // run qdp
-        run(data_a, data_b, results, workload_b, chunk_size, tc_filter, tc_copy, tc_agg);
-
-        // free the allocated memory
-        numa_free(data_a, workload_b);
-        numa_free(data_b, workload_b);
-        numa_free(results, THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t));
-    }
-
-    // run qdp, work on provided memory pointers to enable memory reuse across multiple runs
-    void run(base_t* data_a, base_t* data_b, base_t* results, const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){
-        constexpr bool simple_query = (QUERY == 1);
-        // sync objects
-        std::promise<void> p;
-        std::shared_future<void> ready_future(p.get_future());
-
-        // create the query wrapper, that is managing the to-be-used threads
-        Query_Wrapper<base_t, simple_query>* qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, hbm_node, dram_node, 
-            tc_filter, tc_copy, tc_agg, NewPMode::Prefetch, THREAD_GROUP_MULTIPLIER, compare_value_a, compare_value_b, false);             
-
-        // clear buffers to make sure, that they have been written and are fully mapped before running qdp
-        qw->clear_buffers();
-
-        // creating lambdas for executing filter (scan_a), copy (scan_b), and aggregation tasks on the query wrapper
-        // passing gid (group id), gcnt (group count) and tid (thread id)
-        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
-        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
-        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
-
-        // creating thread pools, holding all used threads
-        std::vector<std::thread> filter_pool;
-        std::vector<std::thread>   copy_pool;
-        std::vector<std::thread>    agg_pool;
-
-        int thread_id = 0;
-        // cpus on node 2 (for sapphire rapids), that the threads should be executed on
-        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; 
-
-        // create all threads for all thread groups and for every task (copy, filter, aggregation), according their specific theadcount
-        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
-            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
-                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
-                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);      
-            }
-            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
-                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
-                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
-            }
-            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
-                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
-                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
-            }
-        }
-
-        // start the clock 
-        auto start = std::chrono::steady_clock::now();
-        // set value to the promise, to signal the waiting threads, that they can start now
-        p.set_value();
-
-        // wait for all thread to be finished
-        for(std::thread& t : filter_pool) { t.join(); }
-        for(std::thread& t :   copy_pool) { t.join(); }
-        for(std::thread& t :    agg_pool) { t.join(); }
-
-        // sum up the results of all the aggregation threads to get a final result
-        Aggregation<base_t, Sum, load_mode::Aligned>::apply(&result, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
-        auto end = std::chrono::steady_clock::now();
-        
-        // get the overall execution time in seconds
-        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
-        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
-        exec_time   = (double)(nanos) / nanos_per_second;
-
-        // calculate the checksum according to the used query
-        if constexpr (QUERY == 1) {
-            // QUERY == 1 -> simple query is applied
-            checksum = sum_check(compare_value_a, data_a, data_b, workload_b);
-        } else {
-            checksum = sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
-        }
-
-        delete qw;
-    }
-};
diff --git a/qdp_project/src/benchmark/doubly_filtered_agg.cpp b/qdp_project/src/benchmark/doubly_filtered_agg.cpp
deleted file mode 100644
index eaee93d..0000000
--- a/qdp_project/src/benchmark/doubly_filtered_agg.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-
-#include <cstring>
-#include <fstream>
-#include <future>
-#include <iostream>
-#include <string>
-#include <thread>
-#include <vector>
-
-#include <numa.h>
-
-#include "aggregation.h"
-#include "array_utils.h"
-#include "cpu_set_utils.h"
-#include "file_output.h"
-#include "iterable_range.h"
-#include "memory_literals.h"
-#include "pipelines/scan_filter_pipe.h"
-
-int main () {
-
-    using base_t = uint64_t;
-
-    
-    const size_t workload = 2_GiB;
-    const char filename[256] = "../results/doubly_filtered_results_stronger_affinity_.csv";
-    const uint32_t numa_local = 2;
-    const uint32_t numa_remote = 3;
-
-
-    Linear_Int_Range<uint32_t, 1,  6, 1> thread_group("thread_groups");
-    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_filter("thread_cnt_filter");
-    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_filter_copy("thread_cnt_filter_copy");
-    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_aggregation("thread_cnt_agg");
-    Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
-    Range<PMode, no_copy, mode_manager, mode_manager> mode("mode");
-    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
-
-    std::ofstream out_file;
-    out_file.open(filename);
-    print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, 
-                  thread_count_aggregation, thread_group), "time", "scan_a", "scan_b", "aggr_j", "wait_aggr", "results");
-
-    base_t* data_a     = (base_t*) numa_alloc_onnode(workload, numa_remote);
-    base_t* data_b     = (base_t*) numa_alloc_onnode(workload, numa_remote);
-    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload, numa_local);
-    fill_mt<base_t>(data_a, workload, 0, 100, 42);
-    fill_mt<base_t>(data_b, workload, 0, 100, 420);
-    std::memcpy(data_b_hbm, data_b, workload); 
-    base_t* result = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), 
-                                                 numa_remote);
-
-    std::string iteration("init");
-    Query_Wrapper<base_t, false>* qw = nullptr;
-    
-    while(iteration != "false") {
-
-        std::promise<void> p;
-        std::shared_future<void> ready_future(p.get_future());
-
-        if(iteration != "run") {
-            if(qw != nullptr) {
-                delete qw;
-            }
-
-            switch(mode.current) {
-            case PMode::expl_copy:
-                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, 
-                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, 
-                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, false);
-                    break;
-            case PMode::no_copy:
-                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, 
-                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, 
-                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
-                    break;
-            case PMode::hbm:
-                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b_hbm, result, numa_local, numa_remote, 
-                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current,
-                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
-                    break;
-            }
-        }
-        qw->ready_future = &ready_future;
-        qw->clear_buffers();
-
-
-        // todo create threads depending on mode
-        std::vector<std::thread> thread_pool;
-        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
-        auto filter_copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
-        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };
-
-
-        /* Intel Xeon Gold 6130 // todo implement different for 5120 -> fewer cpus
-        node 0 cpus:  0-15  64- 79
-        node 1 cpus: 16-31  80- 95
-        node 2 cpus: 32-47  96-111
-        node 3 cpus: 48-63 112-127
-        */
-        int thread_id = 0;
-        std::vector<std::pair<int, int>> range {std::make_pair(0, 16), std::make_pair(64, 80)};
-        for(uint32_t gid = 0; gid < thread_group.current; ++gid) {
-
-            
-            for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) {
-                thread_pool.emplace_back(filter_lambda, gid, thread_group.current, tid);
-                pin_thread_in_range(thread_pool.back(), thread_id++, range);
-            }
-
-            for(uint32_t tid = 0; tid < thread_count_filter_copy.current; ++tid) {
-                thread_pool.emplace_back(filter_copy_lambda, gid, thread_group.current, tid);
-                pin_thread_in_range(thread_pool.back(), thread_id++, range);
-            }
-
-            for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) {
-                thread_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid);
-                pin_thread_in_range(thread_pool.back(), thread_id++, range);
-            }
-        }
-
-        auto start = std::chrono::steady_clock::now();
-        p.set_value();
-
-        // wait for every thread to join
-        for(std::thread& t : thread_pool) t.join();
-        // aggregate all partial results
-        Aggregation<base_t, Sum, load_mode::Aligned>::apply(result, result, 
-                                                            sizeof(base_t) * thread_count_aggregation.current * thread_group.current);
-
-        auto end = std::chrono::steady_clock::now();
-
-        double duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / (double)1000000000;
-
-
-        //TODO add mode
-        print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, 
-                thread_count_filter_copy, thread_count_aggregation, thread_group, duration, 
-                qw->trt->summarize_time(0), qw->trt->summarize_time(1),
-                qw->trt->summarize_time(2), qw->trt->summarize_time(3), *result);
-        iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, thread_count_aggregation, thread_group);
-    }
-
-    auto end = std::chrono::system_clock::now();
-    std::time_t end_time = std::chrono::system_clock::to_time_t(end);
-    std::cout << "finished computation at " << std::ctime(&end_time) << std::endl;
-
-    print_to_file(out_file, std::ctime(&end_time));
-}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp b/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
deleted file mode 100644
index b4a6753..0000000
--- a/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-#include <atomic>
-#include <barrier>
-#include <chrono>
-#include <condition_variable>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <future>
-#include <iostream>
-#include <limits>
-#include <list>
-#include <mutex>
-#include <queue>
-#include <thread>
-#include <tuple>
-#include <utility>
-
-#include <numa.h>
-
-#include "const.h"
-
-#include "file_output.h"
-#include "array_utils.h"
-#include "timer_utils.h"
-#include "barrier_utils.h"
-#include "cpu_set_utils.h"
-#include "iterable_range.h"
-#include "memory_literals.h"
-#include "pipelines/scan_filter_pipe.h"
-
-#include "aggregation.h"
-#include "filter.h"
-
-using base_t = uint64_t;
-
-base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
-    base_t sum = 0;
-    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
-        sum += (row_A[i] < compare_value) * row_B[i];
-    }
-    return sum;
-}
-
-
-int main(int argc, char** argv) {
-    size_t workload_b = 2_GiB;
-    std::ofstream out_file;
-    out_file.open("filter_aggreagate_pipe_bm_" + (std::string) BARRIER_MODE + ".csv");
-
-    Linear_Int_Range<uint32_t, 1,  7, 1> thread_group("thread_groups");
-    Linear_Int_Range<uint32_t, 0, 10, 1> run("run");
-    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
-    Linear_Int_Range<uint32_t, 1,  2, 1> thread_count_filter("thread_cnt_filter");
-    Linear_Int_Range<uint32_t, 2,  3, 1> thread_count_copy("thread_cnt_copy");
-    Linear_Int_Range<uint32_t, 1,  2, 1> thread_count_aggregation("thread_cnt_agg");
-    Range<PMode, no_copy, mode_manager, mode_manager> mode("mode");
-
-    uint32_t remote_node  = 2;
-    uint32_t remote_node_2 = 2;
-    uint32_t local_node = 10; 
-
-    print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_copy, 
-                  thread_count_aggregation, thread_group), "time",
-    #ifdef THREAD_TIMINGS
-        "scan_a", "scan_b", "aggr_j",
-    #endif
-    #ifdef BARRIER_TIMINGS
-        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
-    #endif
-        "result");
-
-
-    /*** alloc data and buffers ************************************************/
-    base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node);
-    base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
-    base_t* data_b_hbm = (base_t *) numa_alloc_onnode(workload_b, local_node);
-    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
-    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
-    std::memcpy(data_b_hbm, data_b, workload_b);
-    base_t* results = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), remote_node);
-
-    std::string iteration("init");
-    const bool simple_query = true;
-    Query_Wrapper<base_t, simple_query>* qw = nullptr;
-    while(iteration != "false") {
-        base_t compare_value = 50;
-        std::promise<void> p;
-        std::shared_future<void> ready_future(p.get_future());
-
-        if(iteration != "run") {
-        
-            if(qw != nullptr) {
-                delete qw;
-            }
-            
-            std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << " thread_group " << thread_group.current << std::endl;
-            switch(mode.current) {
-            case PMode::expl_copy:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
-                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, false);
-                    break;
-            case PMode::no_copy:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
-                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
-                    break;
-            case PMode::hbm:
-                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
-                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
-                    break;
-            }             
-        }
-
-        qw->ready_future = &ready_future;
-        qw->clear_buffers();
-
-        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
-        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
-        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
-
-        std::vector<std::thread> filter_pool;
-        std::vector<std::thread>   copy_pool;
-        std::vector<std::thread>    agg_pool;
-
-        int thread_id = 0;
-        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm2
-        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
-
-        for(uint32_t gid = 0; gid < thread_group.current; ++gid) {
-
-        for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) {
-            filter_pool.emplace_back(filter_lambda, gid, thread_group.current, tid);
-            pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
-        }
-
-        if(mode.current == PMode::expl_copy){
-            for(uint32_t tid = 0; tid < thread_count_copy.current; ++tid) {
-                copy_pool.emplace_back(copy_lambda, gid, thread_group.current, tid);
-                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
-            }
-        }
-
-        for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) {
-            agg_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid);
-            pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
-            }
-        }
-
-        auto start = std::chrono::steady_clock::now();
-        p.set_value();
-
-        for(std::thread& t : filter_pool) { t.join(); }
-        for(std::thread& t :   copy_pool) { t.join(); }
-        for(std::thread& t :    agg_pool) { t.join(); }
-
-        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_count_aggregation.current * thread_group.current);
-        auto end = std::chrono::steady_clock::now();
-
-        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
-        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
-        double seconds   = (double)(nanos) / nanos_per_second;
-
-
-
-        print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, 
-            thread_count_copy, thread_count_aggregation, thread_group, seconds,
-        #ifdef THREAD_TIMINGS
-                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
-        #endif
-        #ifdef BARRIER_TIMINGS
-                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
-        #endif
-                results[0]);
-
-        
-        iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_copy, thread_count_aggregation, thread_group);
-
-    }
-
-    numa_free(data_b_hbm, workload_b);
-    numa_free(data_a, workload_b);
-    numa_free(data_b, workload_b);
-    numa_free(results, thread_group.max * sizeof(base_t));
-
-}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/latency.cpp b/qdp_project/src/benchmark/latency.cpp
deleted file mode 100644
index 011066a..0000000
--- a/qdp_project/src/benchmark/latency.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * numa_memory_latency
- * Copyright (c) 2017 UMEZAWA Takeshi
- * This software is licensed under GNU GPL version 2 or later.
- *
- * This file has been modified
- */
-
-#include <algorithm>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <iostream>
-#include <unistd.h>
-#include <ctime>
-#include "file_output.h"
-#include <vector>
-#include <random>
-#include <algorithm>
-#include <numa.h>
-
-#ifndef VOLATILE
-#define VOLATILE 0
-#endif
-
-#define cachelinesize 64
-union CACHELINE {
-	char cacheline[cachelinesize];
-	#if VOLATILE
-	volatile CACHELINE* next;
-	#else 
-	CACHELINE* next;
-	#endif /*VOLATILE*/
-};
-
-#define REPT4(x)    do { x; x; x; x; } while(0)
-#define REPT16(x)   do { REPT4(x);   REPT4(x);   REPT4(x);   REPT4(x);   } while(0);
-#define REPT64(x)   do { REPT16(x);  REPT16(x);  REPT16(x);  REPT16(x);  } while(0);
-#define REPT256(x)  do { REPT64(x);  REPT64(x);  REPT64(x);  REPT64(x);  } while(0);
-#define REPT1024(x) do { REPT256(x); REPT256(x); REPT256(x); REPT256(x); } while(0);
-
-size_t bufsize = 1 * 1024 * 1024 * 1024;
-size_t nloop = 128 * 1024;
-std::vector<size_t> offsets;
-
-#if VOLATILE
-
-volatile CACHELINE* walk(volatile CACHELINE* start)
-{
-	volatile CACHELINE* p = start;
-	for (size_t i = 0; i < nloop; ++i) {
-		REPT1024(p = p->next);
-	}
-	return p;
-}
-
-#else
-
-CACHELINE* walk(CACHELINE* start, uint64_t* sum)
-{
-	CACHELINE* p = start;
-	for (size_t i = 0; i < nloop; ++i) {
-        REPT1024(
-			*sum += static_cast<uint64_t>(p->cacheline[cachelinesize-1]);
-            p = p->next;
-        );
-    }
-	return p;
-}
-
-#endif /*VOLATILE*/
-
-void bench(int tasknode, int memnode, std::ofstream* out_file)
-{
-	struct timespec ts_begin, ts_end, ts_elapsed;
-
-	printf("bench(task=%d, mem=%d)\n", tasknode, memnode);
-
-	if (numa_run_on_node(tasknode) != 0) {
-		printf("failed to run on node: %s\n", strerror(errno));
-		return;
-	}
-
-	CACHELINE* const buf = (CACHELINE*)numa_alloc_onnode(bufsize, memnode);
-	if (buf == NULL) {
-		printf("failed to allocate memory\n");
-		return;
-	}
-
-	for (size_t i = 0; i < offsets.size() - 1; ++i) {
-		// assuming that next-pointer never overwrites last Byte of the cacheline/union
-		buf[offsets[i]].cacheline[cachelinesize-1] = offsets[i] % 128;
-		buf[offsets[i]].next = buf + offsets[i+1];
-	}
-	buf[offsets[offsets.size() - 1]].next = buf;
-	buf[offsets[offsets.size() - 1]].cacheline[cachelinesize-1] = offsets[offsets.size() - 1] % 128;
-
-	uint64_t value = 0;
-	uint64_t* sum = &value;
-
-	clock_gettime(CLOCK_MONOTONIC, &ts_begin);
-	
-	#if VOLATILE 
-	walk(buf);
-	#else
-	walk(buf, sum);
-	#endif /*VOLATILE*/
-
-	clock_gettime(CLOCK_MONOTONIC, &ts_end);
-
-	ts_elapsed.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec;
-	ts_elapsed.tv_sec = ts_end.tv_sec - ts_begin.tv_sec;
-	if (ts_elapsed.tv_nsec < 0) {
-		--ts_elapsed.tv_sec;
-		ts_elapsed.tv_nsec += 1000*1000*1000;
-	}
-	double elapsed = ts_elapsed.tv_sec + 0.000000001 * ts_elapsed.tv_nsec;
-	printf("took %fsec. %fns/load\n", elapsed, elapsed/(1024*nloop)*(1000*1000*1000));
-	print_to_file(*out_file, tasknode, memnode, elapsed/(1024*nloop)*(1000*1000*1000), *sum);
-	numa_free(buf, bufsize);
-}
-
-struct RND {
-	std::mt19937 mt;
-	RND() : mt(time(NULL)) {}
-	std::mt19937::result_type operator()(std::mt19937::result_type n) { return mt() % n; }
-} r;
-
-void usage(const char* prog)
-{
-	printf("usage: %s [-h] [bufsize] [nloop]\n", prog);
-}
-
-int main(int argc, char* argv[])
-{
-	int ch;
-
-	while ((ch = getopt(argc, argv, "h")) != -1) {
-		switch (ch) {
-		case 'h':
-		default:
-			usage(argv[0]);
-			exit(1);
-		}
-	}
-
-	argc -= optind;
-	argv += optind;
-
-	if (argc > 1) {
-		// 1048576 KiB = 1 GiB
-		bufsize = atoi(argv[0]) * 1024; // in KiB
-		nloop = atoi(argv[1]) * 1024;
-    } 
-
-	offsets.resize(bufsize / cachelinesize);
-	
-	for (size_t i = 0; i < offsets.size(); ++i)
-		offsets[i] = i;
-	std::random_shuffle(offsets.begin() + 1, offsets.end(), r);
-
-	uint64_t expected_checksum = 0;
-	#if VOLATILE == 0
-    for (size_t i = 0; i < nloop * 1024; ++i) {
-        expected_checksum += offsets[i % offsets.size()] % 128;
-    }
-	#endif
-
-	std::ofstream check_file;
-	check_file.open("../results/micro_bench/latency/micro_bench_latency_" + (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".checksum");
-	check_file << expected_checksum;
-	check_file.close();
-
-
-	printf("benchmark bufsize=%zuKiB, nloop=%zuKi\n", bufsize/1024, nloop/1024);
-
-    std::ofstream out_file;
-    out_file.open("../results/micro_bench/latency/micro_bench_latency_"+ (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".csv");
-    print_to_file(out_file, "tasknode", "memnode", "latency", "checksum");
-
-	for (int tasknode = 0; tasknode < 8; tasknode++) {
-		for (int memnode = 0; memnode < 16; memnode++) {
-			bench(tasknode, memnode, &out_file);
-		}
-	}
-
-	return 0;
-}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/micro_benchmarks.cpp b/qdp_project/src/benchmark/micro_benchmarks.cpp
deleted file mode 100644
index 4e63f82..0000000
--- a/qdp_project/src/benchmark/micro_benchmarks.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <iostream>
-#include <chrono>
-#include <future>
-#include <numa.h>
-#include <algorithm>
-#include <cstring>
-#include "memory_literals.h"
-#include "array_utils.h"
-#include "file_output.h"
-#include "aggregation.h"
-
-
-using base_t = uint64_t;
-
-size_t thread_cnt_memcpy = 128;
-size_t thread_cnt_read = 128;
-size_t runs = 10;
-
-
-base_t sum_up(base_t* data, size_t workload){
-    base_t sum = 0;
-    for(int i = 0; i < workload/sizeof(base_t); i++){
-        sum += data[i];
-    }
-    return sum;
-}
-
-int reverse_bits(int number, size_t bit_count) {
-    int result = 0;
-    for(int i = 0; i < bit_count; i++) {
-        result <<= 1; 
-        result |= (number & 1);
-        number >>= 1; 
-    }
-    return result;
-}
-
-
-double measure_memcpy_bw(base_t* src, base_t* dest, size_t workload, base_t* result){
-    std::promise<void> p;
-    std::shared_future<void> ready_future(p.get_future());
-
-    auto thread_lambda =          [&](base_t* source, base_t* destination, size_t count) {
-        ready_future.wait();
-        memcpy(destination, source, count);
-    };
-
-    std::vector<std::thread> thread_pool;
-    size_t total_elements = workload / sizeof(base_t); 
-    size_t elements_per_thread = total_elements / thread_cnt_memcpy;
-    size_t remainder = total_elements % thread_cnt_memcpy; 
-
-    for(size_t tid = 0; tid < thread_cnt_memcpy; tid++) {
-        size_t elements_to_process = elements_per_thread + (tid < remainder ? 1 : 0);
-        size_t byte_offset = (elements_per_thread * tid + std::min(tid, remainder)) * sizeof(base_t);
-        
-        thread_pool.emplace_back(thread_lambda, src + byte_offset / sizeof(base_t), dest + byte_offset / sizeof(base_t), elements_to_process * sizeof(base_t));
-    }
-
-    auto start = std::chrono::steady_clock::now();
-    p.set_value();
-    for(std::thread& t : thread_pool) { t.join(); }
-    auto stop = std::chrono::steady_clock::now();
-
-    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
-    double seconds = duration.count() / 1e9;
-    double throughput = (workload / seconds) / (1024 * 1024 * 1024);
-    *result = sum_up(dest, workload);
-    return throughput;
-}
-
-double measure_read_bw(base_t* data, size_t workload, base_t* results){
-    const size_t chunk_size = sizeof(__m512i);
-    const size_t num_chunks = (workload) / chunk_size;
-    __m512i* src = reinterpret_cast<__m512i*>(data);
-    std::promise<void> p;
-    std::shared_future<void> ready_future(p.get_future());
-    size_t num_chunks_per_thread = num_chunks / thread_cnt_read;
-    size_t num_chunks_remainder = num_chunks % thread_cnt_read;
-
-    auto thread_lambda =          [&](__m512i* src, int tid, int num_chunks) {
-        __m512i accumulator = _mm512_setzero_si512();
-        ready_future.wait();
-        for (int i = 0; i < num_chunks; i++) {
-            __m512i chunk = _mm512_load_si512(&src[i]);
-            accumulator = _mm512_add_epi64(accumulator, chunk); 
-        }
-        results[tid] = _mm512_reduce_add_epi64(accumulator);
-    };
-
-    std::vector<std::thread> thread_pool;
-    int offset;
-    for(int tid = 0; tid < thread_cnt_read; tid++){
-        if(tid < num_chunks_remainder){
-            offset = tid * (num_chunks_per_thread + 1);
-            thread_pool.emplace_back(thread_lambda, &src[offset], tid, (num_chunks_per_thread + 1));
-        } else {
-            offset = tid*num_chunks_per_thread + num_chunks_remainder;
-            thread_pool.emplace_back(thread_lambda, &src[offset], tid, num_chunks_per_thread);
-        }
-        
-    }
-
-    auto start = std::chrono::steady_clock::now();
-    p.set_value();
-    for(std::thread& t : thread_pool) { t.join(); }
-    auto stop = std::chrono::steady_clock::now();
-
-    Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_cnt_read);
-    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
-    double seconds = duration.count() / 1e9;
-    double throughput = (workload / seconds) / (1024 * 1024 * 1024);
-    return throughput;
-}
-
-void exec_multiple_runs_memcpy(size_t workload, int exec_node, int src_node, int dest_node, std::ofstream* out_file, std::string iteration_type){
-    base_t value;
-    base_t* result = &value;
-    base_t* src = (base_t*) numa_alloc_onnode(workload, src_node);
-    base_t* dest = (base_t*) numa_alloc_onnode(workload, dest_node);
-    fill_mt<base_t>(src, workload, 0, 100, 42);
-    fill_mt<base_t>(dest, workload, 0, 100, 12);
-    numa_run_on_node(exec_node);
-
-    if(dest_node == 0 && src_node == 0){
-        std::ofstream check_file;
-        check_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
-            + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_" + iteration_type + ".checksum");
-        check_file << sum_up(src, workload);
-        check_file.close();
-    }
-
-    for(size_t run = 0; run < runs; run++){
-        double bw = measure_memcpy_bw(src, dest, workload, result);
-        std::cout << "Copy throughput executed on node " << exec_node << " form node " << src_node << " to node " 
-            << dest_node << ": " << bw << " GiB/s" << std::endl; 
-        print_to_file(*out_file, run, src_node, dest_node, bw, *result);
-        std::memset(dest, 0x00, workload);
-        *result = 0;
-    }
-    numa_free(src, workload);
-    numa_free(dest, workload);
-}
-
-void measure_all_memcpy_bw_for_chosen_execnode(int exec_node){
-    std::ofstream out_file;
-    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
-        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + ".csv");
-    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
-    const size_t workload = 4_GiB;
-    
-    for(int src_node = 0; src_node < 16; src_node++){
-        for(int dest_node = 0; dest_node < 16; dest_node++){
-            exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "");
-        }
-    }
-    out_file.close();
-}
-
-void measure_all_memcpy_bw_for_chosen_execnode_reversed(int exec_node){
-    std::ofstream out_file;
-    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
-        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed.csv");
-    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
-    const size_t workload = 4_GiB;
-    
-    for(int src_node = 15; src_node >= 0; src_node--){
-        for(int dest_node = 15; dest_node >= 0; dest_node--){
-            exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "reversed");
-        }
-    }
-    out_file.close();
-}
-
-
-
-void measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(int exec_node){
-    std::ofstream out_file;
-    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
-        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed_bitwise.csv");
-    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
-    const size_t workload = 4_GiB;
-    
-    for(int src_node = 0; src_node < 16; src_node++){
-        for(int dest_node = 0; dest_node < 16; dest_node++){
-            int reversed_src_node = reverse_bits(src_node, 4);
-            int reversed_dest_node = reverse_bits(dest_node, 4);
-            exec_multiple_runs_memcpy(workload, exec_node, reversed_src_node, reversed_dest_node, &out_file, "reversed_bitwise");
-        }
-    }
-    out_file.close();
-}
-
-
-void exec_multiple_runs_read(size_t workload, int mem_node, int exec_node, std::ofstream *out_file, std::string iteration_type){
-    base_t* data = (base_t*) numa_alloc_onnode(workload, mem_node);
-    fill_mt<base_t>(data, workload, 0, 100, 42);
-    base_t* results = (base_t*) numa_alloc_onnode(thread_cnt_read * sizeof(base_t), exec_node);
-    numa_run_on_node(exec_node);
-
-    if(mem_node == 0 && exec_node == 0){
-        std::ofstream check_file;
-        check_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_" + iteration_type + ".checksum");
-        check_file << sum_up(data, workload);
-        check_file.close();
-    }
-
-    for(size_t run = 0; run < runs; run++){
-        double bw =  measure_read_bw(data, workload, results);
-        std::cout << "Read throughput executed on node " << exec_node << " for node " << mem_node << ": " << bw << " GiB/s" << std::endl;
-        print_to_file(*out_file, run, exec_node, mem_node, bw, results[0]);
-        std::memset(results, 0x00, thread_cnt_read * sizeof(base_t));
-    }
-    numa_free(data, workload);
-    numa_free(results, thread_cnt_read * sizeof(base_t));
-}
-
-void measure_all_read_bw(){
-    std::ofstream out_file;
-    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + ".csv");
-    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
-    const size_t workload = 8_GiB;
-
-    for(int exec_node = 0; exec_node < 8; exec_node++){
-        for(int mem_node = 0; mem_node < 16; mem_node++){
-            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "");
-        }
-    }
-    out_file.close();
-}
-
-void measure_all_read_bw_reversed(){
-    std::ofstream out_file;
-    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed.csv");
-    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
-    const size_t workload = 8_GiB;
-
-    for(int exec_node = 7; exec_node >= 0; exec_node--){
-        for(int mem_node = 15; mem_node >= 0; mem_node--){
-            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed");
-        }
-    }
-    out_file.close();
-}
-
-void measure_all_read_bw_reversed_bitwise(){
-    std::ofstream out_file;
-    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed_bitwise.csv");
-    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
-    const size_t workload = 8_GiB;
-
-    for(int exec_node0 = 0; exec_node0 < 8; exec_node0++){
-        for(int mem_node0 = 0; mem_node0 < 16; mem_node0++){
-            int mem_node = reverse_bits(mem_node0, 4);
-            int exec_node = reverse_bits(exec_node0, 3);
-            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed_bitwise");
-        }
-    }
-    out_file.close();
-}
-
-
-
-int main() {
-    // nodes 0-7 hold cores and DRAM, nodes 8-15 only HBM
-
-    measure_all_read_bw_reversed_bitwise();
-    measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(0);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
deleted file mode 100644
index 6dbc652..0000000
--- a/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
+++ /dev/null
@@ -1,391 +0,0 @@
-
-#include <cassert>
-#include <mutex>
-#include <cstring>
-#include <bitset>
-
-#include <numa.h>
-
-#include "filter.h"
-#include "aggregation.h"
-#include "vector_loader.h"
-#include "timer_utils.h"
-#include "barrier_utils.h"
-#include "execution_modes.h"
-
-
-template<typename base_t, bool simple>
-class Query_Wrapper {
-public:
-    // sync
-    std::shared_future<void>* ready_future;
-
-    thread_runtime_timing* trt;
-    barrier_timing* bt;
-
-private:
-    // numa
-    uint32_t close_mem;
-    uint32_t far_mem;
-
-    // data
-    size_t size_b;
-    size_t chunk_size_b;
-    size_t chunk_size_w;
-    size_t chunk_cnt;
-    base_t* data_a;
-    base_t* data_b;
-    base_t* dest;
-
-    // ratios
-    uint32_t thread_count_fc;
-    uint32_t thread_count_fi;
-    uint32_t thread_count_ag;
-    uint32_t thread_group;
-
-    // done bits
-    volatile uint8_t* ready_flag_a;
-    volatile uint8_t* ready_flag_b;
-    std::mutex ready_a_m;
-    std::mutex ready_b_m;
-
-    // buffer
-    uint16_t* mask_a;
-    uint16_t* mask_b;
-    base_t** buffer_b;
-
-    // params
-    base_t cmp_a;
-    base_t cmp_b;
-    bool no_copy;
-    NewPMode mode;
-
-    // sync
-    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
-    std::string barrier_mode = BARRIER_MODE;
-
-    using filterCopy   = Filter<base_t, LT, load_mode::Stream, true>;
-    using filterNoCopy = Filter<base_t, LT, load_mode::Stream, false>;
-    using filter       = Filter<base_t, LT, load_mode::Stream, false>;
-    using aggregation  = Aggregation<base_t, Sum, load_mode::Stream>;
-
-public: 
-    
-
-    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
-                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
-                  NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) :
-                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
-                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){
-        
-        chunk_size_w = chunk_size_b / sizeof(base_t);
-        chunk_cnt = size_b / chunk_size_b;
-        thread_count_fi = tc_fi;
-        thread_count_fc = tc_fc;
-        thread_count_ag = tc_ag;
-
-        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
-            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
-        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
-            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
-
-        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
-        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
-
-        trt = new thread_runtime_timing(4, 16*4*4*4, close_mem);
-        bt = new barrier_timing(4, 16*4*4*4, close_mem);
-        reset_barriers();
-
-        if constexpr(BUFFER_LIMIT==1) {
-            // TODO size ok like that?
-            buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem);
-            buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
-            buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
-        } else {
-            buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem);
-            base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem);
-            *buffer_b = buffer_tmp;
-        }
-    };
-
-    void reset_barriers(){
-        if(sync_barrier != nullptr) {
-            for(auto& barrier : *sync_barrier) {
-                delete barrier;
-            }
-            sync_barrier.reset();
-        }
-
-        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
-        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
-        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
-        uint32_t barrier_thread_count;
-
-        if constexpr(simple){
-            barrier_thread_count = (thread_group / barrier_count) *
-                                        (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi));
-        } else {
-            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
-        }
-        for(uint32_t i = 0; i < barrier_count; ++i) {
-            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
-        }
-    }
-
-    void clear_buffers () {
-        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
-        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
-
-        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
-        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
-        if constexpr(BUFFER_LIMIT==1) {
-            std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b);
-            std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b);
-        } else {
-            std::memset(*buffer_b, 0x00, size_b);
-        }
-
-        trt->reset_accumulator();
-        bt->reset_accumulator();
-        reset_barriers();
-    };
-
-    ~Query_Wrapper() {
-        numa_free((void*)ready_flag_a,
-            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
-        numa_free((void*)ready_flag_b, 
-            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
-
-        numa_free(mask_a, size_b / sizeof(base_t));
-        numa_free(mask_b, size_b / sizeof(base_t));
-        if constexpr(BUFFER_LIMIT==1) {
-            numa_free(buffer_b[0], thread_group * chunk_size_b);
-            numa_free(buffer_b[1], thread_group * chunk_size_b);
-            numa_free(buffer_b, size_b * sizeof(base_t*));
-        } else {
-            numa_free(*buffer_b, size_b);
-        }
-
-        delete trt;
-        for(auto& barrier : *sync_barrier) {
-            delete barrier;
-        }
-        delete bt;
-
-    };
-
-    //this can be set without need to change allocations
-    void set_thread_group_count(uint32_t value) {
-        this->thread_group = value;
-    };
-
-private:
-    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
-                                            size_t tcnt) {
-        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
-        return chunk_ptr + tid * (chunk_size_w / tcnt);
-    }
-
-    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
-                                            size_t tcnt) {
-        // 16 integer are addressed with one uint16_t in mask buffer
-        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
-        return base_ptr + (offset / 16);
-    }
-
-    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
-        uint8_t value = bitmap[bitpos / 8];
-        switch(bitpos % 8) {
-        case 0: return value & 0b00000001;
-        case 1: return value & 0b00000010;
-        case 2: return value & 0b00000100;
-        case 3: return value & 0b00001000;
-        case 4: return value & 0b00010000;
-        case 5: return value & 0b00100000;
-        case 6: return value & 0b01000000;
-        case 7: return value & 0b10000000;
-        default: return false;
-        }
-    }
-
-    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
-        mutex.lock();
-        switch(bitpos % 8) {
-        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
-        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
-        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
-        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
-        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
-        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
-        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
-        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
-        }
-        mutex.unlock();
-    }
-
-public:
-
-    static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) {
-        base_t sum = 0;
-        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
-            if(a[i] >= cmp_a && b[i] <= cmp_b) {
-                sum += b[i];
-            }
-        }
-        return sum;
-    }
-
-    static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
-        uint32_t cnt = 0;
-        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
-            if(leq) {
-                if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) {
-                    ++cnt;
-                } 
-            } else {
-                if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) {
-                    ++cnt;
-                } 
-            }
-        }
-    }
-
-    static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
-        for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) {
-            std::bitset<16> m(mask[i]);
-            uint16_t ch = 0;
-            for(int j = 0; j < 16; ++j) {
-                if(data[i*16 + j] <= cmp) {
-                    ch |=  0x1 << j;
-                }
-            }
-            std::bitset<16> c(ch);
-
-            std::cout << "act " << m << std::endl;
-            std::cout << "rea " << c << std::endl << std::endl;
-        }
-    }
-
-
-    void scan_b(size_t gid, size_t gcnt, size_t tid) {
-        size_t tcnt = thread_count_fc;
-        assert(chunk_size_w % tcnt == 0);
-        assert(chunk_size_w % 16   == 0);
-        assert(chunk_size_w % tcnt * 16 == 0);
-
-        // wait till everyone can start
-        ready_future->wait();
-
-        // the lower gids run once more if the chunks are not evenly distributable
-        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
-        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
-        for(uint32_t i = 0; i < runs; ++i) {
-            trt->start_timer(1, tid * gcnt + gid);
-            
-            // calculate pointers
-            size_t chunk_id = gid + gcnt * i;
-            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b  , chunk_id, chunk_size_w, tid, tcnt);
-            uint16_t* mask_ptr = get_sub_mask_ptr (mask_b  , chunk_id, chunk_size_w, tid, tcnt);
-
-            if constexpr(simple){
-                base_t* buffer_ptr;
-                if constexpr(BUFFER_LIMIT==1) {
-                    buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
-                } else {
-                    buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
-                }
-                std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt);
-            } else {
-                if(no_copy) {
-                    filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
-                } else {
-                    base_t* buffer_ptr;
-                    if constexpr(BUFFER_LIMIT==1) {
-                        buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
-                    } else {
-                        buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
-                    }
-                    filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
-                }
-            }
-            
-            trt->stop_timer(1, tid * gcnt + gid);
-            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
-
-        }
-        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
-
-    }
-
-    void scan_a(size_t gid, size_t gcnt, size_t tid) {
-        size_t tcnt = thread_count_fi;
-        assert(chunk_size_w % tcnt == 0);
-        assert(chunk_size_w % 16   == 0);
-        assert(chunk_size_w % tcnt * 16 == 0);
-
-        // wait till everyone can start
-        ready_future->wait();
-
-        // the lower gids run once more if the chunks are not evenly distributable
-        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
-        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
-        for(uint32_t i = 0; i < runs; ++i) {
-            trt->start_timer(0, tid * gcnt + gid);
-            // calculate pointers
-            size_t chunk_id = gid + gcnt * i;
-            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
-            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
-
-            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
-
-            trt->stop_timer(0, tid * gcnt + gid);
-            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
-        }
-        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
-    }
-
-    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
-        size_t tcnt = thread_count_ag;
-        // wait till everyone can start
-        ready_future->wait();
-
-        // calculate values
-        __m512i aggregator = aggregation::OP::zero();
-        // the lower gids run once more if the chunks are not evenly distributable
-        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
-        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
-        for(uint32_t i = 0; i < runs; ++i) {
-            
-            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
-            trt->start_timer(2, tid * gcnt + gid);
-
-            // calculate pointers
-            size_t chunk_id = gid + gcnt * i;
-            base_t* chunk_ptr;
-            if(no_copy) {
-                chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
-            } else {
-                if constexpr(BUFFER_LIMIT==1) {   
-                    chunk_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
-                } else {
-                    chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
-                }
-            }
-            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
-            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
-
-            base_t tmp = _mm512_reduce_add_epi64(aggregator);
-            if constexpr(simple){
-                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt);
-            } else {
-                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
-            }
-            trt->stop_timer(2, tid * gcnt + gid);
-        }
-
-        // so threads with more runs dont wait for finished threads
-        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
-
-        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
-    }
-};
\ No newline at end of file
diff --git a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
index 3b1d861..e224391 100644
--- a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
+++ b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
@@ -15,9 +15,9 @@
 #include "measurement_utils.h"
 #include "execution_modes.h"
 
-#include "../../../thirdParty/dsa_offload/offloading-cacher/cache.hpp"
+#include "../../../../offloading-cacher/cache.hpp"
 
-template<typename base_t, bool simple>
+template<typename base_t, bool simple, bool cache_a, bool wait_b>
 class Query_Wrapper {
 public:
     // sync
@@ -28,11 +28,9 @@ public:
     pcm_value_collector* pvc;
 
 private:
-    dsacache::Cache cache_;
+    static constexpr size_t COPY_POLICY_MIN_SIZE = 64 * 1024 * 1024;
 
-    // numa
-    uint32_t close_mem;
-    uint32_t far_mem;
+    dsacache::Cache cache_;
 
     // data
     size_t size_b;
@@ -47,13 +45,11 @@ private:
     uint32_t thread_count_fc;
     uint32_t thread_count_fi;
     uint32_t thread_count_ag;
-    uint32_t thread_group;
+    uint32_t thread_count;
 
     // done bits
     volatile uint8_t* ready_flag_a;
     volatile uint8_t* ready_flag_b;
-    std::mutex ready_a_m;
-    std::mutex ready_b_m;
 
     // buffer
     uint16_t* mask_a;
@@ -73,70 +69,72 @@ private:
     using filter       = Filter<base_t, LT, load_mode::Stream, false>;
     using aggregation  = Aggregation<base_t, Sum, load_mode::Stream>;
 
-    void InitCache(const std::string& device) {
-        if (device == "default") {
-            static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
-                return numa_dst_node;
-            };
+    static int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+        return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;
+    }
 
-            static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-                return std::vector<int>{ numa_src_node, numa_dst_node };
-            };
+    static std::vector<int> CopyMethodPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) {
+        if (data_size < COPY_POLICY_MIN_SIZE) {
+            // if the data size is small then the copy will just be carried
+            // out by the destination node which does not require setting numa
+            // thread affinity as the selected dsa engine is already the one
+            // present on the calling thread
 
-            cache_.Init(cache_policy,copy_policy);
-        }
-        else if (device == "xeonmax") {
-            static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
-                return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;
-            };
-
-            static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
-                const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
-                if (same_socket) {
-                    const bool socket_number = numa_dst_node >> 2;
-                    if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
-                    else return std::vector<int>{ 4, 5, 6, 7 };
-                }
-                else return std::vector<int>{ numa_src_node, numa_dst_node };
-            };
-
-            cache_.Init(cache_policy,copy_policy);
+            return std::vector<int>{ (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) };
         }
         else {
-            std::cerr << "Given device '" << device << "' not supported!" << std::endl;
-            exit(-1);
+            // for sufficiently large data, smart copy is used which will utilize
+            // all four engines for intra-socket copy operations and cross copy on
+            // the source and destination nodes for inter-socket copy
+
+            const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
+
+            if (same_socket) {
+                const bool socket_number = numa_dst_node >> 2;
+                if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
+                else return std::vector<int>{ 4, 5, 6, 7 };
+            }
+            else {
+                return std::vector<int>{
+                        (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node),
+                        (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node)
+                };
+            }
         }
     }
 
 public: 
-    
-
-    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
-                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
-                  NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42) :
+    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a,
+                  base_t* data_b, base_t* dest, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag,
+                  NewPMode mode, base_t cmp_a = 50, base_t cmp_b = 42) :
                   ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
-                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b){
-        
+                  dest(dest), mode(mode), cmp_a(cmp_a), cmp_b(cmp_b) {
+
+        const int current_cpu = sched_getcpu();
+        const int current_node = numa_node_of_cpu(current_cpu);
+        const int cache_node = CachePlacementPolicy(current_node, current_node, 0);
+
         chunk_size_w = chunk_size_b / sizeof(base_t);
         chunk_cnt = size_b / chunk_size_b;
+
         thread_count_fi = tc_fi;
         thread_count_fc = tc_fc;
         thread_count_ag = tc_ag;
 
-        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
-            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
-        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
-            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
+        thread_count = tc_fi + tc_fc + tc_ag;
+
+        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), cache_node);
+        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), cache_node);
 
-        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
-        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
+        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), cache_node);
+        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), cache_node);
 
-        InitCache("xeonmax");
+        cache_.Init(CachePlacementPolicy, CopyMethodPolicy);
 
-        size_t measurement_space = THREAD_GROUP_MULTIPLIER * std::max(std::max(tc_fi, tc_fc), tc_ag);
-        trt = new thread_runtime_timing(3, measurement_space, far_mem);
-        bt = new barrier_timing(3, measurement_space, far_mem);
-        pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, far_mem);
+        size_t measurement_space = std::max(std::max(tc_fi, tc_fc), tc_ag);
+        trt = new thread_runtime_timing(3, measurement_space, current_node);
+        bt = new barrier_timing(3, measurement_space, current_node);
+        pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, current_node);
         reset_barriers();
     };
 
@@ -148,16 +146,15 @@ public:
             sync_barrier.reset();
         }
 
-        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
+        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_count);
         uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
-        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
+        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_count;
         uint32_t barrier_thread_count;
 
         if constexpr(simple){
-            barrier_thread_count = (thread_group / barrier_count) *
-                                        (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi));
+            barrier_thread_count = (thread_count / barrier_count) * (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi));
         } else {
-            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
+            barrier_thread_count = (thread_count / barrier_count) * thread_count_sum;
         }
         for(uint32_t i = 0; i < barrier_count; ++i) {
             (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
@@ -180,10 +177,8 @@ public:
     };
 
     ~Query_Wrapper() {
-        numa_free((void*)ready_flag_a,
-            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
-        numa_free((void*)ready_flag_b, 
-            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
+        numa_free((void*)ready_flag_a, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
+        numa_free((void*)ready_flag_b, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
 
         numa_free(mask_a, size_b / sizeof(base_t));
         numa_free(mask_b, size_b / sizeof(base_t));
@@ -202,14 +197,12 @@ public:
     };
 
 private:
-    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
-                                            size_t tcnt) {
+    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, size_t tcnt) {
         base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
         return chunk_ptr + tid * (chunk_size_w / tcnt);
     }
 
-    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
-                                            size_t tcnt) {
+    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, size_t tcnt) {
         // 16 integer are addressed with one uint16_t in mask buffer
         size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
         return base_ptr + (offset / 16);
@@ -258,6 +251,7 @@ public:
         // the lower gids run once more if the chunks are not evenly distributable
         uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
         uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+
         for(uint32_t i = 0; i < runs; ++i) {
             trt->start_timer(1, tid * gcnt + gid);
             pvc->start("scan_b", tid * gcnt + gid);
@@ -268,28 +262,45 @@ public:
             uint16_t* mask_ptr = get_sub_mask_ptr(mask_b, chunk_id, chunk_size_w, tid, tcnt);
 
             if constexpr(simple){
-                cache_.Access(chunk_ptr, chunk_size_b / tcnt);
+                cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
             } else {
-                const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt);
+                const auto data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
 
-                // wait on copy to complete - during this time other threads may
-                // continue with their calculation which leads to little impact
-                // and we will be faster if the cache is used
+                if constexpr(wait_b) {
+                    // wait on copy to complete - during this time other threads may
+                    // continue with their calculation which leads to little impact
+                    // and we will be faster if the cache is used
 
-                data->WaitOnCompletion();
+                    data->WaitOnCompletion();
 
-                // obtain the data location from the cache entry
+                    // obtain the data location from the cache entry
 
-                base_t* data_ptr = data->GetDataLocation();
+                    base_t* data_ptr = reinterpret_cast<base_t*>(data->GetDataLocation());
 
-                // nullptr is still a legal return value for CacheData::GetLocation()
-                // even after waiting, so this must be checked
+                    // nullptr is still a legal return value for CacheData::GetLocation()
+                    // even after waiting, so this must be checked
 
-                if (data_ptr == nullptr) {
-                    data_ptr = chunk_ptr;
+                    if (data_ptr == nullptr) {
+                        std::cerr << "[!] Cache Miss in ScanB" << std::endl;
+                        data_ptr = chunk_ptr;
+                    }
+
+                    filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt);
                 }
+                else {
+                    // obtain the data location from the cache entry
+
+                    base_t* data_ptr = reinterpret_cast<base_t*>(data->GetDataLocation());
+
+                    // nullptr is still a legal return value for CacheData::GetLocation()
+                    // even after waiting, so this must be checked
 
-                filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt);
+                    if (data_ptr == nullptr) {
+                        data_ptr = chunk_ptr;
+                    }
+
+                    filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt);
+                }
             }
 
             pvc->stop("scan_b", tid * gcnt + gid);
@@ -321,7 +332,21 @@ public:
             base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
             uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
 
-            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
+            if constexpr (cache_a) {
+                const auto data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
+                data->WaitOnCompletion();
+                base_t* data_ptr = reinterpret_cast<base_t*>(data->GetDataLocation());
+
+                if (data_ptr == nullptr) {
+                    std::cerr << "[!] Cache Miss in ScanA" << std::endl;
+                    data_ptr = chunk_ptr;
+                }
+
+                filter::apply_same(mask_ptr, nullptr, data_ptr, cmp_a, chunk_size_b / tcnt);
+            }
+            else {
+                filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
+            }
 
             pvc->stop("scan_a", tid * gcnt + gid);
             trt->stop_timer(0, tid * gcnt + gid);
@@ -340,19 +365,19 @@ public:
         // the lower gids run once more if the chunks are not evenly distributable
         uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
         uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
+
         for(uint32_t i = 0; i < runs; ++i) {
-            
             bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
             trt->start_timer(2, tid * gcnt + gid);
             pvc->start("aggr_j", tid * gcnt + gid);
 
             // calculate pointers
             size_t chunk_id = gid + gcnt * i;
-            const base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
+            base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
 
             // access the cache for the given chunk which will have been accessed in scan_b
 
-            const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt);
+            const auto data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
 
             // wait on the caching task to complete, this will give time for other processes
             // to make progress here which will therefore not hurt performance
@@ -362,14 +387,14 @@ public:
             // after the copy task has finished we obtain the pointer to the cached
             // copy of data_b which is then used from now on
 
-            const base_t* data_ptr = data->GetDataLocation();
+            base_t* data_ptr = reinterpret_cast<base_t*>(data->GetDataLocation());
 
             // nullptr is still a legal return value for CacheData::GetLocation()
             // even after waiting, so this must be checked
 
             if (data_ptr == nullptr) {
                 data_ptr = chunk_ptr;
-                std::cerr << "Cache Miss" << std::endl;
+                std::cerr << "[!] Cache Miss in AggrJ" << std::endl;
             }
 
             uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
diff --git a/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
deleted file mode 100644
index 2b10b06..0000000
--- a/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
+++ /dev/null
@@ -1,387 +0,0 @@
-
-#include <cassert>
-#include <mutex>
-#include <cstring>
-#include <bitset>
-
-#include <numa.h>
-
-#include "filter.h"
-#include "aggregation.h"
-#include "vector_loader.h"
-#include "timer_utils.h"
-#include "barrier_utils.h"
-#include "execution_modes.h"
-
-
-template<typename base_t, bool simple>
-class Query_Wrapper {
-public:
-    // sync
-    std::shared_future<void>* ready_future;
-
-    thread_runtime_timing* trt;
-    barrier_timing* bt;
-
-private:
-    // numa
-    uint32_t close_mem;
-    uint32_t far_mem;
-
-    // data
-    size_t size_b;
-    size_t chunk_size_b;
-    size_t chunk_size_w;
-    size_t chunk_cnt;
-    base_t* data_a;
-    base_t* data_b;
-    base_t* dest;
-
-    // ratios
-    uint32_t thread_count_fc;
-    uint32_t thread_count_fi;
-    uint32_t thread_count_ag;
-    uint32_t thread_group;
-
-    // done bits
-    volatile uint8_t* ready_flag_a;
-    volatile uint8_t* ready_flag_b;
-    std::mutex ready_a_m;
-    std::mutex ready_b_m;
-
-    // buffer
-    uint16_t* mask_a;
-    uint16_t* mask_b;
-    base_t** buffer_b;
-
-    // params
-    base_t cmp_a;
-    base_t cmp_b;
-    bool no_copy;
-    PMode mode;
-
-    // sync
-    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
-    std::string barrier_mode = BARRIER_MODE;
-
-    using filterCopy   = Filter<base_t, LEQ, load_mode::Aligned, true>;
-    using filterNoCopy = Filter<base_t, LEQ, load_mode::Aligned, false>;
-    using filter       = Filter<base_t, GEQ, load_mode::Aligned, false>;
-    using aggregation  = Aggregation<base_t, Sum, load_mode::Aligned>;
-
-public: 
-    
-
-    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
-                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
-                  PMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) :
-                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
-                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){
-        
-        chunk_size_w = chunk_size_b / sizeof(base_t);
-        chunk_cnt = size_b / chunk_size_b;
-        thread_count_fi = tc_fi;
-        thread_count_fc = tc_fc;
-        thread_count_ag = tc_ag;
-
-        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
-            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
-        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
-            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
-
-        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
-        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
-
-        trt = new thread_runtime_timing(4, 20, close_mem);
-        bt = new barrier_timing(4, 20, close_mem);
-        reset_barriers();
-
-        if constexpr(BUFFER_LIMIT==1) {
-            // TODO size ok like that?
-            buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem);
-            buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
-            buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
-        } else {
-            buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem);
-            base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem);
-            *buffer_b = buffer_tmp;
-        }
-    };
-
-    void reset_barriers(){
-        if(sync_barrier != nullptr) {
-            for(auto& barrier : *sync_barrier) {
-                delete barrier;
-            }
-            sync_barrier.reset();
-        }
-
-        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
-        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
-        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
-        uint32_t barrier_thread_count;
-
-        if constexpr(simple){
-            barrier_thread_count = (thread_group / barrier_count) *
-                                        (mode == PMode::expl_copy ? thread_count_sum : (thread_count_ag + thread_count_fi));
-        } else {
-            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
-        }
-        for(uint32_t i = 0; i < barrier_count; ++i) {
-            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
-        }
-    }
-
-
-    void clear_buffers () {
-        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
-        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
-
-        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
-        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
-        if constexpr(BUFFER_LIMIT==1) {
-            std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b);
-            std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b);
-        } else {
-            std::memset(*buffer_b, 0x00, size_b);
-        }
-
-        trt->reset_accumulator();
-        bt->reset_accumulator();
-        reset_barriers();
-    };
-
-    ~Query_Wrapper() {
-        numa_free((void*)ready_flag_a,
-            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
-        numa_free((void*)ready_flag_b, 
-            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
-
-        numa_free(mask_a, size_b / sizeof(base_t));
-        numa_free(mask_b, size_b / sizeof(base_t));
-        if constexpr(BUFFER_LIMIT==1) {
-            numa_free(buffer_b[0], thread_group * chunk_size_b);
-            numa_free(buffer_b[1], thread_group * chunk_size_b);
-            numa_free(buffer_b, size_b * sizeof(base_t*));
-        } else {
-            numa_free(*buffer_b, size_b);
-        }
-
-        delete trt;
-        for(auto& barrier : *sync_barrier) {
-            delete barrier;
-        }
-        delete bt;
-
-    };
-
-private:
-    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
-                                            size_t tcnt) {
-        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
-        return chunk_ptr + tid * (chunk_size_w / tcnt);
-    }
-
-    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
-                                            size_t tcnt) {
-        // 16 integer are addressed with one uint16_t in mask buffer
-        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
-        return base_ptr + (offset / 16);
-    }
-
-    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
-        uint8_t value = bitmap[bitpos / 8];
-        switch(bitpos % 8) {
-        case 0: return value & 0b00000001;
-        case 1: return value & 0b00000010;
-        case 2: return value & 0b00000100;
-        case 3: return value & 0b00001000;
-        case 4: return value & 0b00010000;
-        case 5: return value & 0b00100000;
-        case 6: return value & 0b01000000;
-        case 7: return value & 0b10000000;
-        default: return false;
-        }
-    }
-
-    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
-        mutex.lock();
-        switch(bitpos % 8) {
-        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
-        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
-        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
-        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
-        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
-        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
-        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
-        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
-        }
-        mutex.unlock();
-    }
-
-public:
-
-    static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) {
-        base_t sum = 0;
-        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
-            if(a[i] >= cmp_a && b[i] <= cmp_b) {
-                sum += b[i];
-            }
-        }
-        return sum;
-    }
-
-    static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
-        uint32_t cnt = 0;
-        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
-            if(leq) {
-                if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) {
-                    ++cnt;
-                } 
-            } else {
-                if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) {
-                    ++cnt;
-                } 
-            }
-        }
-    }
-
-    static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
-        for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) {
-            std::bitset<16> m(mask[i]);
-            uint16_t ch = 0;
-            for(int j = 0; j < 16; ++j) {
-                if(data[i*16 + j] <= cmp) {
-                    ch |=  0x1 << j;
-                }
-            }
-            std::bitset<16> c(ch);
-
-            std::cout << "act " << m << std::endl;
-            std::cout << "rea " << c << std::endl << std::endl;
-        }
-    }
-
-
-    void scan_b(size_t gid, size_t gcnt, size_t tid) {
-        size_t tcnt = thread_count_fc;
-        assert(chunk_size_w % tcnt == 0);
-        assert(chunk_size_w % 16   == 0);
-        assert(chunk_size_w % tcnt * 16 == 0);
-
-        // wait till everyone can start
-        ready_future->wait();
-
-        // the lower gids run once more if the chunks are not evenly distributable
-        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
-        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
-        for(uint32_t i = 0; i < runs; ++i) {
-            trt->start_timer(1, tid * gcnt + gid);
-            
-            // calculate pointers
-            size_t chunk_id = gid + gcnt * i;
-            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b  , chunk_id, chunk_size_w, tid, tcnt);
-            uint16_t* mask_ptr = get_sub_mask_ptr (mask_b  , chunk_id, chunk_size_w, tid, tcnt);
-
-            if constexpr(simple){
-                base_t* buffer_ptr;
-                if constexpr(BUFFER_LIMIT==1) {
-                    buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
-                } else {
-                    buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
-                }
-                std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt);
-            } else {
-                if(no_copy) {
-                    filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
-                } else {
-                    base_t* buffer_ptr;
-                    if constexpr(BUFFER_LIMIT==1) {
-                        buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
-                    } else {
-                        buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
-                    }
-                    filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
-                }
-            }
-            
-            trt->stop_timer(1, tid * gcnt + gid);
-            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
-
-        }
-        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
-
-    }
-
-    void scan_a(size_t gid, size_t gcnt, size_t tid) {
-        size_t tcnt = thread_count_fi;
-        assert(chunk_size_w % tcnt == 0);
-        assert(chunk_size_w % 16   == 0);
-        assert(chunk_size_w % tcnt * 16 == 0);
-
-        // wait till everyone can start
-        ready_future->wait();
-
-        // the lower gids run once more if the chunks are not evenly distributable
-        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
-        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
-        for(uint32_t i = 0; i < runs; ++i) {
-            trt->start_timer(0, tid * gcnt + gid);
-            // calculate pointers
-            size_t chunk_id = gid + gcnt * i;
-            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
-            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
-
-            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
-
-            trt->stop_timer(0, tid * gcnt + gid);
-            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
-        }
-        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
-    }
-
-    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
-        size_t tcnt = thread_count_ag;
-        // wait till everyone can start
-        ready_future->wait();
-
-        // calculate values
-        __m512i aggregator = aggregation::OP::zero();
-        // the lower gids run once more if the chunks are not evenly distributable
-        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
-        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
-        for(uint32_t i = 0; i < runs; ++i) {
-            
-            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
-            trt->start_timer(2, tid * gcnt + gid);
-
-            // calculate pointers
-            size_t chunk_id = gid + gcnt * i;
-            base_t* chunk_ptr;
-            if(no_copy) {
-                chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
-            } else {
-                if constexpr(BUFFER_LIMIT==1) {
-                    chunk_ptr = get_sub_chunk_ptr(buffer_b[i%2], gid, chunk_size_w, tid, tcnt);
-                } else {
-                    chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
-                }
-            }
-            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
-            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
-
-            base_t tmp = _mm512_reduce_add_epi64(aggregator);
-            if constexpr(simple){
-                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt);
-            } else {
-                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
-            }
-            trt->stop_timer(2, tid * gcnt + gid);
-        }
-
-        // so threads with more runs dont wait for finished threads
-        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
-
-        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
-    }
-};
\ No newline at end of file
diff --git a/qdp_project/src/utils/execution_modes.h b/qdp_project/src/utils/execution_modes.h
index ca04b4f..b494fab 100644
--- a/qdp_project/src/utils/execution_modes.h
+++ b/qdp_project/src/utils/execution_modes.h
@@ -55,17 +55,24 @@ struct new_mode_manager {
     };*/
 
     constexpr static int thread_counts[2][4][3] = {
+        // thread counts for both simple and complex querry
+        // inner layout: { scan_a, scan_b, aggr_j }
+
         //simple query
-        //scan_a, scan_b, aggr_j
-        {{4,      0,      2}, // DRAM_base
-         {4,      0,      2}, // HBM_base
-         {4,      0,      2}, // Mixed_base
-         {1,      4,      1}},// Prefetching
+        {
+            {4,      0,      2}, // DRAM_base
+            {4,      0,      2}, // HBM_base
+            {4,      0,      2}, // Mixed_base
+            {4,      4,      4}  // Prefetching
+        },
+
         //complex query
-        {{1,      4,      1}, // DRAM_base
-         {1,      4,      1}, // HBM_base
-         {1,      4,      1}, // Mixed_base
-         {1,      4,      1}},// Prefetching
+        {
+            {1,      4,      1}, // DRAM_base
+            {1,      4,      1}, // HBM_base
+            {1,      4,      1}, // Mixed_base
+            {4,      4,      4}  // Prefetching
+        }
     };
 
     static inline NewPMode inc(NewPMode value) {
@@ -81,9 +88,17 @@ struct new_mode_manager {
     };
     static std::string string(NewPMode value) {
         switch(value) {
-            case  DRAM_base: return "DRAM_Baseline";
-            case   HBM_base: return "HBM_Baseline";
-            case Mixed_base: return "DRAM_HBM_Baseline";
-        }                    return "Q-d_Prefetching";
+            case DRAM_base:
+                return "DRAM_Baseline";
+            case HBM_base:
+                return "HBM_Baseline";
+            case Mixed_base:
+                return "DRAM_HBM_Baseline";
+            case Prefetch:
+                return "Q-d_Prefetching";
+            default:
+                std::cerr << "[x] Unknown Processing Mode" << std::endl;
+                exit(-1);
+        }
     };
 };
\ No newline at end of file

From 0ad0e4af042dcc9bb30da22f26c486f68317f319 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 17 Jan 2024 13:48:09 +0100
Subject: [PATCH 28/29] remove the manual build script and add numa and cpu
 asignment to the execution script

---
 qdp_project/bench_max.sh | 5 ++---
 qdp_project/cmake_max.sh | 9 ---------
 2 files changed, 2 insertions(+), 12 deletions(-)
 delete mode 100644 qdp_project/cmake_max.sh

diff --git a/qdp_project/bench_max.sh b/qdp_project/bench_max.sh
index b7e0168..e49275b 100644
--- a/qdp_project/bench_max.sh
+++ b/qdp_project/bench_max.sh
@@ -1,10 +1,9 @@
-#!bin/bash
+#!/bin/bash
 
 current_date_time=$(date)
 echo "Benchmark start at: $current_date_time"
 
-
-../bin/MAXBench
+sudo numactl --cpunodebind=2 -- taskset -c 0,1,2,3,4,5 ../bin/MAXBench
 
 current_date_time=$(date)
 echo "Benchmark end at: $current_date_time"
\ No newline at end of file
diff --git a/qdp_project/cmake_max.sh b/qdp_project/cmake_max.sh
deleted file mode 100644
index 03c137b..0000000
--- a/qdp_project/cmake_max.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!bin/bash
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=cpu -DPCM_M=false ..
-cmake --build . --target MAXBench
-mv ../bin/MAXBench ../bin/MAXBench_gcc
-
-cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=numa -DPCM_M=false ..
-cmake --build . --target MAXBench
-mv ../bin/MAXBench ../bin/MAXBench_gcn

From 1a3cb6dada1c9d64461d1a3141cc2b64348dc59a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Wed, 17 Jan 2024 13:49:21 +0100
Subject: [PATCH 29/29] prettify credit in the readme

---
 qdp_project/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/qdp_project/README.md b/qdp_project/README.md
index afad56b..7b774b4 100644
--- a/qdp_project/README.md
+++ b/qdp_project/README.md
@@ -1,3 +1,5 @@
 This is a copy of the Query Driven Prefetching Repository
+
 https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/qdp_minimal/code
-Original Authors: André Berthold and Anna Bartuschka
+
+Original Authors: André Berthold and Anna Bartuschka
\ No newline at end of file