add query driven prefetching code repository copy

11 months ago · 641a7593fe
33 changed files with 4682 additions and 0 deletions
--- a/qdp_project/.gitignore
+++ b/qdp_project/.gitignore
@ -0,0 +1,104 @@
 bin/
 # CMake building files
 CMakeLists.txt.user
 CMakeCache.txt
 CMakeFiles
 CMakeScripts
 Testing
 Makefile
 cmake_install.cmake
 install_manifest.txt
 compile_commands.json
 CTestTestfile.cmake
 _deps
 .cmake
 # Prerequisites
 *.d
 # Object files
 *.o
 *.ko
 *.obj
 *.elf
 # Linker output
 *.ilk
 *.map
 *.exp
 # Precompiled Headers
 *.gch
 *.pch
 # Libraries
 *.lib
 *.a
 *.la
 *.lo
 # Shared objects (inc. Windows DLLs)
 *.dll
 *.so
 *.so.*
 *.dylib
 # Executables
 *.exe
 *.out
 *.app
 *.i*86
 *.x86_64
 *.hex
 # Debug files
 *.dSYM/
 *.su
 *.idb
 *.pdb
 # Kernel Module Compile Results
 *.mod*
 *.cmd
 .tmp_versions/
 modules.order
 Module.symvers
 Mkfile.old
 dkms.conf
 # Prerequisites
 *.d
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.obj
 # Precompiled Headers
 *.gch
 *.pch
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Fortran module files
 *.mod
 *.smod
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.exe
 *.out
 *.app
--- a/qdp_project/CMakeLists.txt
+++ b/qdp_project/CMakeLists.txt
@ -0,0 +1,104 @@
 cmake_minimum_required(VERSION 3.18)
 # set the project name
 project(NUMA_Slow_Fast_Datamigration_Test VERSION 0.1)
 # specify the C standard
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 #set flags on need                       cross compile for sapphirerapids architecture
 set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=sapphirerapids")
 #set flags on need                       cross compile for skylake micro architecture
 #set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=skylake-avx512")
 #set flags on need                       cross compile for knights landing micro architecture (for debugging)
 #set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mavx512f -mavx512cd -mavx512er -mavx512pf")
 #suppress selected! warnigs that are not very important to resolve. This is to keep the compileation output clean
 set(SUPPRESS_WARNINGS "-Wno-literal-suffix -Wno-volatile")
 set(DEBUG_FLAGS "-g3" "-ggdb")
 set(RELEASE_FLAGS "-O3")
 #set pcm location
 set(PCM_LOCATION ./thirdParty/pcm)
 set(PCM_LINKS -lpcm -L${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib)
 # pass the in formation about the shared library location to the linker
 link_directories(${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib)
 #set flags used for Release and Debug build type
 add_compile_options(
    "$<$<CONFIG:Release>:${RELEASE_FLAGS}>"
    "$<$<CONFIG:Debug>:${DEBUG_FLAGS}>"
 )
 # evaluate custom variables
 function(eval vname vvalid vdefault)
  # is variable is set to the below value if its not already defined from the comand line 
  set(VALID ${vvalid} CACHE INTERNAL "Possible values for ${vname}")
  set(${vname} ${vdefault} CACHE STRING "The barrier mode")
  # command for GUI shenanigans
  set_property(CACHE ${vname} PROPERTY STRINGS VALID)
  if(${vname} IN_LIST VALID)
    message(STATUS "Variable ${vname} = ${${vname}}")
  else()
    message(STATUS "Variable ${vname} has invalid value ${${vname}}")
    # set the fallback value for use in parent function
    unset(${vname} CACHE)
    message(STATUS "Fallback to default: ${vname} = ${vdefault}")
    set(${vname} ${vdefault} PARENT_SCOPE)
  endif()
 endfunction()
 eval(WSUPPRESS "suppress;show" "show")
 if($<STREQUAL:${BUFFER_LIMIT},suppress> EQUAL 1)
  add_compile_options("${SUPPRESS_WARNINGS}")
 endif()
 eval(BARRIER_MODE "global;local" "global")
 add_definitions(-DBARRIER_MODE="${BARRIER_MODE}")
 eval(BUFFER_LIMIT "unlimited;limited" "unlimited")
 add_definitions(-DBUFFER_LIMIT=$<STREQUAL:${BUFFER_LIMIT},limited>)
 eval(QUERY "simple;complex" "simple")
 add_definitions(-DQUERY=$<STREQUAL:${QUERY},simple>)
 eval(THREAD_FACTOR "1;2;3;4;5;6;7;8;9;10" "4")
 add_definitions(-DTHREAD_GROUP_MULTIPLIER=${THREAD_FACTOR})
 eval(PINNING "cpu;numa" "cpu")
 add_definitions(-DPINNING=$<STREQUAL:${PINNING},cpu>)
 eval(PCM_M "true;false" "false")
 add_definitions(-DPCM_M=$<STREQUAL:${PCM_M},true>)
 add_definitions(${PCM_LINKS})
 # build directory
 set(CMAKE_BINARY_DIR "../bin") #relative to inside build 
 set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
 # include directories
 include_directories(src/utils)
 include_directories(src/algorithm)
 include_directories(src/algorithm/operators)
 include_directories(thirdParty/pcm/src)
 # link libraries 
 link_libraries(-lnuma -lpthread)
 # Add targets only below   
 # specify build targets
 add_executable(FilterAggregatePipeline src/benchmark/filter_aggregate_pipeline.cpp)
 add_executable(DoublyFiltered src/benchmark/doubly_filtered_agg.cpp)
 add_executable(DIMESBench src/benchmark/DIMES_benchmark.cpp)
 add_executable(DIMESCoreBench src/benchmark/DIMES_cores_benchmark.cpp)
 add_executable(MicroBench src/benchmark/micro_benchmarks.cpp)
 add_executable(MAXBench src/benchmark/MAX_benchmark.cpp
        src/benchmark/QDP_minimal.h)
 target_link_libraries(MAXBench libpcm.so)
 add_executable(LatencyBench src/benchmark/latency.cpp)
--- a/qdp_project/README.md
+++ b/qdp_project/README.md
@ -0,0 +1,3 @@
 This is a copy of the Query Driven Prefetching Repository
 https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/qdp_minimal/code
 Original Authors: André Berthold and Anna Bartuschka
--- a/qdp_project/bench_all_dimes.sh
+++ b/qdp_project/bench_all_dimes.sh
@ -0,0 +1,10 @@
 #!bin/bash
 ../bin/DIMESBench_gus
 ../bin/DIMESBench_guc
 ../bin/DIMESBench_gls
 ../bin/DIMESBench_glc
 ../bin/DIMESBench_lus
 ../bin/DIMESBench_luc
 ../bin/DIMESBench_lls
 ../bin/DIMESBench_llc
--- a/qdp_project/bench_max.sh
+++ b/qdp_project/bench_max.sh
@ -0,0 +1,15 @@
 #!bin/bash
 current_date_time=$(date)
 echo "Benchmark start at: $current_date_time"
 ../bin/MAXBench_gcc
 cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_c_HBM.csv
 ../bin/MAXBench_gcn
 cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_n_HBM.csv
 current_date_time=$(date)
 echo "Benchmark end at: $current_date_time"
--- a/qdp_project/cmake_all_dimes.sh
+++ b/qdp_project/cmake_all_dimes.sh
@ -0,0 +1,33 @@
 #!bin/bash
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=simple  ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_gus
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_guc
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited   -DQUERY=simple  ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_gls
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited   -DQUERY=complex ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_glc
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=unlimited -DQUERY=simple  ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_lus
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=unlimited -DQUERY=complex ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_luc
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=limited   -DQUERY=simple  ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_lls
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local  -DBUFFER_LIMIT=limited   -DQUERY=complex ..
 cmake --build . --target DIMESBench
 mv ../bin/DIMESBench ../bin/DIMESBench_llc
--- a/qdp_project/cmake_max.sh
+++ b/qdp_project/cmake_max.sh
@ -0,0 +1,9 @@
 #!bin/bash
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=cpu -DPCM_M=false ..
 cmake --build . --target MAXBench
 mv ../bin/MAXBench ../bin/MAXBench_gcc
 cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=numa -DPCM_M=false ..
 cmake --build . --target MAXBench
 mv ../bin/MAXBench ../bin/MAXBench_gcn
--- a/qdp_project/src/.gitkeep
+++ b/qdp_project/src/.gitkeep
--- a/qdp_project/src/algorithm/operators/aggregation.h
+++ b/qdp_project/src/algorithm/operators/aggregation.h
@ -0,0 +1,316 @@
 #pragma once
 #include <cstdint>
 #include <algorithm>
 #include <immintrin.h>
 #include <type_traits>
 #include "vector_loader.h"
 #include "const.h"
 /**
 * @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type.
 * 
 * @tparam T 
 */
 template <typename T>
 class AggFunction {
    static_assert(std::is_integral<T>::value, "The base type of an AggFunction must be an integral");
 };
 /**
 * @brief Template class that implements methods used for Summation. It wraps the corresponding vector intrinsics 
 * 
 * @tparam T base datatype for the implemented methods
 */
 template<typename T>
 class Sum : public AggFunction<T> {
 public:
    static inline __m512i simd_agg(__m512i aggregator, __m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_add_epi32(aggregator, vector);
        else if constexpr (sizeof(T) == 8) return _mm512_add_epi64(aggregator, vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers");
    };
    static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_mask_add_epi32(aggregator, mask, aggregator, vector);
        else if constexpr (sizeof(T) == 8) return _mm512_mask_add_epi64(aggregator, mask, aggregator, vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers");
    };
    static inline T simd_reduce(__m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_reduce_add_epi32(vector);
        else if constexpr (sizeof(T) == 8) return _mm512_reduce_add_epi64(vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers");
    };
    static inline T scalar_agg(T aggregator, T scalar) { return aggregator + scalar; };
    static inline __m512i zero() { return _mm512_set1_epi32(0); };
 };
 /**
 * @brief Template class that implements methods used for Maximum determination. It wraps the corresponding vector intrinsics 
 * 
 * @tparam T base datatype for the implemented methods
 *
 */
 template<typename T>
 class Max : public AggFunction<T> {
 public:
    static inline __m512i simd_agg(__m512i aggregator, __m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_max_epi32(aggregator, vector);
        else if constexpr (sizeof(T) == 8) return _mm512_max_epi64(aggregator, vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers");
    }
    static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_mask_max_epi32(aggregator, mask, aggregator, vector);
        else if constexpr (sizeof(T) == 8) return _mm512_mask_max_epi64(aggregator, mask, aggregator, vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); 
    }
    static inline T simd_reduce(__m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_reduce_max_epi32(vector);
        else if constexpr (sizeof(T) == 8) return _mm512_reduce_max_epi64(vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers");
    }
    static inline T scalar_agg(T aggregator, T scalar) { return std::max(aggregator, scalar); }
    static inline __m512i zero() { 
        if constexpr (sizeof(T) == 4) {
            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFF);
            else                                    return _mm512_set1_epi32(0x0);
        }
        else if constexpr (sizeof(T) == 8) {
            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF);
            else                                    return _mm512_set1_epi32(0x0);
        }
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers");
    }
 };
 /**
 * @brief Template class that implements methods used for Minimum determination. It wraps the corresponding vector intrinsics 
 * 
 * @tparam T base datatype for the implemented methods
 *
 */
 template<typename T>
 class Min : public AggFunction<T> {
 public:
    static inline __m512i simd_agg(__m512i aggregator, __m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_min_epi32(aggregator, vector);
        else if constexpr (sizeof(T) == 8) return _mm512_min_epi64(aggregator, vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
    }
    static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_mask_min_epi32(aggregator, mask, aggregator, vector);
        else if constexpr (sizeof(T) == 8) return _mm512_mask_min_epi64(aggregator, mask, aggregator, vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
    }
    static inline T simd_reduce(__m512i vector) {
        if      constexpr (sizeof(T) == 4) return _mm512_reduce_min_epi32(vector);
        else if constexpr (sizeof(T) == 8) return _mm512_reduce_min_epi64(vector);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
    }
    static inline T scalar_agg(T aggregator, T scalar) { return std::min(aggregator, scalar); }
    static inline __m512i zero() { 
        if constexpr (sizeof(T) == 4) {
            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFF);
            else                                    return _mm512_set1_epi32(0xFFFFFFFF);
        }
        else if constexpr (sizeof(T) == 8) {
            if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFFFFFFFFFF);
            else                                    return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF);
        }
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers");
    }
 };
 /**
 * @brief Template Class that implements an aggregation operation.
 * 
 * @tparam base_t Base type of the values for aggregation
 * @tparam func 
 * @tparam load_mode 
 */
 template<typename base_t, template<typename _base_t> class func, load_mode load_mode> 
 class Aggregation{
 public:
    static_assert(std::is_same_v<base_t, uint64_t>, "Enforce unsigned 64 bit ints.");
    using OP = func<base_t>;
    /**
     * @brief Calculates the memory maximal needed to store a chunk's processing result.
     * 
     * @param chunk_size_b Size of the chunk in byte
     * @return size_t Size of the chunk's processing result in byte
     */
    static size_t result_bytes_per_chunk(size_t chunk_size_b) {
        // aggregation returns a single value of type base_t
        return sizeof(base_t);
    }
    /**
     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes. 
     * The result is written to main memory.
     * 
     * @param dest Pointer to the start of the result chunk
     * @param src Pointer to the start of the source chunk
     * @param chunk_size_b Size of the source chunk in bytes
     * @return true When the aggregation is done
     * @return false Never
     */
    static bool apply (base_t *dest, base_t *src, size_t chunk_size_b) {
        constexpr size_t lanes = VECTOR_SIZE<base_t>();
        size_t value_count = chunk_size_b / sizeof(base_t);
        __m512i agg_vec = func<base_t>::zero();
        size_t i = 0;
        base_t result = 0;
        // stop before! running out of space
        if(value_count >= lanes) {// keep in mind value_count is unsigned so if it becomes negative, it doesn't.
            for(; i <= value_count - lanes; i += lanes) {
                __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
                agg_vec = func<base_t>::simd_agg(agg_vec, vec);
            }
            result = func<base_t>::simd_reduce(agg_vec);
        }
        for(; i < value_count; ++i) {
            result = func<base_t>::scalar_agg(result, src[i]);
        }
        *dest = result;
        return true;
    }
    /**
     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, 
     * while applying the bit string stored in *masks*. The result is written to main memory.
     * 
     * @param dest Pointer to the start of the result chunk
     * @param src Pointer to the start of the source chunk
     * @param masks Pointer the bitstring that marks the values that should be aggregated
     * @param chunk_size_b Size of the source chunk in bytes
     * @return true When the aggregation is done
     * @return false Never
     */
    static bool apply_masked (base_t *dest, base_t *src, uint16_t* msks, size_t chunk_size_b) {
        constexpr size_t lanes = VECTOR_SIZE<base_t>();
        uint8_t* masks = (uint8_t *)msks;
        size_t value_count = chunk_size_b / sizeof(base_t);
        __m512i agg_vec = func<base_t>::zero();
        size_t i = 0;
        // stop before! running out of space
        if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't.
        for(; i <= value_count - lanes; i += lanes) {
            __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
            __mmask8 mask = _mm512_int2mask(masks[i / lanes]);
            agg_vec = func<base_t>::simd_mask_agg(agg_vec, mask, vec);        
        }
        *dest = func<base_t>::simd_reduce(agg_vec);
        for(; i < value_count; ++i) {
            uint8_t mask = masks[i / lanes];
            if(mask & (0b1 << (i % lanes))){
                *dest = func<base_t>::scalar_agg(*dest, src[i]);
            }
        }
        return true;
    }
    /**
     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, 
     * while applying the bit string stored in *masks*. The values are agggegated in the register *dest* without 
     * clearing beforehand. 
     *
     * NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte
     * 
     * @param dest Vector register used for storing and passing the result around
     * @param src Pointer to the start of the source chunk
     * @param masks Pointer the bitstring that marks the values that should be aggregated
     * @param chunk_size_b Size of the source chunk in bytes
     * @return __m512i Vector register holding the aggregation result
     */
    static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks, size_t chunk_size_b) {
        constexpr size_t lanes = VECTOR_SIZE<base_t>();
        uint8_t* masks = (uint8_t*) msks;
        //TODO this function does not work if value_count % lanes != 0
        size_t value_count = chunk_size_b / sizeof(base_t);
        size_t i = 0;
        // stop before! running out of space
        if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't.
        for(; i <= value_count - lanes; i += lanes) {
            __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
            __mmask8 mask = _mm512_int2mask(masks[i / lanes]);
            dest = func<base_t>::simd_agg(dest, mask, vec);
        }
        return dest;
    }
    /**
     * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, 
     * while applying two bit strings stored in *masks_0* and *masks_1*. The values are aggregated in the register 
     * *dest* without clearing beforehand. 
     *
     * NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte
     * 
     * @param dest Vector register used for storing and passing the result around
     * @param src Pointer to the start of the source chunk
     * @param masks_0 Pointer the bitstring that marks the values that should be aggregated
     * @param masks_1 Pointer the bitstring that marks the values that should be aggregated
     * @param chunk_size_b Size of the source chunk in bytes
     * @return __m512i Vector register holding the aggregation result
     */
    static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks0, uint16_t* msks1, size_t chunk_size_b) {
        constexpr size_t lanes = VECTOR_SIZE<base_t>();
        uint8_t* masks0 = (uint8_t*) msks0; 
        uint8_t* masks1 = (uint8_t*) msks1; 
        //TODO this function does not work if value_count % lanes != 0
        size_t value_count = chunk_size_b / sizeof(base_t);
        size_t i = 0;
        // stop before! running out of space
        if(value_count >= lanes) // keep in mind value_count is unsigned so if it becomes negative, it doesn't.
        for(; i <= value_count - lanes; i += lanes) {
            __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
            __mmask8 mask0 = _mm512_int2mask(masks0[i / lanes]);
            __mmask8 mask1 = _mm512_int2mask(masks1[i / lanes]);
            mask0 = _kand_mask8(mask0, mask1);
            dest = func<base_t>::simd_agg(dest, mask0, vec);
        }
        return dest;
    }
    /**
     * @brief Reduces a vector by applying the aggregation function horizontally.
     * 
     * @param dest Result of the horizontal aggregation
     * @param src Vector as source for the horizontal aggregation
     * @return true When the operation is done
     * @return false Never
     */
    static bool happly (base_t *dest, __m512i src) {
        *dest = func<base_t>::simd_reduce(src);
        return true;
    }
    static __m512i get_zero() {
        return func<base_t>::zero();
    }
 };
--- a/qdp_project/src/algorithm/operators/filter.h
+++ b/qdp_project/src/algorithm/operators/filter.h
@ -0,0 +1,170 @@
 #pragma once
 #include<cstdint>
 #include<type_traits>
 #include <immintrin.h>
 #include "vector_loader.h"
 /**
 * @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type.
 * 
 * @tparam T An integral datatype
 */
 template<typename T>
 class FilterFunction {
    static_assert(std::is_integral<T>::value, "The base type of a FilterFunction must be an integeral.");
 };
 /**
 * @brief Template class that implements methods used for finding values that are not equal to the compare value. 
 * It wraps the corresponding vector intrinsics.
 * 
 * @tparam T base datatype for the implemented methods
 */
 template<typename T>
 class NEQ : public FilterFunction<T> {
 public:
    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
        if      constexpr (sizeof(T) == 4) return _mm512_cmpneq_epi32_mask(vector, comp);
        else if constexpr (sizeof(T) == 8) return _mm512_cmpneq_epi64_mask(vector, comp);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "NEQ is only implemented for 32 and 64 wide integers");
    }
    static inline bool scalar_filter(T scalar, T comp) { return scalar != comp; }
 };
 template<typename T>
 class EQ : public FilterFunction<T> {
 public:
    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
        if      constexpr (sizeof(T) == 4) return _mm512_cmpeq_epi32_mask(vector, comp);
        else if constexpr (sizeof(T) == 8) return _mm512_cmpeq_epi64_mask(vector, comp);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "EQ is only implemented for 32 and 64 wide integers");
    }
    static inline bool scalar_filter(T scalar, T comp) { return scalar == comp; }
 };
 template<typename T>
 class LT : public FilterFunction<T> {
 public:
    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
        if      constexpr (sizeof(T) == 4) return _mm512_cmplt_epi32_mask(vector, comp);
        else if constexpr (sizeof(T) == 8) return _mm512_cmplt_epi64_mask(vector, comp);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LT is only implemented for 32 and 64 wide integers");
    }
    static inline bool scalar_filter(T scalar, T comp) { return scalar < comp; }
 };
 template<typename T>
 class LEQ : public FilterFunction<T> {
 public:
    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
        if      constexpr (sizeof(T) == 4) return _mm512_cmple_epi32_mask(vector, comp);
        else if constexpr (sizeof(T) == 8) return _mm512_cmple_epi64_mask(vector, comp);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LEQ is only implemented for 32 and 64 wide integers");
    }
    static inline bool scalar_filter(T scalar, T comp) { return scalar <= comp; }
 };
 template<typename T>
 class GT : public FilterFunction<T> {
 public:
    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
        if      constexpr (sizeof(T) == 4) return _mm512_cmpgt_epi32_mask(vector, comp);
        else if constexpr (sizeof(T) == 8) return _mm512_cmpgt_epi64_mask(vector, comp);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GT is only implemented for 32 and 64 wide integers");
    }
    static inline bool scalar_filter(T scalar, T comp) { return scalar > comp; }
 };
 template<typename T>
 class GEQ : public FilterFunction<T> {
 public:
    static inline __mmask16 simd_filter(__m512i vector, __m512i comp) {
        if      constexpr (sizeof(T) == 4) return _mm512_cmpge_epi32_mask(vector, comp);
        else if constexpr (sizeof(T) == 8) return _mm512_cmpge_epi64_mask(vector, comp);
        static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GEQ is only implemented for 32 and 64 wide integers");
    }
    static inline bool scalar_filter(T scalar, T comp) { return scalar >= comp; }
 };
 template<typename base_t, template<typename _base_t> class func, load_mode load_mode, bool copy>
 class Filter {
 public:
    static_assert(std::is_same_v<base_t, uint64_t>, "We enforce 64 bit integer");
    /**
     * @brief Calculates the memory maximal needed to store a chunk's processing result.
     * 
     * @param chunk_size_b Size of the chunk in byte
     * @return size_t Size of the chunk's processing result in byte
     */
    static size_t result_bytes_per_chunk(size_t chunk_size_b) {
        // + 7 to enshure that we have enougth bytes -> / 8 -> rounds down 
        // if we had 17 / 8 = 2  but (17 + 7) / 8 = 3
        // if we hat 16 / 8 = 2 is right, as well as, 16 + 7 / 8 = 2
        return (chunk_size_b / sizeof(base_t) + 7) / 8;
    }
    /**
     * @brief Applies the filter function on the chunk starting at *src* and spanning *chunk_size_b* bytes, while comparing with he same value every time. 
     * The resulting bit string is written to main memory.
     * 
     * @param dest Pointer to the start of the result chunk
     * @param src Pointer to the start of the source chunk
     * @param cmp_value Comparision value to compare the values from source to
     * @param chunk_size_b Size of the source chunk in bytes
     * @return true When the filter operation is done
     * @return false Never
     */
    // we only need this impl. yet, as all filter are at the end of a pipeline
    static bool apply_same (uint16_t *dst, base_t *buffer, base_t *src, base_t cmp_value, size_t chunk_size_b) {
        constexpr uint32_t lanes = VECTOR_SIZE<base_t>();
        uint8_t* dest = (uint8_t*) dst;
        size_t value_count = chunk_size_b / sizeof(base_t);
        __m512i cmp_vec = _mm512_set1_epi64(cmp_value);
        size_t i = 0;
        // this weird implementetion is neccessary, see analogous impl in aggregation for explaination
        if(value_count > lanes) {
            for(; (i < value_count - lanes); i += lanes) {
                __m512i vec = Vector_Loader<base_t, load_mode>::load(src + i);
                __mmask8 bitmask = func<base_t>::simd_filter(vec, cmp_vec);
                uint8_t int_mask = (uint8_t) _mm512_mask2int(bitmask);
                dest[i / lanes] = int_mask;
                if constexpr(copy){
                    Vector_Loader<base_t, load_mode>::store(buffer + i, vec);
                }
            }
        }
        auto dest_pos = i / lanes;
        uint8_t int_mask = 0;
        for(; i < value_count; ++i) {
            base_t val = src[i];
            uint8_t result = func<base_t>::scalar_filter(val, cmp_value);
            int_mask |= (result << (i % lanes));
            if constexpr(copy){
                buffer[i] = val;
            }
        }
        dest[dest_pos] = int_mask;
        return true;
    }
 };
--- a/qdp_project/src/benchmark/DIMES_benchmark.cpp
+++ b/qdp_project/src/benchmark/DIMES_benchmark.cpp
@ -0,0 +1,240 @@
 #include <atomic>
 #include <barrier>
 #include <chrono>
 #include <condition_variable>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <future>
 #include <iostream>
 #include <limits>
 #include <list>
 #include <mutex>
 #include <queue>
 #include <thread>
 #include <tuple>
 #include <utility>
 #include <numa.h>
 #ifndef THREAD_GROUP_MULTIPLIER
 #define THREAD_GROUP_MULTIPLIER 8
 #endif
 #ifndef QUERY
 #define QUERY 1
 #endif
 #ifndef BARRIER_MODE
 #define BARRIER_MODE "global"
 #endif
 #ifndef BUFFER_LIMIT
 #define BUFFER_LIMIT 1
 #endif
 #include "const.h"
 #include "file_output.h"
 #include "array_utils.h"
 #include "timer_utils.h"
 #include "barrier_utils.h"
 #include "cpu_set_utils.h"
 #include "iterable_range.h"
 #include "memory_literals.h"
 #include "pipelines/DIMES_scan_filter_pipe.h"
 #include "aggregation.h"
 #include "filter.h"
 using base_t = uint64_t;
 base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value) * row_B[i];
    }
    return sum;
 }
 base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
    }
    return sum;
 }
 int main(int argc, char** argv) {
    // set constants
    const size_t workload_b = 4_GiB;
    const base_t compare_value_a = 50;
    const base_t compare_value_b = 42;
    constexpr bool simple_query = (QUERY == 1);
    const size_t thread_count = 6;
    std::ofstream out_file;
    out_file.open("../results/dimes_" 
            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
            "_bm-" + (std::string) BARRIER_MODE + 
            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".csv");
    // set benchmark parameter 
    Linear_Int_Range<uint32_t, 0, 10, 1> run("run");
    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
    uint32_t remote_node   = 3;
    uint32_t remote_node_2 = 2;
    uint32_t local_node    = 10; 
    print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time",
    #ifdef THREAD_TIMINGS
        "scan_a", "scan_b", "aggr_j",
    #endif
    #ifdef BARRIER_TIMINGS
        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
    #endif
        "result");
    /*** alloc data and buffers ************************************************/
    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
    std::memcpy(data_a_hbm, data_a, workload_b);
    std::memcpy(data_b_hbm, data_b, workload_b);
    base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node);
    std::ofstream check_file;
    check_file.open("../results/dimes_" 
            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
            "_bm-" + (std::string) BARRIER_MODE + 
            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum");
    if constexpr (QUERY == 1) {
        //calculate simple checksum if QUERY == 1 -> simple query is applied
        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
    } else {
        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
    }
    check_file.close();
    std::string iteration("init");
    Query_Wrapper<base_t, simple_query>* qw = nullptr;
    while(iteration != "false") {
        std::promise<void> p;
        std::shared_future<void> ready_future(p.get_future());
        if(iteration != "run") {
            if(qw != nullptr) {
                delete qw;
            }
            std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << std::endl;
            uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
            uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
            uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
            switch(mode.current) {
            case NewPMode::DRAM_base:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
                    break;
            case NewPMode::HBM_base:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
                    break;
            case NewPMode::Mixed_base:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
                    break;
            case NewPMode::Prefetch: 
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false);
                    break;
            }             
        }
        qw->ready_future = &ready_future;
        qw->clear_buffers();
        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
        std::vector<std::thread> filter_pool;
        std::vector<std::thread>   copy_pool;
        std::vector<std::thread>    agg_pool;
        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
        int thread_id = 0;
        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids
        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids
        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids
        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
            }
            // if tc_copy == 0 this loop is skipped
            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
            }
            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
            }
        }
        auto start = std::chrono::steady_clock::now();
        p.set_value();
        for(std::thread& t : filter_pool) { t.join(); }
        for(std::thread& t :   copy_pool) { t.join(); }
        for(std::thread& t :    agg_pool) { t.join(); }
        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
        auto end = std::chrono::steady_clock::now();
        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
        double seconds   = (double)(nanos) / nanos_per_second;
        print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds,
        #ifdef THREAD_TIMINGS
                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
        #endif
        #ifdef BARRIER_TIMINGS
                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
        #endif
                results[0]);
        iteration = IterateOnce(run, chunk_size, mode);
    }
    numa_free(data_b_hbm, workload_b);
    numa_free(data_a, workload_b);
    numa_free(data_b, workload_b);
    numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t));
 }
--- a/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
+++ b/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
@ -0,0 +1,260 @@
 #include <atomic>
 #include <barrier>
 #include <chrono>
 #include <condition_variable>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <future>
 #include <iostream>
 #include <limits>
 #include <list>
 #include <mutex>
 #include <queue>
 #include <thread>
 #include <tuple>
 #include <utility>
 #include <numa.h>
 #ifndef QUERY
 #define QUERY 1
 #endif
 #ifndef BARRIER_MODE
 #define BARRIER_MODE "global"
 #endif
 #define BUFFER_LIMIT 0
 #include "const.h"
 #include "file_output.h"
 #include "array_utils.h"
 #include "timer_utils.h"
 #include "barrier_utils.h"
 #include "cpu_set_utils.h"
 #include "iterable_range.h"
 #include "memory_literals.h"
 #include "pipelines/DIMES_scan_filter_pipe.h"
 #include "aggregation.h"
 #include "filter.h"
 using base_t = uint64_t;
 base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value) * row_B[i];
    }
    return sum;
 }
 base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
    }
    return sum;
 }
 int main(int argc, char** argv) {
    // set constants
    const size_t workload_b = 4_GiB;
    const size_t chunk_size = 2_MiB;
    const base_t compare_value_a = 50;
    const base_t compare_value_b = 42;
    constexpr bool simple_query = (QUERY == 1);
    std::ofstream out_file;
    out_file.open("../results/dimes_cores_" 
            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
            "_bm-" + (std::string) BARRIER_MODE + 
            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
            ".csv");
    // set benchmark parameter 
    Linear_Int_Range<uint32_t, 0, 3, 1> run("run");
    Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_a_thread("scan_a_tc");
    Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_b_thread("scan_b_tc");
    Exp_Int_Range<uint32_t, 1, 4+1, 2> aggr_j_thread("aggr_j_tc");
    Linear_Int_Range<uint32_t, 1, 16+1, 1> thread_group_count("thread_group_c");
    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
    uint32_t remote_node   = 1;
    uint32_t remote_node_2 = 0;//on heacboehm II: node 0 is two hops away from node 2 -> prefetching is more beneficial
    uint32_t local_node    = 2; 
    print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), 
        "time",
    #ifdef THREAD_TIMINGS
        "scan_a", "scan_b", "aggr_j",
    #endif
    #ifdef BARRIER_TIMINGS
        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
    #endif
        "result");
    /*** alloc data and buffers ************************************************/
    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
    std::memcpy(data_a_hbm, data_a, workload_b);
    std::memcpy(data_b_hbm, data_b, workload_b);
    base_t* results = (base_t*) numa_alloc_onnode(thread_group_count.max * aggr_j_thread.max * sizeof(base_t), remote_node);
    std::ofstream check_file;
    check_file.open("../results/dimes_cores_" 
            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
            "_bm-" + (std::string) BARRIER_MODE + 
            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
            ".checksum");
    if constexpr (QUERY == 1) {
        //calculate simple checksum if QUERY == 1 -> simple query is applied
        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
    } else {
        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
    }
    check_file.close();
    std::string iteration("init");
    Query_Wrapper<base_t, simple_query>* qw = nullptr;
    while(iteration != "false") {
        std::promise<void> p;
        std::shared_future<void> ready_future(p.get_future());
        // skipping iteration through scan_b_thread while not used
        while(simple_query && mode.current != NewPMode::Prefetch && scan_b_thread.current != 1) {
                iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread);
        }
        if(iteration != "run") {
            std::cout << "Changing to mode " << mode.current
                    << " thread_group_count " << thread_group_count.current 
                    << " thread_ratio " << scan_a_thread.current <<":"<< scan_b_thread.current <<":"<< aggr_j_thread.current
                    << std::endl;
            if(qw != nullptr) {
                if (iteration == thread_group_count.label) {
                } else {
                    delete qw;
                    uint32_t sat = scan_a_thread.current;
                    uint32_t sbt = simple_query && mode.current != NewPMode::Prefetch ?  0 : scan_b_thread.current;
                    uint32_t ajt = aggr_j_thread.current;
                    switch(mode.current) {
                    case NewPMode::DRAM_base:
                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, 
                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
                            break;
                    case NewPMode::HBM_base:
                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
                            break;
                    case NewPMode::Mixed_base:
                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b_hbm, results, local_node, remote_node, 
                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true);
                            break;
                    case NewPMode::Prefetch: 
                        qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, 
                            sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, false);
                            break;
                    }
                }
            }
        }
        qw->ready_future = &ready_future;
        qw->clear_buffers();
        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
        std::vector<std::thread> filter_pool;
        std::vector<std::thread>   copy_pool;
        std::vector<std::thread>    agg_pool;
        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
        int thread_id = 0;
        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
        for(uint32_t gid = 0; gid < thread_group_count.current; ++gid) {
            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
                filter_pool.emplace_back(filter_lambda, gid, thread_group_count.current, tid);
                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
            }
            // if tc_copy == 0 this loop is skipped
            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
                copy_pool.emplace_back(copy_lambda, gid, thread_group_count.current, tid);
                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
            }
            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
                agg_pool.emplace_back(aggregation_lambda, gid, thread_group_count.current, tid);
                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
            }
        }
        auto start = std::chrono::steady_clock::now();
        p.set_value();
        for(std::thread& t : filter_pool) { t.join(); }
        for(std::thread& t :   copy_pool) { t.join(); }
        for(std::thread& t :    agg_pool) { t.join(); }
        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * thread_group_count.current);
        auto end = std::chrono::steady_clock::now();
        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
        double seconds   = (double)(nanos) / nanos_per_second;
 print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), 
        "time",
    #ifdef THREAD_TIMINGS
        "scan_a", "scan_b", "aggr_j",
    #endif
    #ifdef BARRIER_TIMINGS
        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
    #endif
        "result");
        print_to_file(out_file, run, thread_group_count.current, new_mode_manager::string(mode.current), scan_a_thread, 
                (simple_query && mode.current != NewPMode::Prefetch ?  0 : scan_b_thread.current), 
                aggr_j_thread, seconds,
        #ifdef THREAD_TIMINGS
                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
        #endif
        #ifdef BARRIER_TIMINGS
                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
        #endif
                results[0]);
        iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread);
    }
    numa_free(data_b_hbm, workload_b);
    numa_free(data_a, workload_b);
    numa_free(data_b, workload_b);
    numa_free(results, thread_group_count.max * aggr_j_thread.max * sizeof(base_t));
 }
--- a/qdp_project/src/benchmark/MAX_benchmark.cpp
+++ b/qdp_project/src/benchmark/MAX_benchmark.cpp
@ -0,0 +1,289 @@
 #include <atomic>
 #include <barrier>
 #include <chrono>
 #include <condition_variable>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <future>
 #include <iostream>
 #include <limits>
 #include <list>
 #include <mutex>
 #include <queue>
 #include <thread>
 #include <tuple>
 #include <utility>
 #include <numa.h>
 #ifndef THREAD_GROUP_MULTIPLIER
 #define THREAD_GROUP_MULTIPLIER 2
 #endif
 #ifndef QUERY
 #define QUERY 1
 #endif
 #ifndef BARRIER_MODE
 #define BARRIER_MODE "global"
 #endif
 #ifndef BUFFER_LIMIT
 #define BUFFER_LIMIT 1
 #endif
 #ifndef PINNING
 #define PINNING 1
 #endif
 #ifndef PCM_M
 #define PCM_M 0
 #endif
 #if PCM_M == 1
 #include "pcm.h"
 #endif
 #include "const.h"
 #include "file_output.h"
 #include "array_utils.h"
 #include "timer_utils.h"
 #include "barrier_utils.h"
 #include "measurement_utils.h"
 #include "cpu_set_utils.h"
 #include "iterable_range.h"
 #include "memory_literals.h"
 #include "pipelines/MAX_scan_filter_pipe.h"
 #include "aggregation.h"
 #include "filter.h"
 using base_t = uint64_t;
 base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value) * row_B[i];
    }
    return sum;
 }
 base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
    }
    return sum;
 }
 int main(int argc, char** argv) {
 #if PCM == 1
    pcm::PCM *pcm = pcm::PCM::getInstance();
 	//and check for errors
 	auto error_code = pcm->program();
 	if(error_code != pcm::PCM::Success) {
 		std::cerr << "PCM couldn't start" << std::endl;
        std::cerr << "Error code: " << error_code << std::endl;
        std::cerr << "Try to execute 'sudo modprobe msr' and execute this program with root privigeges.";
        return 1;
 	}
 #endif
    // set constants
    const size_t workload_b = 2_GiB;
    const base_t compare_value_a = 50;
    const base_t compare_value_b = 42;
    constexpr bool simple_query = (QUERY == 1);
    const size_t thread_count = 6;
    std::ofstream out_file;
    out_file.open("../results/max_" 
            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
            "_bm-" + (std::string) BARRIER_MODE + 
            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + "1MiB-2MiB.csv");
    // set benchmark parameter 
    Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
    constexpr size_t chunk_min = 1_MiB; constexpr size_t chunk_max = 8_MiB + 1; constexpr size_t chunk_incr = 128_kiB;
    Linear_Int_Range<size_t, chunk_min, chunk_max, chunk_incr> chunk_size("chunk_size");
    Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode");
    uint32_t remote_node   = 2;
    uint32_t remote_node_2 = 2;
    uint32_t local_node    = 10;
    /*uint32_t remote_node   = 6;
    uint32_t remote_node_2 = 6;
    uint32_t local_node    = 2;*/
    print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time",
    #ifdef THREAD_TIMINGS
        "scan_a", "scan_b", "aggr_j",
    #endif
    #ifdef BARRIER_TIMINGS
        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
    #endif
    #if PCM == 1
        pcm_value_collector::getHead("scan_a"), 
        pcm_value_collector::getHead("scan_b"), 
        pcm_value_collector::getHead("aggr_j"),
    #endif
        "result");
    /*** alloc data and buffers ************************************************/
    base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, remote_node);
    base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
    base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node);
    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
    std::memcpy(data_a_hbm, data_a, workload_b);
    std::memcpy(data_b_hbm, data_b, workload_b);
    base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node);
    std::ofstream check_file;
    check_file.open("../results/max_" 
            "q-" + (std::string)(simple_query == true ? "simple" : "complex") + 
            "_bm-" + (std::string) BARRIER_MODE + 
            "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + 
            "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum");
    if constexpr (QUERY == 1) {
        //calculate simple checksum if QUERY == 1 -> simple query is applied
        check_file << sum_check(compare_value_a, data_a, data_b, workload_b);
    } else {
        check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
    }
    check_file.close();
    std::string iteration("init");
    Query_Wrapper<base_t, simple_query>* qw = nullptr;
    while(iteration != "false") {
        std::promise<void> p;
        std::shared_future<void> ready_future(p.get_future());
        if(iteration != "run") {
            if(qw != nullptr) {
                delete qw; 
            }
            uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
            uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
            uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
            switch(mode.current) {
            case NewPMode::DRAM_base:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
                    break;
            case NewPMode::HBM_base:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
                    break;
            case NewPMode::Mixed_base:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true);
                    break;
            case NewPMode::Prefetch: 
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
                    tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false);
                    break;
            }             
        }
        qw->ready_future = &ready_future;
        qw->clear_buffers();
        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
        std::vector<std::thread> filter_pool;
        std::vector<std::thread>   copy_pool;
        std::vector<std::thread>    agg_pool;
        uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A);
        uint8_t tc_copy   = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B);
        uint8_t tc_agg    = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J);
        int thread_id = 0;
        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids
        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids
        //std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids
        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
 #if PINNING
                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);      
 #else
                pin_thread_in_range(filter_pool.back(), pinning_ranges);
 #endif
            }
            // if tc_copy == 0 this loop is skipped
            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
 #if PINNING
                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
 #else
                pin_thread_in_range(copy_pool.back(), pinning_ranges);
 #endif
            }
            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
 #if PINNING
                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
 #else
                pin_thread_in_range(agg_pool.back(), pinning_ranges);
 #endif
            }
        }
        auto start = std::chrono::steady_clock::now();
        p.set_value();
        for(std::thread& t : filter_pool) { t.join(); }
        for(std::thread& t :   copy_pool) { t.join(); }
        for(std::thread& t :    agg_pool) { t.join(); }
        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
        auto end = std::chrono::steady_clock::now();
        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
        double seconds   = (double)(nanos) / nanos_per_second;
        print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds,
        #ifdef THREAD_TIMINGS
                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
        #endif
        #ifdef BARRIER_TIMINGS
                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
        #endif
        #if PCM == 1
                qw->pvc->summarize_as_string("scan_a"),
                qw->pvc->summarize_as_string("scan_b"), 
                qw->pvc->summarize_as_string("aggr_j"),
        #endif
                results[0]);
        iteration = IterateOnce(run, chunk_size, mode);
    }
    numa_free(data_b_hbm, workload_b);
    numa_free(data_a, workload_b);
    numa_free(data_b, workload_b);
    numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t));
 }
--- a/qdp_project/src/benchmark/QDP_minimal.h
+++ b/qdp_project/src/benchmark/QDP_minimal.h
@ -0,0 +1,147 @@
 #include <chrono>
 #include <iostream>
 #include <thread>
 #include <future>
 #include <numa.h>
 #include "const.h"
 #include "array_utils.h"
 #include "cpu_set_utils.h"
 #include "iterable_range.h"
 #include "memory_literals.h"
 #include "pipelines/MAX_scan_filter_pipe.h"
 #include "aggregation.h"
 using base_t = uint64_t;
 // calculate the checksum for the simple query
 base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value) * row_B[i];
    }
    return sum;
 }
 // calculate the checksum for the complex query
 base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i];
    }
    return sum;
 }
 class QDP_minimal {
 private:
    // values used for comparisons in the filter operations
    const base_t compare_value_a = 50;
    const base_t compare_value_b = 42;
    // define, which numa nodes to use
    // Xeon Max: node 0-7 DRAM and 8-15 HBM
    // if the nodes are changed, the pinning ranges in run should be adjusted accordingly too
    uint32_t dram_node   = 2; 
    uint32_t dram_node_2 = 2; 
    uint32_t hbm_node    = 10;
 public:
    // results of running qdp, set by run()
    base_t result;
    base_t checksum;
    double exec_time;
    // run qdp
    void run(const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){
        // allocate data
        base_t* data_a     = (base_t*) numa_alloc_onnode(workload_b, dram_node);
        base_t* data_b     = (base_t*) numa_alloc_onnode(workload_b, dram_node_2);
        base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t), dram_node);
        // fill the memory with acutal values
        fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
        fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
        // run qdp
        run(data_a, data_b, results, workload_b, chunk_size, tc_filter, tc_copy, tc_agg);
        // free the allocated memory
        numa_free(data_a, workload_b);
        numa_free(data_b, workload_b);
        numa_free(results, THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t));
    }
    // run qdp, work on provided memory pointers to enable memory reuse across multiple runs
    void run(base_t* data_a, base_t* data_b, base_t* results, const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){
        constexpr bool simple_query = (QUERY == 1);
        // sync objects
        std::promise<void> p;
        std::shared_future<void> ready_future(p.get_future());
        // create the query wrapper, that is managing the to-be-used threads
        Query_Wrapper<base_t, simple_query>* qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, hbm_node, dram_node, 
            tc_filter, tc_copy, tc_agg, NewPMode::Prefetch, THREAD_GROUP_MULTIPLIER, compare_value_a, compare_value_b, false);             
        // clear buffers to make sure, that they have been written and are fully mapped before running qdp
        qw->clear_buffers();
        // creating lambdas for executing filter (scan_a), copy (scan_b), and aggregation tasks on the query wrapper
        // passing gid (group id), gcnt (group count) and tid (thread id)
        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
        // creating thread pools, holding all used threads
        std::vector<std::thread> filter_pool;
        std::vector<std::thread>   copy_pool;
        std::vector<std::thread>    agg_pool;
        int thread_id = 0;
        // cpus on node 2 (for sapphire rapids), that the threads should be executed on
        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; 
        // create all threads for all thread groups and for every task (copy, filter, aggregation), according their specific theadcount
        for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) {
            for(uint32_t tid = 0; tid < tc_filter; ++tid) {
                filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
                pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);      
            }
            for(uint32_t tid = 0; tid < tc_copy; ++tid) {
                copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
            }
            for(uint32_t tid = 0; tid < tc_agg; ++tid) {
                agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid);
                pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
            }
        }
        // start the clock 
        auto start = std::chrono::steady_clock::now();
        // set value to the promise, to signal the waiting threads, that they can start now
        p.set_value();
        // wait for all thread to be finished
        for(std::thread& t : filter_pool) { t.join(); }
        for(std::thread& t :   copy_pool) { t.join(); }
        for(std::thread& t :    agg_pool) { t.join(); }
        // sum up the results of all the aggregation threads to get a final result
        Aggregation<base_t, Sum, load_mode::Aligned>::apply(&result, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER);
        auto end = std::chrono::steady_clock::now();
        // get the overall execution time in seconds
        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
        exec_time   = (double)(nanos) / nanos_per_second;
        // calculate the checksum according to the used query
        if constexpr (QUERY == 1) {
            // QUERY == 1 -> simple query is applied
            checksum = sum_check(compare_value_a, data_a, data_b, workload_b);
        } else {
            checksum = sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b);
        }
        delete qw;
    }
 };
--- a/qdp_project/src/benchmark/doubly_filtered_agg.cpp
+++ b/qdp_project/src/benchmark/doubly_filtered_agg.cpp
@ -0,0 +1,149 @@
 #include <cstring>
 #include <fstream>
 #include <future>
 #include <iostream>
 #include <string>
 #include <thread>
 #include <vector>
 #include <numa.h>
 #include "aggregation.h"
 #include "array_utils.h"
 #include "cpu_set_utils.h"
 #include "file_output.h"
 #include "iterable_range.h"
 #include "memory_literals.h"
 #include "pipelines/scan_filter_pipe.h"
 int main () {
    using base_t = uint64_t;
    const size_t workload = 2_GiB;
    const char filename[256] = "../results/doubly_filtered_results_stronger_affinity_.csv";
    const uint32_t numa_local = 2;
    const uint32_t numa_remote = 3;
    Linear_Int_Range<uint32_t, 1,  6, 1> thread_group("thread_groups");
    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_filter("thread_cnt_filter");
    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_filter_copy("thread_cnt_filter_copy");
    Exp_Int_Range<uint32_t, 1,  5, 2> thread_count_aggregation("thread_cnt_agg");
    Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
    Range<PMode, no_copy, mode_manager, mode_manager> mode("mode");
    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
    std::ofstream out_file;
    out_file.open(filename);
    print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, 
                  thread_count_aggregation, thread_group), "time", "scan_a", "scan_b", "aggr_j", "wait_aggr", "results");
    base_t* data_a     = (base_t*) numa_alloc_onnode(workload, numa_remote);
    base_t* data_b     = (base_t*) numa_alloc_onnode(workload, numa_remote);
    base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload, numa_local);
    fill_mt<base_t>(data_a, workload, 0, 100, 42);
    fill_mt<base_t>(data_b, workload, 0, 100, 420);
    std::memcpy(data_b_hbm, data_b, workload); 
    base_t* result = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), 
                                                 numa_remote);
    std::string iteration("init");
    Query_Wrapper<base_t, false>* qw = nullptr;
    while(iteration != "false") {
        std::promise<void> p;
        std::shared_future<void> ready_future(p.get_future());
        if(iteration != "run") {
            if(qw != nullptr) {
                delete qw;
            }
            switch(mode.current) {
            case PMode::expl_copy:
                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, 
                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, 
                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, false);
                    break;
            case PMode::no_copy:
                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, 
                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, 
                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
                    break;
            case PMode::hbm:
                qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b_hbm, result, numa_local, numa_remote, 
                    thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current,
                    mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
                    break;
            }
        }
        qw->ready_future = &ready_future;
        qw->clear_buffers();
        // todo create threads depending on mode
        std::vector<std::thread> thread_pool;
        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
        auto filter_copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };
        /* Intel Xeon Gold 6130 // todo implement different for 5120 -> fewer cpus
        node 0 cpus:  0-15  64- 79
        node 1 cpus: 16-31  80- 95
        node 2 cpus: 32-47  96-111
        node 3 cpus: 48-63 112-127
        */
        int thread_id = 0;
        std::vector<std::pair<int, int>> range {std::make_pair(0, 16), std::make_pair(64, 80)};
        for(uint32_t gid = 0; gid < thread_group.current; ++gid) {
            for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) {
                thread_pool.emplace_back(filter_lambda, gid, thread_group.current, tid);
                pin_thread_in_range(thread_pool.back(), thread_id++, range);
            }
            for(uint32_t tid = 0; tid < thread_count_filter_copy.current; ++tid) {
                thread_pool.emplace_back(filter_copy_lambda, gid, thread_group.current, tid);
                pin_thread_in_range(thread_pool.back(), thread_id++, range);
            }
            for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) {
                thread_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid);
                pin_thread_in_range(thread_pool.back(), thread_id++, range);
            }
        }
        auto start = std::chrono::steady_clock::now();
        p.set_value();
        // wait for every thread to join
        for(std::thread& t : thread_pool) t.join();
        // aggregate all partial results
        Aggregation<base_t, Sum, load_mode::Aligned>::apply(result, result, 
                                                            sizeof(base_t) * thread_count_aggregation.current * thread_group.current);
        auto end = std::chrono::steady_clock::now();
        double duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / (double)1000000000;
        //TODO add mode
        print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, 
                thread_count_filter_copy, thread_count_aggregation, thread_group, duration, 
                qw->trt->summarize_time(0), qw->trt->summarize_time(1),
                qw->trt->summarize_time(2), qw->trt->summarize_time(3), *result);
        iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, thread_count_aggregation, thread_group);
    }
    auto end = std::chrono::system_clock::now();
    std::time_t end_time = std::chrono::system_clock::to_time_t(end);
    std::cout << "finished computation at " << std::ctime(&end_time) << std::endl;
    print_to_file(out_file, std::ctime(&end_time));
 }
--- a/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
+++ b/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
@ -0,0 +1,184 @@
 #include <atomic>
 #include <barrier>
 #include <chrono>
 #include <condition_variable>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <future>
 #include <iostream>
 #include <limits>
 #include <list>
 #include <mutex>
 #include <queue>
 #include <thread>
 #include <tuple>
 #include <utility>
 #include <numa.h>
 #include "const.h"
 #include "file_output.h"
 #include "array_utils.h"
 #include "timer_utils.h"
 #include "barrier_utils.h"
 #include "cpu_set_utils.h"
 #include "iterable_range.h"
 #include "memory_literals.h"
 #include "pipelines/scan_filter_pipe.h"
 #include "aggregation.h"
 #include "filter.h"
 using base_t = uint64_t;
 base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) {
    base_t sum = 0;
    for(int i = 0; i < row_size / sizeof(base_t); ++i) {
        sum += (row_A[i] < compare_value) * row_B[i];
    }
    return sum;
 }
 int main(int argc, char** argv) {
    size_t workload_b = 2_GiB;
    std::ofstream out_file;
    out_file.open("filter_aggreagate_pipe_bm_" + (std::string) BARRIER_MODE + ".csv");
    Linear_Int_Range<uint32_t, 1,  7, 1> thread_group("thread_groups");
    Linear_Int_Range<uint32_t, 0, 10, 1> run("run");
    Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size");
    Linear_Int_Range<uint32_t, 1,  2, 1> thread_count_filter("thread_cnt_filter");
    Linear_Int_Range<uint32_t, 2,  3, 1> thread_count_copy("thread_cnt_copy");
    Linear_Int_Range<uint32_t, 1,  2, 1> thread_count_aggregation("thread_cnt_agg");
    Range<PMode, no_copy, mode_manager, mode_manager> mode("mode");
    uint32_t remote_node  = 2;
    uint32_t remote_node_2 = 2;
    uint32_t local_node = 10; 
    print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_copy, 
                  thread_count_aggregation, thread_group), "time",
    #ifdef THREAD_TIMINGS
        "scan_a", "scan_b", "aggr_j",
    #endif
    #ifdef BARRIER_TIMINGS
        "wait_scan_a", "wait_scan_b", "wait_aggr_j",
    #endif
        "result");
    /*** alloc data and buffers ************************************************/
    base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node);
    base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2);
    base_t* data_b_hbm = (base_t *) numa_alloc_onnode(workload_b, local_node);
    fill_mt<base_t>(data_a, workload_b, 0, 100, 42);
    fill_mt<base_t>(data_b, workload_b, 0, 100, 420);
    std::memcpy(data_b_hbm, data_b, workload_b);
    base_t* results = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), remote_node);
    std::string iteration("init");
    const bool simple_query = true;
    Query_Wrapper<base_t, simple_query>* qw = nullptr;
    while(iteration != "false") {
        base_t compare_value = 50;
        std::promise<void> p;
        std::shared_future<void> ready_future(p.get_future());
        if(iteration != "run") {
            if(qw != nullptr) {
                delete qw;
            }
            std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << " thread_group " << thread_group.current << std::endl;
            switch(mode.current) {
            case PMode::expl_copy:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, false);
                    break;
            case PMode::no_copy:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, 
                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
                    break;
            case PMode::hbm:
                qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, 
                    thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true);
                    break;
            }             
        }
        qw->ready_future = &ready_future;
        qw->clear_buffers();
        auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); };
        auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); };
        auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); };            
        std::vector<std::thread> filter_pool;
        std::vector<std::thread>   copy_pool;
        std::vector<std::thread>    agg_pool;
        int thread_id = 0;
        // std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm2
        std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
        for(uint32_t gid = 0; gid < thread_group.current; ++gid) {
        for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) {
            filter_pool.emplace_back(filter_lambda, gid, thread_group.current, tid);
            pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges);
        }
        if(mode.current == PMode::expl_copy){
            for(uint32_t tid = 0; tid < thread_count_copy.current; ++tid) {
                copy_pool.emplace_back(copy_lambda, gid, thread_group.current, tid);
                pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges);
            }
        }
        for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) {
            agg_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid);
            pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges);
            }
        }
        auto start = std::chrono::steady_clock::now();
        p.set_value();
        for(std::thread& t : filter_pool) { t.join(); }
        for(std::thread& t :   copy_pool) { t.join(); }
        for(std::thread& t :    agg_pool) { t.join(); }
        Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_count_aggregation.current * thread_group.current);
        auto end = std::chrono::steady_clock::now();
        constexpr double nanos_per_second = ((double)1000) * 1000 * 1000;
        uint64_t nanos   = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
        double seconds   = (double)(nanos) / nanos_per_second;
        print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, 
            thread_count_copy, thread_count_aggregation, thread_group, seconds,
        #ifdef THREAD_TIMINGS
                qw->trt->summarize_time(0),  qw->trt->summarize_time(1),  qw->trt->summarize_time(2), 
        #endif
        #ifdef BARRIER_TIMINGS
                qw->bt->summarize_time(0),  qw->bt->summarize_time(1),  qw->bt->summarize_time(2),
        #endif
                results[0]);
        iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_copy, thread_count_aggregation, thread_group);
    }
    numa_free(data_b_hbm, workload_b);
    numa_free(data_a, workload_b);
    numa_free(data_b, workload_b);
    numa_free(results, thread_group.max * sizeof(base_t));
 }
--- a/qdp_project/src/benchmark/latency.cpp
+++ b/qdp_project/src/benchmark/latency.cpp
@ -0,0 +1,188 @@
 /*
 * numa_memory_latency
 * Copyright (c) 2017 UMEZAWA Takeshi
 * This software is licensed under GNU GPL version 2 or later.
 *
 * This file has been modified
 */
 #include <algorithm>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <iostream>
 #include <unistd.h>
 #include <ctime>
 #include "file_output.h"
 #include <vector>
 #include <random>
 #include <algorithm>
 #include <numa.h>
 #ifndef VOLATILE
 #define VOLATILE 0
 #endif
 #define cachelinesize 64
 union CACHELINE {
 	char cacheline[cachelinesize];
 	#if VOLATILE
 	volatile CACHELINE* next;
 	#else 
 	CACHELINE* next;
 	#endif /*VOLATILE*/
 };
 #define REPT4(x)    do { x; x; x; x; } while(0)
 #define REPT16(x)   do { REPT4(x);   REPT4(x);   REPT4(x);   REPT4(x);   } while(0);
 #define REPT64(x)   do { REPT16(x);  REPT16(x);  REPT16(x);  REPT16(x);  } while(0);
 #define REPT256(x)  do { REPT64(x);  REPT64(x);  REPT64(x);  REPT64(x);  } while(0);
 #define REPT1024(x) do { REPT256(x); REPT256(x); REPT256(x); REPT256(x); } while(0);
 size_t bufsize = 1 * 1024 * 1024 * 1024;
 size_t nloop = 128 * 1024;
 std::vector<size_t> offsets;
 #if VOLATILE
 volatile CACHELINE* walk(volatile CACHELINE* start)
 {
 	volatile CACHELINE* p = start;
 	for (size_t i = 0; i < nloop; ++i) {
 		REPT1024(p = p->next);
 	}
 	return p;
 }
 #else
 CACHELINE* walk(CACHELINE* start, uint64_t* sum)
 {
 	CACHELINE* p = start;
 	for (size_t i = 0; i < nloop; ++i) {
        REPT1024(
 			*sum += static_cast<uint64_t>(p->cacheline[cachelinesize-1]);
            p = p->next;
        );
    }
 	return p;
 }
 #endif /*VOLATILE*/
 void bench(int tasknode, int memnode, std::ofstream* out_file)
 {
 	struct timespec ts_begin, ts_end, ts_elapsed;
 	printf("bench(task=%d, mem=%d)\n", tasknode, memnode);
 	if (numa_run_on_node(tasknode) != 0) {
 		printf("failed to run on node: %s\n", strerror(errno));
 		return;
 	}
 	CACHELINE* const buf = (CACHELINE*)numa_alloc_onnode(bufsize, memnode);
 	if (buf == NULL) {
 		printf("failed to allocate memory\n");
 		return;
 	}
 	for (size_t i = 0; i < offsets.size() - 1; ++i) {
 		// assuming that next-pointer never overwrites last Byte of the cacheline/union
 		buf[offsets[i]].cacheline[cachelinesize-1] = offsets[i] % 128;
 		buf[offsets[i]].next = buf + offsets[i+1];
 	}
 	buf[offsets[offsets.size() - 1]].next = buf;
 	buf[offsets[offsets.size() - 1]].cacheline[cachelinesize-1] = offsets[offsets.size() - 1] % 128;
 	uint64_t value = 0;
 	uint64_t* sum = &value;
 	clock_gettime(CLOCK_MONOTONIC, &ts_begin);
 	#if VOLATILE 
 	walk(buf);
 	#else
 	walk(buf, sum);
 	#endif /*VOLATILE*/
 	clock_gettime(CLOCK_MONOTONIC, &ts_end);
 	ts_elapsed.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec;
 	ts_elapsed.tv_sec = ts_end.tv_sec - ts_begin.tv_sec;
 	if (ts_elapsed.tv_nsec < 0) {
 		--ts_elapsed.tv_sec;
 		ts_elapsed.tv_nsec += 1000*1000*1000;
 	}
 	double elapsed = ts_elapsed.tv_sec + 0.000000001 * ts_elapsed.tv_nsec;
 	printf("took %fsec. %fns/load\n", elapsed, elapsed/(1024*nloop)*(1000*1000*1000));
 	print_to_file(*out_file, tasknode, memnode, elapsed/(1024*nloop)*(1000*1000*1000), *sum);
 	numa_free(buf, bufsize);
 }
 struct RND {
 	std::mt19937 mt;
 	RND() : mt(time(NULL)) {}
 	std::mt19937::result_type operator()(std::mt19937::result_type n) { return mt() % n; }
 } r;
 void usage(const char* prog)
 {
 	printf("usage: %s [-h] [bufsize] [nloop]\n", prog);
 }
 int main(int argc, char* argv[])
 {
 	int ch;
 	while ((ch = getopt(argc, argv, "h")) != -1) {
 		switch (ch) {
 		case 'h':
 		default:
 			usage(argv[0]);
 			exit(1);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 	if (argc > 1) {
 		// 1048576 KiB = 1 GiB
 		bufsize = atoi(argv[0]) * 1024; // in KiB
 		nloop = atoi(argv[1]) * 1024;
    } 
 	offsets.resize(bufsize / cachelinesize);
 	for (size_t i = 0; i < offsets.size(); ++i)
 		offsets[i] = i;
 	std::random_shuffle(offsets.begin() + 1, offsets.end(), r);
 	uint64_t expected_checksum = 0;
 	#if VOLATILE == 0
    for (size_t i = 0; i < nloop * 1024; ++i) {
        expected_checksum += offsets[i % offsets.size()] % 128;
    }
 	#endif
 	std::ofstream check_file;
 	check_file.open("../results/micro_bench/latency/micro_bench_latency_" + (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".checksum");
 	check_file << expected_checksum;
 	check_file.close();
 	printf("benchmark bufsize=%zuKiB, nloop=%zuKi\n", bufsize/1024, nloop/1024);
    std::ofstream out_file;
    out_file.open("../results/micro_bench/latency/micro_bench_latency_"+ (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".csv");
    print_to_file(out_file, "tasknode", "memnode", "latency", "checksum");
 	for (int tasknode = 0; tasknode < 8; tasknode++) {
 		for (int memnode = 0; memnode < 16; memnode++) {
 			bench(tasknode, memnode, &out_file);
 		}
 	}
 	return 0;
 }
--- a/qdp_project/src/benchmark/micro_benchmarks.cpp
+++ b/qdp_project/src/benchmark/micro_benchmarks.cpp
@ -0,0 +1,271 @@
 #include <iostream>
 #include <chrono>
 #include <future>
 #include <numa.h>
 #include <algorithm>
 #include <cstring>
 #include "memory_literals.h"
 #include "array_utils.h"
 #include "file_output.h"
 #include "aggregation.h"
 using base_t = uint64_t;
 size_t thread_cnt_memcpy = 128;
 size_t thread_cnt_read = 128;
 size_t runs = 10;
 base_t sum_up(base_t* data, size_t workload){
    base_t sum = 0;
    for(int i = 0; i < workload/sizeof(base_t); i++){
        sum += data[i];
    }
    return sum;
 }
 int reverse_bits(int number, size_t bit_count) {
    int result = 0;
    for(int i = 0; i < bit_count; i++) {
        result <<= 1; 
        result |= (number & 1);
        number >>= 1; 
    }
    return result;
 }
 double measure_memcpy_bw(base_t* src, base_t* dest, size_t workload, base_t* result){
    std::promise<void> p;
    std::shared_future<void> ready_future(p.get_future());
    auto thread_lambda =          [&](base_t* source, base_t* destination, size_t count) {
        ready_future.wait();
        memcpy(destination, source, count);
    };
    std::vector<std::thread> thread_pool;
    size_t total_elements = workload / sizeof(base_t); 
    size_t elements_per_thread = total_elements / thread_cnt_memcpy;
    size_t remainder = total_elements % thread_cnt_memcpy; 
    for(size_t tid = 0; tid < thread_cnt_memcpy; tid++) {
        size_t elements_to_process = elements_per_thread + (tid < remainder ? 1 : 0);
        size_t byte_offset = (elements_per_thread * tid + std::min(tid, remainder)) * sizeof(base_t);
        thread_pool.emplace_back(thread_lambda, src + byte_offset / sizeof(base_t), dest + byte_offset / sizeof(base_t), elements_to_process * sizeof(base_t));
    }
    auto start = std::chrono::steady_clock::now();
    p.set_value();
    for(std::thread& t : thread_pool) { t.join(); }
    auto stop = std::chrono::steady_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
    double seconds = duration.count() / 1e9;
    double throughput = (workload / seconds) / (1024 * 1024 * 1024);
    *result = sum_up(dest, workload);
    return throughput;
 }
 double measure_read_bw(base_t* data, size_t workload, base_t* results){
    const size_t chunk_size = sizeof(__m512i);
    const size_t num_chunks = (workload) / chunk_size;
    __m512i* src = reinterpret_cast<__m512i*>(data);
    std::promise<void> p;
    std::shared_future<void> ready_future(p.get_future());
    size_t num_chunks_per_thread = num_chunks / thread_cnt_read;
    size_t num_chunks_remainder = num_chunks % thread_cnt_read;
    auto thread_lambda =          [&](__m512i* src, int tid, int num_chunks) {
        __m512i accumulator = _mm512_setzero_si512();
        ready_future.wait();
        for (int i = 0; i < num_chunks; i++) {
            __m512i chunk = _mm512_load_si512(&src[i]);
            accumulator = _mm512_add_epi64(accumulator, chunk); 
        }
        results[tid] = _mm512_reduce_add_epi64(accumulator);
    };
    std::vector<std::thread> thread_pool;
    int offset;
    for(int tid = 0; tid < thread_cnt_read; tid++){
        if(tid < num_chunks_remainder){
            offset = tid * (num_chunks_per_thread + 1);
            thread_pool.emplace_back(thread_lambda, &src[offset], tid, (num_chunks_per_thread + 1));
        } else {
            offset = tid*num_chunks_per_thread + num_chunks_remainder;
            thread_pool.emplace_back(thread_lambda, &src[offset], tid, num_chunks_per_thread);
        }
    }
    auto start = std::chrono::steady_clock::now();
    p.set_value();
    for(std::thread& t : thread_pool) { t.join(); }
    auto stop = std::chrono::steady_clock::now();
    Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_cnt_read);
    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
    double seconds = duration.count() / 1e9;
    double throughput = (workload / seconds) / (1024 * 1024 * 1024);
    return throughput;
 }
 void exec_multiple_runs_memcpy(size_t workload, int exec_node, int src_node, int dest_node, std::ofstream* out_file, std::string iteration_type){
    base_t value;
    base_t* result = &value;
    base_t* src = (base_t*) numa_alloc_onnode(workload, src_node);
    base_t* dest = (base_t*) numa_alloc_onnode(workload, dest_node);
    fill_mt<base_t>(src, workload, 0, 100, 42);
    fill_mt<base_t>(dest, workload, 0, 100, 12);
    numa_run_on_node(exec_node);
    if(dest_node == 0 && src_node == 0){
        std::ofstream check_file;
        check_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
            + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_" + iteration_type + ".checksum");
        check_file << sum_up(src, workload);
        check_file.close();
    }
    for(size_t run = 0; run < runs; run++){
        double bw = measure_memcpy_bw(src, dest, workload, result);
        std::cout << "Copy throughput executed on node " << exec_node << " form node " << src_node << " to node " 
            << dest_node << ": " << bw << " GiB/s" << std::endl; 
        print_to_file(*out_file, run, src_node, dest_node, bw, *result);
        std::memset(dest, 0x00, workload);
        *result = 0;
    }
    numa_free(src, workload);
    numa_free(dest, workload);
 }
 void measure_all_memcpy_bw_for_chosen_execnode(int exec_node){
    std::ofstream out_file;
    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + ".csv");
    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
    const size_t workload = 4_GiB;
    for(int src_node = 0; src_node < 16; src_node++){
        for(int dest_node = 0; dest_node < 16; dest_node++){
            exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "");
        }
    }
    out_file.close();
 }
 void measure_all_memcpy_bw_for_chosen_execnode_reversed(int exec_node){
    std::ofstream out_file;
    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed.csv");
    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
    const size_t workload = 4_GiB;
    for(int src_node = 15; src_node >= 0; src_node--){
        for(int dest_node = 15; dest_node >= 0; dest_node--){
            exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "reversed");
        }
    }
    out_file.close();
 }
 void measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(int exec_node){
    std::ofstream out_file;
    out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) 
        + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed_bitwise.csv");
    print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result");
    const size_t workload = 4_GiB;
    for(int src_node = 0; src_node < 16; src_node++){
        for(int dest_node = 0; dest_node < 16; dest_node++){
            int reversed_src_node = reverse_bits(src_node, 4);
            int reversed_dest_node = reverse_bits(dest_node, 4);
            exec_multiple_runs_memcpy(workload, exec_node, reversed_src_node, reversed_dest_node, &out_file, "reversed_bitwise");
        }
    }
    out_file.close();
 }
 void exec_multiple_runs_read(size_t workload, int mem_node, int exec_node, std::ofstream *out_file, std::string iteration_type){
    base_t* data = (base_t*) numa_alloc_onnode(workload, mem_node);
    fill_mt<base_t>(data, workload, 0, 100, 42);
    base_t* results = (base_t*) numa_alloc_onnode(thread_cnt_read * sizeof(base_t), exec_node);
    numa_run_on_node(exec_node);
    if(mem_node == 0 && exec_node == 0){
        std::ofstream check_file;
        check_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_" + iteration_type + ".checksum");
        check_file << sum_up(data, workload);
        check_file.close();
    }
    for(size_t run = 0; run < runs; run++){
        double bw =  measure_read_bw(data, workload, results);
        std::cout << "Read throughput executed on node " << exec_node << " for node " << mem_node << ": " << bw << " GiB/s" << std::endl;
        print_to_file(*out_file, run, exec_node, mem_node, bw, results[0]);
        std::memset(results, 0x00, thread_cnt_read * sizeof(base_t));
    }
    numa_free(data, workload);
    numa_free(results, thread_cnt_read * sizeof(base_t));
 }
 void measure_all_read_bw(){
    std::ofstream out_file;
    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + ".csv");
    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
    const size_t workload = 8_GiB;
    for(int exec_node = 0; exec_node < 8; exec_node++){
        for(int mem_node = 0; mem_node < 16; mem_node++){
            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "");
        }
    }
    out_file.close();
 }
 void measure_all_read_bw_reversed(){
    std::ofstream out_file;
    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed.csv");
    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
    const size_t workload = 8_GiB;
    for(int exec_node = 7; exec_node >= 0; exec_node--){
        for(int mem_node = 15; mem_node >= 0; mem_node--){
            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed");
        }
    }
    out_file.close();
 }
 void measure_all_read_bw_reversed_bitwise(){
    std::ofstream out_file;
    out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed_bitwise.csv");
    print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result");
    const size_t workload = 8_GiB;
    for(int exec_node0 = 0; exec_node0 < 8; exec_node0++){
        for(int mem_node0 = 0; mem_node0 < 16; mem_node0++){
            int mem_node = reverse_bits(mem_node0, 4);
            int exec_node = reverse_bits(exec_node0, 3);
            exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed_bitwise");
        }
    }
    out_file.close();
 }
 int main() {
    // nodes 0-7 hold cores and DRAM, nodes 8-15 only HBM
    measure_all_read_bw_reversed_bitwise();
    measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(0);
    return 0;
 }
--- a/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
+++ b/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
@ -0,0 +1,391 @@
 #include <cassert>
 #include <mutex>
 #include <cstring>
 #include <bitset>
 #include <numa.h>
 #include "filter.h"
 #include "aggregation.h"
 #include "vector_loader.h"
 #include "timer_utils.h"
 #include "barrier_utils.h"
 #include "execution_modes.h"
 template<typename base_t, bool simple>
 class Query_Wrapper {
 public:
    // sync
    std::shared_future<void>* ready_future;
    thread_runtime_timing* trt;
    barrier_timing* bt;
 private:
    // numa
    uint32_t close_mem;
    uint32_t far_mem;
    // data
    size_t size_b;
    size_t chunk_size_b;
    size_t chunk_size_w;
    size_t chunk_cnt;
    base_t* data_a;
    base_t* data_b;
    base_t* dest;
    // ratios
    uint32_t thread_count_fc;
    uint32_t thread_count_fi;
    uint32_t thread_count_ag;
    uint32_t thread_group;
    // done bits
    volatile uint8_t* ready_flag_a;
    volatile uint8_t* ready_flag_b;
    std::mutex ready_a_m;
    std::mutex ready_b_m;
    // buffer
    uint16_t* mask_a;
    uint16_t* mask_b;
    base_t** buffer_b;
    // params
    base_t cmp_a;
    base_t cmp_b;
    bool no_copy;
    NewPMode mode;
    // sync
    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
    std::string barrier_mode = BARRIER_MODE;
    using filterCopy   = Filter<base_t, LT, load_mode::Stream, true>;
    using filterNoCopy = Filter<base_t, LT, load_mode::Stream, false>;
    using filter       = Filter<base_t, LT, load_mode::Stream, false>;
    using aggregation  = Aggregation<base_t, Sum, load_mode::Stream>;
 public: 
    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
                  NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) :
                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){
        chunk_size_w = chunk_size_b / sizeof(base_t);
        chunk_cnt = size_b / chunk_size_b;
        thread_count_fi = tc_fi;
        thread_count_fc = tc_fc;
        thread_count_ag = tc_ag;
        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
        trt = new thread_runtime_timing(4, 16*4*4*4, close_mem);
        bt = new barrier_timing(4, 16*4*4*4, close_mem);
        reset_barriers();
        if constexpr(BUFFER_LIMIT==1) {
            // TODO size ok like that?
            buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem);
            buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
            buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
        } else {
            buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem);
            base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem);
            *buffer_b = buffer_tmp;
        }
    };
    void reset_barriers(){
        if(sync_barrier != nullptr) {
            for(auto& barrier : *sync_barrier) {
                delete barrier;
            }
            sync_barrier.reset();
        }
        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
        uint32_t barrier_thread_count;
        if constexpr(simple){
            barrier_thread_count = (thread_group / barrier_count) *
                                        (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi));
        } else {
            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
        }
        for(uint32_t i = 0; i < barrier_count; ++i) {
            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
        }
    }
    void clear_buffers () {
        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
        if constexpr(BUFFER_LIMIT==1) {
            std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b);
            std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b);
        } else {
            std::memset(*buffer_b, 0x00, size_b);
        }
        trt->reset_accumulator();
        bt->reset_accumulator();
        reset_barriers();
    };
    ~Query_Wrapper() {
        numa_free((void*)ready_flag_a,
            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
        numa_free((void*)ready_flag_b, 
            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
        numa_free(mask_a, size_b / sizeof(base_t));
        numa_free(mask_b, size_b / sizeof(base_t));
        if constexpr(BUFFER_LIMIT==1) {
            numa_free(buffer_b[0], thread_group * chunk_size_b);
            numa_free(buffer_b[1], thread_group * chunk_size_b);
            numa_free(buffer_b, size_b * sizeof(base_t*));
        } else {
            numa_free(*buffer_b, size_b);
        }
        delete trt;
        for(auto& barrier : *sync_barrier) {
            delete barrier;
        }
        delete bt;
    };
    //this can be set without need to change allocations
    void set_thread_group_count(uint32_t value) {
        this->thread_group = value;
    };
 private:
    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
                                            size_t tcnt) {
        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
        return chunk_ptr + tid * (chunk_size_w / tcnt);
    }
    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
                                            size_t tcnt) {
        // 16 integer are addressed with one uint16_t in mask buffer
        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
        return base_ptr + (offset / 16);
    }
    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
        uint8_t value = bitmap[bitpos / 8];
        switch(bitpos % 8) {
        case 0: return value & 0b00000001;
        case 1: return value & 0b00000010;
        case 2: return value & 0b00000100;
        case 3: return value & 0b00001000;
        case 4: return value & 0b00010000;
        case 5: return value & 0b00100000;
        case 6: return value & 0b01000000;
        case 7: return value & 0b10000000;
        default: return false;
        }
    }
    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
        mutex.lock();
        switch(bitpos % 8) {
        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
        }
        mutex.unlock();
    }
 public:
    static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) {
        base_t sum = 0;
        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
            if(a[i] >= cmp_a && b[i] <= cmp_b) {
                sum += b[i];
            }
        }
        return sum;
    }
    static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
        uint32_t cnt = 0;
        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
            if(leq) {
                if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) {
                    ++cnt;
                } 
            } else {
                if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) {
                    ++cnt;
                } 
            }
        }
    }
    static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
        for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) {
            std::bitset<16> m(mask[i]);
            uint16_t ch = 0;
            for(int j = 0; j < 16; ++j) {
                if(data[i*16 + j] <= cmp) {
                    ch |=  0x1 << j;
                }
            }
            std::bitset<16> c(ch);
            std::cout << "act " << m << std::endl;
            std::cout << "rea " << c << std::endl << std::endl;
        }
    }
    void scan_b(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_fc;
        assert(chunk_size_w % tcnt == 0);
        assert(chunk_size_w % 16   == 0);
        assert(chunk_size_w % tcnt * 16 == 0);
        // wait till everyone can start
        ready_future->wait();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            trt->start_timer(1, tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b  , chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr = get_sub_mask_ptr (mask_b  , chunk_id, chunk_size_w, tid, tcnt);
            if constexpr(simple){
                base_t* buffer_ptr;
                if constexpr(BUFFER_LIMIT==1) {
                    buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
                } else {
                    buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
                }
                std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt);
            } else {
                if(no_copy) {
                    filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
                } else {
                    base_t* buffer_ptr;
                    if constexpr(BUFFER_LIMIT==1) {
                        buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
                    } else {
                        buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
                    }
                    filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
                }
            }
            trt->stop_timer(1, tid * gcnt + gid);
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
        }
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
    }
    void scan_a(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_fi;
        assert(chunk_size_w % tcnt == 0);
        assert(chunk_size_w % 16   == 0);
        assert(chunk_size_w % tcnt * 16 == 0);
        // wait till everyone can start
        ready_future->wait();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            trt->start_timer(0, tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
            trt->stop_timer(0, tid * gcnt + gid);
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
        }
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
    }
    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_ag;
        // wait till everyone can start
        ready_future->wait();
        // calculate values
        __m512i aggregator = aggregation::OP::zero();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
            trt->start_timer(2, tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t* chunk_ptr;
            if(no_copy) {
                chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
            } else {
                if constexpr(BUFFER_LIMIT==1) {   
                    chunk_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
                } else {
                    chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
                }
            }
            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
            base_t tmp = _mm512_reduce_add_epi64(aggregator);
            if constexpr(simple){
                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt);
            } else {
                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
            }
            trt->stop_timer(2, tid * gcnt + gid);
        }
        // so threads with more runs dont wait for finished threads
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
    }
 };
--- a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
+++ b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
@ -0,0 +1,395 @@
 #include <cassert>
 #include <mutex>
 #include <cstring>
 #include <bitset>
 #include <algorithm>
 #include <numa.h>
 #include "filter.h"
 #include "aggregation.h"
 #include "vector_loader.h"
 #include "timer_utils.h"
 #include "barrier_utils.h"
 #include "measurement_utils.h"
 #include "execution_modes.h"
 #include "../../../thirdParty/dsa_offload/offloading-cacher/cache.hpp"
 template<typename base_t, bool simple>
 class Query_Wrapper {
 public:
    // sync
    std::shared_future<void>* ready_future;
    thread_runtime_timing* trt;
    barrier_timing* bt;
    pcm_value_collector* pvc;
 private:
    dsacache::Cache cache_;
    // numa
    uint32_t close_mem;
    uint32_t far_mem;
    // data
    size_t size_b;
    size_t chunk_size_b;
    size_t chunk_size_w;
    size_t chunk_cnt;
    base_t* data_a;
    base_t* data_b;
    base_t* dest;
    // ratios
    uint32_t thread_count_fc;
    uint32_t thread_count_fi;
    uint32_t thread_count_ag;
    uint32_t thread_group;
    // done bits
    volatile uint8_t* ready_flag_a;
    volatile uint8_t* ready_flag_b;
    std::mutex ready_a_m;
    std::mutex ready_b_m;
    // buffer
    uint16_t* mask_a;
    uint16_t* mask_b;
    // params
    base_t cmp_a;
    base_t cmp_b;
    NewPMode mode;
    // sync
    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
    std::string barrier_mode = BARRIER_MODE;
    using filterCopy   = Filter<base_t, LT, load_mode::Stream, true>;
    using filterNoCopy = Filter<base_t, LT, load_mode::Stream, false>;
    using filter       = Filter<base_t, LT, load_mode::Stream, false>;
    using aggregation  = Aggregation<base_t, Sum, load_mode::Stream>;
    void InitCache(const std::string& device) {
        if (device == "default") {
            static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
                return numa_dst_node;
            };
            static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
                return std::vector<int>{ numa_src_node, numa_dst_node };
            };
            cache_.Init(cache_policy,copy_policy);
        }
        else if (device == "xeonmax") {
            static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {
                return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;
            };
            static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) {
                const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;
                if (same_socket) {
                    const bool socket_number = numa_dst_node >> 2;
                    if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };
                    else return std::vector<int>{ 4, 5, 6, 7 };
                }
                else return std::vector<int>{ numa_src_node, numa_dst_node };
            };
            cache_.Init(cache_policy,copy_policy);
        }
        else {
            std::cerr << "Given device '" << device << "' not supported!" << std::endl;
            exit(-1);
        }
    }
 public: 
    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
                  NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42) :
                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b){
        chunk_size_w = chunk_size_b / sizeof(base_t);
        chunk_cnt = size_b / chunk_size_b;
        thread_count_fi = tc_fi;
        thread_count_fc = tc_fc;
        thread_count_ag = tc_ag;
        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
        InitCache("xeonmax");
        size_t measurement_space = THREAD_GROUP_MULTIPLIER * std::max(std::max(tc_fi, tc_fc), tc_ag);
        trt = new thread_runtime_timing(3, measurement_space, far_mem);
        bt = new barrier_timing(3, measurement_space, far_mem);
        pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, far_mem);
        reset_barriers();
    };
    void reset_barriers(){
        if(sync_barrier != nullptr) {
            for(auto& barrier : *sync_barrier) {
                delete barrier;
            }
            sync_barrier.reset();
        }
        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
        uint32_t barrier_thread_count;
        if constexpr(simple){
            barrier_thread_count = (thread_group / barrier_count) *
                                        (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi));
        } else {
            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
        }
        for(uint32_t i = 0; i < barrier_count; ++i) {
            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
        }
    }
    void clear_buffers () {
        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
        cache_.Clear();
        trt->reset_accumulator();
        bt->reset_accumulator();
        pvc->reset();
        reset_barriers();
    };
    ~Query_Wrapper() {
        numa_free((void*)ready_flag_a,
            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
        numa_free((void*)ready_flag_b, 
            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
        numa_free(mask_a, size_b / sizeof(base_t));
        numa_free(mask_b, size_b / sizeof(base_t));
        delete trt;
        for(auto& barrier : *sync_barrier) {
            delete barrier;
        }
        delete bt;
        delete pvc;
    };
    //this can be set without need to change allocations
    void set_thread_group_count(uint32_t value) {
        this->thread_group = value;
    };
 private:
    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
                                            size_t tcnt) {
        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
        return chunk_ptr + tid * (chunk_size_w / tcnt);
    }
    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
                                            size_t tcnt) {
        // 16 integer are addressed with one uint16_t in mask buffer
        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
        return base_ptr + (offset / 16);
    }
    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
        uint8_t value = bitmap[bitpos / 8];
        switch(bitpos % 8) {
        case 0: return value & 0b00000001;
        case 1: return value & 0b00000010;
        case 2: return value & 0b00000100;
        case 3: return value & 0b00001000;
        case 4: return value & 0b00010000;
        case 5: return value & 0b00100000;
        case 6: return value & 0b01000000;
        case 7: return value & 0b10000000;
        default: return false;
        }
    }
    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
        mutex.lock();
        switch(bitpos % 8) {
        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
        }
        mutex.unlock();
    }
 public:
    void scan_b(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_fc;
        assert(chunk_size_w % tcnt == 0);
        assert(chunk_size_w % 16   == 0);
        assert(chunk_size_w % tcnt * 16 == 0);
        // wait till everyone can start
        ready_future->wait();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            trt->start_timer(1, tid * gcnt + gid);
            pvc->start("scan_b", tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr = get_sub_mask_ptr(mask_b, chunk_id, chunk_size_w, tid, tcnt);
            if constexpr(simple){
                cache_.Access(chunk_ptr, chunk_size_b / tcnt);
            } else {
                const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt);
                // wait on copy to complete - during this time other threads may
                // continue with their calculation which leads to little impact
                // and we will be faster if the cache is used
                data->WaitOnCompletion();
                // obtain the data location from the cache entry
                base_t* data_ptr = data->GetDataLocation();
                // nullptr is still a legal return value for CacheData::GetLocation()
                // even after waiting, so this must be checked
                if (data_ptr == nullptr) {
                    data_ptr = chunk_ptr;
                }
                filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt);
            }
            pvc->stop("scan_b", tid * gcnt + gid);
            trt->stop_timer(1, tid * gcnt + gid);
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
        }
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
    }
    void scan_a(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_fi;
        assert(chunk_size_w % tcnt == 0);
        assert(chunk_size_w % 16   == 0);
        assert(chunk_size_w % tcnt * 16 == 0);
        // wait till everyone can start
        ready_future->wait();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            trt->start_timer(0, tid * gcnt + gid);
            pvc->start("scan_a", tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
            pvc->stop("scan_a", tid * gcnt + gid);
            trt->stop_timer(0, tid * gcnt + gid);
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
        }
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
    }
    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_ag;
        // wait till everyone can start
        ready_future->wait();
        // calculate values
        __m512i aggregator = aggregation::OP::zero();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
            trt->start_timer(2, tid * gcnt + gid);
            pvc->start("aggr_j", tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            const base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
            // access the cache for the given chunk which will have been accessed in scan_b
            const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt);
            // wait on the caching task to complete, this will give time for other processes
            // to make progress here which will therefore not hurt performance
            data->WaitOnCompletion();
            // after the copy task has finished we obtain the pointer to the cached
            // copy of data_b which is then used from now on
            const base_t* data_ptr = data->GetDataLocation();
            // nullptr is still a legal return value for CacheData::GetLocation()
            // even after waiting, so this must be checked
            if (data_ptr == nullptr) {
                data_ptr = chunk_ptr;
                std::cerr << "Cache Miss" << std::endl;
            }
            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
            base_t tmp = _mm512_reduce_add_epi64(aggregator);
            if constexpr(simple){
                aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, chunk_size_b / tcnt);
            } else {
                aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
            }
            pvc->stop("aggr_j", tid * gcnt + gid);
            trt->stop_timer(2, tid * gcnt + gid);
        }
        // so threads with more runs dont wait for alerady finished threads
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
    }
 };
--- a/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
+++ b/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
@ -0,0 +1,387 @@
 #include <cassert>
 #include <mutex>
 #include <cstring>
 #include <bitset>
 #include <numa.h>
 #include "filter.h"
 #include "aggregation.h"
 #include "vector_loader.h"
 #include "timer_utils.h"
 #include "barrier_utils.h"
 #include "execution_modes.h"
 template<typename base_t, bool simple>
 class Query_Wrapper {
 public:
    // sync
    std::shared_future<void>* ready_future;
    thread_runtime_timing* trt;
    barrier_timing* bt;
 private:
    // numa
    uint32_t close_mem;
    uint32_t far_mem;
    // data
    size_t size_b;
    size_t chunk_size_b;
    size_t chunk_size_w;
    size_t chunk_cnt;
    base_t* data_a;
    base_t* data_b;
    base_t* dest;
    // ratios
    uint32_t thread_count_fc;
    uint32_t thread_count_fi;
    uint32_t thread_count_ag;
    uint32_t thread_group;
    // done bits
    volatile uint8_t* ready_flag_a;
    volatile uint8_t* ready_flag_b;
    std::mutex ready_a_m;
    std::mutex ready_b_m;
    // buffer
    uint16_t* mask_a;
    uint16_t* mask_b;
    base_t** buffer_b;
    // params
    base_t cmp_a;
    base_t cmp_b;
    bool no_copy;
    PMode mode;
    // sync
    std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier;
    std::string barrier_mode = BARRIER_MODE;
    using filterCopy   = Filter<base_t, LEQ, load_mode::Aligned, true>;
    using filterNoCopy = Filter<base_t, LEQ, load_mode::Aligned, false>;
    using filter       = Filter<base_t, GEQ, load_mode::Aligned, false>;
    using aggregation  = Aggregation<base_t, Sum, load_mode::Aligned>;
 public: 
    Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, 
                  base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, 
                  PMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) :
                  ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), 
                  dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){
        chunk_size_w = chunk_size_b / sizeof(base_t);
        chunk_cnt = size_b / chunk_size_b;
        thread_count_fi = tc_fi;
        thread_count_fc = tc_fc;
        thread_count_ag = tc_ag;
        ready_flag_a = (volatile uint8_t *) numa_alloc_onnode(
            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem);
        ready_flag_b = (volatile uint8_t *) numa_alloc_onnode(
            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem);
        mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
        mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem);
        trt = new thread_runtime_timing(4, 20, close_mem);
        bt = new barrier_timing(4, 20, close_mem);
        reset_barriers();
        if constexpr(BUFFER_LIMIT==1) {
            // TODO size ok like that?
            buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem);
            buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
            buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem);
        } else {
            buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem);
            base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem);
            *buffer_b = buffer_tmp;
        }
    };
    void reset_barriers(){
        if(sync_barrier != nullptr) {
            for(auto& barrier : *sync_barrier) {
                delete barrier;
            }
            sync_barrier.reset();
        }
        sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group);
        uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc;
        uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group;
        uint32_t barrier_thread_count;
        if constexpr(simple){
            barrier_thread_count = (thread_group / barrier_count) *
                                        (mode == PMode::expl_copy ? thread_count_sum : (thread_count_ag + thread_count_fi));
        } else {
            barrier_thread_count = (thread_group / barrier_count) * thread_count_sum;
        }
        for(uint32_t i = 0; i < barrier_count; ++i) {
            (*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count);
        }
    }
    void clear_buffers () {
        std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
        std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
        std::memset(mask_a, 0x00, size_b / sizeof(base_t));
        std::memset(mask_b, 0x00, size_b / sizeof(base_t));
        if constexpr(BUFFER_LIMIT==1) {
            std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b);
            std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b);
        } else {
            std::memset(*buffer_b, 0x00, size_b);
        }
        trt->reset_accumulator();
        bt->reset_accumulator();
        reset_barriers();
    };
    ~Query_Wrapper() {
        numa_free((void*)ready_flag_a,
            chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0));
        numa_free((void*)ready_flag_b, 
            chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0));
        numa_free(mask_a, size_b / sizeof(base_t));
        numa_free(mask_b, size_b / sizeof(base_t));
        if constexpr(BUFFER_LIMIT==1) {
            numa_free(buffer_b[0], thread_group * chunk_size_b);
            numa_free(buffer_b[1], thread_group * chunk_size_b);
            numa_free(buffer_b, size_b * sizeof(base_t*));
        } else {
            numa_free(*buffer_b, size_b);
        }
        delete trt;
        for(auto& barrier : *sync_barrier) {
            delete barrier;
        }
        delete bt;
    };
 private:
    static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
                                            size_t tcnt) {
        base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w;
        return chunk_ptr + tid * (chunk_size_w / tcnt);
    }
    static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, 
                                            size_t tcnt) {
        // 16 integer are addressed with one uint16_t in mask buffer
        size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt);
        return base_ptr + (offset / 16);
    }
    static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) {
        uint8_t value = bitmap[bitpos / 8];
        switch(bitpos % 8) {
        case 0: return value & 0b00000001;
        case 1: return value & 0b00000010;
        case 2: return value & 0b00000100;
        case 3: return value & 0b00001000;
        case 4: return value & 0b00010000;
        case 5: return value & 0b00100000;
        case 6: return value & 0b01000000;
        case 7: return value & 0b10000000;
        default: return false;
        }
    }
    static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) {
        mutex.lock();
        switch(bitpos % 8) {
        case 0: bitmap[bitpos / 8] |= 0b00000001;break;
        case 1: bitmap[bitpos / 8] |= 0b00000010;break;
        case 2: bitmap[bitpos / 8] |= 0b00000100;break;
        case 3: bitmap[bitpos / 8] |= 0b00001000;break;
        case 4: bitmap[bitpos / 8] |= 0b00010000;break;
        case 5: bitmap[bitpos / 8] |= 0b00100000;break;
        case 6: bitmap[bitpos / 8] |= 0b01000000;break;
        case 7: bitmap[bitpos / 8] |= 0b10000000;break;
        }
        mutex.unlock();
    }
 public:
    static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) {
        base_t sum = 0;
        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
            if(a[i] >= cmp_a && b[i] <= cmp_b) {
                sum += b[i];
            }
        }
        return sum;
    }
    static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
        uint32_t cnt = 0;
        for(int i = 0; i < size_b / sizeof(base_t); ++i) {
            if(leq) {
                if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) {
                    ++cnt;
                } 
            } else {
                if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) {
                    ++cnt;
                } 
            }
        }
    }
    static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) {
        for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) {
            std::bitset<16> m(mask[i]);
            uint16_t ch = 0;
            for(int j = 0; j < 16; ++j) {
                if(data[i*16 + j] <= cmp) {
                    ch |=  0x1 << j;
                }
            }
            std::bitset<16> c(ch);
            std::cout << "act " << m << std::endl;
            std::cout << "rea " << c << std::endl << std::endl;
        }
    }
    void scan_b(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_fc;
        assert(chunk_size_w % tcnt == 0);
        assert(chunk_size_w % 16   == 0);
        assert(chunk_size_w % tcnt * 16 == 0);
        // wait till everyone can start
        ready_future->wait();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            trt->start_timer(1, tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b  , chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr = get_sub_mask_ptr (mask_b  , chunk_id, chunk_size_w, tid, tcnt);
            if constexpr(simple){
                base_t* buffer_ptr;
                if constexpr(BUFFER_LIMIT==1) {
                    buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
                } else {
                    buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
                }
                std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt);
            } else {
                if(no_copy) {
                    filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
                } else {
                    base_t* buffer_ptr;
                    if constexpr(BUFFER_LIMIT==1) {
                        buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt);
                    } else {
                        buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
                    }
                    filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt);
                }
            }
            trt->stop_timer(1, tid * gcnt + gid);
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid);
        }
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
    }
    void scan_a(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_fi;
        assert(chunk_size_w % tcnt == 0);
        assert(chunk_size_w % 16   == 0);
        assert(chunk_size_w % tcnt * 16 == 0);
        // wait till everyone can start
        ready_future->wait();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            trt->start_timer(0, tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t*  chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
            filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt);
            trt->stop_timer(0, tid * gcnt + gid);
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid);
        }
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
    }
    void aggr_j(size_t gid, size_t gcnt, size_t tid) {
        size_t tcnt = thread_count_ag;
        // wait till everyone can start
        ready_future->wait();
        // calculate values
        __m512i aggregator = aggregation::OP::zero();
        // the lower gids run once more if the chunks are not evenly distributable
        uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid);
        uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid;
        for(uint32_t i = 0; i < runs; ++i) {
            bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid);
            trt->start_timer(2, tid * gcnt + gid);
            // calculate pointers
            size_t chunk_id = gid + gcnt * i;
            base_t* chunk_ptr;
            if(no_copy) {
                chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
            } else {
                if constexpr(BUFFER_LIMIT==1) {
                    chunk_ptr = get_sub_chunk_ptr(buffer_b[i%2], gid, chunk_size_w, tid, tcnt);
                } else {
                    chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt);
                }
            }
            uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt);
            uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt);
            base_t tmp = _mm512_reduce_add_epi64(aggregator);
            if constexpr(simple){
                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt);
            } else {
                aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt);
            }
            trt->stop_timer(2, tid * gcnt + gid);
        }
        // so threads with more runs dont wait for finished threads
        (*(*sync_barrier)[barrier_idx]).arrive_and_drop();
        aggregation::happly(dest + (tid * gcnt + gid), aggregator);
    }
 };
--- a/qdp_project/src/utils/array_utils.h
+++ b/qdp_project/src/utils/array_utils.h
@ -0,0 +1,80 @@
 #pragma once
 #include <cstdlib>
 #include <ctime>
 #include <cstdint>
 #include <type_traits>
 #include <random>
 #include <chrono>
 #include <immintrin.h>
 /// @brief Fills a given array with random generated integers.
 /// @tparam base_t Datatype of the array
 /// @param dest Pointer to the array
 /// @param size Size of the array
 /// @param min Minumum value of the generated integers
 /// @param max Maximum value of the generated integers
 template<typename base_t>
 void fill(base_t * dest, uint64_t size, base_t min, base_t max) {
    std::srand(std::time(nullptr));
    for(uint64_t i = 0; i < size/sizeof(base_t); ++i) {
        dest[i] = (std::rand() % (max - min)) + min;
    }
 }
 /// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937).
 /// @tparam base_t Datatype of the array
 /// @param dest Pointer to the array
 /// @param size Size of the array
 /// @param min Minumum value of the generated integers
 /// @param max Maximum value of the generated integers
 template <typename T>
 void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) {
 	static_assert(std::is_integral<T>::value, "Data type is not integral.");
    size = size / sizeof(T);
    std::mt19937::result_type seed;
    if (int_seed == 0) {
        std::random_device rd;
        seed = rd() ^ (
            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::seconds>(
                std::chrono::system_clock::now().time_since_epoch()).count() + 
            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::microseconds>(
                std::chrono::high_resolution_clock::now().time_since_epoch()).count());
    } else seed = int_seed;
    std::mt19937 gen(seed);
    std::uniform_int_distribution<T> distrib(min, max);
    for (uint64_t j = 0; j < size; ++j) {
        array[j] = distrib(gen);
    }
 }
 /**
 * @brief Checks if two arrays of the integral type *T* contain the same values
 * 
 * @tparam T Integral type of *array0* and *array1*
 * @param array0 Array 0 to check
 * @param array1 Array 1 to check
 * @param size_b Size of the two arrays in byte
 * @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index)
 * @return bool Weathor or not the content is equal or not
 */
 template <typename T>
 typename std::enable_if<std::is_integral<T>::value, bool>::type
        check_same(T* array0, T* array1, size_t size_b, bool verbose) {
    for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) {
        __m512i vec0 = _mm512_stream_load_si512(array0 + i);
        __m512i vec1 = _mm512_stream_load_si512(array1 + i);
        __mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1);
    }
    //TODO complete function
    return false;
 }
--- a/qdp_project/src/utils/barrier_utils.h
+++ b/qdp_project/src/utils/barrier_utils.h
@ -0,0 +1,73 @@
 #pragma once
 #include <cstdint>
 #include <numa.h>
 #include <barrier>
 #include <chrono>
 #define BARRIER_TIMINGS 1
 struct barrier_completion_function {
    inline void operator() () {
        return;
    }
 };
 struct barrier_timing {
    uint32_t time_points, time_threads;
    double** time_accumulator;
    barrier_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) {
 #ifdef BARRIER_TIMINGS
        time_points = timing_points;
        time_threads = timing_threads;
        time_accumulator =  (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node);
        for(uint32_t i = 0; i < timing_points; ++i) {
            time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node);
        }
 #endif
    }
    ~barrier_timing() {
 #ifdef BARRIER_TIMINGS
        for(uint32_t i = 0; i < time_points; ++i) {
            numa_free(time_accumulator[i], time_threads * sizeof(double));
        }
        numa_free(time_accumulator, time_points * sizeof(double*));
 #endif
    }
    void reset_accumulator() {
 #ifdef BARRIER_TIMINGS        
        for(uint32_t i = 0; i < time_points;  ++i){
        for(uint32_t j = 0; j < time_threads; ++j){
            time_accumulator[i][j] = 0.0;
        }}
 #endif
    }
    double summarize_time(uint32_t time_point) {
 #ifdef BARRIER_TIMINGS
        double sum = 0.0;
        for(uint32_t i = 0; i < time_threads; ++i) {
            sum += time_accumulator[time_point][i];
        }
        return sum;
 #endif
    }
    void timed_wait(std::barrier<struct barrier_completion_function>& barrier, uint32_t point_id, uint32_t thread_id) {
 #ifdef BARRIER_TIMINGS
        auto before_barrier = std::chrono::steady_clock::now();
 #endif
        barrier.arrive_and_wait();
 #ifdef BARRIER_TIMINGS
        auto after_barrier = std::chrono::steady_clock::now();
        uint64_t barrier_wait_time = std::chrono::duration_cast<std::chrono::nanoseconds>(after_barrier - before_barrier).count();
        double seconds = barrier_wait_time / (1000.0 * 1000.0 * 1000.0);
        time_accumulator[point_id][thread_id] += seconds;
 #endif    
    }
 };
--- a/qdp_project/src/utils/const.h
+++ b/qdp_project/src/utils/const.h
@ -0,0 +1,33 @@
 /**
 * @file const.h
 * @author André Berthold
 * @brief Defines handy constants.
 * @version 0.1
 * @date 2023-05-25
 * 
 * @copyright Copyright (c) 2023
 * 
 */
 #pragma once
 #include <cstdint>
 #include <immintrin.h>
 constexpr size_t VECTOR_SIZE_I = 512;
 constexpr size_t VECTOR_SIZE_B = VECTOR_SIZE_I / 8;
 constexpr size_t VECTOR_SIZE_H = VECTOR_SIZE_B / sizeof(uint32_t);
 constexpr size_t VECTOR_SIZE_W = VECTOR_SIZE_B / sizeof(uint64_t);
 template<typename T>
 constexpr size_t VECTOR_SIZE() {
    return VECTOR_SIZE_B / sizeof(T);
 }
 template<typename T>
 constexpr size_t V_MASK_SIZE() {
    return VECTOR_SIZE<T>() / 8;
 }
 const __mmask16 full_m16 = _mm512_int2mask(0xFFFF); 
--- a/qdp_project/src/utils/cpu_set_utils.h
+++ b/qdp_project/src/utils/cpu_set_utils.h
@ -0,0 +1,82 @@
 #pragma once
 #include <cstdint>
 #include <thread>
 #include <cassert>
 #include <iostream>
 #include <vector>
 #include <utility>
 /** Sets all bits in a given cpu_set_t between L and H (condition L <= H)*/
 #define CPU_BETWEEN(L, H, SET) assert(L <= H); for(; L < H; ++L) {CPU_SET(L, SET);}
 /**
 * Applies the affinity defined in set to the thread, through pthread library 
 * calls. If it fails it wites the problem to stderr and terminated the program.
 */
 inline void pin_thread(std::thread& thread, cpu_set_t* set) {
    int error_code = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), set);
    if (error_code != 0) {
        std::cerr << "Error calling pthread_setaffinity_np in copy_pool assignment: " << error_code << std::endl;
        exit(-1);
    }
 }
 /**
 * Returns the cpu id of the thread_id-th cpu in a given (multi)range. Thread_id
 * greater than the number of cpus in the (multi)range are valid. In this case
 * the (thread_id % #cpus in the range)-th cpu in the range is returned.
 */
 int get_cpu_id(int thread_id, const std::vector<std::pair<int, int>>& range) {
    int subrange_size = range[0].second - range[0].first;
    int i = 0;
    while(subrange_size <= thread_id) {
        thread_id -= subrange_size;
        i = (i + 1) % range.size();
        subrange_size = range[i].second - range[i].first;
    }
    return thread_id + range[i].first;
 }
 /*inline void cpu_set_between(cpu_set_t* set, uint32_t low, uint32_t high) {
    assert(low != high);
    if (low > high) std::swap(low, high);
    for(; low < high; ++low) {
        CPU_SET(low, set);
    }
 }*/
 /**
 * Pins the given thread to the thread_id-th cpu in the given range. 
 */
 void pin_thread_in_range(std::thread& thread, int thread_id, std::vector<std::pair<int, int>>& range) {
    cpu_set_t set; 
    CPU_ZERO(&set);
    CPU_SET(get_cpu_id(thread_id, range), &set);
    pin_thread(thread, &set);
 }
 /**
 * Pins the given thread to all cpus in the given range. 
 */
 void pin_thread_in_range(std::thread& thread, std::vector<std::pair<int, int>>& range) {
    cpu_set_t set;
    CPU_ZERO(&set);
    for(auto r : range) { CPU_BETWEEN(r.first, r.second, &set); }
    pin_thread(thread, &set);
 }
 /**
 * Pins the given thread to all cpu ids between low (incl.) and high (excl.).
 */
 inline void pin_thread_between(std::thread& thread, uint32_t low, uint32_t high) {
    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_BETWEEN(low, high, &set);
    pin_thread(thread, &set);
 }
--- a/qdp_project/src/utils/execution_modes.h
+++ b/qdp_project/src/utils/execution_modes.h
@ -0,0 +1,89 @@
 #include <string> 
 enum PMode{no_copy = 0, hbm = 1, expl_copy = 2};
 struct mode_manager {
    static inline PMode inc(PMode value) {
        return static_cast<PMode>(value + 1);
    };
    static inline bool pred(PMode value) {
        return no_copy <= value && value <= expl_copy;
    };
    static std::string string(PMode value) {
        switch(value) {
            case no_copy:  return "no_copy";
            case hbm:      return "hbm_pre";
            case expl_copy:return "expl_co";
        } return "no_copy";
    };
 };
 #define SIMPLE_Q 0
 #define COMPLEX_Q 1
 #define SCAN_A 0
 #define SCAN_B 1
 #define AGGR_J 2
 enum NewPMode{DRAM_base = 0, HBM_base = 1, Mixed_base = 2, Prefetch = 3};
 struct new_mode_manager {
    /*constexpr static int thread_counts[2][4][3] = {
        //simple query
        //scan_a, scan_b, aggr_j
        {{3,      0,      3}, // DRAM_base
         {3,      0,      3}, // HBM_base
         {3,      0,      3}, // Mixed_base
         {1,      4,      1}},// Prefetching
        //complex query
        {{1,      4,      1}, // DRAM_base
         {1,      4,      1}, // HBM_base
         {1,      4,      1}, // Mixed_base
         {1,      4,      1}},// Prefetching
    };*/
    /*constexpr static int thread_counts[2][4][3] = {
        //simple query
        //scan_a, scan_b, aggr_j
        {{2,      0,      4}, // DRAM_base
         {2,      0,      4}, // HBM_base
         {2,      0,      4}, // Mixed_base
         {1,      4,      1}},// Prefetching
        //complex query
        {{1,      4,      1}, // DRAM_base
         {1,      4,      1}, // HBM_base
         {1,      4,      1}, // Mixed_base
         {1,      4,      1}},// Prefetching
    };*/
    constexpr static int thread_counts[2][4][3] = {
        //simple query
        //scan_a, scan_b, aggr_j
        {{4,      0,      2}, // DRAM_base
         {4,      0,      2}, // HBM_base
         {4,      0,      2}, // Mixed_base
         {1,      4,      1}},// Prefetching
        //complex query
        {{1,      4,      1}, // DRAM_base
         {1,      4,      1}, // HBM_base
         {1,      4,      1}, // Mixed_base
         {1,      4,      1}},// Prefetching
    };
    static inline NewPMode inc(NewPMode value) {
        return static_cast<NewPMode>(value + 1);
    };
    static inline bool pred(NewPMode value) {
        return DRAM_base <= value && value <= Prefetch;
    };
    static int thread_count(uint8_t query_type, NewPMode mode, uint8_t thread_type){
        if(query_type > 1) query_type = 1;
        if(thread_type > 2) thread_type = 2;
        return (thread_counts[query_type][mode][thread_type]);
    };
    static std::string string(NewPMode value) {
        switch(value) {
            case  DRAM_base: return "DRAM_Baseline";
            case   HBM_base: return "HBM_Baseline";
            case Mixed_base: return "DRAM_HBM_Baseline";
        }                    return "Q-d_Prefetching";
    };
 };
--- a/qdp_project/src/utils/file_output.h
+++ b/qdp_project/src/utils/file_output.h
@ -0,0 +1,76 @@
 /**
 * @file file_output.h
 * @author André Berthold
 * @brief Implements a template-function that accepts an arbitrary number of parameters that should be printed
 * @version 0.1
 * @date 2023-05-25
 * 
 * @copyright Copyright (c) 2023
 * 
 */
 #pragma once
 #include <fstream>
 #include <string>
 #include <type_traits>
 #include "iterable_range.h"
 template<class T>
 inline constexpr bool is_numeric_v = std::disjunction<
            std::is_integral<T>, 
            std::is_floating_point<T>>::value;
 /**
 * @brief Converts a parameter to a string by either  using it directly or its member current (if it is of type Labeled) 
 * as parameter to the std::string-Constructor.
 * 
 * @tparam T Type of the parameter
 * @param value Parameter to be converted
 * @return std::string The converted parameter
 */
 template<typename T>
 inline std::string to_string(T value) {
    if constexpr(std::is_base_of<Labeled, T>::value){
        // integrals cannot be use in the string constructor and must be translated by the std::to_string-function
        if constexpr (is_numeric_v<decltype(value.current)>) {
            return std::to_string(value.current);
        } else {
            return std::string(value.current);
        }
    } else {
        // integrals cannot be use in the string constructor and must be translated by the std::to_string-function
        if constexpr (is_numeric_v<decltype(value)>) {
            return std::to_string(value);
        } else {
            return std::string(value);
        }
    }
 }
 /**
 * @brief This function wites the content of *val* to *file*. Terminates terecursive function definition.
 * 
 * @tparam type Type of the paramter *val* (is usually implicitly defeined)
 * @param file File that is written to
 * @param val Value that is translated to a char stream and written to the file
 */
 template<typename type>
 inline void print_to_file(std::ofstream &file, type val) {
    file << to_string(val) << std::endl;
 }
 /**
 * @brief This function wites the content of *val* and that content if *vals* to *file*.
 * 
 * @tparam type Type of the paramter *val* (is usually implicitly defeined)
 * @tparam types Parameter pack that describes the types of *vals*
 * @param file File that is written to
 * @param val Value that is translated to a char stream and written to the file
 * @param vals Paramater pack of values that are gonna be printed to the file
 */
 template<typename type, typename... types>
 inline void print_to_file(std::ofstream &file, type val, types ... vals) {
    file << to_string(val) << ",";
    print_to_file(file, vals...);
 }
--- a/qdp_project/src/utils/iterable_range.h
+++ b/qdp_project/src/utils/iterable_range.h
@ -0,0 +1,208 @@
 #pragma once
 #include <cstdint>
 #include <type_traits>
 #include <string>
 constexpr auto NO_NEXT = "false";
 /**
 * @brief Class that adds an label member-parameter to a sub-class
 * 
 */
 class Labeled {
 public:
    std::string label;
 public:
    Labeled(std::string str) : label(str) {};
    Labeled(const char* str) { this->label = std::string(str); };
 };
 /**
 * @brief Converts a parameter to a string by either reading the member label (if it is of type Labeled) or using it 
 * as parameter to the std::string-Constructor.
 * 
 * @tparam T Type of the parameter
 * @param value Parameter to be converted
 * @return std::string The converted parameter
 */
 template<typename T>
 inline std::string generateHead(T value) {
    if constexpr(std::is_base_of<Labeled, T>::value){
        return value.label;
    } else {
        return std::string(value);
    }
 }
 /**
 * @brief Converts a parameter-pack to a string calling genarateHead(T) on every parameter and concatenatin the results.
 * 
 * @tparam T Type of the first parameter
 * @tparam Ts Parameter pack specifying the preceeding parameters' types
 * @param value Parameter to be transformed
 * @param values Parameter-pack of the next prameters to be transformed
 * @return std::string Comma-separated concatenation of all parameters string representation
 */
 template<typename T, typename... Ts>
 inline std::string generateHead(T value, Ts... values) {
    return generateHead(value) + ',' + generateHead(values...);
 }
 /**
 * @brief Takes a single Range object and calls its next function.
 * 
 * @tparam T Specific type of the Range object
 * @param t Instance of the Range object
 * @return std::string Label of the Range object or "false" if the Range reaced its end and was reset
 */
 template<typename T>
 std::string IterateOnce(T& t) {
    if(t.next()) return t.label;
    else t.reset();
    return std::string(NO_NEXT); //the string signalises that the iteration has to be terminiated.
 }
 /**
 * @brief Takes a number of Range objects and recusively increments them till the first Range does not reach its end
 * upon incrementing. It tarts at the first Range object given. Every Range object that reached its end is reset to 
 * its start value. 
 * 
 * @tparam T Specific type of the first Range object
 * @tparam Ts Types to the following Range objects
 * @param t First instance of the Range object
 * @param ts Parameter pack of the following Range objects
 * @return std::string Label of the highest index Range object that was altered, or "false" if the last Range object 
 * reache its end and was reset
 */
 template<typename T, typename... Ts>
 std::string IterateOnce(T& t , Ts&... ts) {
    if(t.next()) return t.label;
    else t.reset();
    return IterateOnce<Ts...>(ts...);
 }
 /**
 * @brief Class that provides a convenient interface for iteratin throug a parameter range. It stores a public value 
 * that can be altered by the classes' methods.
 * 
 * @tparam T Base type of the parameter
 * @tparam INIT Initial value of the current pointer
 * @tparam PRED Struct providing an apply function testing if the current value is in range or not
 * @tparam INC Struct providing an apply function setting the current value to the value following the current value
 */
 template<typename T, T INIT, typename PRED, typename INC>
 class Range : public Labeled {
 public:
    /**
     * @brief Current value of the parameter
     */
    T current = INIT;
    /**
     * @brief Resets current to its initial value
     */
    void reset() {current = INIT; };
    /**
     * @brief Sets current to its next value (according to INC::inc) and returns if the range Reached its end 
     * (accordingt to PRED::pred).
     * 
     * @return true The newly assigned value of current is in the range
     * @return false Otherwise
     */
    bool next() {
        current = INC::inc(current);
        return PRED::pred(current);
    };
    /**
     * @brief Checks if current is in the Range (according to PRED).
     * 
     * @return true PRED returns true
     * @return false Otherwise
     */
    bool valid() { return PRED::apply(current); };
 };
 /**
 * @brief Class that is in contrast to Range specialized for integral values.
 * 
 * @tparam T Integral base type of the Range
 * @tparam INIT Initial value of the parameter
 * @tparam MAX Maximal value of the parameter
 * @tparam INC Struct providing an apply function setting the current value to the value following the current value
 */
 template<typename T, T INIT, T MAX, typename INC>
 class Int_Range : public Labeled {
 static_assert(std::is_integral<T>::value, "Int_Range requires an integral base type");
 public:
    const T max = MAX;
    T current = INIT;
    void reset() {current = INIT; };
    bool next() {
        current = INC::inc(current);
        return current < MAX;
    };
    bool valid() { return current < MAX; };
 };
 /**
 * @brief Class that is in contrast to Int_Range specialized for integrals that grow linearly.
 * 
 * @tparam T Integral base type of the Range
 * @tparam INIT Initial value of the parameter
 * @tparam MAX Maximal value of the parameter
 * @tparam STEP Increase of the value per next()-call
 */
 template<typename T, T INIT, T MAX, T STEP = 1>
 class Linear_Int_Range : public Labeled {
 static_assert(std::is_integral<T>::value, "Linear_Int_Range requires an integral base type");
 public:
    const T max = MAX;
    T current = INIT;
    void reset() {current = INIT; };
    bool next() {
        current += STEP;
        return current < MAX;
    };
    bool valid() { return current < MAX; };
 };
 /**
 * @brief Class that is in contrast to Int_Range specialized for integrals that grow exponetially.
 * 
 * @tparam T Integral base type of the Range
 * @tparam INIT Initial value of the parameter
 * @tparam MAX Maximal value of the parameter
 * @tparam FACTOR Multiplicative Increase of the value per next()-call
 */
 template<typename T, T INIT, T MAX, T FACTOR = 2>
 class Exp_Int_Range : public Labeled {
 static_assert(std::is_integral<T>::value, "Exp_Int_Range requires an integral base type");
 public:
    const T max = MAX;
    T current = INIT;
    void reset() {current = INIT; };
    bool next() {
        current *= FACTOR;
        return current < MAX;
    };
    bool valid() { return current < MAX; };
 };
--- a/qdp_project/src/utils/measurement_utils.h
+++ b/qdp_project/src/utils/measurement_utils.h
@ -0,0 +1,152 @@
 #pragma once
 #include <cstdint>
 #include <chrono>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include <numa.h>
 #if PCM_M == 1
 #define PCM_MEASURE 1
 #include "pcm.h"
 #endif
 struct pcm_value_collector {
    const uint32_t value_count = 6;
    uint32_t threads;
    std::vector<std::string> points;
 #ifdef PCM_MEASURE
    pcm::SystemCounterState** states;
 #endif
    uint64_t** collection;
    pcm_value_collector(const std::vector<std::string>& in_points, uint32_t threads, uint32_t memory_node) : threads(threads) {
 #ifdef PCM_MEASURE
        points = std::vector(in_points);
        collection = (uint64_t**) numa_alloc_onnode(threads * sizeof(uint64_t*), memory_node);
        states = (pcm::SystemCounterState**) numa_alloc_onnode(threads * sizeof(pcm::SystemCounterState*), memory_node);
        for(int i = 0; i < threads; ++i) {
            collection[i] = (uint64_t*) numa_alloc_onnode(points.size() * value_count * sizeof(uint64_t), memory_node);
            states[i] = (pcm::SystemCounterState*) numa_alloc_onnode(points.size() * sizeof(pcm::SystemCounterState), memory_node);
        }
 #endif
    }
    ~pcm_value_collector() {
 #ifdef PCM_MEASURE
        for(int i = 0; i < threads; ++i) {
            numa_free(collection[threads], points.size() * value_count * sizeof(uint64_t));
        }
        numa_free(collection, threads * sizeof(uint64_t*));
        numa_free(states, threads * sizeof(pcm::SystemCounterState));
 #endif
    }
    void reset() {
 #ifdef PCM_MEASURE
        for(int i = 0; i < threads; ++i)
            for(uint32_t j = 0; j < points.size() * value_count;  ++j){
                collection[i][j] = 0;
            }
 #endif
    }
    int64_t point_index(const std::string& value) {
        auto it = std::find(points.begin(), points.end(), value);
        if(it == points.end()) return -1;
        else return it - points.begin();
    }
    std::vector<uint64_t> summarize(const std::string &point) {
 #ifdef PCM_MEASURE
        std::vector<uint64_t> sums(value_count);
        int64_t idx = point_index(point);
        if(idx < 0) return sums;
        for(uint32_t v = 0; v < value_count; ++v) {
            for(uint32_t i = 0; i < threads; ++i) {
                sums[v] += collection[i][static_cast<uint32_t>(idx) + points.size() * v];
            }
        }
        return sums;
 #endif
        return std::vector<uint64_t> {0};
    }
    std::string summarize_as_string(const std::string &point) {
 #ifdef PCM_MEASURE
        auto summary = summarize(point);
        auto it = summary.begin();
        auto end = summary.end();
        if(it >= end) return "";
        std::string result(""); 
        result += std::to_string(*it);
        ++it;
        while(it < end) {
            result += ",";
            result += std::to_string(*it);
            ++it;
        }
        return result;
 #endif
        return "";
    }
    void start(const std::string& point, uint32_t thread) {
 #ifdef PCM_MEASURE
        int64_t idx = point_index(point);
        if(idx < 0) {
            std::cerr << "Invalid 'point' given. Ignored!" << std::endl;
            return;
        }
        states[thread][static_cast<uint32_t>(idx)] = pcm::getSystemCounterState();
 #endif
    }
    static std::string getHead(const std::string& point) {
        return point + "_l2h," + 
               point + "_l2m," + 
               point + "_l3h," + 
               point + "_l3hns," + 
               point + "_l3m," + 
               point + "_mc";  
    }
 #ifdef PCM_MEASURE
    void read_values(uint32_t point_idx, uint32_t thread, pcm::SystemCounterState& start, pcm::SystemCounterState& end) {
        collection[thread][point_idx + points.size() * 0] += getL2CacheHits(start, end);
        collection[thread][point_idx + points.size() * 1] += getL2CacheMisses(start, end);
        collection[thread][point_idx + points.size() * 2] += getL3CacheHits(start, end);
        collection[thread][point_idx + points.size() * 3] += getL3CacheHitsNoSnoop(start, end);
        collection[thread][point_idx + points.size() * 4] += getL3CacheMisses(start, end);
        collection[thread][point_idx + points.size() * 5] += getBytesReadFromMC(start, end);
    }
 #endif
    void stop(const std::string& point, uint32_t thread) {
 #ifdef PCM_MEASURE
        auto state = pcm::getSystemCounterState();
        int64_t idx = point_index(point);
        if(idx < 0) {
            std::cerr << "Invalid 'point' given. Ignored!" << std::endl;
            return;
        }
        auto start = states[thread][static_cast<uint32_t>(idx)];
        read_values(static_cast<uint32_t>(idx), thread, start, state);
 #endif
    }
 };
--- a/qdp_project/src/utils/memory_literals.h
+++ b/qdp_project/src/utils/memory_literals.h
@ -0,0 +1,45 @@
 /**
 * @file memory_literals.h
 * @author André Berthold
 * @brief Defines some operators that ease to define a certain size of memory.
 * e.g. to alloc 3 Gib (Gibibit = 2^30 bit) of memory one can now simply write: "std::malloc(3_Gib)"
 *      to alloc 512 MB (Megabyte = 10^2 byte) of memory one can now simply write: "std::malloc(512_MB)"
 * @version 0.1
 * @date 2023-05-25
 * 
 * @copyright Copyright (c) 2023
 * 
 */
 #pragma once
 #include <cstdint>
 typedef const unsigned long long int ull_int;
 //***************************************************************************//
 // Bit **********************************************************************//
 //***************************************************************************//
 constexpr size_t operator ""_b(ull_int value) {
    // one byte is 8 bit + one byte if bit is no multiple of 8
    return value / 8 + value % 8;
 }
 constexpr size_t operator ""_kb (ull_int value) { return value * 1000 / 8; }
 constexpr size_t operator ""_kib(ull_int value) { return value * 1024 / 8; }
 constexpr size_t operator ""_Mb (ull_int value) { return value * 1000 * 1000 / 8; }
 constexpr size_t operator ""_Mib(ull_int value) { return value * 1024 * 1024 / 8; }
 constexpr size_t operator ""_Gb (ull_int value) { return value * 1000 * 1000 * 1000 / 8; }
 constexpr size_t operator ""_Gib(ull_int value) { return value * 1024 * 1024 * 1024 / 8; }
 constexpr size_t operator ""_Tb (ull_int value) { return value * 1000 * 1000 * 1000 * 1000 / 8; }
 constexpr size_t operator ""_Tib(ull_int value) { return value * 1024 * 1024 * 1024 * 1024 / 8; }
 //***************************************************************************//
 // Byte *********************************************************************//
 //***************************************************************************//
 constexpr size_t operator ""_B  (ull_int value) { return value; }
 constexpr size_t operator ""_kB (ull_int value) { return value * 1000; }
 constexpr size_t operator ""_kiB(ull_int value) { return value * 1024; }
 constexpr size_t operator ""_MB (ull_int value) { return value * 1000 * 1000; }
 constexpr size_t operator ""_MiB(ull_int value) { return value * 1024 * 1024; }
 constexpr size_t operator ""_GB (ull_int value) { return value * 1000 * 1000 * 1000; }
 constexpr size_t operator ""_GiB(ull_int value) { return value * 1024 * 1024 * 1024; }
 constexpr size_t operator ""_TB (ull_int value) { return value * 1000 * 1000 * 1000 * 1000; }
 constexpr size_t operator ""_TiB(ull_int value) { return value * 1024 * 1024 * 1024 * 1024; }
--- a/qdp_project/src/utils/pcm.h
+++ b/qdp_project/src/utils/pcm.h
@ -0,0 +1,6 @@
 #pragma once
 //this file includes all important header from the pcm repository
 #include "cpucounters.h"
 #include "msr.h"
 #include "pci.h"
 #include "mutex.h"
--- a/qdp_project/src/utils/timer_utils.h
+++ b/qdp_project/src/utils/timer_utils.h
@ -0,0 +1,80 @@
 #pragma once
 #include <cstdint>
 #include <chrono>
 #include <barrier>
 #include <numa.h>
 #define THREAD_TIMINGS 1
 struct thread_runtime_timing {
    using time_point_t = std::chrono::time_point<std::chrono::steady_clock>;
    uint32_t time_points, time_threads;
    time_point_t** start_times;
    double** time_accumulator;
    thread_runtime_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) {
 #ifdef THREAD_TIMINGS
        time_points = timing_points;
        time_threads = timing_threads;
        start_times = (time_point_t**) numa_alloc_onnode(timing_points * sizeof(time_point_t*), memory_node);
        time_accumulator =  (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node);
        for(uint32_t i = 0; i < timing_points; ++i) {
            start_times[i] = (time_point_t*) numa_alloc_onnode(timing_threads * sizeof(time_point_t), memory_node);
            time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node);
        }
 #endif
    }
    ~thread_runtime_timing() {
 #ifdef THREAD_TIMINGS
        for(uint32_t i = 0; i < time_points; ++i) {
            numa_free(start_times[i], time_threads * sizeof(time_point_t));
            numa_free(time_accumulator[i], time_threads * sizeof(double));
        }
        numa_free(start_times, time_points * sizeof(time_point_t*));
        numa_free(time_accumulator, time_points * sizeof(double*));
 #endif
    }
    void reset_accumulator() {
 #ifdef THREAD_TIMINGS        
        for(uint32_t i = 0; i < time_points;  ++i){
        for(uint32_t j = 0; j < time_threads; ++j){
            time_accumulator[i][j] = 0.0;
        }}
 #endif
    }
    double summarize_time(uint32_t time_point) {
 #ifdef THREAD_TIMINGS
        double sum = 0.0;
        for(uint32_t i = 0; i < time_threads; ++i) {
            sum += time_accumulator[time_point][i];
        }
        return sum;
 #endif
    }
    void stop_timer(uint32_t point_id, uint32_t thread_id) {
 #ifdef THREAD_TIMINGS
        auto end_time = std::chrono::steady_clock::now();
        auto start_time = start_times[point_id][thread_id];
        uint64_t time = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
        double seconds = time / (1000.0 * 1000.0 * 1000.0);
        time_accumulator[point_id][thread_id] += seconds;
 #endif 
    }
    void start_timer(uint32_t point_id, uint32_t thread_id) {
 #ifdef THREAD_TIMINGS
        start_times[point_id][thread_id] = std::chrono::steady_clock::now();
 #endif
    }
 };
--- a/qdp_project/src/utils/vector_loader.h
+++ b/qdp_project/src/utils/vector_loader.h
@ -0,0 +1,93 @@
 /**
 * @file vector_loader.h
 * @author André Berthold
 * @brief Provides an interface to easily excange vector loading strategies
 * @version 0.1
 * @date 2023-05-25
 * 
 * @copyright Copyright (c) 2023
 * 
 */
 #pragma once
 #include <cstdint>
 #include <type_traits>
 #include <immintrin.h>
 enum load_mode {Unaligned = 0, Aligned = 1, Stream = 2};
 /**
 * @brief A class template that provides functions for loading and storing data of type *base_t* into/from vectors using the stretegy *mode*.
 * 
 * @tparam base_t Base type of the data
 * @tparam mode Strategy for loading the vector 
 */
 template<typename base_t, load_mode mode> 
 class Vector_Loader {};
 /**
 * @brief Template specialization for Vector_Loader with base_t = uint32_t.
 * 
 * @tparam mode Strategy for loading the vector
 */
 template<load_mode mode> 
 class Vector_Loader<uint32_t, mode> {
    using base_t = uint32_t;
    using mask_t = __mmask16;
    using mask_base_t = uint8_t;
 public:
    /**
     * @brief Loads 512 bit of data into a vector register
     * 
     * @param src Pointer to the data to load
     * @return __m512i The vector register with the loaded data
     */
    static inline __m512i load(base_t* src) {
        if      constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi32(src);
        else if constexpr (mode == load_mode::Aligned)   return _mm512_load_epi32 (src);
        else if constexpr (mode == load_mode::Stream)    return _mm512_stream_load_si512(src);        
    };
    /**
     * @brief Stroes data from a given vector register to a destination pointer
     * 
     * @param dst Pointer to the data destination
     * @param vector Vector register containing the data to store
     */
    static inline void store(base_t* dst, __m512i vector) {
        if      constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi32(dst, vector);
        else if constexpr (mode == load_mode::Aligned)   _mm512_store_epi32 (dst, vector);
        else if constexpr (mode == load_mode::Stream)    _mm512_stream_si512((__m512i*)(dst), vector);
    };
 };
 /**
 * @brief Template specialization for Vector_Loader with base_t = uint64_t.
 * 
 * @tparam mode Strategy for loading the vector
 */
 template<load_mode mode> 
 class Vector_Loader<uint64_t, mode> {
    using base_t = uint64_t;
    using mask_t = __mmask8;
    using mask_base_t = uint8_t;
 public:
    static inline __m512i load(base_t* src) {
        if      constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi64(src);
        else if constexpr (mode == load_mode::Aligned)   return _mm512_load_epi64 (src);
        else if constexpr (mode == load_mode::Stream)    return _mm512_stream_load_si512(src);        
    };
    static inline void store(base_t* dst, __m512i vector) {
        if      constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi64(dst, vector);
        else if constexpr (mode == load_mode::Aligned)   _mm512_store_epi64 (dst, vector);
        else if constexpr (mode == load_mode::Stream)    _mm512_stream_si512((__m512i*)(dst), vector);
    };
 };