From f91cd3202fb28669f9642bc31aef19773c372c9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 9 Jan 2024 17:28:35 +0100 Subject: [PATCH 01/29] add project 'offloading cacher' and function declarations for the cacher in its header file --- .gitignore | 2 + offloading-cacher/CMakeLists.txt | 19 ++ offloading-cacher/benchmark.hpp | 174 ++++++++++++++++++ .../cmake/modules/FindNUMA.cmake | 43 +++++ offloading-cacher/main.cpp | 42 +++++ offloading-cacher/offloading-cache.hpp | 84 +++++++++ offloading-cacher/util/dml-helper.hpp | 26 +++ 7 files changed, 390 insertions(+) create mode 100755 offloading-cacher/CMakeLists.txt create mode 100644 offloading-cacher/benchmark.hpp create mode 100644 offloading-cacher/cmake/modules/FindNUMA.cmake create mode 100644 offloading-cacher/main.cpp create mode 100644 offloading-cacher/offloading-cache.hpp create mode 100644 offloading-cacher/util/dml-helper.hpp diff --git a/.gitignore b/.gitignore index ab3553e..55c6836 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ *.fls */.vscode/* +*/.idea/* +*/cmake-build-*/* # ---> C++ # Prerequisites diff --git a/offloading-cacher/CMakeLists.txt b/offloading-cacher/CMakeLists.txt new file mode 100755 index 0000000..7b4844a --- /dev/null +++ b/offloading-cacher/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.18) + +project(offloading-cacher) + +set(CMAKE_CXX_STANDARD 20) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules") + +find_package(NUMA REQUIRED) + +set(DML_SOURCE_DIR "../../DML/include/") +set(SOURCES main.cpp) + +add_executable(offloading-cacher ${SOURCES}) + +target_include_directories(offloading-cacher PRIVATE ${CMAKE_SOURCE_DIR} ${NUMA_INCLUDE_DIRS} ${DML_SOURCE_DIR}) +target_link_libraries(offloading-cacher PRIVATE libdml.a pthread ${CMAKE_DL_LIBS} ${NUMA_LIBRARY}) + +install(TARGETS offloading-cacher DESTINATION ${CMAKE_INSTALL_PREFIX}) diff --git a/offloading-cacher/benchmark.hpp b/offloading-cacher/benchmark.hpp new file mode 100644 index 0000000..550efc2 --- /dev/null +++ b/offloading-cacher/benchmark.hpp @@ -0,0 +1,174 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "util/barrier.hpp" +#include "util/dml-helper.hpp" +#include "util/task-data.hpp" + +#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl +#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO +#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }} + +#define ADD_TIMING_MESSUREMENT { if (i >= 5) { args->submit_duration.emplace_back(std::chrono::duration_cast(se - st).count()); args->complete_duration.emplace_back(std::chrono::duration_cast(et - se).count()); args->combined_duration.emplace_back(std::chrono::duration_cast(et - st).count());}} + +template +void* thread_function(void* argp) { + TaskData* args = reinterpret_cast(argp); + + // set numa node and core affinity of the current thread + numa_run_on_node(args->numa_node); + + // allocate memory for the move operation on the requested numa nodes + void* src = numa_alloc_onnode(args->size, args->nnode_src); + void* dst = numa_alloc_onnode(args->size, args->nnode_dst); + dml::data_view srcv = dml::make_view(reinterpret_cast(src), args->size); + dml::data_view dstv = dml::make_view(reinterpret_cast(dst), args->size); + + std::memset(src, 0, args->size); + std::memset(dst, 0, args->size); + + args->status = dml::status_code::ok; + args->rep_completed = 0; + + std::chrono::time_point tps; + + // we add 5 as the first 5 iterations will not be meassured + // to remove exceptional values encountered during warmup + for (uint32_t i = 0; i < args->rep_count + 5; i++) { + // synchronize the start of each iteration + // using the barrier structure + args->barrier_->wait(); + + if (args->batch_submit) { + const auto st = std::chrono::steady_clock::now(); + + auto sequence = dml::sequence(args->batch_size, std::allocator()); + + for (uint32_t j = 0; j < args->batch_size; j++) { + // block_on_fault() is required to submit the task in a way so that the + // DSA engine can handle page faults itself together with the IOMMU which + // requires the WQ to be configured to allow this too + + const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv); + CHECK_STATUS(status, "Adding operation to batch failed!"); + } + + // we use the asynchronous submit-routine even though this is not required + // here, however the project later on will only use async operation and + // therefore this behaviour should be benchmarked + + auto handler = dml::submit(dml::batch, sequence); + + const auto se = std::chrono::steady_clock::now(); + + auto result = handler.get(); + + const auto et = std::chrono::steady_clock::now(); + + const dml::status_code status = result.status; + CHECK_STATUS(status, "Batch completed with an Error!"); + + ADD_TIMING_MESSUREMENT; + } + else if (args->batch_size > 1) { + // implementation for non-batched batch submit follows here + // this means we submit a bunch of work as single descriptors + // but then dont wait for the completion immediately + + std::vector>> handlers; + + const auto st = std::chrono::steady_clock::now(); + + for (uint32_t j = 0; j < args->batch_size; j++) { + // block_on_fault() is required to submit the task in a way so that the + // DSA engine can handle page faults itself together with the IOMMU which + // requires the WQ to be configured to allow this too + + handlers.emplace_back(dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv)); + } + + const auto se = std::chrono::steady_clock::now(); + + for (auto& handler : handlers) { + auto result = handler.get(); + const dml::status_code status = result.status; + CHECK_STATUS(status, "Operation completed with an Error!"); + } + + const auto et = std::chrono::steady_clock::now(); + + ADD_TIMING_MESSUREMENT; + } + else { + const auto st = std::chrono::steady_clock::now(); + + // we use the asynchronous submit-routine even though this is not required + // here, however the project later on will only use async operation and + // therefore this behaviour should be benchmarked + // block_on_fault() is required to submit the task in a way so that the + // DSA engine can handle page faults itself together with the IOMMU which + // requires the WQ to be configured to allow this too + auto handler = dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); + + const auto se = std::chrono::steady_clock::now(); + + auto result = handler.get(); + + const auto et = std::chrono::steady_clock::now(); + + const dml::status_code status = result.status; + CHECK_STATUS(status, "Operation completed with an Error!"); + + ADD_TIMING_MESSUREMENT; + } + + // again: we do not count the first 5 repetitions + if (i == 5) tps = std::chrono::steady_clock::now(); + if (i >= 5) args->rep_completed++; + } + + const auto tpe = std::chrono::steady_clock::now(); + + args->total_time = std::chrono::duration_cast(tpe - tps).count(); + + // free the allocated memory regions on the selected nodes + numa_free(src, args->size); + numa_free(dst, args->size); + + return nullptr; +} + +template +void execute_dml_memcpy(std::vector& args) { + barrier task_barrier(args.size()); + std::vector threads; + + // initialize numa library + numa_available(); + + // for each submitted task we link the semaphore + // and create the thread, passing the argument + for (auto& arg : args) { + arg.barrier_ = &task_barrier; + threads.emplace_back(); + + if (pthread_create(&threads.back(), nullptr, thread_function, &arg) != 0) { + std::cerr << "Error creating thread" << std::endl; + exit(1); + } + } + + for (pthread_t& t : threads) { + pthread_join(t, nullptr); + } +} \ No newline at end of file diff --git a/offloading-cacher/cmake/modules/FindNUMA.cmake b/offloading-cacher/cmake/modules/FindNUMA.cmake new file mode 100644 index 0000000..94b23c8 --- /dev/null +++ b/offloading-cacher/cmake/modules/FindNUMA.cmake @@ -0,0 +1,43 @@ +# Module for locating libnuma +# +# Read-only variables: +# NUMA_FOUND +# Indicates that the library has been found. +# +# NUMA_INCLUDE_DIR +# Points to the libnuma include directory. +# +# NUMA_LIBRARY_DIR +# Points to the directory that contains the libraries. +# The content of this variable can be passed to link_directories. +# +# NUMA_LIBRARY +# Points to the libnuma that can be passed to target_link_libararies. +# +# Copyright (c) 2013-2020 MulticoreWare, Inc + +include(FindPackageHandleStandardArgs) + +find_path(NUMA_ROOT_DIR + NAMES include/numa.h + PATHS ENV NUMA_ROOT + DOC "NUMA root directory") + +find_path(NUMA_INCLUDE_DIR + NAMES numa.h + HINTS ${NUMA_ROOT_DIR} + PATH_SUFFIXES include + DOC "NUMA include directory") + +find_library(NUMA_LIBRARY + NAMES numa + HINTS ${NUMA_ROOT_DIR} + DOC "NUMA library") + +if (NUMA_LIBRARY) + get_filename_component(NUMA_LIBRARY_DIR ${NUMA_LIBRARY} PATH) +endif() + +mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARY_DIR NUMA_LIBRARY) + +find_package_handle_standard_args(NUMA REQUIRED_VARS NUMA_ROOT_DIR NUMA_INCLUDE_DIR NUMA_LIBRARY) \ No newline at end of file diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp new file mode 100644 index 0000000..f49f1f1 --- /dev/null +++ b/offloading-cacher/main.cpp @@ -0,0 +1,42 @@ +#include + +#include +#include +#include + +#include "benchmark.hpp" + +int main(int argc, char **argv) { + if (argc < 3) { + std::cout << "Missing input and output file names." << std::endl; + std::cout << "Usage: ./benchmarks [input.json] [output.json]" << std::endl; + return 1; + } + + const std::string input = argv[1]; + const std::string output = argv[2]; + + std::string path; + std::vector args; + + std::ifstream is(input); + ReadWorkDescription(args, path, is); + is.close(); + + if (path == "hw") { + execute_dml_memcpy(args); + } + else if (path == "sw") { + execute_dml_memcpy(args); + } + else if (path == "auto") { + execute_dml_memcpy(args); + } + else { + std::cerr << "Path is neither hw/sw/auto." << std::endl; + } + + std::ofstream os(output); + WriteResultLog(args, path, os); + os.close(); +} diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp new file mode 100644 index 0000000..613d498 --- /dev/null +++ b/offloading-cacher/offloading-cache.hpp @@ -0,0 +1,84 @@ +#pragma once + +#include +#include +#include + +#include + +#include + +namespace offcache { + // the cache task structure will be used to submit and + // control a cache element, while providing source pointer + // and size in bytes for submission + // + // then the submitting thread may wait on the atomic "result" + // which will be notified by the cache worker upon processing + // after which the atomic-bool-ptr active will also become valid + // + // the data pointed to by result and the bool-ptr are guaranteed + // to remain valid until the value pointed to by active is changed + // to false, after which the worker may clean up and delete the + // structure - carefull, do not call delete on this, the worker does + struct CacheTask { + uint8_t* data_; + size_t size_; + std::atomic result_ { nullptr }; + std::atomic* active_; + }; + + // worker class, one for each numa node + // discovers its node configuration on startup + // and keeps track of available memory + class CacheWorker { + private: + uint8_t numa_node_ = 0; + + std::unordered_map cache_info_; + + public: + // this is the mailbox of the worker to which a new task + // may be submitted by exchanging nullptr with a valid one + // and notifying on the atomic after which ownership + // of the CacheTask structure is transferred to the worker + std::atomic* task_slot_ = nullptr; + + static void run(CacheWorker* this_, const uint8_t numa_node); + }; + + // singleton which holds the cache workers + // and is the place where work will be submited + class CacheCoordinator { + public: + // cache policy is defined as a type here to allow flexible usage of the cacher + // given a numa destination node (where the data will be needed), the numa source + // node (current location of the data) and the data size, this function should + // return optimal cache placement + // dst node and returned value can differ if the system, for example, has HBM + // attached accessible directly to node n under a different node id m + typedef uint8_t (CachePolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node, const size_t data_size); + + // copy policy specifies the copy-executing nodes for a given task + // which allows flexibility in assignment for optimizing raw throughput + // or choosing a conservative usage policy + typedef std::vector (CopyPolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node); + + enum class ExecutionPolicy { + Immediate, Relaxed, NoCache + }; + + private: + CachePolicy* cache_policy_function_ = nullptr; + CopyPolicy* copy_policy_function_ = nullptr; + + public: + void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); + + // submits the given task and takes ownership of the pointer + void SubmitTask(CacheTask* task, const ExecutionPolicy policy) const; + + static void WaitOnCompletion(CacheTask* task); + static void SignalDataUnused(CacheTask* task); + }; +} \ No newline at end of file diff --git a/offloading-cacher/util/dml-helper.hpp b/offloading-cacher/util/dml-helper.hpp new file mode 100644 index 0000000..1686fd1 --- /dev/null +++ b/offloading-cacher/util/dml-helper.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +inline const std::string StatusCodeToString(const dml::status_code code) { + switch(code) { + case dml::status_code::ok: return "ok"; + case dml::status_code::false_predicate: return "false predicate"; + case dml::status_code::partial_completion: return "partial completion"; + case dml::status_code::nullptr_error: return "nullptr error"; + case dml::status_code::bad_size: return "bad size"; + case dml::status_code::bad_length: return "bad length"; + case dml::status_code::inconsistent_size: return "inconsistent size"; + case dml::status_code::dualcast_bad_padding: return "dualcast bad padding"; + case dml::status_code::bad_alignment: return "bad alignment"; + case dml::status_code::buffers_overlapping: return "buffers overlapping"; + case dml::status_code::delta_delta_empty: return "delta delta empty"; + case dml::status_code::batch_overflow: return "batch overflow"; + case dml::status_code::execution_failed: return "execution failed"; + case dml::status_code::unsupported_operation: return "unsupported operation"; + case dml::status_code::queue_busy: return "queue busy"; + case dml::status_code::error: return "unknown error"; + case dml::status_code::config_error: return "config error"; + default: return "unhandled error"; + } +} \ No newline at end of file From 623366433bfca316ab932fddd0c0cfcfff2c6f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 9 Jan 2024 18:18:11 +0100 Subject: [PATCH 02/29] continue modifying the declarations for the cacher and providing some first definitions --- offloading-cacher/offloading-cache.hpp | 101 ++++++++++++++++++++----- 1 file changed, 82 insertions(+), 19 deletions(-) diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp index 613d498..9c2967a 100644 --- a/offloading-cacher/offloading-cache.hpp +++ b/offloading-cacher/offloading-cache.hpp @@ -2,13 +2,39 @@ #include #include +#include #include +#include + #include #include namespace offcache { + // execution policy selects in which way the data is supposed to be cached + // and returned with the following behaviour is guaranteed in addition to the + // returned value being valid: + // Immediate: return as fast as possible + // may return cached data, can return data in RAM + // will trigger caching of the data provided + // ImmediateNoCache: return as fast as possible and never trigger caching + // same as Immediate but will not trigger caching + // Relaxed: no rapid return needed, take time + // will trigger caching and may only return + // once the caching is successful but can still + // provide data in RAM + enum class ExecutionPolicy { + Relaxed, Immediate, ImmediateNoCache + }; + + struct WorkerTask { + uint8_t* src_; + uint8_t* dst_; + size_t size_; + std::atomic completed_ { false }; + }; + // the cache task structure will be used to submit and // control a cache element, while providing source pointer // and size in bytes for submission @@ -16,35 +42,29 @@ namespace offcache { // then the submitting thread may wait on the atomic "result" // which will be notified by the cache worker upon processing // after which the atomic-bool-ptr active will also become valid - // - // the data pointed to by result and the bool-ptr are guaranteed - // to remain valid until the value pointed to by active is changed - // to false, after which the worker may clean up and delete the - // structure - carefull, do not call delete on this, the worker does struct CacheTask { uint8_t* data_; size_t size_; - std::atomic result_ { nullptr }; - std::atomic* active_; + ExecutionPolicy policy_; + uint8_t* result_; + std::atomic active_; + std::vector sub_tasks_; }; // worker class, one for each numa node // discovers its node configuration on startup // and keeps track of available memory class CacheWorker { - private: + public: uint8_t numa_node_ = 0; - std::unordered_map cache_info_; - - public: // this is the mailbox of the worker to which a new task // may be submitted by exchanging nullptr with a valid one // and notifying on the atomic after which ownership // of the CacheTask structure is transferred to the worker - std::atomic* task_slot_ = nullptr; + std::atomic* task_slot_ = nullptr; - static void run(CacheWorker* this_, const uint8_t numa_node); + static void run(CacheWorker* this_); }; // singleton which holds the cache workers @@ -64,11 +84,11 @@ namespace offcache { // or choosing a conservative usage policy typedef std::vector (CopyPolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node); - enum class ExecutionPolicy { - Immediate, Relaxed, NoCache - }; - private: + std::unordered_map workers_; + + std::unordered_map cache_state_; + CachePolicy* cache_policy_function_ = nullptr; CopyPolicy* copy_policy_function_ = nullptr; @@ -78,7 +98,50 @@ namespace offcache { // submits the given task and takes ownership of the pointer void SubmitTask(CacheTask* task, const ExecutionPolicy policy) const; - static void WaitOnCompletion(CacheTask* task); + // waits upon completion of caching + // returns the location of the data + static uint8_t* WaitOnCompletion(CacheTask* task); + + // invalidates the given pointer static void SignalDataUnused(CacheTask* task); }; -} \ No newline at end of file +} + +void offcache::CacheWorker::run(CacheWorker* this_) { + +} + +void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { + cache_policy_function_ = cache_policy_function; + copy_policy_function_ = copy_policy_function; + + // initialize numa library + numa_available(); + + const uint8_t nodes_max = numa_num_configured_nodes(); + const uint8_t valid_nodes = numa_get_mems_allowed(); + + for (uint8_t node = 0; node < nodes_max; node++) { + if (numa_bitmask_isbitset(valid_nodes, node)) { + workers_.insert({ node, CacheWorker() }); + workers_[node].numa_node_ = node; + std::thread t (CacheWorker::run, &workers_[node]); + t.detach(); + } + } +} + +void offcache::CacheCoordinator::SubmitTask(CacheTask* task, const ExecutionPolicy policy) const { + +} + +uint8_t* offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) { + while (!task->sub_tasks_.empty()) { + task->sub_tasks_.back().completed_.wait(false); + task->sub_tasks_.pop_back(); + } +} + +void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) { + task->active_.store(false); +} From d396056230d36275a397c1ad64588f9758bce283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 9 Jan 2024 21:02:34 +0100 Subject: [PATCH 03/29] provide first draft of implementations for the cachers functionality --- offloading-cacher/offloading-cache.hpp | 298 ++++++++++++++++++++----- 1 file changed, 245 insertions(+), 53 deletions(-) diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp index 9c2967a..d937fb8 100644 --- a/offloading-cacher/offloading-cache.hpp +++ b/offloading-cacher/offloading-cache.hpp @@ -4,9 +4,11 @@ #include #include #include +#include #include +#include #include #include @@ -28,13 +30,6 @@ namespace offcache { Relaxed, Immediate, ImmediateNoCache }; - struct WorkerTask { - uint8_t* src_; - uint8_t* dst_; - size_t size_; - std::atomic completed_ { false }; - }; - // the cache task structure will be used to submit and // control a cache element, while providing source pointer // and size in bytes for submission @@ -45,26 +40,11 @@ namespace offcache { struct CacheTask { uint8_t* data_; size_t size_; - ExecutionPolicy policy_; - uint8_t* result_; - std::atomic active_; - std::vector sub_tasks_; - }; - - // worker class, one for each numa node - // discovers its node configuration on startup - // and keeps track of available memory - class CacheWorker { - public: - uint8_t numa_node_ = 0; - - // this is the mailbox of the worker to which a new task - // may be submitted by exchanging nullptr with a valid one - // and notifying on the atomic after which ownership - // of the CacheTask structure is transferred to the worker - std::atomic* task_slot_ = nullptr; - - static void run(CacheWorker* this_); + uint8_t* result_ = nullptr; + uint8_t* maybe_result_ = nullptr; + std::atomic active_ { true }; + std::atomic valid_ { false }; + std::vector>> handlers_; }; // singleton which holds the cache workers @@ -77,71 +57,283 @@ namespace offcache { // return optimal cache placement // dst node and returned value can differ if the system, for example, has HBM // attached accessible directly to node n under a different node id m - typedef uint8_t (CachePolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node, const size_t data_size); + typedef int (CachePolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size); // copy policy specifies the copy-executing nodes for a given task // which allows flexibility in assignment for optimizing raw throughput // or choosing a conservative usage policy - typedef std::vector (CopyPolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node); + typedef std::vector (CopyPolicy)(const int numa_dst_node, const int numa_src_node); private: - std::unordered_map workers_; + std::shared_mutex cache_mutex_; std::unordered_map cache_state_; CachePolicy* cache_policy_function_ = nullptr; CopyPolicy* copy_policy_function_ = nullptr; + dml::handler> ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const; + + void SubmitTask(CacheTask* task); + + CacheTask* CreateTask(const uint8_t *data, const size_t size) const; + + void DestroyTask(CacheTask* task) const; + public: void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); - // submits the given task and takes ownership of the pointer - void SubmitTask(CacheTask* task, const ExecutionPolicy policy) const; + // function to perform data access through the cache + // behaviour depends on the chosen execution policy + // Immediate and ImmediateNoCache return a cache task + // with guaranteed-valid result value where Relaxed + // policy does not come with this guarantee. + CacheTask* Access(uint8_t* data, const size_t size, const ExecutionPolicy policy); // waits upon completion of caching - // returns the location of the data - static uint8_t* WaitOnCompletion(CacheTask* task); + static void WaitOnCompletion(CacheTask* task); // invalidates the given pointer + // afterwards the reference to the + // cache task object may be forgotten static void SignalDataUnused(CacheTask* task); - }; -} -void offcache::CacheWorker::run(CacheWorker* this_) { + // returns the location of the cached data + // which may or may not be valid + static uint8_t* GetDataLocation(CacheTask* task); + void Flush(); + }; } -void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { +inline void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { cache_policy_function_ = cache_policy_function; copy_policy_function_ = copy_policy_function; // initialize numa library numa_available(); +} + +inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) { + // the best situation is if this data is already cached + // which we check in an unnamed block in which the cache + // is locked for reading to prevent another thread + // from marking the element we may find as unused and + // clearing it + { + std::shared_lock lock(cache_mutex_); - const uint8_t nodes_max = numa_num_configured_nodes(); - const uint8_t valid_nodes = numa_get_mems_allowed(); + const auto search = cache_state_.find(data); - for (uint8_t node = 0; node < nodes_max; node++) { - if (numa_bitmask_isbitset(valid_nodes, node)) { - workers_.insert({ node, CacheWorker() }); - workers_[node].numa_node_ = node; - std::thread t (CacheWorker::run, &workers_[node]); - t.detach(); + if (search != cache_state_.end()) { + if (search->second->size_ == size) { + search->second->active_.store(true); + // TODO: check for completed status depending on execution policy + return search->second; + } + else { + DestroyTask(search->second); + cache_state_.erase(search); + } } - } + } + + // at this point the requested data is not present in cache + // and we create a caching task for it + + CacheTask* task = CreateTask(data, size); + + if (policy == ExecutionPolicy::Immediate) { + // in intermediate mode the returned task + // object is guaranteed to be valid and therefore + // its resulting location must be validated + // after which we submit the task + // maybe_result is then set by submit + + task->result_ = data; + SubmitTask(task); + return task; + } + else if (policy == ExecutionPolicy::ImmediateNoCache) { + // for immediatenocache we just validate + // the generated task and return it + // we must also set maybe_result in case + // someone waits on this + + task->result_ = data; + task->maybe_result_ = data; + return task; + } + else if (policy == ExecutionPolicy::Relaxed) { + // for relaxed no valid task must be returned + // and therefore we just submit and then give + // the possible invalid task back with only + // maybe_result set by submission + + SubmitTask(task); + return task; + } + else { + // this should not be reached + } } -void offcache::CacheCoordinator::SubmitTask(CacheTask* task, const ExecutionPolicy policy) const { +inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) { + // obtain numa node of current thread to determine where the data is needed + + const int current_cpu = sched_getcpu(); + const int current_node = numa_node_of_cpu(current_cpu); + + // obtain node that the given data pointer is allocated on + + int data_node = -1; + get_mempolicy(&data_node, NULL, 0, (void*)task->data_, MPOL_F_NODE | MPOL_F_ADDR); + + // querry cache policy function for the destination numa node + + const uint32_t dst_node = cache_policy_function_(current_node, data_node, task->size_); + + // allocate data on this node and flush the unused parts of the + // cache if the operation fails and retry once + // TODO: smarter flush strategy could keep some stuff cached + + uint8_t* dst = numa_alloc_onnode(task->size_, dst_node); + + if (dst == nullptr) { + Flush(); + + dst = numa_alloc_onnode(task->size_, dst_node); + + if (dst == nullptr) { + return; + } + } + task->maybe_result_ = dst; + + // querry copy policy function for the nodes to use for the copy + + const std::vector executing_nodes = copy_policy_function_(dst_node, data_node); + const size_t task_count = executing_nodes.size(); + + // at this point the task may be added to the cache structure + // due to the task being initialized with the valid flag set to false + + { + std::unique_lock lock(cache_mutex_); + + const auto state = cache_state_.insert({task->data_, task}); + + // if state.second is false then no insertion took place + // which means that concurrently whith this thread + // some other thread must have accessed the same + // resource in which case we must perform an abort + // TODO: abort is not the only way to handle this situation + + if (!state.second) { + // abort by doing the following steps + // (1) free the allocated memory, (2) remove the "maybe result" as + // we will not run the caching operation, (3) clear the sub tasks + // for the very same reason, (4) set the result to the RAM-location + + numa_free(dst, task->size_); + task->maybe_result_ = nullptr; + task->result_ = task->data_; + return; + } + } + + // each task will copy one fair part of the total size + // and in case the total size is not a factor of the + // given task count the last node must copy the remainder + + const size_t size = task->size_ / task_count; + const size_t last_size = size + task->size_ % task_count; + + // save the current numa node mask to restore later + // as executing the copy task will place this thread + // on a different node + + const int nodemask = numa_get_run_node_mask(); + + for (uint32_t i = 0; i < task_count; i++) { + const size_t local_size = i + 1 == task_count ? size : last_size; + const size_t local_offset = i * size; + const uint8_t* local_src = task->data_ + local_offset; + uint8_t* local_dst = dst + local_offset; + + const auto handler = ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i]); + task->handlers_.emplace_back(handler); + } + + // set the valid flag of the task as all handlers + // required for completion signal are registered + + task->valid_.store(true); + task->valid_.notify_all(); + + // restore the previous nodemask + + numa_run_on_node_mask(nodemask); +} + +inline dml::handler> offcache::CacheCoordinator::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) { + dml::data_view srcv = dml::make_view(reinterpret_cast(src), size); + dml::data_view dstv = dml::make_view(reinterpret_cast(dst), size); + + numa_run_on_node(node); + + return dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); +} + +inline offcache::CacheTask* offcache::CacheCoordinator::CreateTask(const uint8_t* data, const size_t size) const { + CacheTask* task = new CacheTask(); + task->data_ = data; + task->size_ = size; + return task; +} + +inline void offcache::CacheCoordinator::DestroyTask(CacheTask* task) const { + numa_free(task->result_, task->size_); + delete task; } -uint8_t* offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) { - while (!task->sub_tasks_.empty()) { - task->sub_tasks_.back().completed_.wait(false); - task->sub_tasks_.pop_back(); +inline void offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) { + task->valid_.wait(false); + + for (auto& handler : task->handlers_) { + auto result = handler.get(); + // TODO: handle the returned status code } + + task->handlers_.clear(); } -void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) { +inline uint8_t* offcache::CacheCoordinator::GetDataLocation(CacheTask* task) { + return task->result_; +} + +inline void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) { task->active_.store(false); } + +inline void offcache::CacheCoordinator::Flush() { + // TODO: there probably is a better way to implement this flush + + { + std::unique_lock lock(cache_mutex_); + + auto it = cache_state_.begin(); + + while (it != cache_state_.end()) { + if (it->second->active_.load() == false) { + DestroyTask(it->second); + cache_state_.erase(it); + it = cache_state_.begin(); + } + else { + it++; + } + } + } +} \ No newline at end of file From 5e30a370ceb12cf75d350e7075645ffff6cb8475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 9 Jan 2024 23:49:22 +0100 Subject: [PATCH 04/29] finish first stage of caching implementation and provide a rudimentary test function in the main --- offloading-cacher/benchmark.hpp | 174 --------------- offloading-cacher/main.cpp | 80 ++++--- offloading-cacher/offloading-cache.hpp | 290 ++++++++++++++++--------- 3 files changed, 236 insertions(+), 308 deletions(-) delete mode 100644 offloading-cacher/benchmark.hpp diff --git a/offloading-cacher/benchmark.hpp b/offloading-cacher/benchmark.hpp deleted file mode 100644 index 550efc2..0000000 --- a/offloading-cacher/benchmark.hpp +++ /dev/null @@ -1,174 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include -#include -#include - -#include - -#include "util/barrier.hpp" -#include "util/dml-helper.hpp" -#include "util/task-data.hpp" - -#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl -#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO -#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }} - -#define ADD_TIMING_MESSUREMENT { if (i >= 5) { args->submit_duration.emplace_back(std::chrono::duration_cast(se - st).count()); args->complete_duration.emplace_back(std::chrono::duration_cast(et - se).count()); args->combined_duration.emplace_back(std::chrono::duration_cast(et - st).count());}} - -template -void* thread_function(void* argp) { - TaskData* args = reinterpret_cast(argp); - - // set numa node and core affinity of the current thread - numa_run_on_node(args->numa_node); - - // allocate memory for the move operation on the requested numa nodes - void* src = numa_alloc_onnode(args->size, args->nnode_src); - void* dst = numa_alloc_onnode(args->size, args->nnode_dst); - dml::data_view srcv = dml::make_view(reinterpret_cast(src), args->size); - dml::data_view dstv = dml::make_view(reinterpret_cast(dst), args->size); - - std::memset(src, 0, args->size); - std::memset(dst, 0, args->size); - - args->status = dml::status_code::ok; - args->rep_completed = 0; - - std::chrono::time_point tps; - - // we add 5 as the first 5 iterations will not be meassured - // to remove exceptional values encountered during warmup - for (uint32_t i = 0; i < args->rep_count + 5; i++) { - // synchronize the start of each iteration - // using the barrier structure - args->barrier_->wait(); - - if (args->batch_submit) { - const auto st = std::chrono::steady_clock::now(); - - auto sequence = dml::sequence(args->batch_size, std::allocator()); - - for (uint32_t j = 0; j < args->batch_size; j++) { - // block_on_fault() is required to submit the task in a way so that the - // DSA engine can handle page faults itself together with the IOMMU which - // requires the WQ to be configured to allow this too - - const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv); - CHECK_STATUS(status, "Adding operation to batch failed!"); - } - - // we use the asynchronous submit-routine even though this is not required - // here, however the project later on will only use async operation and - // therefore this behaviour should be benchmarked - - auto handler = dml::submit(dml::batch, sequence); - - const auto se = std::chrono::steady_clock::now(); - - auto result = handler.get(); - - const auto et = std::chrono::steady_clock::now(); - - const dml::status_code status = result.status; - CHECK_STATUS(status, "Batch completed with an Error!"); - - ADD_TIMING_MESSUREMENT; - } - else if (args->batch_size > 1) { - // implementation for non-batched batch submit follows here - // this means we submit a bunch of work as single descriptors - // but then dont wait for the completion immediately - - std::vector>> handlers; - - const auto st = std::chrono::steady_clock::now(); - - for (uint32_t j = 0; j < args->batch_size; j++) { - // block_on_fault() is required to submit the task in a way so that the - // DSA engine can handle page faults itself together with the IOMMU which - // requires the WQ to be configured to allow this too - - handlers.emplace_back(dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv)); - } - - const auto se = std::chrono::steady_clock::now(); - - for (auto& handler : handlers) { - auto result = handler.get(); - const dml::status_code status = result.status; - CHECK_STATUS(status, "Operation completed with an Error!"); - } - - const auto et = std::chrono::steady_clock::now(); - - ADD_TIMING_MESSUREMENT; - } - else { - const auto st = std::chrono::steady_clock::now(); - - // we use the asynchronous submit-routine even though this is not required - // here, however the project later on will only use async operation and - // therefore this behaviour should be benchmarked - // block_on_fault() is required to submit the task in a way so that the - // DSA engine can handle page faults itself together with the IOMMU which - // requires the WQ to be configured to allow this too - auto handler = dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); - - const auto se = std::chrono::steady_clock::now(); - - auto result = handler.get(); - - const auto et = std::chrono::steady_clock::now(); - - const dml::status_code status = result.status; - CHECK_STATUS(status, "Operation completed with an Error!"); - - ADD_TIMING_MESSUREMENT; - } - - // again: we do not count the first 5 repetitions - if (i == 5) tps = std::chrono::steady_clock::now(); - if (i >= 5) args->rep_completed++; - } - - const auto tpe = std::chrono::steady_clock::now(); - - args->total_time = std::chrono::duration_cast(tpe - tps).count(); - - // free the allocated memory regions on the selected nodes - numa_free(src, args->size); - numa_free(dst, args->size); - - return nullptr; -} - -template -void execute_dml_memcpy(std::vector& args) { - barrier task_barrier(args.size()); - std::vector threads; - - // initialize numa library - numa_available(); - - // for each submitted task we link the semaphore - // and create the thread, passing the argument - for (auto& arg : args) { - arg.barrier_ = &task_barrier; - threads.emplace_back(); - - if (pthread_create(&threads.back(), nullptr, thread_function, &arg) != 0) { - std::cerr << "Error creating thread" << std::endl; - exit(1); - } - } - - for (pthread_t& t : threads) { - pthread_join(t, nullptr); - } -} \ No newline at end of file diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index f49f1f1..b6c9714 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -1,42 +1,64 @@ -#include - -#include #include -#include +#include -#include "benchmark.hpp" +#include "offloading-cache.hpp" -int main(int argc, char **argv) { - if (argc < 3) { - std::cout << "Missing input and output file names." << std::endl; - std::cout << "Usage: ./benchmarks [input.json] [output.json]" << std::endl; - return 1; - } +double* GetRandomArray(const size_t size) { + double* array = new double[size]; - const std::string input = argv[1]; - const std::string output = argv[2]; + std::uniform_real_distribution unif(std::numeric_limits::min(), std::numeric_limits::max()); + std::default_random_engine re; - std::string path; - std::vector args; + for (size_t i = 0; i < size; i++) { + array[i] = unif(re); + } - std::ifstream is(input); - ReadWorkDescription(args, path, is); - is.close(); + return array; +} - if (path == "hw") { - execute_dml_memcpy(args); +bool IsEqual(const double* a, const double* b, const size_t size) { + for (size_t i = 0; i < size; i++) { + try { + if (a[i] != b[i]) return false; + } + catch (...) { + return false; + } } - else if (path == "sw") { - execute_dml_memcpy(args); + + return true; +} + +int main(int argc, char **argv) { + offcache::Cache cache; + + auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + return numa_dst_node; + }; + + auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { + return std::vector{ numa_src_node }; + }; + + cache.Init(cache_policy,copy_policy); + + static constexpr size_t data_size = 8192; + double* data = GetRandomArray(data_size); + + std::unique_ptr data_cache = cache.Access(reinterpret_cast(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed); + + data_cache->WaitOnCompletion(); + + double* cached = reinterpret_cast(data_cache->GetDataLocation()); + + if (data == cached) { + std::cout << "Caching did not affect data location." << std::endl; } - else if (path == "auto") { - execute_dml_memcpy(args); + + if (IsEqual(data,cached,data_size)) { + std::cout << "Cached data is correct." << std::endl; } else { - std::cerr << "Path is neither hw/sw/auto." << std::endl; + std::cout << "Cached data is wrong." << std::endl; } - - std::ofstream os(output); - WriteResultLog(args, path, os); - os.close(); } diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp index d937fb8..f40ef3d 100644 --- a/offloading-cacher/offloading-cache.hpp +++ b/offloading-cacher/offloading-cache.hpp @@ -1,15 +1,20 @@ #pragma once +#include + #include #include #include #include #include +#include +#include #include #include #include +#include #include @@ -30,6 +35,8 @@ namespace offcache { Relaxed, Immediate, ImmediateNoCache }; + class Cache; + // the cache task structure will be used to submit and // control a cache element, while providing source pointer // and size in bytes for submission @@ -37,19 +44,41 @@ namespace offcache { // then the submitting thread may wait on the atomic "result" // which will be notified by the cache worker upon processing // after which the atomic-bool-ptr active will also become valid - struct CacheTask { - uint8_t* data_; + class CacheData { + public: + using dml_handler = dml::handler>; + + private: + uint8_t* src_; size_t size_; - uint8_t* result_ = nullptr; - uint8_t* maybe_result_ = nullptr; - std::atomic active_ { true }; - std::atomic valid_ { false }; - std::vector>> handlers_; + + std::atomic* active_; + + protected: + std::atomic* cache_; + + uint8_t* incomplete_cache_; + + std::unique_ptr> handlers_; + + friend Cache; + + public: + CacheData(uint8_t* data, const size_t size); + CacheData(const CacheData& other); + ~CacheData(); + + void Deallocate(); + void WaitOnCompletion(); + + uint8_t* GetDataLocation() const; + + bool Active() const; }; // singleton which holds the cache workers // and is the place where work will be submited - class CacheCoordinator { + class Cache { public: // cache policy is defined as a type here to allow flexible usage of the cacher // given a numa destination node (where the data will be needed), the numa source @@ -67,18 +96,14 @@ namespace offcache { private: std::shared_mutex cache_mutex_; - std::unordered_map cache_state_; + std::unordered_map cache_state_; CachePolicy* cache_policy_function_ = nullptr; CopyPolicy* copy_policy_function_ = nullptr; dml::handler> ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const; - void SubmitTask(CacheTask* task); - - CacheTask* CreateTask(const uint8_t *data, const size_t size) const; - - void DestroyTask(CacheTask* task) const; + void SubmitTask(CacheData* task); public: void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); @@ -88,33 +113,23 @@ namespace offcache { // Immediate and ImmediateNoCache return a cache task // with guaranteed-valid result value where Relaxed // policy does not come with this guarantee. - CacheTask* Access(uint8_t* data, const size_t size, const ExecutionPolicy policy); - - // waits upon completion of caching - static void WaitOnCompletion(CacheTask* task); - - // invalidates the given pointer - // afterwards the reference to the - // cache task object may be forgotten - static void SignalDataUnused(CacheTask* task); - - // returns the location of the cached data - // which may or may not be valid - static uint8_t* GetDataLocation(CacheTask* task); + std::unique_ptr Access(uint8_t* data, const size_t size, const ExecutionPolicy policy); void Flush(); }; } -inline void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { +inline void offcache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { cache_policy_function_ = cache_policy_function; copy_policy_function_ = copy_policy_function; // initialize numa library numa_available(); + + std::cout << "[-] Cache Initialized" << std::endl; } -inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) { +inline std::unique_ptr offcache::Cache::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) { // the best situation is if this data is already cached // which we check in an unnamed block in which the cache // is locked for reading to prevent another thread @@ -126,13 +141,16 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co const auto search = cache_state_.find(data); if (search != cache_state_.end()) { - if (search->second->size_ == size) { - search->second->active_.store(true); - // TODO: check for completed status depending on execution policy - return search->second; + if (search->second.size_ == size) { + search->second.active_->store(true); + + std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)data << std::dec << std::endl; + + return std::move(std::make_unique(search->second)); } else { - DestroyTask(search->second); + std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)data << std::dec << std::endl; + cache_state_.erase(search); } } @@ -141,7 +159,7 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co // at this point the requested data is not present in cache // and we create a caching task for it - CacheTask* task = CreateTask(data, size); + auto task = std::make_unique(data, size); if (policy == ExecutionPolicy::Immediate) { // in intermediate mode the returned task @@ -150,9 +168,9 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co // after which we submit the task // maybe_result is then set by submit - task->result_ = data; - SubmitTask(task); - return task; + task->cache_->store(data); + SubmitTask(task.get()); + return std::move(task); } else if (policy == ExecutionPolicy::ImmediateNoCache) { // for immediatenocache we just validate @@ -160,9 +178,9 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co // we must also set maybe_result in case // someone waits on this - task->result_ = data; - task->maybe_result_ = data; - return task; + task->cache_->store(data); + task->incomplete_cache_ = data; + return std::move(task); } else if (policy == ExecutionPolicy::Relaxed) { // for relaxed no valid task must be returned @@ -170,15 +188,15 @@ inline offcache::CacheTask* offcache::CacheCoordinator::Access(uint8_t* data, co // the possible invalid task back with only // maybe_result set by submission - SubmitTask(task); - return task; + SubmitTask(task.get()); + return std::move(task); } else { // this should not be reached } } -inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) { +inline void offcache::Cache::SubmitTask(CacheData* task) { // obtain numa node of current thread to determine where the data is needed const int current_cpu = sched_getcpu(); @@ -187,42 +205,72 @@ inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) { // obtain node that the given data pointer is allocated on int data_node = -1; - get_mempolicy(&data_node, NULL, 0, (void*)task->data_, MPOL_F_NODE | MPOL_F_ADDR); + get_mempolicy(&data_node, NULL, 0, (void*)task->src_, MPOL_F_NODE | MPOL_F_ADDR); // querry cache policy function for the destination numa node - const uint32_t dst_node = cache_policy_function_(current_node, data_node, task->size_); + const int dst_node = cache_policy_function_(current_node, data_node, task->size_); + + std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl; // allocate data on this node and flush the unused parts of the // cache if the operation fails and retry once // TODO: smarter flush strategy could keep some stuff cached - uint8_t* dst = numa_alloc_onnode(task->size_, dst_node); + uint8_t* dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); if (dst == nullptr) { + std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; + Flush(); - dst = numa_alloc_onnode(task->size_, dst_node); + dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); if (dst == nullptr) { + std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; return; } } - task->maybe_result_ = dst; + task->incomplete_cache_ = dst; // querry copy policy function for the nodes to use for the copy const std::vector executing_nodes = copy_policy_function_(dst_node, data_node); const size_t task_count = executing_nodes.size(); - // at this point the task may be added to the cache structure - // due to the task being initialized with the valid flag set to false + // each task will copy one fair part of the total size + // and in case the total size is not a factor of the + // given task count the last node must copy the remainder + + const size_t size = task->size_ / task_count; + const size_t last_size = size + task->size_ % task_count; + + // save the current numa node mask to restore later + // as executing the copy task will place this thread + // on a different node + + bitmask* nodemask = numa_get_run_node_mask(); + + for (uint32_t i = 0; i < task_count; i++) { + const size_t local_size = i + 1 == task_count ? size : last_size; + const size_t local_offset = i * size; + const uint8_t* local_src = task->src_ + local_offset; + uint8_t* local_dst = dst + local_offset; + + task->handlers_->emplace_back(ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i])); + } + + // only at this point may the task be added to the control structure + // because adding it earlier could cause it to be returned for an + // access request while the handler-vector is not fully populated + // which could cause the wait-function to return prematurely + // TODO: this can be optimized because the abort is quite expensive { std::unique_lock lock(cache_mutex_); - const auto state = cache_state_.insert({task->data_, task}); + const auto state = cache_state_.insert({task->src_, *task}); // if state.second is false then no insertion took place // which means that concurrently whith this thread @@ -231,94 +279,127 @@ inline void offcache::CacheCoordinator::SubmitTask(CacheTask* task) { // TODO: abort is not the only way to handle this situation if (!state.second) { + std::cout << "[x] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + + // first wait on all copy operations to be completed + + task->WaitOnCompletion(); + // abort by doing the following steps // (1) free the allocated memory, (2) remove the "maybe result" as // we will not run the caching operation, (3) clear the sub tasks // for the very same reason, (4) set the result to the RAM-location numa_free(dst, task->size_); - task->maybe_result_ = nullptr; - task->result_ = task->data_; + task->incomplete_cache_ = nullptr; + task->cache_->store(task->src_); + + std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + return; } } - // each task will copy one fair part of the total size - // and in case the total size is not a factor of the - // given task count the last node must copy the remainder + // restore the previous nodemask - const size_t size = task->size_ / task_count; - const size_t last_size = size + task->size_ % task_count; + numa_run_on_node_mask(nodemask); +} - // save the current numa node mask to restore later - // as executing the copy task will place this thread - // on a different node +inline dml::handler> offcache::Cache::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const { + dml::const_data_view srcv = dml::make_view(src, size); + dml::data_view dstv = dml::make_view(dst, size); - const int nodemask = numa_get_run_node_mask(); + numa_run_on_node(node); - for (uint32_t i = 0; i < task_count; i++) { - const size_t local_size = i + 1 == task_count ? size : last_size; - const size_t local_offset = i * size; - const uint8_t* local_src = task->data_ + local_offset; - uint8_t* local_dst = dst + local_offset; + return dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); +} + +inline void offcache::CacheData::WaitOnCompletion() { + if (handlers_ == nullptr) { + std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - const auto handler = ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i]); - task->handlers_.emplace_back(handler); + cache_->wait(nullptr); + + std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; } + else { + std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - // set the valid flag of the task as all handlers - // required for completion signal are registered + for (auto& handler : *handlers_) { + auto result = handler.get(); + // TODO: handle the returned status code + } - task->valid_.store(true); - task->valid_.notify_all(); + handlers_ = nullptr; - // restore the previous nodemask + std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - numa_run_on_node_mask(nodemask); + cache_->store(incomplete_cache_); + cache_->notify_all(); + } } -inline dml::handler> offcache::CacheCoordinator::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) { - dml::data_view srcv = dml::make_view(reinterpret_cast(src), size); - dml::data_view dstv = dml::make_view(reinterpret_cast(dst), size); - - numa_run_on_node(node); +offcache::CacheData::CacheData(uint8_t* data, const size_t size) { + std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl; - return dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); + src_ = data; + size_ = size; + active_ = new std::atomic(); + cache_ = new std::atomic(); + incomplete_cache_ = nullptr; + handlers_ = std::make_unique>(); } -inline offcache::CacheTask* offcache::CacheCoordinator::CreateTask(const uint8_t* data, const size_t size) const { - CacheTask* task = new CacheTask(); - task->data_ = data; - task->size_ = size; - return task; -} +offcache::CacheData::CacheData(const offcache::CacheData& other) { + std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; -inline void offcache::CacheCoordinator::DestroyTask(CacheTask* task) const { - numa_free(task->result_, task->size_); - delete task; + src_ = other.src_; + size_ = other.size_; + cache_ = other.cache_; + active_ = other.active_; + incomplete_cache_ = nullptr; + handlers_ = nullptr; + active_->fetch_add(1); } -inline void offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) { - task->valid_.wait(false); +offcache::CacheData::~CacheData() { + std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + const int32_t v = active_->fetch_sub(1); - for (auto& handler : task->handlers_) { - auto result = handler.get(); - // TODO: handle the returned status code + // if the returned value is non-positive + // then we must execute proper deletion + // as this was the last reference + + if (v <= 0) { + std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + Deallocate(); + delete active_; + delete cache_; } +} - task->handlers_.clear(); +void offcache::CacheData::Deallocate() { + std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + numa_free(cache_, size_); + cache_ = nullptr; + incomplete_cache_ = nullptr; } -inline uint8_t* offcache::CacheCoordinator::GetDataLocation(CacheTask* task) { - return task->result_; +uint8_t *offcache::CacheData::GetDataLocation() const { + return cache_->load(); } -inline void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) { - task->active_.store(false); +bool offcache::CacheData::Active() const { + return active_->load() > 0; } -inline void offcache::CacheCoordinator::Flush() { - // TODO: there probably is a better way to implement this flush +inline void offcache::Cache::Flush() { + std::cout << "[-] Flushing Cache" << std::endl; + + // TODO: there is a better way to implement this flush { std::unique_lock lock(cache_mutex_); @@ -326,8 +407,7 @@ inline void offcache::CacheCoordinator::Flush() { auto it = cache_state_.begin(); while (it != cache_state_.end()) { - if (it->second->active_.load() == false) { - DestroyTask(it->second); + if (it->second.Active() == false) { cache_state_.erase(it); it = cache_state_.begin(); } From f19c069b0ffe88ac1fbf1c2dff6cc9fb65972e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 9 Jan 2024 23:55:23 +0100 Subject: [PATCH 05/29] always perform copy from src and dst node, add another log output about the split --- offloading-cacher/main.cpp | 4 ++-- offloading-cacher/offloading-cache.hpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index b6c9714..7aa8ea0 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -37,12 +37,12 @@ int main(int argc, char **argv) { }; auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { - return std::vector{ numa_src_node }; + return std::vector{ numa_src_node, numa_dst_node }; }; cache.Init(cache_policy,copy_policy); - static constexpr size_t data_size = 8192; + static constexpr size_t data_size = 1024 * 1024; double* data = GetRandomArray(data_size); std::unique_ptr data_cache = cache.Access(reinterpret_cast(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed); diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp index f40ef3d..ea91fae 100644 --- a/offloading-cacher/offloading-cache.hpp +++ b/offloading-cacher/offloading-cache.hpp @@ -246,6 +246,8 @@ inline void offcache::Cache::SubmitTask(CacheData* task) { const size_t size = task->size_ / task_count; const size_t last_size = size + task->size_ % task_count; + std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + // save the current numa node mask to restore later // as executing the copy task will place this thread // on a different node From 395d3073100110fc9c899c82eee2c568730837ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 00:58:17 +0100 Subject: [PATCH 06/29] fix an issue with the freeing of data in the cacher --- offloading-cacher/main.cpp | 85 +++++++++-- offloading-cacher/offloading-cache.hpp | 189 ++++++++++++++++--------- 2 files changed, 192 insertions(+), 82 deletions(-) diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index 7aa8ea0..726033b 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -3,6 +3,8 @@ #include "offloading-cache.hpp" +offcache::Cache CACHE; + double* GetRandomArray(const size_t size) { double* array = new double[size]; @@ -29,36 +31,91 @@ bool IsEqual(const double* a, const double* b, const size_t size) { return true; } +void PerformAccessAndTest(double* src, const size_t size) { + // this is the function that any cache access will go through + // execution policy picks between three options: + // Relaxed may return an invalid (but not nullptr) CacheData + // which can then be validated with WaitOnCompletion() + // Immediate never returns an invalid CacheData structure + // however it may return just the pointer to source + // WaitOnCompletion() will then ensure that the data + // is actually in cache + // ImmediateNoCache behaves the same as Immediate but does never perform + // caching itself so only returns cached version if + // previously cached is available + + std::unique_ptr data_cache = CACHE.Access( + reinterpret_cast(src), + size * sizeof(double), + offcache::ExecutionPolicy::Immediate + ); + + double* cached_imm = reinterpret_cast(data_cache->GetDataLocation()); + + // check the value immediately just to see if ram or cache was returned + + if (src == cached_imm) { + std::cout << "Caching did not immediately yield different data location." << std::endl; + } + else { + std::cout << "Immediately got different data location." << std::endl; + } + + // waits for the completion of the asynchronous caching operation + + data_cache->WaitOnCompletion(); + + // gets the cache-data-location from the struct + + double* cached = reinterpret_cast(data_cache->GetDataLocation()); + + // tests on the resulting value + + if (src == cached) { + std::cout << "Caching did not affect data location." << std::endl; + } + + if (IsEqual(src,cached,size)) { + std::cout << "Cached data is correct." << std::endl; + } + else { + std::cout << "Cached data is wrong." << std::endl; + } +} + int main(int argc, char **argv) { - offcache::Cache cache; + + // given numa destination and source node and the size of the data + // this function decides on which the data will be placed + // which is used to select the HBM-node for the dst-node if desired auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { return numa_dst_node; }; + // this function receives the memory source and destination node + // and then decides, on which nodes the copy operation will be split + auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { return std::vector{ numa_src_node, numa_dst_node }; }; - cache.Init(cache_policy,copy_policy); + // initializes the cache with the two policies + + CACHE.Init(cache_policy,copy_policy); + + // generate the test data static constexpr size_t data_size = 1024 * 1024; double* data = GetRandomArray(data_size); - std::unique_ptr data_cache = cache.Access(reinterpret_cast(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed); + std::cout << "--- first access --- " << std::endl; - data_cache->WaitOnCompletion(); + PerformAccessAndTest(data, data_size); - double* cached = reinterpret_cast(data_cache->GetDataLocation()); + std::cout << "--- second access --- " << std::endl; - if (data == cached) { - std::cout << "Caching did not affect data location." << std::endl; - } + PerformAccessAndTest(data, data_size); - if (IsEqual(data,cached,data_size)) { - std::cout << "Cached data is correct." << std::endl; - } - else { - std::cout << "Cached data is wrong." << std::endl; - } + std::cout << "--- end of application --- " << std::endl; } diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp index ea91fae..e265665 100644 --- a/offloading-cacher/offloading-cache.hpp +++ b/offloading-cacher/offloading-cache.hpp @@ -94,9 +94,14 @@ namespace offcache { typedef std::vector (CopyPolicy)(const int numa_dst_node, const int numa_src_node); private: + // mutex for accessing the cache state map + std::shared_mutex cache_mutex_; - std::unordered_map cache_state_; + // map from [dst-numa-node,map2] + // map2 from [data-ptr,cache-structure] + + std::unordered_map> cache_state_; CachePolicy* cache_policy_function_ = nullptr; CopyPolicy* copy_policy_function_ = nullptr; @@ -105,6 +110,12 @@ namespace offcache { void SubmitTask(CacheData* task); + void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const; + + void AbortTask(CacheData* task) const; + + std::unique_ptr GetFromCache(uint8_t* src, const size_t size); + public: void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); @@ -126,40 +137,29 @@ inline void offcache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy // initialize numa library numa_available(); + const int nodes_max = numa_num_configured_nodes(); + const bitmask* valid_nodes = numa_get_mems_allowed(); + + for (int node = 0; node < nodes_max; node++) { + if (numa_bitmask_isbitset(valid_nodes, node)) { + cache_state_.insert({node,{}}); + } + } + std::cout << "[-] Cache Initialized" << std::endl; } inline std::unique_ptr offcache::Cache::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) { - // the best situation is if this data is already cached - // which we check in an unnamed block in which the cache - // is locked for reading to prevent another thread - // from marking the element we may find as unused and - // clearing it - { - std::shared_lock lock(cache_mutex_); - - const auto search = cache_state_.find(data); - - if (search != cache_state_.end()) { - if (search->second.size_ == size) { - search->second.active_->store(true); - - std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)data << std::dec << std::endl; - - return std::move(std::make_unique(search->second)); - } - else { - std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)data << std::dec << std::endl; + std::unique_ptr task = GetFromCache(data, size); - cache_state_.erase(search); - } - } + if (task != nullptr) { + return std::move(task); } // at this point the requested data is not present in cache // and we create a caching task for it - auto task = std::make_unique(data, size); + task = std::make_unique(data, size); if (policy == ExecutionPolicy::Immediate) { // in intermediate mode the returned task @@ -197,19 +197,12 @@ inline std::unique_ptr offcache::Cache::Access(uint8_t* dat } inline void offcache::Cache::SubmitTask(CacheData* task) { - // obtain numa node of current thread to determine where the data is needed - - const int current_cpu = sched_getcpu(); - const int current_node = numa_node_of_cpu(current_cpu); - - // obtain node that the given data pointer is allocated on - - int data_node = -1; - get_mempolicy(&data_node, NULL, 0, (void*)task->src_, MPOL_F_NODE | MPOL_F_ADDR); + // get destination numa node for the cache - // querry cache policy function for the destination numa node + int dst_node = -1; + int src_node = -1; - const int dst_node = cache_policy_function_(current_node, data_node, task->size_); + GetCacheNode(task->src_, task->size_, &dst_node, &src_node); std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl; @@ -236,7 +229,7 @@ inline void offcache::Cache::SubmitTask(CacheData* task) { // querry copy policy function for the nodes to use for the copy - const std::vector executing_nodes = copy_policy_function_(dst_node, data_node); + const std::vector executing_nodes = copy_policy_function_(dst_node, src_node); const size_t task_count = executing_nodes.size(); // each task will copy one fair part of the total size @@ -272,7 +265,7 @@ inline void offcache::Cache::SubmitTask(CacheData* task) { { std::unique_lock lock(cache_mutex_); - const auto state = cache_state_.insert({task->src_, *task}); + const auto state = cache_state_[dst_node].emplace(task->src_, *task); // if state.second is false then no insertion took place // which means that concurrently whith this thread @@ -283,20 +276,7 @@ inline void offcache::Cache::SubmitTask(CacheData* task) { if (!state.second) { std::cout << "[x] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; - // first wait on all copy operations to be completed - - task->WaitOnCompletion(); - - // abort by doing the following steps - // (1) free the allocated memory, (2) remove the "maybe result" as - // we will not run the caching operation, (3) clear the sub tasks - // for the very same reason, (4) set the result to the RAM-location - - numa_free(dst, task->size_); - task->incomplete_cache_ = nullptr; - task->cache_->store(task->src_); - - std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + AbortTask(task); return; } @@ -346,7 +326,7 @@ offcache::CacheData::CacheData(uint8_t* data, const size_t size) { src_ = data; size_ = size; - active_ = new std::atomic(); + active_ = new std::atomic(1); cache_ = new std::atomic(); incomplete_cache_ = nullptr; handlers_ = std::make_unique>(); @@ -355,21 +335,25 @@ offcache::CacheData::CacheData(uint8_t* data, const size_t size) { offcache::CacheData::CacheData(const offcache::CacheData& other) { std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; + active_ = other.active_; + const int current_active = active_->fetch_add(1); + src_ = other.src_; size_ = other.size_; cache_ = other.cache_; - active_ = other.active_; incomplete_cache_ = nullptr; handlers_ = nullptr; - active_->fetch_add(1); } offcache::CacheData::~CacheData() { std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - const int32_t v = active_->fetch_sub(1); + // due to fetch_sub returning the preivously held value + // we must subtract one locally to get the current value - // if the returned value is non-positive + const int32_t v = active_->fetch_sub(1) - 1; + + // if the returned value is zero or lower // then we must execute proper deletion // as this was the last reference @@ -390,7 +374,23 @@ void offcache::CacheData::Deallocate() { incomplete_cache_ = nullptr; } -uint8_t *offcache::CacheData::GetDataLocation() const { +void offcache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { + // obtain numa node of current thread to determine where the data is needed + + const int current_cpu = sched_getcpu(); + const int current_node = numa_node_of_cpu(current_cpu); + + // obtain node that the given data pointer is allocated on + + *OUT_SRC_NODE = -1; + get_mempolicy(OUT_SRC_NODE, NULL, 0, (void*)src, MPOL_F_NODE | MPOL_F_ADDR); + + // querry cache policy function for the destination numa node + + *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size); +} + +uint8_t* offcache::CacheData::GetDataLocation() const { return cache_->load(); } @@ -405,17 +405,70 @@ inline void offcache::Cache::Flush() { { std::unique_lock lock(cache_mutex_); - - auto it = cache_state_.begin(); - while (it != cache_state_.end()) { - if (it->second.Active() == false) { - cache_state_.erase(it); - it = cache_state_.begin(); - } - else { - it++; + for (auto& nc : cache_state_) { + auto it = nc.second.begin(); + + while (it != nc.second.end()) { + if (it->second.Active() == false) { + nc.second.erase(it); + it = nc.second.begin(); + } + else { + it++; + } } } } -} \ No newline at end of file +} + +void offcache::Cache::AbortTask(offcache::CacheData *task) const { + // first wait on all copy operations to be completed + + task->WaitOnCompletion(); + + // abort by doing the following steps + // (1) free the allocated memory, (2) remove the "maybe result" as + // we will not run the caching operation, (3) clear the sub tasks + // for the very same reason, (4) set the result to the RAM-location + + numa_free(task->incomplete_cache_, task->size_); + task->incomplete_cache_ = nullptr; + task->cache_->store(task->src_); + + std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; +} + +std::unique_ptr offcache::Cache::GetFromCache(uint8_t* src, const size_t size) { + // the best situation is if this data is already cached + // which we check in an unnamed block in which the cache + // is locked for reading to prevent another thread + // from marking the element we may find as unused and + // clearing it + + int dst_node = -1; + int src_node = -1; + + GetCacheNode(src, size, &dst_node, &src_node); + + std::shared_lock lock(cache_mutex_); + + const auto search = cache_state_[dst_node].find(src); + + if (search != cache_state_[dst_node].end()) { + if (search->second.size_ == size) { + search->second.active_->store(true); + + std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; + + return std::move(std::make_unique(search->second)); + } + else { + std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; + + cache_state_[dst_node].erase(search); + } + } + + return nullptr; +} From c01eafedaea03fb70f2c2ae0421e5f2a4b7b2f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 11:45:29 +0100 Subject: [PATCH 07/29] refactor the cacher to reduce complexity, removes the access guarantees (relaxed,immediate,...), uses the fact that other tasks will wait on atomic value change for the cache-pointer if it is nullptr to add the entry to cache structure earlier reducing cost of two threads accessing new entry at the same time, splits the offloading-cache.hpp file into two with one containing the data-class (represents a cache entry and task) and the other containing the cacher itself --- offloading-cacher/cache-data.hpp | 139 ++++++++ offloading-cacher/cache.hpp | 280 +++++++++++++++ offloading-cacher/main.cpp | 30 +- offloading-cacher/offloading-cache.hpp | 474 ------------------------- 4 files changed, 432 insertions(+), 491 deletions(-) create mode 100644 offloading-cacher/cache-data.hpp create mode 100644 offloading-cacher/cache.hpp delete mode 100644 offloading-cacher/offloading-cache.hpp diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp new file mode 100644 index 0000000..4028597 --- /dev/null +++ b/offloading-cacher/cache-data.hpp @@ -0,0 +1,139 @@ +#pragma once + +#include + +#include +#include +#include + +#include + +namespace dsacache { + class Cache; + + // the cache task structure will be used to submit and + // control a cache element, while providing source pointer + // and size in bytes for submission + // + // then the submitting thread may wait on the atomic "result" + // which will be notified by the cache worker upon processing + // after which the atomic-bool-ptr active will also become valid + class CacheData { + public: + using dml_handler = dml::handler>; + + private: + uint8_t* src_; + size_t size_; + + std::atomic* active_; + + protected: + std::atomic* cache_; + + uint8_t* incomplete_cache_; + + std::unique_ptr> handlers_; + + friend Cache; + + public: + CacheData(uint8_t* data, const size_t size); + CacheData(const CacheData& other); + ~CacheData(); + + void Deallocate(); + + void WaitOnCompletion(); + + uint8_t* GetDataLocation() const; + + bool Active() const; + }; +} + +inline void dsacache::CacheData::WaitOnCompletion() { + if (handlers_ == nullptr) { + std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + cache_->wait(nullptr); + + std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + } + else { + std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + for (auto& handler : *handlers_) { + auto result = handler.get(); + // TODO: handle the returned status code + } + + handlers_ = nullptr; + + std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + cache_->store(incomplete_cache_); + cache_->notify_all(); + } +} + +dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { + std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl; + + src_ = data; + size_ = size; + active_ = new std::atomic(1); + cache_ = new std::atomic(); + incomplete_cache_ = nullptr; + handlers_ = std::make_unique>(); +} + +dsacache::CacheData::CacheData(const dsacache::CacheData& other) { + std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; + + active_ = other.active_; + const int current_active = active_->fetch_add(1); + + src_ = other.src_; + size_ = other.size_; + cache_ = other.cache_; + incomplete_cache_ = nullptr; + handlers_ = nullptr; +} + +dsacache::CacheData::~CacheData() { + std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + // due to fetch_sub returning the preivously held value + // we must subtract one locally to get the current value + + const int32_t v = active_->fetch_sub(1) - 1; + + // if the returned value is zero or lower + // then we must execute proper deletion + // as this was the last reference + + if (v <= 0) { + std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + Deallocate(); + delete active_; + delete cache_; + } +} + +void dsacache::CacheData::Deallocate() { + std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + numa_free(cache_, size_); + cache_ = nullptr; + incomplete_cache_ = nullptr; +} + +uint8_t* dsacache::CacheData::GetDataLocation() const { + return cache_->load(); +} + +bool dsacache::CacheData::Active() const { + return active_->load() > 0; +} \ No newline at end of file diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp new file mode 100644 index 0000000..0081a04 --- /dev/null +++ b/offloading-cacher/cache.hpp @@ -0,0 +1,280 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "cache-data.hpp" + +namespace dsacache { + // singleton which holds the cache workers + // and is the place where work will be submited + class Cache { + public: + // cache policy is defined as a type here to allow flexible usage of the cacher + // given a numa destination node (where the data will be needed), the numa source + // node (current location of the data) and the data size, this function should + // return optimal cache placement + // dst node and returned value can differ if the system, for example, has HBM + // attached accessible directly to node n under a different node id m + typedef int (CachePolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size); + + // copy policy specifies the copy-executing nodes for a given task + // which allows flexibility in assignment for optimizing raw throughput + // or choosing a conservative usage policy + typedef std::vector (CopyPolicy)(const int numa_dst_node, const int numa_src_node); + + private: + // mutex for accessing the cache state map + + std::shared_mutex cache_mutex_; + + // map from [dst-numa-node,map2] + // map2 from [data-ptr,cache-structure] + + std::unordered_map> cache_state_; + + CachePolicy* cache_policy_function_ = nullptr; + CopyPolicy* copy_policy_function_ = nullptr; + + dml::handler> ExecuteCopy( + const uint8_t* src, uint8_t* dst, const size_t size, const int node + ) const; + + void SubmitTask(CacheData* task, const int dst_node, const int src_node); + + void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const; + + std::unique_ptr GetFromCache(uint8_t* src, const size_t size, const int dst_node); + + public: + void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); + + // function to perform data access through the cache + std::unique_ptr Access(uint8_t* data, const size_t size); + + void Flush(const int node = -1); + }; +} + +inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { + cache_policy_function_ = cache_policy_function; + copy_policy_function_ = copy_policy_function; + + // initialize numa library + numa_available(); + + const int nodes_max = numa_num_configured_nodes(); + const bitmask* valid_nodes = numa_get_mems_allowed(); + + for (int node = 0; node < nodes_max; node++) { + if (numa_bitmask_isbitset(valid_nodes, node)) { + cache_state_.insert({node,{}}); + } + } + + std::cout << "[-] Cache Initialized" << std::endl; +} + +inline std::unique_ptr dsacache::Cache::Access(uint8_t* data, const size_t size) { + // get destination numa node for the cache + + int dst_node = -1; + int src_node = -1; + + GetCacheNode(data, size, &dst_node, &src_node); + + // check whether the data is already cached + + std::unique_ptr task = GetFromCache(data, size, dst_node); + + if (task != nullptr) { + return std::move(task); + } + + // at this point the requested data is not present in cache + // and we create a caching task for it + + task = std::make_unique(data, size); + + { + std::unique_lock lock(cache_mutex_); + + const auto state = cache_state_[dst_node].emplace(task->src_, *task); + + // if state.second is false then no insertion took place + // which means that concurrently whith this thread + // some other thread must have accessed the same + // resource in which case we return the other + // threads data cache structure + + if (!state.second) { + std::cout << "[!] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + return std::move(std::make_unique(state.first->second)); + } + } + + SubmitTask(task.get(), dst_node, src_node); + + return std::move(task); +} + +inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) { + std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + + // allocate data on this node and flush the unused parts of the + // cache if the operation fails and retry once + // TODO: smarter flush strategy could keep some stuff cached + + uint8_t* dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); + + if (dst == nullptr) { + std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; + + // allocation on dst_node failed so we flush the cache for this + // node hoping to free enough currently unused entries to make + // the second allocation attempt successful + + Flush(dst_node); + + dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); + + if (dst == nullptr) { + std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; + return; + } + } + + task->incomplete_cache_ = dst; + + // querry copy policy function for the nodes to use for the copy + + const std::vector executing_nodes = copy_policy_function_(dst_node, src_node); + const size_t task_count = executing_nodes.size(); + + // each task will copy one fair part of the total size + // and in case the total size is not a factor of the + // given task count the last node must copy the remainder + + const size_t size = task->size_ / task_count; + const size_t last_size = size + task->size_ % task_count; + + std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + + // save the current numa node mask to restore later + // as executing the copy task will place this thread + // on a different node + + bitmask* nodemask = numa_get_run_node_mask(); + + for (uint32_t i = 0; i < task_count; i++) { + const size_t local_size = i + 1 == task_count ? size : last_size; + const size_t local_offset = i * size; + const uint8_t* local_src = task->src_ + local_offset; + uint8_t* local_dst = dst + local_offset; + + task->handlers_->emplace_back(ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i])); + } + + // restore the previous nodemask + + numa_run_on_node_mask(nodemask); +} + +inline dml::handler> dsacache::Cache::ExecuteCopy( + const uint8_t* src, uint8_t* dst, const size_t size, const int node +) const { + dml::const_data_view srcv = dml::make_view(src, size); + dml::data_view dstv = dml::make_view(dst, size); + + numa_run_on_node(node); + + return dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); +} + + +void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { + // obtain numa node of current thread to determine where the data is needed + + const int current_cpu = sched_getcpu(); + const int current_node = numa_node_of_cpu(current_cpu); + + // obtain node that the given data pointer is allocated on + + *OUT_SRC_NODE = -1; + get_mempolicy(OUT_SRC_NODE, NULL, 0, (void*)src, MPOL_F_NODE | MPOL_F_ADDR); + + // querry cache policy function for the destination numa node + + *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size); +} + + +inline void dsacache::Cache::Flush(const int node) { + std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl; + + const auto FlushNode = [](std::unordered_map& map) { + auto it = map.begin(); + + while (it != map.end()) { + if (it->second.Active() == false) { + map.erase(it); + it = map.begin(); + } + else { + it++; + } + } + }; + + { + std::unique_lock lock(cache_mutex_); + + if (node == -1) { + for (auto& nc : cache_state_) { + FlushNode(nc.second); + } + } + else { + FlushNode(cache_state_[node]); + } + } +} + +std::unique_ptr dsacache::Cache::GetFromCache(uint8_t* src, const size_t size, const int dst_node) { + // the best situation is if this data is already cached + // which we check in an unnamed block in which the cache + // is locked for reading to prevent another thread + // from marking the element we may find as unused and + // clearing it + + std::shared_lock lock(cache_mutex_); + + const auto search = cache_state_[dst_node].find(src); + + if (search != cache_state_[dst_node].end()) { + if (search->second.size_ == size) { + search->second.active_->store(true); + + std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; + + return std::move(std::make_unique(search->second)); + } + else { + std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; + + cache_state_[dst_node].erase(search); + } + } + + return nullptr; +} diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index 726033b..e67eb22 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -1,9 +1,9 @@ #include #include -#include "offloading-cache.hpp" +#include "cache.hpp" -offcache::Cache CACHE; +dsacache::Cache CACHE; double* GetRandomArray(const size_t size) { double* array = new double[size]; @@ -32,22 +32,9 @@ bool IsEqual(const double* a, const double* b, const size_t size) { } void PerformAccessAndTest(double* src, const size_t size) { - // this is the function that any cache access will go through - // execution policy picks between three options: - // Relaxed may return an invalid (but not nullptr) CacheData - // which can then be validated with WaitOnCompletion() - // Immediate never returns an invalid CacheData structure - // however it may return just the pointer to source - // WaitOnCompletion() will then ensure that the data - // is actually in cache - // ImmediateNoCache behaves the same as Immediate but does never perform - // caching itself so only returns cached version if - // previously cached is available - - std::unique_ptr data_cache = CACHE.Access( + std::unique_ptr data_cache = CACHE.Access( reinterpret_cast(src), - size * sizeof(double), - offcache::ExecutionPolicy::Immediate + size * sizeof(double) ); double* cached_imm = reinterpret_cast(data_cache->GetDataLocation()); @@ -57,6 +44,9 @@ void PerformAccessAndTest(double* src, const size_t size) { if (src == cached_imm) { std::cout << "Caching did not immediately yield different data location." << std::endl; } + else if (cached_imm == nullptr) { + std::cout << "Immediately got nullptr." << std::endl; + } else { std::cout << "Immediately got different data location." << std::endl; } @@ -74,6 +64,12 @@ void PerformAccessAndTest(double* src, const size_t size) { if (src == cached) { std::cout << "Caching did not affect data location." << std::endl; } + else if (cached == nullptr) { + std::cout << "Got nullptr from cache." << std::endl; + } + else { + std::cout << "Got different data location from cache." << std::endl; + } if (IsEqual(src,cached,size)) { std::cout << "Cached data is correct." << std::endl; diff --git a/offloading-cacher/offloading-cache.hpp b/offloading-cacher/offloading-cache.hpp deleted file mode 100644 index e265665..0000000 --- a/offloading-cacher/offloading-cache.hpp +++ /dev/null @@ -1,474 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include - -namespace offcache { - // execution policy selects in which way the data is supposed to be cached - // and returned with the following behaviour is guaranteed in addition to the - // returned value being valid: - // Immediate: return as fast as possible - // may return cached data, can return data in RAM - // will trigger caching of the data provided - // ImmediateNoCache: return as fast as possible and never trigger caching - // same as Immediate but will not trigger caching - // Relaxed: no rapid return needed, take time - // will trigger caching and may only return - // once the caching is successful but can still - // provide data in RAM - enum class ExecutionPolicy { - Relaxed, Immediate, ImmediateNoCache - }; - - class Cache; - - // the cache task structure will be used to submit and - // control a cache element, while providing source pointer - // and size in bytes for submission - // - // then the submitting thread may wait on the atomic "result" - // which will be notified by the cache worker upon processing - // after which the atomic-bool-ptr active will also become valid - class CacheData { - public: - using dml_handler = dml::handler>; - - private: - uint8_t* src_; - size_t size_; - - std::atomic* active_; - - protected: - std::atomic* cache_; - - uint8_t* incomplete_cache_; - - std::unique_ptr> handlers_; - - friend Cache; - - public: - CacheData(uint8_t* data, const size_t size); - CacheData(const CacheData& other); - ~CacheData(); - - void Deallocate(); - void WaitOnCompletion(); - - uint8_t* GetDataLocation() const; - - bool Active() const; - }; - - // singleton which holds the cache workers - // and is the place where work will be submited - class Cache { - public: - // cache policy is defined as a type here to allow flexible usage of the cacher - // given a numa destination node (where the data will be needed), the numa source - // node (current location of the data) and the data size, this function should - // return optimal cache placement - // dst node and returned value can differ if the system, for example, has HBM - // attached accessible directly to node n under a different node id m - typedef int (CachePolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size); - - // copy policy specifies the copy-executing nodes for a given task - // which allows flexibility in assignment for optimizing raw throughput - // or choosing a conservative usage policy - typedef std::vector (CopyPolicy)(const int numa_dst_node, const int numa_src_node); - - private: - // mutex for accessing the cache state map - - std::shared_mutex cache_mutex_; - - // map from [dst-numa-node,map2] - // map2 from [data-ptr,cache-structure] - - std::unordered_map> cache_state_; - - CachePolicy* cache_policy_function_ = nullptr; - CopyPolicy* copy_policy_function_ = nullptr; - - dml::handler> ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const; - - void SubmitTask(CacheData* task); - - void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const; - - void AbortTask(CacheData* task) const; - - std::unique_ptr GetFromCache(uint8_t* src, const size_t size); - - public: - void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); - - // function to perform data access through the cache - // behaviour depends on the chosen execution policy - // Immediate and ImmediateNoCache return a cache task - // with guaranteed-valid result value where Relaxed - // policy does not come with this guarantee. - std::unique_ptr Access(uint8_t* data, const size_t size, const ExecutionPolicy policy); - - void Flush(); - }; -} - -inline void offcache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { - cache_policy_function_ = cache_policy_function; - copy_policy_function_ = copy_policy_function; - - // initialize numa library - numa_available(); - - const int nodes_max = numa_num_configured_nodes(); - const bitmask* valid_nodes = numa_get_mems_allowed(); - - for (int node = 0; node < nodes_max; node++) { - if (numa_bitmask_isbitset(valid_nodes, node)) { - cache_state_.insert({node,{}}); - } - } - - std::cout << "[-] Cache Initialized" << std::endl; -} - -inline std::unique_ptr offcache::Cache::Access(uint8_t* data, const size_t size, const ExecutionPolicy policy) { - std::unique_ptr task = GetFromCache(data, size); - - if (task != nullptr) { - return std::move(task); - } - - // at this point the requested data is not present in cache - // and we create a caching task for it - - task = std::make_unique(data, size); - - if (policy == ExecutionPolicy::Immediate) { - // in intermediate mode the returned task - // object is guaranteed to be valid and therefore - // its resulting location must be validated - // after which we submit the task - // maybe_result is then set by submit - - task->cache_->store(data); - SubmitTask(task.get()); - return std::move(task); - } - else if (policy == ExecutionPolicy::ImmediateNoCache) { - // for immediatenocache we just validate - // the generated task and return it - // we must also set maybe_result in case - // someone waits on this - - task->cache_->store(data); - task->incomplete_cache_ = data; - return std::move(task); - } - else if (policy == ExecutionPolicy::Relaxed) { - // for relaxed no valid task must be returned - // and therefore we just submit and then give - // the possible invalid task back with only - // maybe_result set by submission - - SubmitTask(task.get()); - return std::move(task); - } - else { - // this should not be reached - } -} - -inline void offcache::Cache::SubmitTask(CacheData* task) { - // get destination numa node for the cache - - int dst_node = -1; - int src_node = -1; - - GetCacheNode(task->src_, task->size_, &dst_node, &src_node); - - std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl; - - // allocate data on this node and flush the unused parts of the - // cache if the operation fails and retry once - // TODO: smarter flush strategy could keep some stuff cached - - uint8_t* dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); - - if (dst == nullptr) { - std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; - - Flush(); - - dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); - - if (dst == nullptr) { - std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; - return; - } - } - - task->incomplete_cache_ = dst; - - // querry copy policy function for the nodes to use for the copy - - const std::vector executing_nodes = copy_policy_function_(dst_node, src_node); - const size_t task_count = executing_nodes.size(); - - // each task will copy one fair part of the total size - // and in case the total size is not a factor of the - // given task count the last node must copy the remainder - - const size_t size = task->size_ / task_count; - const size_t last_size = size + task->size_ % task_count; - - std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; - - // save the current numa node mask to restore later - // as executing the copy task will place this thread - // on a different node - - bitmask* nodemask = numa_get_run_node_mask(); - - for (uint32_t i = 0; i < task_count; i++) { - const size_t local_size = i + 1 == task_count ? size : last_size; - const size_t local_offset = i * size; - const uint8_t* local_src = task->src_ + local_offset; - uint8_t* local_dst = dst + local_offset; - - task->handlers_->emplace_back(ExecuteCopy(local_src, local_dst, local_size, executing_nodes[i])); - } - - // only at this point may the task be added to the control structure - // because adding it earlier could cause it to be returned for an - // access request while the handler-vector is not fully populated - // which could cause the wait-function to return prematurely - // TODO: this can be optimized because the abort is quite expensive - - { - std::unique_lock lock(cache_mutex_); - - const auto state = cache_state_[dst_node].emplace(task->src_, *task); - - // if state.second is false then no insertion took place - // which means that concurrently whith this thread - // some other thread must have accessed the same - // resource in which case we must perform an abort - // TODO: abort is not the only way to handle this situation - - if (!state.second) { - std::cout << "[x] Found another cache instance for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; - - AbortTask(task); - - return; - } - } - - // restore the previous nodemask - - numa_run_on_node_mask(nodemask); -} - -inline dml::handler> offcache::Cache::ExecuteCopy(const uint8_t* src, uint8_t* dst, const size_t size, const int node) const { - dml::const_data_view srcv = dml::make_view(src, size); - dml::data_view dstv = dml::make_view(dst, size); - - numa_run_on_node(node); - - return dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); -} - -inline void offcache::CacheData::WaitOnCompletion() { - if (handlers_ == nullptr) { - std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - cache_->wait(nullptr); - - std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - } - else { - std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - for (auto& handler : *handlers_) { - auto result = handler.get(); - // TODO: handle the returned status code - } - - handlers_ = nullptr; - - std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - cache_->store(incomplete_cache_); - cache_->notify_all(); - } -} - -offcache::CacheData::CacheData(uint8_t* data, const size_t size) { - std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl; - - src_ = data; - size_ = size; - active_ = new std::atomic(1); - cache_ = new std::atomic(); - incomplete_cache_ = nullptr; - handlers_ = std::make_unique>(); -} - -offcache::CacheData::CacheData(const offcache::CacheData& other) { - std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; - - active_ = other.active_; - const int current_active = active_->fetch_add(1); - - src_ = other.src_; - size_ = other.size_; - cache_ = other.cache_; - incomplete_cache_ = nullptr; - handlers_ = nullptr; -} - -offcache::CacheData::~CacheData() { - std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - // due to fetch_sub returning the preivously held value - // we must subtract one locally to get the current value - - const int32_t v = active_->fetch_sub(1) - 1; - - // if the returned value is zero or lower - // then we must execute proper deletion - // as this was the last reference - - if (v <= 0) { - std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - Deallocate(); - delete active_; - delete cache_; - } -} - -void offcache::CacheData::Deallocate() { - std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - numa_free(cache_, size_); - cache_ = nullptr; - incomplete_cache_ = nullptr; -} - -void offcache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { - // obtain numa node of current thread to determine where the data is needed - - const int current_cpu = sched_getcpu(); - const int current_node = numa_node_of_cpu(current_cpu); - - // obtain node that the given data pointer is allocated on - - *OUT_SRC_NODE = -1; - get_mempolicy(OUT_SRC_NODE, NULL, 0, (void*)src, MPOL_F_NODE | MPOL_F_ADDR); - - // querry cache policy function for the destination numa node - - *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size); -} - -uint8_t* offcache::CacheData::GetDataLocation() const { - return cache_->load(); -} - -bool offcache::CacheData::Active() const { - return active_->load() > 0; -} - -inline void offcache::Cache::Flush() { - std::cout << "[-] Flushing Cache" << std::endl; - - // TODO: there is a better way to implement this flush - - { - std::unique_lock lock(cache_mutex_); - - for (auto& nc : cache_state_) { - auto it = nc.second.begin(); - - while (it != nc.second.end()) { - if (it->second.Active() == false) { - nc.second.erase(it); - it = nc.second.begin(); - } - else { - it++; - } - } - } - } -} - -void offcache::Cache::AbortTask(offcache::CacheData *task) const { - // first wait on all copy operations to be completed - - task->WaitOnCompletion(); - - // abort by doing the following steps - // (1) free the allocated memory, (2) remove the "maybe result" as - // we will not run the caching operation, (3) clear the sub tasks - // for the very same reason, (4) set the result to the RAM-location - - numa_free(task->incomplete_cache_, task->size_); - task->incomplete_cache_ = nullptr; - task->cache_->store(task->src_); - - std::cout << "[-] Abort completed for 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; -} - -std::unique_ptr offcache::Cache::GetFromCache(uint8_t* src, const size_t size) { - // the best situation is if this data is already cached - // which we check in an unnamed block in which the cache - // is locked for reading to prevent another thread - // from marking the element we may find as unused and - // clearing it - - int dst_node = -1; - int src_node = -1; - - GetCacheNode(src, size, &dst_node, &src_node); - - std::shared_lock lock(cache_mutex_); - - const auto search = cache_state_[dst_node].find(src); - - if (search != cache_state_[dst_node].end()) { - if (search->second.size_ == size) { - search->second.active_->store(true); - - std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; - - return std::move(std::make_unique(search->second)); - } - else { - std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; - - cache_state_[dst_node].erase(search); - } - } - - return nullptr; -} From 46de3151a2634dbf7eeb75556eb89ee0ff2f669e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 12:29:31 +0100 Subject: [PATCH 08/29] add a lot of comments to the code, also handle errors in the dml handlers gracefully --- offloading-cacher/cache-data.hpp | 131 +++++++++++++++++++++++--- offloading-cacher/cache.hpp | 35 ++++++- offloading-cacher/util/dml-helper.hpp | 60 ++++++++---- 3 files changed, 192 insertions(+), 34 deletions(-) diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp index 4028597..4de6138 100644 --- a/offloading-cacher/cache-data.hpp +++ b/offloading-cacher/cache-data.hpp @@ -8,6 +8,8 @@ #include +#include "util/dml-helper.hpp" + namespace dsacache { class Cache; @@ -23,57 +25,130 @@ namespace dsacache { using dml_handler = dml::handler>; private: + // data source and size of the block uint8_t* src_; size_t size_; + // global reference counting object std::atomic* active_; - protected: + // global cache-location pointer std::atomic* cache_; + // object-local incomplete cache location pointer + // which is only available in the first instance uint8_t* incomplete_cache_; + // dml handler vector pointer which is only + // available in the first instance std::unique_ptr> handlers_; - friend Cache; + // deallocates the global cache-location + // and invalidates it + void Deallocate(); + + // checks whether there are at least two + // valid references to this object which + // is done as the cache always has one + // internally to any living instance + bool Active() const; + friend Cache; public: CacheData(uint8_t* data, const size_t size); CacheData(const CacheData& other); ~CacheData(); - void Deallocate(); - + // waits on completion of caching operations + // for this task and is safe to be called in + // any state of the object void WaitOnCompletion(); + // returns the cache data location for this + // instance which is valid as long as the + // instance is alive uint8_t* GetDataLocation() const; - - bool Active() const; }; } inline void dsacache::CacheData::WaitOnCompletion() { + // the cache data entry can be in two states + // either it is the original one which has not + // been waited for in which case the handlers + // are non-null or it is not + if (handlers_ == nullptr) { std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + // when no handlers are attached to this cache entry we wait on a + // value change for the cache structure from nullptr to non-null + // which will either go through immediately if the cache is valid + // already or wait until the handler-owning thread notifies us + cache_->wait(nullptr); std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; } else { + // when the handlers are non-null there are some DSA task handlers + // available on which we must wait here + std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + // abort is set if any operation encountered an error + + bool abort = false; + for (auto& handler : *handlers_) { auto result = handler.get(); - // TODO: handle the returned status code + + if (result.status != dml::status_code::ok) { + std::cerr << "[x] Encountered bad status code for operation: " << dml::StatusCodeToString(result.status) << std::endl; + + // if one of the copy tasks failed we abort the whole task + // after all operations are completed on it + + abort = true; + } } + // the handlers are cleared after all have completed + handlers_ = nullptr; - std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + // now we act depending on whether an abort has been + // called for which signals operation incomplete + + if (abort) { + // store nullptr in the cache location + + cache_->store(nullptr); + + // then free the now incomplete cache + + // TODO: it would be possible to salvage the + // TODO: operation at this point but this + // TODO: is quite complicated so we just abort + + numa_free(incomplete_cache_, size_); + } + else { + std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + // incomplete cache is now safe to use and therefore we + // swap it with the global cache state of this entry + // and notify potentially waiting threads + + cache_->store(incomplete_cache_); + } + + // as a last step all waiting threads must + // be notified (copies of this will wait on value + // change of the cache) and the incomplete cache + // is cleared to nullptr as it is not incomplete - cache_->store(incomplete_cache_); cache_->notify_all(); + incomplete_cache_ = nullptr; } } @@ -91,12 +166,24 @@ dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { dsacache::CacheData::CacheData(const dsacache::CacheData& other) { std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; + // we copy the ptr to the global atomic reference counter + // and increase the amount of active references + active_ = other.active_; const int current_active = active_->fetch_add(1); + // source and size will be copied too + // as well as the reference to the global + // atomic cache pointer + src_ = other.src_; size_ = other.size_; cache_ = other.cache_; + + // incomplete cache and handlers will not + // be copied because only the first instance + // will wait on the completion of handlers + incomplete_cache_ = nullptr; handlers_ = nullptr; } @@ -104,6 +191,15 @@ dsacache::CacheData::CacheData(const dsacache::CacheData& other) { dsacache::CacheData::~CacheData() { std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + // if this is the first instance of this cache structure + // and it has not been waited on and is now being destroyed + // we must wait on completion here to ensure the cache + // remains in a valid state + + if (handlers_ != nullptr) { + WaitOnCompletion(); + } + // due to fetch_sub returning the preivously held value // we must subtract one locally to get the current value @@ -117,6 +213,7 @@ dsacache::CacheData::~CacheData() { std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; Deallocate(); + delete active_; delete cache_; } @@ -125,9 +222,12 @@ dsacache::CacheData::~CacheData() { void dsacache::CacheData::Deallocate() { std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - numa_free(cache_, size_); - cache_ = nullptr; - incomplete_cache_ = nullptr; + // although deallocate should only be called from + // a safe context to do so, it can not hurt to + // defensively perform the operation atomically + + uint8_t* cache_local = cache_->exchange(nullptr); + if (cache_local != nullptr) numa_free(cache_local, size_); } uint8_t* dsacache::CacheData::GetDataLocation() const { @@ -135,5 +235,10 @@ uint8_t* dsacache::CacheData::GetDataLocation() const { } bool dsacache::CacheData::Active() const { - return active_->load() > 0; + // this entry is active if more than one + // reference exists to it, as the Cache + // will always keep one internally until + // the entry is cleared from cache + + return active_->load() > 1; } \ No newline at end of file diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 0081a04..f3ef90d 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -46,22 +46,42 @@ namespace dsacache { CachePolicy* cache_policy_function_ = nullptr; CopyPolicy* copy_policy_function_ = nullptr; + // function used to submit a copy task on a specific node to the dml + // engine on that node - will change the current threads node assignment + // to achieve this so take care to restore this dml::handler> ExecuteCopy( const uint8_t* src, uint8_t* dst, const size_t size, const int node ) const; + // allocates the required memory on the destination node + // and then submits task to the dml library for processing + // and attaches the handlers to the cache data structure void SubmitTask(CacheData* task, const int dst_node, const int src_node); + // querries the policy functions for the given data and size + // to obtain destination cache node, also returns the datas + // source node for further usage + // output may depend on the calling threads node assignment + // as this is set as the "optimal placement" node void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const; + // checks whether the cache contains an entry for + // the given data in the given memory node and + // returns it, otherwise returns nullptr std::unique_ptr GetFromCache(uint8_t* src, const size_t size, const int dst_node); public: + // initializes the cache with the two policy functions + // only after this is it safe to use in a threaded environment void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); // function to perform data access through the cache std::unique_ptr Access(uint8_t* data, const size_t size); + // flushes the cache of inactive entries + // if node is -1 then the whole cache is + // checked and otherwise the specified + // node - no checks on node validity void Flush(const int node = -1); }; } @@ -71,11 +91,19 @@ inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy copy_policy_function_ = copy_policy_function; // initialize numa library + numa_available(); + // obtain all available nodes + // and those we may allocate + // memory on + const int nodes_max = numa_num_configured_nodes(); const bitmask* valid_nodes = numa_get_mems_allowed(); + // prepare the cache state with entries + // for all given nodes + for (int node = 0; node < nodes_max; node++) { if (numa_bitmask_isbitset(valid_nodes, node)) { cache_state_.insert({node,{}}); @@ -93,6 +121,10 @@ inline std::unique_ptr dsacache::Cache::Access(uint8_t* dat GetCacheNode(data, size, &dst_node, &src_node); + // TODO: at this point it could be beneficial to check whether + // TODO: the given destination node is present as an entry + // TODO: in the cache state to see if it is valid + // check whether the data is already cached std::unique_ptr task = GetFromCache(data, size, dst_node); @@ -149,7 +181,7 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); if (dst == nullptr) { - std::cout << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; + std::cerr << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; return; } } @@ -188,6 +220,7 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con // restore the previous nodemask numa_run_on_node_mask(nodemask); + numa_free_nodemask(nodemask); } inline dml::handler> dsacache::Cache::ExecuteCopy( diff --git a/offloading-cacher/util/dml-helper.hpp b/offloading-cacher/util/dml-helper.hpp index 1686fd1..de92bb7 100644 --- a/offloading-cacher/util/dml-helper.hpp +++ b/offloading-cacher/util/dml-helper.hpp @@ -2,25 +2,45 @@ #include -inline const std::string StatusCodeToString(const dml::status_code code) { - switch(code) { - case dml::status_code::ok: return "ok"; - case dml::status_code::false_predicate: return "false predicate"; - case dml::status_code::partial_completion: return "partial completion"; - case dml::status_code::nullptr_error: return "nullptr error"; - case dml::status_code::bad_size: return "bad size"; - case dml::status_code::bad_length: return "bad length"; - case dml::status_code::inconsistent_size: return "inconsistent size"; - case dml::status_code::dualcast_bad_padding: return "dualcast bad padding"; - case dml::status_code::bad_alignment: return "bad alignment"; - case dml::status_code::buffers_overlapping: return "buffers overlapping"; - case dml::status_code::delta_delta_empty: return "delta delta empty"; - case dml::status_code::batch_overflow: return "batch overflow"; - case dml::status_code::execution_failed: return "execution failed"; - case dml::status_code::unsupported_operation: return "unsupported operation"; - case dml::status_code::queue_busy: return "queue busy"; - case dml::status_code::error: return "unknown error"; - case dml::status_code::config_error: return "config error"; - default: return "unhandled error"; +namespace dml { + inline const std::string StatusCodeToString(const dml::status_code code) { + switch (code) { + case dml::status_code::ok: + return "ok"; + case dml::status_code::false_predicate: + return "false predicate"; + case dml::status_code::partial_completion: + return "partial completion"; + case dml::status_code::nullptr_error: + return "nullptr error"; + case dml::status_code::bad_size: + return "bad size"; + case dml::status_code::bad_length: + return "bad length"; + case dml::status_code::inconsistent_size: + return "inconsistent size"; + case dml::status_code::dualcast_bad_padding: + return "dualcast bad padding"; + case dml::status_code::bad_alignment: + return "bad alignment"; + case dml::status_code::buffers_overlapping: + return "buffers overlapping"; + case dml::status_code::delta_delta_empty: + return "delta delta empty"; + case dml::status_code::batch_overflow: + return "batch overflow"; + case dml::status_code::execution_failed: + return "execution failed"; + case dml::status_code::unsupported_operation: + return "unsupported operation"; + case dml::status_code::queue_busy: + return "queue busy"; + case dml::status_code::error: + return "unknown error"; + case dml::status_code::config_error: + return "config error"; + default: + return "unhandled error"; + } } } \ No newline at end of file From 52566fc13b9b1b283b4b3b4187016f8b287904a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 12:31:57 +0100 Subject: [PATCH 09/29] print to cerr for bad states in the test-main --- offloading-cacher/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index e67eb22..4310d3d 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -65,7 +65,7 @@ void PerformAccessAndTest(double* src, const size_t size) { std::cout << "Caching did not affect data location." << std::endl; } else if (cached == nullptr) { - std::cout << "Got nullptr from cache." << std::endl; + std::cerr << "Got nullptr from cache." << std::endl; } else { std::cout << "Got different data location from cache." << std::endl; @@ -75,7 +75,7 @@ void PerformAccessAndTest(double* src, const size_t size) { std::cout << "Cached data is correct." << std::endl; } else { - std::cout << "Cached data is wrong." << std::endl; + std::cerr << "Cached data is wrong." << std::endl; } } From 53e05d096c52042274ab92f05c0e4b367b1f6d31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 12:41:32 +0100 Subject: [PATCH 10/29] add even more comments and remove an old code line that modified the reference counter of cache data from the outside --- offloading-cacher/cache-data.hpp | 3 ++- offloading-cacher/cache.hpp | 41 ++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp index 4de6138..fe02c90 100644 --- a/offloading-cacher/cache-data.hpp +++ b/offloading-cacher/cache-data.hpp @@ -66,7 +66,8 @@ namespace dsacache { // returns the cache data location for this // instance which is valid as long as the - // instance is alive + // instance is alive - !!! this may also + // yield a nullptr !!! uint8_t* GetDataLocation() const; }; } diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index f3ef90d..8fd8362 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -255,23 +255,45 @@ void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST inline void dsacache::Cache::Flush(const int node) { std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl; + // this lambda is used because below we have two code paths that + // flush nodes, either one single or all successively + const auto FlushNode = [](std::unordered_map& map) { + // begin at the front of the map + auto it = map.begin(); + // loop until we reach the end of the map + while (it != map.end()) { + // if the iterator points to an inactive element + // then we may erase it + if (it->second.Active() == false) { + // erase the iterator from the map + map.erase(it); + + // as the erasure invalidated out iterator + // we must start at the beginning again + it = map.begin(); } else { + // if element is active just move over to the next one + it++; } } }; { + // we require exclusive lock as we modify the cache state + std::unique_lock lock(cache_mutex_); + // node == -1 means that cache on all nodes should be flushed + if (node == -1) { for (auto& nc : cache_state_) { FlushNode(nc.second); @@ -290,21 +312,36 @@ std::unique_ptr dsacache::Cache::GetFromCache(uint8_t* src, // from marking the element we may find as unused and // clearing it + // lock the cache state in shared-mode because we read + std::shared_lock lock(cache_mutex_); + // search for the data in our cache state structure at the given node + const auto search = cache_state_[dst_node].find(src); + // if the data is in our structure we continue + if (search != cache_state_[dst_node].end()) { - if (search->second.size_ == size) { - search->second.active_->store(true); + // now check whether the sizes match + // TODO: second.size_ >= size would also work + + if (search->second.size_ == size) { std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; + // return a unique copy of the entry which uses the object + // lifetime and destructor to safely handle deallocation + return std::move(std::make_unique(search->second)); } else { std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; + // if the sizes missmatch then we clear the current entry from cache + // which will cause its deletion only after the last possible outside + // reference is also destroyed + cache_state_[dst_node].erase(search); } } From 9c06bd4fa90aafec7c438580d6b07929b25792f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 12:53:10 +0100 Subject: [PATCH 11/29] add class-definition comments and clear some double-newlines --- offloading-cacher/cache-data.hpp | 15 ++++++++------- offloading-cacher/cache.hpp | 10 ++++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp index fe02c90..95865ca 100644 --- a/offloading-cacher/cache-data.hpp +++ b/offloading-cacher/cache-data.hpp @@ -13,13 +13,14 @@ namespace dsacache { class Cache; - // the cache task structure will be used to submit and - // control a cache element, while providing source pointer - // and size in bytes for submission - // - // then the submitting thread may wait on the atomic "result" - // which will be notified by the cache worker upon processing - // after which the atomic-bool-ptr active will also become valid + // cache data holds all required information on + // one cache entry and will both be stored + // internally by the cache and handed out + // as copies to the user + // this class uses its object lifetime and + // a global reference counter to allow + // thread-safe copies and resource management + class CacheData { public: using dml_handler = dml::handler>; diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 8fd8362..952dd47 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -16,8 +16,12 @@ #include "cache-data.hpp" namespace dsacache { - // singleton which holds the cache workers - // and is the place where work will be submited + // cache class will handle access to data through the cache + // by managing the cache through work submission, it sticks + // to user-defined caching and copy policies, is thread + // safe after initialization and returns copies of + // cache data class to the user + class Cache { public: // cache policy is defined as a type here to allow flexible usage of the cacher @@ -234,7 +238,6 @@ inline dml::handler> dsacache:: return dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); } - void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { // obtain numa node of current thread to determine where the data is needed @@ -251,7 +254,6 @@ void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST *OUT_DST_NODE = cache_policy_function_(current_node, *OUT_SRC_NODE, size); } - inline void dsacache::Cache::Flush(const int node) { std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl; From e5b96727cd65e30358a5fa58c38ccd32cd576d7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 14:24:10 +0100 Subject: [PATCH 12/29] add missing inline specifier to functions as this is header-only code --- offloading-cacher/cache-data.hpp | 12 ++++++------ offloading-cacher/cache.hpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp index 95865ca..1958519 100644 --- a/offloading-cacher/cache-data.hpp +++ b/offloading-cacher/cache-data.hpp @@ -154,7 +154,7 @@ inline void dsacache::CacheData::WaitOnCompletion() { } } -dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { +inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl; src_ = data; @@ -165,7 +165,7 @@ dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { handlers_ = std::make_unique>(); } -dsacache::CacheData::CacheData(const dsacache::CacheData& other) { +inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) { std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; // we copy the ptr to the global atomic reference counter @@ -190,7 +190,7 @@ dsacache::CacheData::CacheData(const dsacache::CacheData& other) { handlers_ = nullptr; } -dsacache::CacheData::~CacheData() { +inline dsacache::CacheData::~CacheData() { std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; // if this is the first instance of this cache structure @@ -221,7 +221,7 @@ dsacache::CacheData::~CacheData() { } } -void dsacache::CacheData::Deallocate() { +inline void dsacache::CacheData::Deallocate() { std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; // although deallocate should only be called from @@ -232,11 +232,11 @@ void dsacache::CacheData::Deallocate() { if (cache_local != nullptr) numa_free(cache_local, size_); } -uint8_t* dsacache::CacheData::GetDataLocation() const { +inline uint8_t* dsacache::CacheData::GetDataLocation() const { return cache_->load(); } -bool dsacache::CacheData::Active() const { +inline bool dsacache::CacheData::Active() const { // this entry is active if more than one // reference exists to it, as the Cache // will always keep one internally until diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 952dd47..22b23f8 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -238,7 +238,7 @@ inline dml::handler> dsacache:: return dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); } -void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { +inline void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { // obtain numa node of current thread to determine where the data is needed const int current_cpu = sched_getcpu(); @@ -307,7 +307,7 @@ inline void dsacache::Cache::Flush(const int node) { } } -std::unique_ptr dsacache::Cache::GetFromCache(uint8_t* src, const size_t size, const int dst_node) { +inline std::unique_ptr dsacache::Cache::GetFromCache(uint8_t* src, const size_t size, const int dst_node) { // the best situation is if this data is already cached // which we check in an unnamed block in which the cache // is locked for reading to prevent another thread From e3e17cec7b0d6b6cecdbc740faf72f2635eac5e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 17:44:52 +0100 Subject: [PATCH 13/29] extend the main method of the small supplied test program to allow running on multiple threads --- offloading-cacher/CMakeLists.txt | 5 +- offloading-cacher/main.cpp | 138 ++++++++++++++++++++++++------- 2 files changed, 110 insertions(+), 33 deletions(-) diff --git a/offloading-cacher/CMakeLists.txt b/offloading-cacher/CMakeLists.txt index 7b4844a..19ddbdd 100755 --- a/offloading-cacher/CMakeLists.txt +++ b/offloading-cacher/CMakeLists.txt @@ -1,12 +1,13 @@ cmake_minimum_required(VERSION 3.18) -project(offloading-cacher) +project(offloading-cacher LANGUAGES CXX) set(CMAKE_CXX_STANDARD 20) list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules") find_package(NUMA REQUIRED) +find_package(OpenMP REQUIRED) set(DML_SOURCE_DIR "../../DML/include/") set(SOURCES main.cpp) @@ -14,6 +15,6 @@ set(SOURCES main.cpp) add_executable(offloading-cacher ${SOURCES}) target_include_directories(offloading-cacher PRIVATE ${CMAKE_SOURCE_DIR} ${NUMA_INCLUDE_DIRS} ${DML_SOURCE_DIR}) -target_link_libraries(offloading-cacher PRIVATE libdml.a pthread ${CMAKE_DL_LIBS} ${NUMA_LIBRARY}) +target_link_libraries(offloading-cacher PRIVATE libdml.a pthread ${CMAKE_DL_LIBS} ${NUMA_LIBRARY} OpenMP::OpenMP_CXX) install(TARGETS offloading-cacher DESTINATION ${CMAKE_INSTALL_PREFIX}) diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index 4310d3d..08640dc 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -1,10 +1,49 @@ #include #include +#include +#include + +#include #include "cache.hpp" dsacache::Cache CACHE; +void InitCache(const std::string& device) { + if (device == "default") { + auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + return numa_dst_node; + }; + + auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { + return std::vector{ numa_src_node, numa_dst_node }; + }; + + CACHE.Init(cache_policy,copy_policy); + } + else if (device == "xeonmax") { + auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; + }; + + auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { + const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; + if (same_socket) { + const bool socket_number = numa_dst_node >> 2; + if (socket_number == 0) return std::vector{ 0, 1, 2, 3 }; + else return std::vector{ 4, 5, 6, 7 }; + } + else return std::vector{ numa_src_node, numa_dst_node }; + }; + + CACHE.Init(cache_policy,copy_policy); + } + else { + std::cerr << "Given device '" << device << "' not supported!" << std::endl; + exit(-1); + } +} + double* GetRandomArray(const size_t size) { double* array = new double[size]; @@ -31,7 +70,7 @@ bool IsEqual(const double* a, const double* b, const size_t size) { return true; } -void PerformAccessAndTest(double* src, const size_t size) { +void PerformAccessAndTest(double* src, const size_t size, const int tid) { std::unique_ptr data_cache = CACHE.Access( reinterpret_cast(src), size * sizeof(double) @@ -42,13 +81,13 @@ void PerformAccessAndTest(double* src, const size_t size) { // check the value immediately just to see if ram or cache was returned if (src == cached_imm) { - std::cout << "Caching did not immediately yield different data location." << std::endl; + std::cout << "[" << tid << "] Caching did not immediately yield different data location." << std::endl; } else if (cached_imm == nullptr) { - std::cout << "Immediately got nullptr." << std::endl; + std::cout << "[" << tid << "] Immediately got nullptr." << std::endl; } else { - std::cout << "Immediately got different data location." << std::endl; + std::cout << "[" << tid << "] Immediately got different data location." << std::endl; } // waits for the completion of the asynchronous caching operation @@ -62,56 +101,93 @@ void PerformAccessAndTest(double* src, const size_t size) { // tests on the resulting value if (src == cached) { - std::cout << "Caching did not affect data location." << std::endl; + std::cout << "[" << tid << "] Caching did not affect data location." << std::endl; } else if (cached == nullptr) { - std::cerr << "Got nullptr from cache." << std::endl; + std::cerr << "[" << tid << "] Got nullptr from cache." << std::endl; } else { - std::cout << "Got different data location from cache." << std::endl; + std::cout << "[" << tid << "] Got different data location from cache." << std::endl; } if (IsEqual(src,cached,size)) { - std::cout << "Cached data is correct." << std::endl; + std::cout << "[" << tid << "] Cached data is correct." << std::endl; } else { - std::cerr << "Cached data is wrong." << std::endl; + std::cerr << "[" << tid << "] Cached data is wrong." << std::endl; } } -int main(int argc, char **argv) { +void RunTestST(const size_t size) { + double* data = GetRandomArray(size); - // given numa destination and source node and the size of the data - // this function decides on which the data will be placed - // which is used to select the HBM-node for the dst-node if desired + static constexpr int tid = 0; - auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { - return numa_dst_node; - }; + std::cout << "[" << tid << "] first access --- " << std::endl; - // this function receives the memory source and destination node - // and then decides, on which nodes the copy operation will be split + PerformAccessAndTest(data, size, tid); - auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { - return std::vector{ numa_src_node, numa_dst_node }; - }; + std::cout << "[" << tid << "] second access --- " << std::endl; - // initializes the cache with the two policies + PerformAccessAndTest(data, size, tid); - CACHE.Init(cache_policy,copy_policy); + std::cout << "[" << tid << "] end of application --- " << std::endl; +} - // generate the test data +void RunTestMT(const size_t size) { + double* data = GetRandomArray(size); - static constexpr size_t data_size = 1024 * 1024; - double* data = GetRandomArray(data_size); + #pragma omp parallel + { + const int tid = omp_get_thread_num(); - std::cout << "--- first access --- " << std::endl; + std::cout << "[" << tid << "] first access --- " << std::endl; - PerformAccessAndTest(data, data_size); + PerformAccessAndTest(data, size, tid); - std::cout << "--- second access --- " << std::endl; + std::cout << "[" << tid << "] second access --- " << std::endl; - PerformAccessAndTest(data, data_size); + PerformAccessAndTest(data, size, tid); - std::cout << "--- end of application --- " << std::endl; + std::cout << "[" << tid << "] end of block --- " << std::endl; + } +} + +int main(int argc, char **argv) { + if (argc != 4) { + std::cerr << "This application requires four parameters!" << std::endl; + + std::cout << "Please provide the following positional arguments: [device] [mode] [size]" << std::endl; + std::cout << "[device] from { default, xeonmax } which influences cache and execution placement" << std::endl; + std::cout << "[mode] from { st, mt } or single and multi threaded respectively" << std::endl; + std::cout << "[size] positive integral number, amount of float64 in data array" << std::endl; + + exit(-1); + } + + const std::string device = argv[1]; + const std::string mode = argv[2]; + const std::string size_s = argv[3]; + + uint32_t size = 0; + + try { + size = std::stoul(size_s); + } + catch (...) { + std::cerr << "Given Size '" << size_s << "' caused error during conversion to number!" << std::endl; + } + + InitCache(device); + + if (mode == "st") { + RunTestST(size); + } + else if (mode == "mt") { + RunTestMT(size); + } + else { + std::cerr << "Given Mode '" << mode << "' not supported!" << std::endl; + exit(-1); + } } From 4ddd96adcb76ea711e536767cbbd4129ed2a25b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 17:57:25 +0100 Subject: [PATCH 14/29] remove extra whitespace from output in main function --- offloading-cacher/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index 08640dc..2302493 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -123,15 +123,15 @@ void RunTestST(const size_t size) { static constexpr int tid = 0; - std::cout << "[" << tid << "] first access --- " << std::endl; + std::cout << "[" << tid << "] first access --- " << std::endl; PerformAccessAndTest(data, size, tid); - std::cout << "[" << tid << "] second access --- " << std::endl; + std::cout << "[" << tid << "] second access --- " << std::endl; PerformAccessAndTest(data, size, tid); - std::cout << "[" << tid << "] end of application --- " << std::endl; + std::cout << "[" << tid << "] end of application --- " << std::endl; } void RunTestMT(const size_t size) { From 7dfbed68feba4de8fe44aa205fc7437c92d5fd92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 19:04:44 +0100 Subject: [PATCH 15/29] handle allocation slightly different, introduce a separate function for cleaner code that does on-node memory allocation, first querry the available size and do not rely on numa_alloc_onnode to report nullptr if the size is not really available --- offloading-cacher/cache.hpp | 56 +++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 22b23f8..cce0439 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -69,6 +69,12 @@ namespace dsacache { // as this is set as the "optimal placement" node void GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const; + // allocates memory of size "size" on the numa node "node" + // and returns nullptr if this is not possible, also may + // try to flush the cache of the requested node to + // alleviate encountered shortage + uint8_t* AllocOnNode(const size_t size, const int node); + // checks whether the cache contains an entry for // the given data in the given memory node and // returns it, otherwise returns nullptr @@ -164,32 +170,58 @@ inline std::unique_ptr dsacache::Cache::Access(uint8_t* dat return std::move(task); } -inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) { - std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl; - +inline uint8_t* dsacache::Cache::AllocOnNode(const size_t size, const int node) { // allocate data on this node and flush the unused parts of the // cache if the operation fails and retry once // TODO: smarter flush strategy could keep some stuff cached - uint8_t* dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); + // check currently free memory to see if the data fits - if (dst == nullptr) { - std::cout << "[!] First allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; + long long int free_space = 0; + numa_node_size64(node, &free_space); + + if (free_space < size) { + std::cout << "[!] Memory shortage when allocating " << size << "B on node " << node << std::endl; - // allocation on dst_node failed so we flush the cache for this + // dst node lacks memory space so we flush the cache for this // node hoping to free enough currently unused entries to make // the second allocation attempt successful - Flush(dst_node); + Flush(node); - dst = reinterpret_cast(numa_alloc_onnode(task->size_, dst_node)); + // re-test by getting the free space and checking again - if (dst == nullptr) { - std::cerr << "[x] Second allocation try failed for " << task->size_ << "B on node " << dst_node << std::endl; - return; + numa_node_size64(node, &free_space); + + if (free_space < size) { + std::cout << "[x] Memory shortage after flush when allocating " << size << "B on node " << node << std::endl; + + return nullptr; } } + uint8_t* dst = reinterpret_cast(numa_alloc_onnode(size, node)); + + if (dst == nullptr) { + std::cout << "[x] Allocation try failed for " << size << "B on node " << node << std::endl; + + return nullptr; + } + + return dst; +} + +inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) { + std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl; + + + uint8_t* dst = AllocOnNode(task->size_, dst_node); + + if (dst == nullptr) { + std::cout << "[x] Allocation failed so we can not cache" << std::endl; + return; + } + task->incomplete_cache_ = dst; // querry copy policy function for the nodes to use for the copy From 6ab88595b7cd0e0f0d910cbb54846d61f7e03688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 19:05:16 +0100 Subject: [PATCH 16/29] add test for the cache-flush logic which was previously not tested --- offloading-cacher/main.cpp | 61 ++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index 2302493..443b00b 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -44,10 +44,10 @@ void InitCache(const std::string& device) { } } -double* GetRandomArray(const size_t size) { - double* array = new double[size]; +uint8_t* GetRandomArray(const size_t size) { + uint8_t* array = new uint8_t[size]; - std::uniform_real_distribution unif(std::numeric_limits::min(), std::numeric_limits::max()); + std::uniform_int_distribution unif(std::numeric_limits::min(), std::numeric_limits::max()); std::default_random_engine re; for (size_t i = 0; i < size; i++) { @@ -57,7 +57,7 @@ double* GetRandomArray(const size_t size) { return array; } -bool IsEqual(const double* a, const double* b, const size_t size) { +bool IsEqual(const uint8_t* a, const uint8_t* b, const size_t size) { for (size_t i = 0; i < size; i++) { try { if (a[i] != b[i]) return false; @@ -70,13 +70,13 @@ bool IsEqual(const double* a, const double* b, const size_t size) { return true; } -void PerformAccessAndTest(double* src, const size_t size, const int tid) { +std::unique_ptr PerformAccessAndTest(uint8_t* src, const size_t size, const int tid) { std::unique_ptr data_cache = CACHE.Access( reinterpret_cast(src), - size * sizeof(double) + size * sizeof(uint8_t) ); - double* cached_imm = reinterpret_cast(data_cache->GetDataLocation()); + uint8_t* cached_imm = reinterpret_cast(data_cache->GetDataLocation()); // check the value immediately just to see if ram or cache was returned @@ -96,7 +96,7 @@ void PerformAccessAndTest(double* src, const size_t size, const int tid) { // gets the cache-data-location from the struct - double* cached = reinterpret_cast(data_cache->GetDataLocation()); + uint8_t* cached = reinterpret_cast(data_cache->GetDataLocation()); // tests on the resulting value @@ -116,10 +116,12 @@ void PerformAccessAndTest(double* src, const size_t size, const int tid) { else { std::cerr << "[" << tid << "] Cached data is wrong." << std::endl; } + + return std::move(data_cache); } void RunTestST(const size_t size) { - double* data = GetRandomArray(size); + uint8_t* data = GetRandomArray(size); static constexpr int tid = 0; @@ -135,7 +137,7 @@ void RunTestST(const size_t size) { } void RunTestMT(const size_t size) { - double* data = GetRandomArray(size); + uint8_t* data = GetRandomArray(size); #pragma omp parallel { @@ -153,14 +155,44 @@ void RunTestMT(const size_t size) { } } +void RunTestFlush(const size_t size) { + uint8_t* data1 = GetRandomArray(size); + uint8_t* data2 = GetRandomArray(size); + uint8_t* data3 = GetRandomArray(size); + + static constexpr int tid = 0; + + std::cout << "[" << tid << "] first access to data d1 and keepalive --- " << std::endl; + + const auto c1 = PerformAccessAndTest(data1, size, tid); + + std::cout << "[" << tid << "] second access to d2 lets d2 vanish --- " << std::endl; + + PerformAccessAndTest(data2, size, tid); + + std::cout << "[" << tid << "] third access to d3 should clear d2 --- " << std::endl; + + PerformAccessAndTest(data3, size, tid); + + std::cout << "[" << tid << "] end of block and test d1 == cache1 --- " << std::endl; + + if (IsEqual(data1, c1->GetDataLocation(), size)) { + std::cout << "[" << tid << "] Cached d1 is still correct." << std::endl; + } + else { + std::cerr << "[" << tid << "] Cached d1 is bad." << std::endl; + } +} + int main(int argc, char **argv) { if (argc != 4) { - std::cerr << "This application requires four parameters!" << std::endl; + std::cerr << "This application requires three parameters!" << std::endl; std::cout << "Please provide the following positional arguments: [device] [mode] [size]" << std::endl; std::cout << "[device] from { default, xeonmax } which influences cache and execution placement" << std::endl; - std::cout << "[mode] from { st, mt } or single and multi threaded respectively" << std::endl; - std::cout << "[size] positive integral number, amount of float64 in data array" << std::endl; + std::cout << "[mode] from { st, mt, flt } or single and multi threaded and flushtest respectively" << std::endl; + std::cout << "[size] positive integral number, amount of bytes in data array" << std::endl; + std::cout << "for flushtest the given size should be 1/3 of the available cache size" << std::endl; exit(-1); } @@ -186,6 +218,9 @@ int main(int argc, char **argv) { else if (mode == "mt") { RunTestMT(size); } + else if (mode == "flt") { + RunTestFlush(size); + } else { std::cerr << "Given Mode '" << mode << "' not supported!" << std::endl; exit(-1); From 4fa5ef65227294b6e755bca45eae1eb6a82be7cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 10 Jan 2024 19:21:44 +0100 Subject: [PATCH 17/29] accept existing cache if the cached block is larger than the requested view --- offloading-cacher/cache.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index cce0439..50e9c29 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -359,9 +359,8 @@ inline std::unique_ptr dsacache::Cache::GetFromCache(uint8_ if (search != cache_state_[dst_node].end()) { // now check whether the sizes match - // TODO: second.size_ >= size would also work - if (search->second.size_ == size) { + if (search->second.size_ >= size) { std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; // return a unique copy of the entry which uses the object From d7c5c55208b3f65c5ebe621cb69585bfaf751d81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Mon, 15 Jan 2024 13:16:46 +0100 Subject: [PATCH 18/29] turn library into single-header mode --- offloading-cacher/cache-data.hpp | 246 ----------------------- offloading-cacher/cache.hpp | 276 +++++++++++++++++++++++++- offloading-cacher/util/dml-helper.hpp | 46 ----- 3 files changed, 275 insertions(+), 293 deletions(-) delete mode 100644 offloading-cacher/cache-data.hpp delete mode 100644 offloading-cacher/util/dml-helper.hpp diff --git a/offloading-cacher/cache-data.hpp b/offloading-cacher/cache-data.hpp deleted file mode 100644 index 1958519..0000000 --- a/offloading-cacher/cache-data.hpp +++ /dev/null @@ -1,246 +0,0 @@ -#pragma once - -#include - -#include -#include -#include - -#include - -#include "util/dml-helper.hpp" - -namespace dsacache { - class Cache; - - // cache data holds all required information on - // one cache entry and will both be stored - // internally by the cache and handed out - // as copies to the user - // this class uses its object lifetime and - // a global reference counter to allow - // thread-safe copies and resource management - - class CacheData { - public: - using dml_handler = dml::handler>; - - private: - // data source and size of the block - uint8_t* src_; - size_t size_; - - // global reference counting object - std::atomic* active_; - - // global cache-location pointer - std::atomic* cache_; - - // object-local incomplete cache location pointer - // which is only available in the first instance - uint8_t* incomplete_cache_; - - // dml handler vector pointer which is only - // available in the first instance - std::unique_ptr> handlers_; - - // deallocates the global cache-location - // and invalidates it - void Deallocate(); - - // checks whether there are at least two - // valid references to this object which - // is done as the cache always has one - // internally to any living instance - bool Active() const; - - friend Cache; - public: - CacheData(uint8_t* data, const size_t size); - CacheData(const CacheData& other); - ~CacheData(); - - // waits on completion of caching operations - // for this task and is safe to be called in - // any state of the object - void WaitOnCompletion(); - - // returns the cache data location for this - // instance which is valid as long as the - // instance is alive - !!! this may also - // yield a nullptr !!! - uint8_t* GetDataLocation() const; - }; -} - -inline void dsacache::CacheData::WaitOnCompletion() { - // the cache data entry can be in two states - // either it is the original one which has not - // been waited for in which case the handlers - // are non-null or it is not - - if (handlers_ == nullptr) { - std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - // when no handlers are attached to this cache entry we wait on a - // value change for the cache structure from nullptr to non-null - // which will either go through immediately if the cache is valid - // already or wait until the handler-owning thread notifies us - - cache_->wait(nullptr); - - std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - } - else { - // when the handlers are non-null there are some DSA task handlers - // available on which we must wait here - - std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - // abort is set if any operation encountered an error - - bool abort = false; - - for (auto& handler : *handlers_) { - auto result = handler.get(); - - if (result.status != dml::status_code::ok) { - std::cerr << "[x] Encountered bad status code for operation: " << dml::StatusCodeToString(result.status) << std::endl; - - // if one of the copy tasks failed we abort the whole task - // after all operations are completed on it - - abort = true; - } - } - - // the handlers are cleared after all have completed - - handlers_ = nullptr; - - // now we act depending on whether an abort has been - // called for which signals operation incomplete - - if (abort) { - // store nullptr in the cache location - - cache_->store(nullptr); - - // then free the now incomplete cache - - // TODO: it would be possible to salvage the - // TODO: operation at this point but this - // TODO: is quite complicated so we just abort - - numa_free(incomplete_cache_, size_); - } - else { - std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - // incomplete cache is now safe to use and therefore we - // swap it with the global cache state of this entry - // and notify potentially waiting threads - - cache_->store(incomplete_cache_); - } - - // as a last step all waiting threads must - // be notified (copies of this will wait on value - // change of the cache) and the incomplete cache - // is cleared to nullptr as it is not incomplete - - cache_->notify_all(); - incomplete_cache_ = nullptr; - } -} - -inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { - std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl; - - src_ = data; - size_ = size; - active_ = new std::atomic(1); - cache_ = new std::atomic(); - incomplete_cache_ = nullptr; - handlers_ = std::make_unique>(); -} - -inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) { - std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; - - // we copy the ptr to the global atomic reference counter - // and increase the amount of active references - - active_ = other.active_; - const int current_active = active_->fetch_add(1); - - // source and size will be copied too - // as well as the reference to the global - // atomic cache pointer - - src_ = other.src_; - size_ = other.size_; - cache_ = other.cache_; - - // incomplete cache and handlers will not - // be copied because only the first instance - // will wait on the completion of handlers - - incomplete_cache_ = nullptr; - handlers_ = nullptr; -} - -inline dsacache::CacheData::~CacheData() { - std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - // if this is the first instance of this cache structure - // and it has not been waited on and is now being destroyed - // we must wait on completion here to ensure the cache - // remains in a valid state - - if (handlers_ != nullptr) { - WaitOnCompletion(); - } - - // due to fetch_sub returning the preivously held value - // we must subtract one locally to get the current value - - const int32_t v = active_->fetch_sub(1) - 1; - - // if the returned value is zero or lower - // then we must execute proper deletion - // as this was the last reference - - if (v <= 0) { - std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - Deallocate(); - - delete active_; - delete cache_; - } -} - -inline void dsacache::CacheData::Deallocate() { - std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - - // although deallocate should only be called from - // a safe context to do so, it can not hurt to - // defensively perform the operation atomically - - uint8_t* cache_local = cache_->exchange(nullptr); - if (cache_local != nullptr) numa_free(cache_local, size_); -} - -inline uint8_t* dsacache::CacheData::GetDataLocation() const { - return cache_->load(); -} - -inline bool dsacache::CacheData::Active() const { - // this entry is active if more than one - // reference exists to it, as the Cache - // will always keep one internally until - // the entry is cleared from cache - - return active_->load() > 1; -} \ No newline at end of file diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 50e9c29..3fe1e19 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -13,9 +13,111 @@ #include -#include "cache-data.hpp" +namespace dml { + inline const std::string StatusCodeToString(const dml::status_code code) { + switch (code) { + case dml::status_code::ok: + return "ok"; + case dml::status_code::false_predicate: + return "false predicate"; + case dml::status_code::partial_completion: + return "partial completion"; + case dml::status_code::nullptr_error: + return "nullptr error"; + case dml::status_code::bad_size: + return "bad size"; + case dml::status_code::bad_length: + return "bad length"; + case dml::status_code::inconsistent_size: + return "inconsistent size"; + case dml::status_code::dualcast_bad_padding: + return "dualcast bad padding"; + case dml::status_code::bad_alignment: + return "bad alignment"; + case dml::status_code::buffers_overlapping: + return "buffers overlapping"; + case dml::status_code::delta_delta_empty: + return "delta delta empty"; + case dml::status_code::batch_overflow: + return "batch overflow"; + case dml::status_code::execution_failed: + return "execution failed"; + case dml::status_code::unsupported_operation: + return "unsupported operation"; + case dml::status_code::queue_busy: + return "queue busy"; + case dml::status_code::error: + return "unknown error"; + case dml::status_code::config_error: + return "config error"; + default: + return "unhandled error"; + } + } +} namespace dsacache { + class Cache; + + // cache data holds all required information on + // one cache entry and will both be stored + // internally by the cache and handed out + // as copies to the user + // this class uses its object lifetime and + // a global reference counter to allow + // thread-safe copies and resource management + + class CacheData { + public: + using dml_handler = dml::handler>; + + private: + // data source and size of the block + uint8_t* src_; + size_t size_; + + // global reference counting object + std::atomic* active_; + + // global cache-location pointer + std::atomic* cache_; + + // object-local incomplete cache location pointer + // which is only available in the first instance + uint8_t* incomplete_cache_; + + // dml handler vector pointer which is only + // available in the first instance + std::unique_ptr> handlers_; + + // deallocates the global cache-location + // and invalidates it + void Deallocate(); + + // checks whether there are at least two + // valid references to this object which + // is done as the cache always has one + // internally to any living instance + bool Active() const; + + friend Cache; + public: + CacheData(uint8_t* data, const size_t size); + CacheData(const CacheData& other); + ~CacheData(); + + // waits on completion of caching operations + // for this task and is safe to be called in + // any state of the object + void WaitOnCompletion(); + + // returns the cache data location for this + // instance which is valid as long as the + // instance is alive - !!! this may also + // yield a nullptr !!! + uint8_t* GetDataLocation() const; + }; + // cache class will handle access to data through the cache // by managing the cache through work submission, it sticks // to user-defined caching and copy policies, is thread @@ -381,3 +483,175 @@ inline std::unique_ptr dsacache::Cache::GetFromCache(uint8_ return nullptr; } + +inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { + std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl; + + src_ = data; + size_ = size; + active_ = new std::atomic(1); + cache_ = new std::atomic(); + incomplete_cache_ = nullptr; + handlers_ = std::make_unique>(); +} + +inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) { + std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; + + // we copy the ptr to the global atomic reference counter + // and increase the amount of active references + + active_ = other.active_; + const int current_active = active_->fetch_add(1); + + // source and size will be copied too + // as well as the reference to the global + // atomic cache pointer + + src_ = other.src_; + size_ = other.size_; + cache_ = other.cache_; + + // incomplete cache and handlers will not + // be copied because only the first instance + // will wait on the completion of handlers + + incomplete_cache_ = nullptr; + handlers_ = nullptr; +} + +inline dsacache::CacheData::~CacheData() { + std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + // if this is the first instance of this cache structure + // and it has not been waited on and is now being destroyed + // we must wait on completion here to ensure the cache + // remains in a valid state + + if (handlers_ != nullptr) { + WaitOnCompletion(); + } + + // due to fetch_sub returning the preivously held value + // we must subtract one locally to get the current value + + const int32_t v = active_->fetch_sub(1) - 1; + + // if the returned value is zero or lower + // then we must execute proper deletion + // as this was the last reference + + if (v <= 0) { + std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + Deallocate(); + + delete active_; + delete cache_; + } +} + +inline void dsacache::CacheData::Deallocate() { + std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + // although deallocate should only be called from + // a safe context to do so, it can not hurt to + // defensively perform the operation atomically + + uint8_t* cache_local = cache_->exchange(nullptr); + if (cache_local != nullptr) numa_free(cache_local, size_); +} + +inline uint8_t* dsacache::CacheData::GetDataLocation() const { + return cache_->load(); +} + +inline bool dsacache::CacheData::Active() const { + // this entry is active if more than one + // reference exists to it, as the Cache + // will always keep one internally until + // the entry is cleared from cache + + return active_->load() > 1; +} + +inline void dsacache::CacheData::WaitOnCompletion() { + // the cache data entry can be in two states + // either it is the original one which has not + // been waited for in which case the handlers + // are non-null or it is not + + if (handlers_ == nullptr) { + std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + // when no handlers are attached to this cache entry we wait on a + // value change for the cache structure from nullptr to non-null + // which will either go through immediately if the cache is valid + // already or wait until the handler-owning thread notifies us + + cache_->wait(nullptr); + + std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + } + else { + // when the handlers are non-null there are some DSA task handlers + // available on which we must wait here + + std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + // abort is set if any operation encountered an error + + bool abort = false; + + for (auto& handler : *handlers_) { + auto result = handler.get(); + + if (result.status != dml::status_code::ok) { + std::cerr << "[x] Encountered bad status code for operation: " << dml::StatusCodeToString(result.status) << std::endl; + + // if one of the copy tasks failed we abort the whole task + // after all operations are completed on it + + abort = true; + } + } + + // the handlers are cleared after all have completed + + handlers_ = nullptr; + + // now we act depending on whether an abort has been + // called for which signals operation incomplete + + if (abort) { + // store nullptr in the cache location + + cache_->store(nullptr); + + // then free the now incomplete cache + + // TODO: it would be possible to salvage the + // TODO: operation at this point but this + // TODO: is quite complicated so we just abort + + numa_free(incomplete_cache_, size_); + } + else { + std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; + + // incomplete cache is now safe to use and therefore we + // swap it with the global cache state of this entry + // and notify potentially waiting threads + + cache_->store(incomplete_cache_); + } + + // as a last step all waiting threads must + // be notified (copies of this will wait on value + // change of the cache) and the incomplete cache + // is cleared to nullptr as it is not incomplete + + cache_->notify_all(); + incomplete_cache_ = nullptr; + } +} diff --git a/offloading-cacher/util/dml-helper.hpp b/offloading-cacher/util/dml-helper.hpp deleted file mode 100644 index de92bb7..0000000 --- a/offloading-cacher/util/dml-helper.hpp +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -#include - -namespace dml { - inline const std::string StatusCodeToString(const dml::status_code code) { - switch (code) { - case dml::status_code::ok: - return "ok"; - case dml::status_code::false_predicate: - return "false predicate"; - case dml::status_code::partial_completion: - return "partial completion"; - case dml::status_code::nullptr_error: - return "nullptr error"; - case dml::status_code::bad_size: - return "bad size"; - case dml::status_code::bad_length: - return "bad length"; - case dml::status_code::inconsistent_size: - return "inconsistent size"; - case dml::status_code::dualcast_bad_padding: - return "dualcast bad padding"; - case dml::status_code::bad_alignment: - return "bad alignment"; - case dml::status_code::buffers_overlapping: - return "buffers overlapping"; - case dml::status_code::delta_delta_empty: - return "delta delta empty"; - case dml::status_code::batch_overflow: - return "batch overflow"; - case dml::status_code::execution_failed: - return "execution failed"; - case dml::status_code::unsupported_operation: - return "unsupported operation"; - case dml::status_code::queue_busy: - return "queue busy"; - case dml::status_code::error: - return "unknown error"; - case dml::status_code::config_error: - return "config error"; - default: - return "unhandled error"; - } - } -} \ No newline at end of file From 8ba716353a7634cd7ddd117170f6f769a5f87d15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Mon, 15 Jan 2024 13:39:06 +0100 Subject: [PATCH 19/29] add Clear() function which forces flush on the entire Cache and remove non-warning/error status messages from the cacher --- offloading-cacher/cache.hpp | 46 ++++++++++++------------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 3fe1e19..6b0e712 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -195,9 +195,24 @@ namespace dsacache { // checked and otherwise the specified // node - no checks on node validity void Flush(const int node = -1); + + // forces out all entries from the + // cache and therefore will also "forget" + // still-in-use entries, these will still + // be properly deleted, but the cache + // will be fresh - use for testing + void Clear(); }; } +inline void dsacache::Cache::Clear() { + std::unique_lock lock(cache_mutex_); + + cache_state_.clear(); + + Init(cache_policy_function_, copy_policy_function_); +} + inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) { cache_policy_function_ = cache_policy_function; copy_policy_function_ = copy_policy_function; @@ -221,8 +236,6 @@ inline void dsacache::Cache::Init(CachePolicy* cache_policy_function, CopyPolicy cache_state_.insert({node,{}}); } } - - std::cout << "[-] Cache Initialized" << std::endl; } inline std::unique_ptr dsacache::Cache::Access(uint8_t* data, const size_t size) { @@ -314,9 +327,6 @@ inline uint8_t* dsacache::Cache::AllocOnNode(const size_t size, const int node) } inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, const int src_node) { - std::cout << "[+] Allocating " << task->size_ << "B on node " << dst_node << " for " << std::hex << (uint64_t)task->src_ << std::dec << std::endl; - - uint8_t* dst = AllocOnNode(task->size_, dst_node); if (dst == nullptr) { @@ -338,8 +348,6 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con const size_t size = task->size_ / task_count; const size_t last_size = size + task->size_ % task_count; - std::cout << "[-] Splitting Copy into " << task_count << " tasks of " << size << "B 0x" << std::hex << (uint64_t)task->src_ << std::dec << std::endl; - // save the current numa node mask to restore later // as executing the copy task will place this thread // on a different node @@ -389,8 +397,6 @@ inline void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* } inline void dsacache::Cache::Flush(const int node) { - std::cout << "[-] Flushing Cache for " << (node == -1 ? "all nodes" : "node " + std::to_string(node)) << std::endl; - // this lambda is used because below we have two code paths that // flush nodes, either one single or all successively @@ -463,16 +469,12 @@ inline std::unique_ptr dsacache::Cache::GetFromCache(uint8_ // now check whether the sizes match if (search->second.size_ >= size) { - std::cout << "[+] Found Cached version for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; - // return a unique copy of the entry which uses the object // lifetime and destructor to safely handle deallocation return std::move(std::make_unique(search->second)); } else { - std::cout << "[!] Found Cached version with size missmatch for 0x" << std::hex << (uint64_t)src << std::dec << std::endl; - // if the sizes missmatch then we clear the current entry from cache // which will cause its deletion only after the last possible outside // reference is also destroyed @@ -485,8 +487,6 @@ inline std::unique_ptr dsacache::Cache::GetFromCache(uint8_ } inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { - std::cout << "[-] New CacheData 0x" << std::hex << (uint64_t)data << std::dec << std::endl; - src_ = data; size_ = size; active_ = new std::atomic(1); @@ -496,8 +496,6 @@ inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { } inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) { - std::cout << "[-] Copy Created for CacheData 0x" << std::hex << (uint64_t)other.src_ << std::dec << std::endl; - // we copy the ptr to the global atomic reference counter // and increase the amount of active references @@ -521,8 +519,6 @@ inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) { } inline dsacache::CacheData::~CacheData() { - std::cout << "[-] Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - // if this is the first instance of this cache structure // and it has not been waited on and is now being destroyed // we must wait on completion here to ensure the cache @@ -542,8 +538,6 @@ inline dsacache::CacheData::~CacheData() { // as this was the last reference if (v <= 0) { - std::cout << "[!] Full Destructor for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - Deallocate(); delete active_; @@ -552,8 +546,6 @@ inline dsacache::CacheData::~CacheData() { } inline void dsacache::CacheData::Deallocate() { - std::cout << "[!] Deallocating for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - // although deallocate should only be called from // a safe context to do so, it can not hurt to // defensively perform the operation atomically @@ -582,23 +574,17 @@ inline void dsacache::CacheData::WaitOnCompletion() { // are non-null or it is not if (handlers_ == nullptr) { - std::cout << "[-] Waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - // when no handlers are attached to this cache entry we wait on a // value change for the cache structure from nullptr to non-null // which will either go through immediately if the cache is valid // already or wait until the handler-owning thread notifies us cache_->wait(nullptr); - - std::cout << "[+] Finished waiting on cache-var-update for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; } else { // when the handlers are non-null there are some DSA task handlers // available on which we must wait here - std::cout << "[-] Waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - // abort is set if any operation encountered an error bool abort = false; @@ -637,8 +623,6 @@ inline void dsacache::CacheData::WaitOnCompletion() { numa_free(incomplete_cache_, size_); } else { - std::cout << "[+] Finished waiting on handlers for CacheData 0x" << std::hex << (uint64_t)src_ << std::dec << std::endl; - // incomplete cache is now safe to use and therefore we // swap it with the global cache state of this entry // and notify potentially waiting threads From 0fdf650fe4b3fe23c380e67c0f0ff0e927e0a5bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Mon, 15 Jan 2024 22:43:38 +0100 Subject: [PATCH 20/29] improve the class-comments for Cache and CacheData, also free incomplete_cache_ if it has not been waited for (see comment on this) --- offloading-cacher/cache.hpp | 125 ++++++++++++++++++++++++++++++++---- 1 file changed, 113 insertions(+), 12 deletions(-) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 6b0e712..d96f02b 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -59,13 +59,33 @@ namespace dml { namespace dsacache { class Cache; - // cache data holds all required information on - // one cache entry and will both be stored - // internally by the cache and handed out - // as copies to the user - // this class uses its object lifetime and - // a global reference counter to allow - // thread-safe copies and resource management + /* + * Class Description: + * Holds all required information on one cache entry and is used + * both internally by the Cache and externally by the user. + * + * Important Usage Notes: + * The pointer is only updated in WaitOnCompletion() which + * therefore must be called by the user at some point in order + * to use the cached data. Using this class as T for + * std::shared_ptr is not recommended as references are + * already counted internally. + * + * Cache Lifetime: + * As long as the instance is referenced, the pointer it stores + * is guaranteed to be either nullptr or pointing to a valid copy. + * + * Implementation Detail: + * Performs self-reference counting with a shared atomic integer. + * Therefore on creating a copy the reference count is increased + * and with the destructor it is deacresed. If the last copy is + * destroyed the actual underlying data is freed and all shared + * variables deleted. + * + * Notes on Thread Safety: + * Class is thread safe in any possible state and performs + * reference counting and deallocation itself entirely atomically. + */ class CacheData { public: @@ -101,6 +121,7 @@ namespace dsacache { bool Active() const; friend Cache; + public: CacheData(uint8_t* data, const size_t size); CacheData(const CacheData& other); @@ -118,11 +139,62 @@ namespace dsacache { uint8_t* GetDataLocation() const; }; - // cache class will handle access to data through the cache - // by managing the cache through work submission, it sticks - // to user-defined caching and copy policies, is thread - // safe after initialization and returns copies of - // cache data class to the user + /* + * Class Description: + * Class will handle access to data through internal copies. + * These are obtained via work submission to the Intel DSA which takes + * care of asynchronously duplicating the data. The user will define + * where these copies lie and which system nodes will perform the copy. + * This is done through policy functions set during initialization. + * + * Placement Policy: + * The Placement Policy Function decides on which node a particular + * entry is to be placed, given the current executing node and the + * data source node and data size. This in turn means that for one + * datum, multiple cached copies may exist at one time. + * + * Cache Lifetime: + * When accessing the cache, a CacheData-object will be returned. + * As long as this object lives, the pointer which it holds is + * guaranteed to be either nullptr or a valid copy. When destroyed + * the entry is marked for deletion which is only carried out + * when system memory pressure drives an automated cache flush. + * + * Restrictions: + * - Overlapping Pointers may lead to undefined behaviour during + * manual cache invalidation which should not be used if you + * intend to have these types of pointers + * - Cache Invalidation may only be performed manually and gives + * no ordering guarantees. Therefore, it is the users responsibility + * to ensure that results after invalidation have been generated + * using the latest state of data. The cache is best suited + * to static data. + * + * Notes on Thread Safety: + * - Cache is completely thread-safe after initialization + * - CacheData-class will handle deallocation of data itself by + * performing self-reference-counting atomically and only + * deallocating if the last reference is destroyed + * - The internal cache state has one lock which is either + * acquired shared for reading the state (upon accessing an already + * cached element) or unique (accessing a new element, flushing, invalidating) + * - Waiting on copy completion is done over an atomic-wait in copies + * of the original CacheData-instance + * - Overall this class may experience performance issues due to the use + * of locking (in any configuration), lock contention (worsens with higher + * core count, node count and utilization) and atomics (worse in the same + * situations as lock contention) + * + * Improving Performance: + * When data is never shared between threads or memory size for the cache is + * not an issue you may consider having one Cache-instance per thread and removing + * the lock in Cache and modifying the reference counting and waiting mechanisms + * of CacheData accordingly (although this is high effort and will yield little due + * to the atomics not being shared among cores/nodes). + * Otherwise, one Cache-instance per node could also be considered. This will allow + * the placement policy function to be barebones and reduces the lock contention and + * synchronization impact of the atomic variables. + */ class Cache { public: @@ -202,6 +274,8 @@ namespace dsacache { // be properly deleted, but the cache // will be fresh - use for testing void Clear(); + + void Invalidate(uint8_t* data); }; } @@ -486,6 +560,28 @@ inline std::unique_ptr dsacache::Cache::GetFromCache(uint8_ return nullptr; } +void dsacache::Cache::Invalidate(uint8_t* data) { + // as the cache is modified we must obtain a unique writers lock + + std::unique_lock lock(cache_mutex_); + + // loop through all per-node-caches available + + for (auto node : cache_state_) { + // search for an entry for the given data pointer + + auto search = node.second.find(data); + + if (search != node.second.end()) { + // if the data is represented in-cache + // then it will be erased to re-trigger + // caching on next access + + node.second.erase(search); + } + } +} + inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { src_ = data; size_ = size; @@ -552,6 +648,11 @@ inline void dsacache::CacheData::Deallocate() { uint8_t* cache_local = cache_->exchange(nullptr); if (cache_local != nullptr) numa_free(cache_local, size_); + + // if the cache was never waited for then incomplete_cache_ + // may still contain a valid pointer which has to be freed + + if (incomplete_cache_ != nullptr) numa_free(incomplete_cache_, size_); } inline uint8_t* dsacache::CacheData::GetDataLocation() const { From e570a6fe696bba485c976c04ba808aa9ce6e182d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Mon, 15 Jan 2024 22:48:01 +0100 Subject: [PATCH 21/29] reduce the line-count of the switch statement for dml::StatusToString by inlining return with case statements --- offloading-cacher/cache.hpp | 54 +++++++++++++------------------------ 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index d96f02b..6a717ff 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -16,42 +16,24 @@ namespace dml { inline const std::string StatusCodeToString(const dml::status_code code) { switch (code) { - case dml::status_code::ok: - return "ok"; - case dml::status_code::false_predicate: - return "false predicate"; - case dml::status_code::partial_completion: - return "partial completion"; - case dml::status_code::nullptr_error: - return "nullptr error"; - case dml::status_code::bad_size: - return "bad size"; - case dml::status_code::bad_length: - return "bad length"; - case dml::status_code::inconsistent_size: - return "inconsistent size"; - case dml::status_code::dualcast_bad_padding: - return "dualcast bad padding"; - case dml::status_code::bad_alignment: - return "bad alignment"; - case dml::status_code::buffers_overlapping: - return "buffers overlapping"; - case dml::status_code::delta_delta_empty: - return "delta delta empty"; - case dml::status_code::batch_overflow: - return "batch overflow"; - case dml::status_code::execution_failed: - return "execution failed"; - case dml::status_code::unsupported_operation: - return "unsupported operation"; - case dml::status_code::queue_busy: - return "queue busy"; - case dml::status_code::error: - return "unknown error"; - case dml::status_code::config_error: - return "config error"; - default: - return "unhandled error"; + case dml::status_code::ok: return "ok"; + case dml::status_code::false_predicate: return "false predicate"; + case dml::status_code::partial_completion: return "partial completion"; + case dml::status_code::nullptr_error: return "nullptr error"; + case dml::status_code::bad_size: return "bad size"; + case dml::status_code::bad_length: return "bad length"; + case dml::status_code::inconsistent_size: return "inconsistent size"; + case dml::status_code::dualcast_bad_padding: return "dualcast bad padding"; + case dml::status_code::bad_alignment: return "bad alignment"; + case dml::status_code::buffers_overlapping: return "buffers overlapping"; + case dml::status_code::delta_delta_empty: return "delta delta empty"; + case dml::status_code::batch_overflow: return "batch overflow"; + case dml::status_code::execution_failed: return "execution failed"; + case dml::status_code::unsupported_operation: return "unsupported operation"; + case dml::status_code::queue_busy: return "queue busy"; + case dml::status_code::error: return "unknown error"; + case dml::status_code::config_error: return "config error"; + default: return "unhandled error"; } } } From e4a681ac1efb2ca5c0d274b3110d903dc1c78c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Mon, 15 Jan 2024 22:50:34 +0100 Subject: [PATCH 22/29] delete the copy-constructor for cache as copying it is undesired behaviour --- offloading-cacher/cache.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 6a717ff..78d16b0 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -237,6 +237,8 @@ namespace dsacache { std::unique_ptr GetFromCache(uint8_t* src, const size_t size, const int dst_node); public: + Cache(const Cache& other) = delete; + // initializes the cache with the two policy functions // only after this is it safe to use in a threaded environment void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function); From da38c048ca53bf2a1971eeae500e92242c4e235f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 16 Jan 2024 22:14:58 +0100 Subject: [PATCH 23/29] pass data size to copy policy function too --- offloading-cacher/cache.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 78d16b0..b84e347 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -191,7 +191,7 @@ namespace dsacache { // copy policy specifies the copy-executing nodes for a given task // which allows flexibility in assignment for optimizing raw throughput // or choosing a conservative usage policy - typedef std::vector (CopyPolicy)(const int numa_dst_node, const int numa_src_node); + typedef std::vector (CopyPolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size); private: // mutex for accessing the cache state map @@ -396,7 +396,7 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con // querry copy policy function for the nodes to use for the copy - const std::vector executing_nodes = copy_policy_function_(dst_node, src_node); + const std::vector executing_nodes = copy_policy_function_(dst_node, src_node, task->size_); const size_t task_count = executing_nodes.size(); // each task will copy one fair part of the total size From 5578f06c80eb8db114e77ab965b2dc59ef484639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Tue, 16 Jan 2024 22:15:36 +0100 Subject: [PATCH 24/29] adapt copy policy function to take data size as well and use this to only use destination nodes dsa engine for small data sizes on xeonmax --- offloading-cacher/main.cpp | 42 ++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/offloading-cacher/main.cpp b/offloading-cacher/main.cpp index 443b00b..8193f5a 100644 --- a/offloading-cacher/main.cpp +++ b/offloading-cacher/main.cpp @@ -7,6 +7,8 @@ #include "cache.hpp" +static constexpr size_t SIZE_64_MIB = 64 * 1024 * 1024; + dsacache::Cache CACHE; void InitCache(const std::string& device) { @@ -15,25 +17,47 @@ void InitCache(const std::string& device) { return numa_dst_node; }; - auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { - return std::vector{ numa_src_node, numa_dst_node }; + auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + return std::vector{ numa_dst_node }; }; CACHE.Init(cache_policy,copy_policy); } else if (device == "xeonmax") { auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + // xeon max is configured to have hbm on node ids that are +8 + return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; }; - auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { - const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; - if (same_socket) { - const bool socket_number = numa_dst_node >> 2; - if (socket_number == 0) return std::vector{ 0, 1, 2, 3 }; - else return std::vector{ 4, 5, 6, 7 }; + auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + if (data_size < SIZE_64_MIB) { + // if the data size is small then the copy will just be carried + // out by the destination node which does not require setting numa + // thread affinity as the selected dsa engine is already the one + // present on the calling thread + + return std::vector{ (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) }; + } + else { + // for sufficiently large data, smart copy is used which will utilize + // all four engines for intra-socket copy operations and cross copy on + // the source and destination nodes for inter-socket copy + + const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; + + if (same_socket) { + const bool socket_number = numa_dst_node >> 2; + if (socket_number == 0) return std::vector{ 0, 1, 2, 3 }; + else return std::vector{ 4, 5, 6, 7 }; + } + else { + return std::vector{ + (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node), + (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) + }; + } } - else return std::vector{ numa_src_node, numa_dst_node }; }; CACHE.Init(cache_policy,copy_policy); From 641a7593feb2d12b72f79f7b1d1583ac052a75fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 17 Jan 2024 11:15:00 +0100 Subject: [PATCH 25/29] add query driven prefetching code repository copy --- qdp_project/.gitignore | 104 +++++ qdp_project/CMakeLists.txt | 104 +++++ qdp_project/README.md | 3 + qdp_project/bench_all_dimes.sh | 10 + qdp_project/bench_max.sh | 15 + qdp_project/cmake_all_dimes.sh | 33 ++ qdp_project/cmake_max.sh | 9 + qdp_project/src/.gitkeep | 0 .../src/algorithm/operators/aggregation.h | 316 ++++++++++++++ qdp_project/src/algorithm/operators/filter.h | 170 ++++++++ qdp_project/src/benchmark/DIMES_benchmark.cpp | 240 +++++++++++ .../src/benchmark/DIMES_cores_benchmark.cpp | 260 ++++++++++++ qdp_project/src/benchmark/MAX_benchmark.cpp | 289 +++++++++++++ qdp_project/src/benchmark/QDP_minimal.h | 147 +++++++ .../src/benchmark/doubly_filtered_agg.cpp | 149 +++++++ .../benchmark/filter_aggregate_pipeline.cpp | 184 ++++++++ qdp_project/src/benchmark/latency.cpp | 188 +++++++++ .../src/benchmark/micro_benchmarks.cpp | 271 ++++++++++++ .../pipelines/DIMES_scan_filter_pipe.h | 391 +++++++++++++++++ .../pipelines/MAX_scan_filter_pipe.h | 395 ++++++++++++++++++ .../benchmark/pipelines/scan_filter_pipe.h | 387 +++++++++++++++++ qdp_project/src/utils/array_utils.h | 80 ++++ qdp_project/src/utils/barrier_utils.h | 73 ++++ qdp_project/src/utils/const.h | 33 ++ qdp_project/src/utils/cpu_set_utils.h | 82 ++++ qdp_project/src/utils/execution_modes.h | 89 ++++ qdp_project/src/utils/file_output.h | 76 ++++ qdp_project/src/utils/iterable_range.h | 208 +++++++++ qdp_project/src/utils/measurement_utils.h | 152 +++++++ qdp_project/src/utils/memory_literals.h | 45 ++ qdp_project/src/utils/pcm.h | 6 + qdp_project/src/utils/timer_utils.h | 80 ++++ qdp_project/src/utils/vector_loader.h | 93 +++++ 33 files changed, 4682 insertions(+) create mode 100644 qdp_project/.gitignore create mode 100644 qdp_project/CMakeLists.txt create mode 100644 qdp_project/README.md create mode 100644 qdp_project/bench_all_dimes.sh create mode 100644 qdp_project/bench_max.sh create mode 100644 qdp_project/cmake_all_dimes.sh create mode 100644 qdp_project/cmake_max.sh create mode 100644 qdp_project/src/.gitkeep create mode 100644 qdp_project/src/algorithm/operators/aggregation.h create mode 100644 qdp_project/src/algorithm/operators/filter.h create mode 100644 qdp_project/src/benchmark/DIMES_benchmark.cpp create mode 100644 qdp_project/src/benchmark/DIMES_cores_benchmark.cpp create mode 100644 qdp_project/src/benchmark/MAX_benchmark.cpp create mode 100644 qdp_project/src/benchmark/QDP_minimal.h create mode 100644 qdp_project/src/benchmark/doubly_filtered_agg.cpp create mode 100644 qdp_project/src/benchmark/filter_aggregate_pipeline.cpp create mode 100644 qdp_project/src/benchmark/latency.cpp create mode 100644 qdp_project/src/benchmark/micro_benchmarks.cpp create mode 100644 qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h create mode 100644 qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h create mode 100644 qdp_project/src/benchmark/pipelines/scan_filter_pipe.h create mode 100644 qdp_project/src/utils/array_utils.h create mode 100644 qdp_project/src/utils/barrier_utils.h create mode 100644 qdp_project/src/utils/const.h create mode 100644 qdp_project/src/utils/cpu_set_utils.h create mode 100644 qdp_project/src/utils/execution_modes.h create mode 100644 qdp_project/src/utils/file_output.h create mode 100644 qdp_project/src/utils/iterable_range.h create mode 100644 qdp_project/src/utils/measurement_utils.h create mode 100644 qdp_project/src/utils/memory_literals.h create mode 100644 qdp_project/src/utils/pcm.h create mode 100644 qdp_project/src/utils/timer_utils.h create mode 100644 qdp_project/src/utils/vector_loader.h diff --git a/qdp_project/.gitignore b/qdp_project/.gitignore new file mode 100644 index 0000000..1a8b920 --- /dev/null +++ b/qdp_project/.gitignore @@ -0,0 +1,104 @@ + + +bin/ + + +# CMake building files +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps +.cmake + +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app diff --git a/qdp_project/CMakeLists.txt b/qdp_project/CMakeLists.txt new file mode 100644 index 0000000..71c8452 --- /dev/null +++ b/qdp_project/CMakeLists.txt @@ -0,0 +1,104 @@ +cmake_minimum_required(VERSION 3.18) + +# set the project name +project(NUMA_Slow_Fast_Datamigration_Test VERSION 0.1) + +# specify the C standard +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +#set flags on need cross compile for sapphirerapids architecture +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids") +#set flags on need cross compile for skylake micro architecture +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=skylake-avx512") +#set flags on need cross compile for knights landing micro architecture (for debugging) +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512cd -mavx512er -mavx512pf") + +#suppress selected! warnigs that are not very important to resolve. This is to keep the compileation output clean +set(SUPPRESS_WARNINGS "-Wno-literal-suffix -Wno-volatile") + +set(DEBUG_FLAGS "-g3" "-ggdb") +set(RELEASE_FLAGS "-O3") + +#set pcm location +set(PCM_LOCATION ./thirdParty/pcm) +set(PCM_LINKS -lpcm -L${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib) +# pass the in formation about the shared library location to the linker +link_directories(${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib) + +#set flags used for Release and Debug build type +add_compile_options( + "$<$:${RELEASE_FLAGS}>" + "$<$:${DEBUG_FLAGS}>" +) + +# evaluate custom variables +function(eval vname vvalid vdefault) + # is variable is set to the below value if its not already defined from the comand line + set(VALID ${vvalid} CACHE INTERNAL "Possible values for ${vname}") + set(${vname} ${vdefault} CACHE STRING "The barrier mode") + # command for GUI shenanigans + set_property(CACHE ${vname} PROPERTY STRINGS VALID) + + if(${vname} IN_LIST VALID) + message(STATUS "Variable ${vname} = ${${vname}}") + else() + message(STATUS "Variable ${vname} has invalid value ${${vname}}") + # set the fallback value for use in parent function + unset(${vname} CACHE) + message(STATUS "Fallback to default: ${vname} = ${vdefault}") + set(${vname} ${vdefault} PARENT_SCOPE) + endif() +endfunction() + +eval(WSUPPRESS "suppress;show" "show") +if($ EQUAL 1) + add_compile_options("${SUPPRESS_WARNINGS}") +endif() + +eval(BARRIER_MODE "global;local" "global") +add_definitions(-DBARRIER_MODE="${BARRIER_MODE}") + +eval(BUFFER_LIMIT "unlimited;limited" "unlimited") +add_definitions(-DBUFFER_LIMIT=$) + +eval(QUERY "simple;complex" "simple") +add_definitions(-DQUERY=$) + +eval(THREAD_FACTOR "1;2;3;4;5;6;7;8;9;10" "4") +add_definitions(-DTHREAD_GROUP_MULTIPLIER=${THREAD_FACTOR}) + +eval(PINNING "cpu;numa" "cpu") +add_definitions(-DPINNING=$) + +eval(PCM_M "true;false" "false") +add_definitions(-DPCM_M=$) +add_definitions(${PCM_LINKS}) + +# build directory +set(CMAKE_BINARY_DIR "../bin") #relative to inside build +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) + + + +# include directories +include_directories(src/utils) +include_directories(src/algorithm) +include_directories(src/algorithm/operators) +include_directories(thirdParty/pcm/src) + +# link libraries +link_libraries(-lnuma -lpthread) + +# Add targets only below +# specify build targets +add_executable(FilterAggregatePipeline src/benchmark/filter_aggregate_pipeline.cpp) +add_executable(DoublyFiltered src/benchmark/doubly_filtered_agg.cpp) +add_executable(DIMESBench src/benchmark/DIMES_benchmark.cpp) +add_executable(DIMESCoreBench src/benchmark/DIMES_cores_benchmark.cpp) +add_executable(MicroBench src/benchmark/micro_benchmarks.cpp) +add_executable(MAXBench src/benchmark/MAX_benchmark.cpp + src/benchmark/QDP_minimal.h) +target_link_libraries(MAXBench libpcm.so) +add_executable(LatencyBench src/benchmark/latency.cpp) + diff --git a/qdp_project/README.md b/qdp_project/README.md new file mode 100644 index 0000000..afad56b --- /dev/null +++ b/qdp_project/README.md @@ -0,0 +1,3 @@ +This is a copy of the Query Driven Prefetching Repository +https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/qdp_minimal/code +Original Authors: André Berthold and Anna Bartuschka diff --git a/qdp_project/bench_all_dimes.sh b/qdp_project/bench_all_dimes.sh new file mode 100644 index 0000000..9c05e62 --- /dev/null +++ b/qdp_project/bench_all_dimes.sh @@ -0,0 +1,10 @@ +#!bin/bash + +../bin/DIMESBench_gus +../bin/DIMESBench_guc +../bin/DIMESBench_gls +../bin/DIMESBench_glc +../bin/DIMESBench_lus +../bin/DIMESBench_luc +../bin/DIMESBench_lls +../bin/DIMESBench_llc \ No newline at end of file diff --git a/qdp_project/bench_max.sh b/qdp_project/bench_max.sh new file mode 100644 index 0000000..fb08bd8 --- /dev/null +++ b/qdp_project/bench_max.sh @@ -0,0 +1,15 @@ +#!bin/bash + +current_date_time=$(date) +echo "Benchmark start at: $current_date_time" + +../bin/MAXBench_gcc + +cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_c_HBM.csv + +../bin/MAXBench_gcn + +cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_n_HBM.csv + +current_date_time=$(date) +echo "Benchmark end at: $current_date_time" \ No newline at end of file diff --git a/qdp_project/cmake_all_dimes.sh b/qdp_project/cmake_all_dimes.sh new file mode 100644 index 0000000..9ce3a96 --- /dev/null +++ b/qdp_project/cmake_all_dimes.sh @@ -0,0 +1,33 @@ +#!bin/bash + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=simple .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_gus + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_guc + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited -DQUERY=simple .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_gls + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited -DQUERY=complex .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_glc + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=unlimited -DQUERY=simple .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_lus + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=unlimited -DQUERY=complex .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_luc + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=limited -DQUERY=simple .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_lls + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=limited -DQUERY=complex .. +cmake --build . --target DIMESBench +mv ../bin/DIMESBench ../bin/DIMESBench_llc \ No newline at end of file diff --git a/qdp_project/cmake_max.sh b/qdp_project/cmake_max.sh new file mode 100644 index 0000000..03c137b --- /dev/null +++ b/qdp_project/cmake_max.sh @@ -0,0 +1,9 @@ +#!bin/bash + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=cpu -DPCM_M=false .. +cmake --build . --target MAXBench +mv ../bin/MAXBench ../bin/MAXBench_gcc + +cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=numa -DPCM_M=false .. +cmake --build . --target MAXBench +mv ../bin/MAXBench ../bin/MAXBench_gcn diff --git a/qdp_project/src/.gitkeep b/qdp_project/src/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/qdp_project/src/algorithm/operators/aggregation.h b/qdp_project/src/algorithm/operators/aggregation.h new file mode 100644 index 0000000..119ab14 --- /dev/null +++ b/qdp_project/src/algorithm/operators/aggregation.h @@ -0,0 +1,316 @@ +#pragma once + +#include +#include +#include +#include + +#include "vector_loader.h" +#include "const.h" + + +/** + * @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type. + * + * @tparam T + */ +template +class AggFunction { + static_assert(std::is_integral::value, "The base type of an AggFunction must be an integral"); +}; + +/** + * @brief Template class that implements methods used for Summation. It wraps the corresponding vector intrinsics + * + * @tparam T base datatype for the implemented methods + */ +template +class Sum : public AggFunction { +public: + static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_add_epi32(aggregator, vector); + else if constexpr (sizeof(T) == 8) return _mm512_add_epi64(aggregator, vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); + }; + + static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_mask_add_epi32(aggregator, mask, aggregator, vector); + else if constexpr (sizeof(T) == 8) return _mm512_mask_add_epi64(aggregator, mask, aggregator, vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); + }; + + static inline T simd_reduce(__m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_reduce_add_epi32(vector); + else if constexpr (sizeof(T) == 8) return _mm512_reduce_add_epi64(vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); + }; + + static inline T scalar_agg(T aggregator, T scalar) { return aggregator + scalar; }; + + static inline __m512i zero() { return _mm512_set1_epi32(0); }; +}; + + +/** + * @brief Template class that implements methods used for Maximum determination. It wraps the corresponding vector intrinsics + * + * @tparam T base datatype for the implemented methods + * + */ +template +class Max : public AggFunction { +public: + static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_max_epi32(aggregator, vector); + else if constexpr (sizeof(T) == 8) return _mm512_max_epi64(aggregator, vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); + } + + static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_mask_max_epi32(aggregator, mask, aggregator, vector); + else if constexpr (sizeof(T) == 8) return _mm512_mask_max_epi64(aggregator, mask, aggregator, vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); + } + + static inline T simd_reduce(__m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_reduce_max_epi32(vector); + else if constexpr (sizeof(T) == 8) return _mm512_reduce_max_epi64(vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); + } + + static inline T scalar_agg(T aggregator, T scalar) { return std::max(aggregator, scalar); } + + static inline __m512i zero() { + if constexpr (sizeof(T) == 4) { + if constexpr (std::is_signed::value) return _mm512_set1_epi32(0xFFFFFFFF); + else return _mm512_set1_epi32(0x0); + } + else if constexpr (sizeof(T) == 8) { + if constexpr (std::is_signed::value) return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF); + else return _mm512_set1_epi32(0x0); + } + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); + } +}; + +/** + * @brief Template class that implements methods used for Minimum determination. It wraps the corresponding vector intrinsics + * + * @tparam T base datatype for the implemented methods + * + */ +template +class Min : public AggFunction { +public: + static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_min_epi32(aggregator, vector); + else if constexpr (sizeof(T) == 8) return _mm512_min_epi64(aggregator, vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); + } + + static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_mask_min_epi32(aggregator, mask, aggregator, vector); + else if constexpr (sizeof(T) == 8) return _mm512_mask_min_epi64(aggregator, mask, aggregator, vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); + } + + static inline T simd_reduce(__m512i vector) { + if constexpr (sizeof(T) == 4) return _mm512_reduce_min_epi32(vector); + else if constexpr (sizeof(T) == 8) return _mm512_reduce_min_epi64(vector); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); + } + + static inline T scalar_agg(T aggregator, T scalar) { return std::min(aggregator, scalar); } + + static inline __m512i zero() { + if constexpr (sizeof(T) == 4) { + if constexpr (std::is_signed::value) return _mm512_set1_epi32(0xEFFFFFFF); + else return _mm512_set1_epi32(0xFFFFFFFF); + } + else if constexpr (sizeof(T) == 8) { + if constexpr (std::is_signed::value) return _mm512_set1_epi32(0xEFFFFFFFFFFFFFFF); + else return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF); + } + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); + } +}; + +/** + * @brief Template Class that implements an aggregation operation. + * + * @tparam base_t Base type of the values for aggregation + * @tparam func + * @tparam load_mode + */ +template class func, load_mode load_mode> +class Aggregation{ +public: + + static_assert(std::is_same_v, "Enforce unsigned 64 bit ints."); + + using OP = func; + /** + * @brief Calculates the memory maximal needed to store a chunk's processing result. + * + * @param chunk_size_b Size of the chunk in byte + * @return size_t Size of the chunk's processing result in byte + */ + static size_t result_bytes_per_chunk(size_t chunk_size_b) { + // aggregation returns a single value of type base_t + return sizeof(base_t); + } + + /** + * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes. + * The result is written to main memory. + * + * @param dest Pointer to the start of the result chunk + * @param src Pointer to the start of the source chunk + * @param chunk_size_b Size of the source chunk in bytes + * @return true When the aggregation is done + * @return false Never + */ + static bool apply (base_t *dest, base_t *src, size_t chunk_size_b) { + constexpr size_t lanes = VECTOR_SIZE(); + size_t value_count = chunk_size_b / sizeof(base_t); + __m512i agg_vec = func::zero(); + size_t i = 0; + base_t result = 0; + // stop before! running out of space + if(value_count >= lanes) {// keep in mind value_count is unsigned so if it becomes negative, it doesn't. + for(; i <= value_count - lanes; i += lanes) { + __m512i vec = Vector_Loader::load(src + i); + + agg_vec = func::simd_agg(agg_vec, vec); + } + result = func::simd_reduce(agg_vec); + } + + for(; i < value_count; ++i) { + result = func::scalar_agg(result, src[i]); + } + *dest = result; + + return true; + } + + /** + * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, + * while applying the bit string stored in *masks*. The result is written to main memory. + * + * @param dest Pointer to the start of the result chunk + * @param src Pointer to the start of the source chunk + * @param masks Pointer the bitstring that marks the values that should be aggregated + * @param chunk_size_b Size of the source chunk in bytes + * @return true When the aggregation is done + * @return false Never + */ + static bool apply_masked (base_t *dest, base_t *src, uint16_t* msks, size_t chunk_size_b) { + constexpr size_t lanes = VECTOR_SIZE(); + uint8_t* masks = (uint8_t *)msks; + size_t value_count = chunk_size_b / sizeof(base_t); + __m512i agg_vec = func::zero(); + size_t i = 0; + // stop before! running out of space + if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't. + for(; i <= value_count - lanes; i += lanes) { + __m512i vec = Vector_Loader::load(src + i); + __mmask8 mask = _mm512_int2mask(masks[i / lanes]); + + agg_vec = func::simd_mask_agg(agg_vec, mask, vec); + } + *dest = func::simd_reduce(agg_vec); + + for(; i < value_count; ++i) { + uint8_t mask = masks[i / lanes]; + if(mask & (0b1 << (i % lanes))){ + *dest = func::scalar_agg(*dest, src[i]); + } + } + + return true; + } + + /** + * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, + * while applying the bit string stored in *masks*. The values are agggegated in the register *dest* without + * clearing beforehand. + * + * NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte + * + * @param dest Vector register used for storing and passing the result around + * @param src Pointer to the start of the source chunk + * @param masks Pointer the bitstring that marks the values that should be aggregated + * @param chunk_size_b Size of the source chunk in bytes + * @return __m512i Vector register holding the aggregation result + */ + static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks, size_t chunk_size_b) { + constexpr size_t lanes = VECTOR_SIZE(); + uint8_t* masks = (uint8_t*) msks; + //TODO this function does not work if value_count % lanes != 0 + size_t value_count = chunk_size_b / sizeof(base_t); + size_t i = 0; + // stop before! running out of space + if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't. + for(; i <= value_count - lanes; i += lanes) { + __m512i vec = Vector_Loader::load(src + i); + __mmask8 mask = _mm512_int2mask(masks[i / lanes]); + dest = func::simd_agg(dest, mask, vec); + } + + return dest; + } + + /** + * @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, + * while applying two bit strings stored in *masks_0* and *masks_1*. The values are aggregated in the register + * *dest* without clearing beforehand. + * + * NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte + * + * @param dest Vector register used for storing and passing the result around + * @param src Pointer to the start of the source chunk + * @param masks_0 Pointer the bitstring that marks the values that should be aggregated + * @param masks_1 Pointer the bitstring that marks the values that should be aggregated + * @param chunk_size_b Size of the source chunk in bytes + * @return __m512i Vector register holding the aggregation result + */ + static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks0, uint16_t* msks1, size_t chunk_size_b) { + constexpr size_t lanes = VECTOR_SIZE(); + uint8_t* masks0 = (uint8_t*) msks0; + uint8_t* masks1 = (uint8_t*) msks1; + //TODO this function does not work if value_count % lanes != 0 + size_t value_count = chunk_size_b / sizeof(base_t); + size_t i = 0; + // stop before! running out of space + if(value_count >= lanes) // keep in mind value_count is unsigned so if it becomes negative, it doesn't. + for(; i <= value_count - lanes; i += lanes) { + __m512i vec = Vector_Loader::load(src + i); + __mmask8 mask0 = _mm512_int2mask(masks0[i / lanes]); + __mmask8 mask1 = _mm512_int2mask(masks1[i / lanes]); + + mask0 = _kand_mask8(mask0, mask1); + dest = func::simd_agg(dest, mask0, vec); + } + + return dest; + } + + /** + * @brief Reduces a vector by applying the aggregation function horizontally. + * + * @param dest Result of the horizontal aggregation + * @param src Vector as source for the horizontal aggregation + * @return true When the operation is done + * @return false Never + */ + static bool happly (base_t *dest, __m512i src) { + *dest = func::simd_reduce(src); + + return true; + } + + static __m512i get_zero() { + return func::zero(); + } +}; \ No newline at end of file diff --git a/qdp_project/src/algorithm/operators/filter.h b/qdp_project/src/algorithm/operators/filter.h new file mode 100644 index 0000000..a58a761 --- /dev/null +++ b/qdp_project/src/algorithm/operators/filter.h @@ -0,0 +1,170 @@ +#pragma once + +#include +#include + +#include + +#include "vector_loader.h" + +/** + * @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type. + * + * @tparam T An integral datatype + */ +template +class FilterFunction { + static_assert(std::is_integral::value, "The base type of a FilterFunction must be an integeral."); +}; + +/** + * @brief Template class that implements methods used for finding values that are not equal to the compare value. + * It wraps the corresponding vector intrinsics. + * + * @tparam T base datatype for the implemented methods + */ +template +class NEQ : public FilterFunction { +public: + static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { + if constexpr (sizeof(T) == 4) return _mm512_cmpneq_epi32_mask(vector, comp); + else if constexpr (sizeof(T) == 8) return _mm512_cmpneq_epi64_mask(vector, comp); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "NEQ is only implemented for 32 and 64 wide integers"); + } + + static inline bool scalar_filter(T scalar, T comp) { return scalar != comp; } +}; + +template +class EQ : public FilterFunction { +public: + static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { + if constexpr (sizeof(T) == 4) return _mm512_cmpeq_epi32_mask(vector, comp); + else if constexpr (sizeof(T) == 8) return _mm512_cmpeq_epi64_mask(vector, comp); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "EQ is only implemented for 32 and 64 wide integers"); + } + + static inline bool scalar_filter(T scalar, T comp) { return scalar == comp; } +}; + +template +class LT : public FilterFunction { +public: + static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { + if constexpr (sizeof(T) == 4) return _mm512_cmplt_epi32_mask(vector, comp); + else if constexpr (sizeof(T) == 8) return _mm512_cmplt_epi64_mask(vector, comp); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LT is only implemented for 32 and 64 wide integers"); + } + + static inline bool scalar_filter(T scalar, T comp) { return scalar < comp; } +}; + +template +class LEQ : public FilterFunction { +public: + static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { + if constexpr (sizeof(T) == 4) return _mm512_cmple_epi32_mask(vector, comp); + else if constexpr (sizeof(T) == 8) return _mm512_cmple_epi64_mask(vector, comp); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LEQ is only implemented for 32 and 64 wide integers"); + } + + static inline bool scalar_filter(T scalar, T comp) { return scalar <= comp; } +}; + +template +class GT : public FilterFunction { +public: + static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { + if constexpr (sizeof(T) == 4) return _mm512_cmpgt_epi32_mask(vector, comp); + else if constexpr (sizeof(T) == 8) return _mm512_cmpgt_epi64_mask(vector, comp); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GT is only implemented for 32 and 64 wide integers"); + } + + static inline bool scalar_filter(T scalar, T comp) { return scalar > comp; } +}; + +template +class GEQ : public FilterFunction { +public: + static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { + if constexpr (sizeof(T) == 4) return _mm512_cmpge_epi32_mask(vector, comp); + else if constexpr (sizeof(T) == 8) return _mm512_cmpge_epi64_mask(vector, comp); + static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GEQ is only implemented for 32 and 64 wide integers"); + } + + static inline bool scalar_filter(T scalar, T comp) { return scalar >= comp; } +}; + + +template class func, load_mode load_mode, bool copy> +class Filter { +public: + + static_assert(std::is_same_v, "We enforce 64 bit integer"); + + /** + * @brief Calculates the memory maximal needed to store a chunk's processing result. + * + * @param chunk_size_b Size of the chunk in byte + * @return size_t Size of the chunk's processing result in byte + */ + static size_t result_bytes_per_chunk(size_t chunk_size_b) { + // + 7 to enshure that we have enougth bytes -> / 8 -> rounds down + // if we had 17 / 8 = 2 but (17 + 7) / 8 = 3 + // if we hat 16 / 8 = 2 is right, as well as, 16 + 7 / 8 = 2 + return (chunk_size_b / sizeof(base_t) + 7) / 8; + } + + + /** + * @brief Applies the filter function on the chunk starting at *src* and spanning *chunk_size_b* bytes, while comparing with he same value every time. + * The resulting bit string is written to main memory. + * + * @param dest Pointer to the start of the result chunk + * @param src Pointer to the start of the source chunk + * @param cmp_value Comparision value to compare the values from source to + * @param chunk_size_b Size of the source chunk in bytes + * @return true When the filter operation is done + * @return false Never + */ + // we only need this impl. yet, as all filter are at the end of a pipeline + static bool apply_same (uint16_t *dst, base_t *buffer, base_t *src, base_t cmp_value, size_t chunk_size_b) { + constexpr uint32_t lanes = VECTOR_SIZE(); + uint8_t* dest = (uint8_t*) dst; + size_t value_count = chunk_size_b / sizeof(base_t); + __m512i cmp_vec = _mm512_set1_epi64(cmp_value); + size_t i = 0; + // this weird implementetion is neccessary, see analogous impl in aggregation for explaination + if(value_count > lanes) { + for(; (i < value_count - lanes); i += lanes) { + __m512i vec = Vector_Loader::load(src + i); + __mmask8 bitmask = func::simd_filter(vec, cmp_vec); + + uint8_t int_mask = (uint8_t) _mm512_mask2int(bitmask); + + dest[i / lanes] = int_mask; + if constexpr(copy){ + Vector_Loader::store(buffer + i, vec); + } + } + } + + auto dest_pos = i / lanes; + uint8_t int_mask = 0; + for(; i < value_count; ++i) { + base_t val = src[i]; + + uint8_t result = func::scalar_filter(val, cmp_value); + + int_mask |= (result << (i % lanes)); + + if constexpr(copy){ + buffer[i] = val; + } + } + dest[dest_pos] = int_mask; + + return true; + } + +}; \ No newline at end of file diff --git a/qdp_project/src/benchmark/DIMES_benchmark.cpp b/qdp_project/src/benchmark/DIMES_benchmark.cpp new file mode 100644 index 0000000..2ca9705 --- /dev/null +++ b/qdp_project/src/benchmark/DIMES_benchmark.cpp @@ -0,0 +1,240 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef THREAD_GROUP_MULTIPLIER +#define THREAD_GROUP_MULTIPLIER 8 +#endif + +#ifndef QUERY +#define QUERY 1 +#endif + +#ifndef BARRIER_MODE +#define BARRIER_MODE "global" +#endif + +#ifndef BUFFER_LIMIT +#define BUFFER_LIMIT 1 +#endif + +#include "const.h" + +#include "file_output.h" +#include "array_utils.h" +#include "timer_utils.h" +#include "barrier_utils.h" +#include "cpu_set_utils.h" +#include "iterable_range.h" +#include "memory_literals.h" +#include "pipelines/DIMES_scan_filter_pipe.h" + +#include "aggregation.h" +#include "filter.h" + +using base_t = uint64_t; + +base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value) * row_B[i]; + } + return sum; +} + +base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; + } + return sum; +} + +int main(int argc, char** argv) { + // set constants + const size_t workload_b = 4_GiB; + const base_t compare_value_a = 50; + const base_t compare_value_b = 42; + constexpr bool simple_query = (QUERY == 1); + + const size_t thread_count = 6; + std::ofstream out_file; + out_file.open("../results/dimes_" + "q-" + (std::string)(simple_query == true ? "simple" : "complex") + + "_bm-" + (std::string) BARRIER_MODE + + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + + "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".csv"); + + // set benchmark parameter + Linear_Int_Range run("run"); + Exp_Int_Range chunk_size("chunk_size"); + Range mode("mode"); + + uint32_t remote_node = 3; + uint32_t remote_node_2 = 2; + uint32_t local_node = 10; + + print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time", + #ifdef THREAD_TIMINGS + "scan_a", "scan_b", "aggr_j", + #endif + #ifdef BARRIER_TIMINGS + "wait_scan_a", "wait_scan_b", "wait_aggr_j", + #endif + "result"); + + + /*** alloc data and buffers ************************************************/ + base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); + base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); + base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); + base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); + fill_mt(data_a, workload_b, 0, 100, 42); + fill_mt(data_b, workload_b, 0, 100, 420); + std::memcpy(data_a_hbm, data_a, workload_b); + std::memcpy(data_b_hbm, data_b, workload_b); + base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node); + + std::ofstream check_file; + check_file.open("../results/dimes_" + "q-" + (std::string)(simple_query == true ? "simple" : "complex") + + "_bm-" + (std::string) BARRIER_MODE + + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + + "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum"); + if constexpr (QUERY == 1) { + //calculate simple checksum if QUERY == 1 -> simple query is applied + check_file << sum_check(compare_value_a, data_a, data_b, workload_b); + } else { + check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); + } + check_file.close(); + + std::string iteration("init"); + Query_Wrapper* qw = nullptr; + while(iteration != "false") { + + std::promise p; + std::shared_future ready_future(p.get_future()); + + if(iteration != "run") { + + if(qw != nullptr) { + delete qw; + } + + std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << std::endl; + + uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); + uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); + uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); + switch(mode.current) { + case NewPMode::DRAM_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::HBM_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::Mixed_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::Prefetch: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false); + break; + } + } + + qw->ready_future = &ready_future; + qw->clear_buffers(); + + auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; + auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; + auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; + + std::vector filter_pool; + std::vector copy_pool; + std::vector agg_pool; + + uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); + uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); + uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); + + int thread_id = 0; + // std::vector> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II + //std::vector> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm + //std::vector> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids + //std::vector> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids + std::vector> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids + + for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { + + for(uint32_t tid = 0; tid < tc_filter; ++tid) { + filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); + pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); + } + + // if tc_copy == 0 this loop is skipped + for(uint32_t tid = 0; tid < tc_copy; ++tid) { + copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); + pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); + } + + for(uint32_t tid = 0; tid < tc_agg; ++tid) { + agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); + pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); + } + } + + auto start = std::chrono::steady_clock::now(); + p.set_value(); + + for(std::thread& t : filter_pool) { t.join(); } + for(std::thread& t : copy_pool) { t.join(); } + for(std::thread& t : agg_pool) { t.join(); } + + Aggregation::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); + auto end = std::chrono::steady_clock::now(); + + constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; + uint64_t nanos = std::chrono::duration_cast(end - start).count(); + double seconds = (double)(nanos) / nanos_per_second; + + + print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds, + #ifdef THREAD_TIMINGS + qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), + #endif + #ifdef BARRIER_TIMINGS + qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), + #endif + results[0]); + + + iteration = IterateOnce(run, chunk_size, mode); + } + + numa_free(data_b_hbm, workload_b); + numa_free(data_a, workload_b); + numa_free(data_b, workload_b); + + numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t)); + +} \ No newline at end of file diff --git a/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp b/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp new file mode 100644 index 0000000..93c6b1b --- /dev/null +++ b/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp @@ -0,0 +1,260 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef QUERY +#define QUERY 1 +#endif + +#ifndef BARRIER_MODE +#define BARRIER_MODE "global" +#endif + +#define BUFFER_LIMIT 0 + +#include "const.h" + +#include "file_output.h" +#include "array_utils.h" +#include "timer_utils.h" +#include "barrier_utils.h" +#include "cpu_set_utils.h" +#include "iterable_range.h" +#include "memory_literals.h" +#include "pipelines/DIMES_scan_filter_pipe.h" + +#include "aggregation.h" +#include "filter.h" + +using base_t = uint64_t; + +base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value) * row_B[i]; + } + return sum; +} + +base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; + } + return sum; +} + + +int main(int argc, char** argv) { + // set constants + const size_t workload_b = 4_GiB; + const size_t chunk_size = 2_MiB; + const base_t compare_value_a = 50; + const base_t compare_value_b = 42; + constexpr bool simple_query = (QUERY == 1); + + + std::ofstream out_file; + out_file.open("../results/dimes_cores_" + "q-" + (std::string)(simple_query == true ? "simple" : "complex") + + "_bm-" + (std::string) BARRIER_MODE + + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + + ".csv"); + + // set benchmark parameter + Linear_Int_Range run("run"); + + Exp_Int_Range scan_a_thread("scan_a_tc"); + Exp_Int_Range scan_b_thread("scan_b_tc"); + Exp_Int_Range aggr_j_thread("aggr_j_tc"); + Linear_Int_Range thread_group_count("thread_group_c"); + Range mode("mode"); + + uint32_t remote_node = 1; + uint32_t remote_node_2 = 0;//on heacboehm II: node 0 is two hops away from node 2 -> prefetching is more beneficial + uint32_t local_node = 2; + + print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), + "time", + #ifdef THREAD_TIMINGS + "scan_a", "scan_b", "aggr_j", + #endif + #ifdef BARRIER_TIMINGS + "wait_scan_a", "wait_scan_b", "wait_aggr_j", + #endif + "result"); + + + /*** alloc data and buffers ************************************************/ + base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); + base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); + base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); + base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); + fill_mt(data_a, workload_b, 0, 100, 42); + fill_mt(data_b, workload_b, 0, 100, 420); + std::memcpy(data_a_hbm, data_a, workload_b); + std::memcpy(data_b_hbm, data_b, workload_b); + base_t* results = (base_t*) numa_alloc_onnode(thread_group_count.max * aggr_j_thread.max * sizeof(base_t), remote_node); + + std::ofstream check_file; + check_file.open("../results/dimes_cores_" + "q-" + (std::string)(simple_query == true ? "simple" : "complex") + + "_bm-" + (std::string) BARRIER_MODE + + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + + ".checksum"); + if constexpr (QUERY == 1) { + //calculate simple checksum if QUERY == 1 -> simple query is applied + check_file << sum_check(compare_value_a, data_a, data_b, workload_b); + } else { + check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); + } + check_file.close(); + + std::string iteration("init"); + Query_Wrapper* qw = nullptr; + while(iteration != "false") { + + std::promise p; + std::shared_future ready_future(p.get_future()); + + // skipping iteration through scan_b_thread while not used + while(simple_query && mode.current != NewPMode::Prefetch && scan_b_thread.current != 1) { + iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread); + } + + if(iteration != "run") { + std::cout << "Changing to mode " << mode.current + << " thread_group_count " << thread_group_count.current + << " thread_ratio " << scan_a_thread.current <<":"<< scan_b_thread.current <<":"<< aggr_j_thread.current + << std::endl; + + if(qw != nullptr) { + if (iteration == thread_group_count.label) { + + } else { + delete qw; + + uint32_t sat = scan_a_thread.current; + uint32_t sbt = simple_query && mode.current != NewPMode::Prefetch ? 0 : scan_b_thread.current; + uint32_t ajt = aggr_j_thread.current; + + switch(mode.current) { + case NewPMode::DRAM_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, + sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::HBM_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a_hbm, data_b_hbm, results, local_node, remote_node, + sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::Mixed_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b_hbm, results, local_node, remote_node, + sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::Prefetch: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, + sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, false); + break; + } + } + } + } + + qw->ready_future = &ready_future; + qw->clear_buffers(); + + auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; + auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; + auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; + + std::vector filter_pool; + std::vector copy_pool; + std::vector agg_pool; + + uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); + uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); + uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); + + int thread_id = 0; + // std::vector> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II + std::vector> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm + + for(uint32_t gid = 0; gid < thread_group_count.current; ++gid) { + + for(uint32_t tid = 0; tid < tc_filter; ++tid) { + filter_pool.emplace_back(filter_lambda, gid, thread_group_count.current, tid); + pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); + } + + // if tc_copy == 0 this loop is skipped + for(uint32_t tid = 0; tid < tc_copy; ++tid) { + copy_pool.emplace_back(copy_lambda, gid, thread_group_count.current, tid); + pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); + } + + for(uint32_t tid = 0; tid < tc_agg; ++tid) { + agg_pool.emplace_back(aggregation_lambda, gid, thread_group_count.current, tid); + pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); + } + } + + auto start = std::chrono::steady_clock::now(); + p.set_value(); + + for(std::thread& t : filter_pool) { t.join(); } + for(std::thread& t : copy_pool) { t.join(); } + for(std::thread& t : agg_pool) { t.join(); } + + Aggregation::apply(results, results, sizeof(base_t) * tc_agg * thread_group_count.current); + auto end = std::chrono::steady_clock::now(); + + constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; + uint64_t nanos = std::chrono::duration_cast(end - start).count(); + double seconds = (double)(nanos) / nanos_per_second; + +print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), + "time", + #ifdef THREAD_TIMINGS + "scan_a", "scan_b", "aggr_j", + #endif + #ifdef BARRIER_TIMINGS + "wait_scan_a", "wait_scan_b", "wait_aggr_j", + #endif + "result"); + + print_to_file(out_file, run, thread_group_count.current, new_mode_manager::string(mode.current), scan_a_thread, + (simple_query && mode.current != NewPMode::Prefetch ? 0 : scan_b_thread.current), + aggr_j_thread, seconds, + #ifdef THREAD_TIMINGS + qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), + #endif + #ifdef BARRIER_TIMINGS + qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), + #endif + results[0]); + + iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread); + } + + numa_free(data_b_hbm, workload_b); + numa_free(data_a, workload_b); + numa_free(data_b, workload_b); + + numa_free(results, thread_group_count.max * aggr_j_thread.max * sizeof(base_t)); + +} \ No newline at end of file diff --git a/qdp_project/src/benchmark/MAX_benchmark.cpp b/qdp_project/src/benchmark/MAX_benchmark.cpp new file mode 100644 index 0000000..fb50f5a --- /dev/null +++ b/qdp_project/src/benchmark/MAX_benchmark.cpp @@ -0,0 +1,289 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef THREAD_GROUP_MULTIPLIER +#define THREAD_GROUP_MULTIPLIER 2 +#endif + +#ifndef QUERY +#define QUERY 1 +#endif + +#ifndef BARRIER_MODE +#define BARRIER_MODE "global" +#endif + +#ifndef BUFFER_LIMIT +#define BUFFER_LIMIT 1 +#endif + +#ifndef PINNING +#define PINNING 1 +#endif + +#ifndef PCM_M +#define PCM_M 0 +#endif + +#if PCM_M == 1 +#include "pcm.h" +#endif + +#include "const.h" + +#include "file_output.h" +#include "array_utils.h" +#include "timer_utils.h" +#include "barrier_utils.h" +#include "measurement_utils.h" +#include "cpu_set_utils.h" +#include "iterable_range.h" +#include "memory_literals.h" +#include "pipelines/MAX_scan_filter_pipe.h" + +#include "aggregation.h" +#include "filter.h" + +using base_t = uint64_t; + +base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value) * row_B[i]; + } + return sum; +} + +base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; + } + return sum; +} + +int main(int argc, char** argv) { +#if PCM == 1 + pcm::PCM *pcm = pcm::PCM::getInstance(); + //and check for errors + auto error_code = pcm->program(); + if(error_code != pcm::PCM::Success) { + std::cerr << "PCM couldn't start" << std::endl; + std::cerr << "Error code: " << error_code << std::endl; + std::cerr << "Try to execute 'sudo modprobe msr' and execute this program with root privigeges."; + return 1; + } +#endif + + // set constants + const size_t workload_b = 2_GiB; + const base_t compare_value_a = 50; + const base_t compare_value_b = 42; + constexpr bool simple_query = (QUERY == 1); + + const size_t thread_count = 6; + std::ofstream out_file; + out_file.open("../results/max_" + "q-" + (std::string)(simple_query == true ? "simple" : "complex") + + "_bm-" + (std::string) BARRIER_MODE + + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + + "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + "1MiB-2MiB.csv"); + + // set benchmark parameter + Linear_Int_Range run("run"); + constexpr size_t chunk_min = 1_MiB; constexpr size_t chunk_max = 8_MiB + 1; constexpr size_t chunk_incr = 128_kiB; + Linear_Int_Range chunk_size("chunk_size"); + Range mode("mode"); + + uint32_t remote_node = 2; + uint32_t remote_node_2 = 2; + uint32_t local_node = 10; + + /*uint32_t remote_node = 6; + uint32_t remote_node_2 = 6; + uint32_t local_node = 2;*/ + + print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time", + #ifdef THREAD_TIMINGS + "scan_a", "scan_b", "aggr_j", + #endif + #ifdef BARRIER_TIMINGS + "wait_scan_a", "wait_scan_b", "wait_aggr_j", + #endif + #if PCM == 1 + pcm_value_collector::getHead("scan_a"), + pcm_value_collector::getHead("scan_b"), + pcm_value_collector::getHead("aggr_j"), + #endif + "result"); + + + /*** alloc data and buffers ************************************************/ + base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); + base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); + base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); + base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); + fill_mt(data_a, workload_b, 0, 100, 42); + fill_mt(data_b, workload_b, 0, 100, 420); + std::memcpy(data_a_hbm, data_a, workload_b); + std::memcpy(data_b_hbm, data_b, workload_b); + base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node); + + std::ofstream check_file; + check_file.open("../results/max_" + "q-" + (std::string)(simple_query == true ? "simple" : "complex") + + "_bm-" + (std::string) BARRIER_MODE + + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + + "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum"); + if constexpr (QUERY == 1) { + //calculate simple checksum if QUERY == 1 -> simple query is applied + check_file << sum_check(compare_value_a, data_a, data_b, workload_b); + } else { + check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); + } + check_file.close(); + + std::string iteration("init"); + Query_Wrapper* qw = nullptr; + while(iteration != "false") { + + std::promise p; + std::shared_future ready_future(p.get_future()); + + if(iteration != "run") { + + if(qw != nullptr) { + delete qw; + } + uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); + uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); + uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); + switch(mode.current) { + case NewPMode::DRAM_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::HBM_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::Mixed_base: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); + break; + case NewPMode::Prefetch: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, + tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false); + break; + } + } + + qw->ready_future = &ready_future; + qw->clear_buffers(); + + auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; + auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; + auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; + + std::vector filter_pool; + std::vector copy_pool; + std::vector agg_pool; + + uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); + uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); + uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); + + int thread_id = 0; + // std::vector> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II + //std::vector> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm + std::vector> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids + //std::vector> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids + //std::vector> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids + + for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { + + for(uint32_t tid = 0; tid < tc_filter; ++tid) { + filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); +#if PINNING + pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); +#else + pin_thread_in_range(filter_pool.back(), pinning_ranges); +#endif + } + + // if tc_copy == 0 this loop is skipped + for(uint32_t tid = 0; tid < tc_copy; ++tid) { + copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); +#if PINNING + pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); +#else + pin_thread_in_range(copy_pool.back(), pinning_ranges); +#endif + } + + for(uint32_t tid = 0; tid < tc_agg; ++tid) { + agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); +#if PINNING + pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); +#else + pin_thread_in_range(agg_pool.back(), pinning_ranges); +#endif + } + } + + auto start = std::chrono::steady_clock::now(); + p.set_value(); + + for(std::thread& t : filter_pool) { t.join(); } + for(std::thread& t : copy_pool) { t.join(); } + for(std::thread& t : agg_pool) { t.join(); } + + Aggregation::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); + auto end = std::chrono::steady_clock::now(); + + constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; + uint64_t nanos = std::chrono::duration_cast(end - start).count(); + double seconds = (double)(nanos) / nanos_per_second; + + + + print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds, + #ifdef THREAD_TIMINGS + qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), + #endif + #ifdef BARRIER_TIMINGS + qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), + #endif + #if PCM == 1 + qw->pvc->summarize_as_string("scan_a"), + qw->pvc->summarize_as_string("scan_b"), + qw->pvc->summarize_as_string("aggr_j"), + #endif + results[0]); + + iteration = IterateOnce(run, chunk_size, mode); + } + + numa_free(data_b_hbm, workload_b); + numa_free(data_a, workload_b); + numa_free(data_b, workload_b); + + numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t)); + +} \ No newline at end of file diff --git a/qdp_project/src/benchmark/QDP_minimal.h b/qdp_project/src/benchmark/QDP_minimal.h new file mode 100644 index 0000000..007d0d9 --- /dev/null +++ b/qdp_project/src/benchmark/QDP_minimal.h @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include + +#include "const.h" +#include "array_utils.h" +#include "cpu_set_utils.h" +#include "iterable_range.h" +#include "memory_literals.h" +#include "pipelines/MAX_scan_filter_pipe.h" +#include "aggregation.h" + +using base_t = uint64_t; + +// calculate the checksum for the simple query +base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value) * row_B[i]; + } + return sum; +} + +// calculate the checksum for the complex query +base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; + } + return sum; +} + +class QDP_minimal { +private: + // values used for comparisons in the filter operations + const base_t compare_value_a = 50; + const base_t compare_value_b = 42; + // define, which numa nodes to use + // Xeon Max: node 0-7 DRAM and 8-15 HBM + // if the nodes are changed, the pinning ranges in run should be adjusted accordingly too + uint32_t dram_node = 2; + uint32_t dram_node_2 = 2; + uint32_t hbm_node = 10; + +public: + // results of running qdp, set by run() + base_t result; + base_t checksum; + double exec_time; + + // run qdp + void run(const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){ + // allocate data + base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, dram_node); + base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, dram_node_2); + base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t), dram_node); + + // fill the memory with acutal values + fill_mt(data_a, workload_b, 0, 100, 42); + fill_mt(data_b, workload_b, 0, 100, 420); + + // run qdp + run(data_a, data_b, results, workload_b, chunk_size, tc_filter, tc_copy, tc_agg); + + // free the allocated memory + numa_free(data_a, workload_b); + numa_free(data_b, workload_b); + numa_free(results, THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t)); + } + + // run qdp, work on provided memory pointers to enable memory reuse across multiple runs + void run(base_t* data_a, base_t* data_b, base_t* results, const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){ + constexpr bool simple_query = (QUERY == 1); + // sync objects + std::promise p; + std::shared_future ready_future(p.get_future()); + + // create the query wrapper, that is managing the to-be-used threads + Query_Wrapper* qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b, results, hbm_node, dram_node, + tc_filter, tc_copy, tc_agg, NewPMode::Prefetch, THREAD_GROUP_MULTIPLIER, compare_value_a, compare_value_b, false); + + // clear buffers to make sure, that they have been written and are fully mapped before running qdp + qw->clear_buffers(); + + // creating lambdas for executing filter (scan_a), copy (scan_b), and aggregation tasks on the query wrapper + // passing gid (group id), gcnt (group count) and tid (thread id) + auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; + auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; + auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; + + // creating thread pools, holding all used threads + std::vector filter_pool; + std::vector copy_pool; + std::vector agg_pool; + + int thread_id = 0; + // cpus on node 2 (for sapphire rapids), that the threads should be executed on + std::vector> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; + + // create all threads for all thread groups and for every task (copy, filter, aggregation), according their specific theadcount + for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { + for(uint32_t tid = 0; tid < tc_filter; ++tid) { + filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); + pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); + } + for(uint32_t tid = 0; tid < tc_copy; ++tid) { + copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); + pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); + } + for(uint32_t tid = 0; tid < tc_agg; ++tid) { + agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); + pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); + } + } + + // start the clock + auto start = std::chrono::steady_clock::now(); + // set value to the promise, to signal the waiting threads, that they can start now + p.set_value(); + + // wait for all thread to be finished + for(std::thread& t : filter_pool) { t.join(); } + for(std::thread& t : copy_pool) { t.join(); } + for(std::thread& t : agg_pool) { t.join(); } + + // sum up the results of all the aggregation threads to get a final result + Aggregation::apply(&result, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); + auto end = std::chrono::steady_clock::now(); + + // get the overall execution time in seconds + constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; + uint64_t nanos = std::chrono::duration_cast(end - start).count(); + exec_time = (double)(nanos) / nanos_per_second; + + // calculate the checksum according to the used query + if constexpr (QUERY == 1) { + // QUERY == 1 -> simple query is applied + checksum = sum_check(compare_value_a, data_a, data_b, workload_b); + } else { + checksum = sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); + } + + delete qw; + } +}; diff --git a/qdp_project/src/benchmark/doubly_filtered_agg.cpp b/qdp_project/src/benchmark/doubly_filtered_agg.cpp new file mode 100644 index 0000000..eaee93d --- /dev/null +++ b/qdp_project/src/benchmark/doubly_filtered_agg.cpp @@ -0,0 +1,149 @@ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "aggregation.h" +#include "array_utils.h" +#include "cpu_set_utils.h" +#include "file_output.h" +#include "iterable_range.h" +#include "memory_literals.h" +#include "pipelines/scan_filter_pipe.h" + +int main () { + + using base_t = uint64_t; + + + const size_t workload = 2_GiB; + const char filename[256] = "../results/doubly_filtered_results_stronger_affinity_.csv"; + const uint32_t numa_local = 2; + const uint32_t numa_remote = 3; + + + Linear_Int_Range thread_group("thread_groups"); + Exp_Int_Range thread_count_filter("thread_cnt_filter"); + Exp_Int_Range thread_count_filter_copy("thread_cnt_filter_copy"); + Exp_Int_Range thread_count_aggregation("thread_cnt_agg"); + Linear_Int_Range run("run"); + Range mode("mode"); + Exp_Int_Range chunk_size("chunk_size"); + + std::ofstream out_file; + out_file.open(filename); + print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, + thread_count_aggregation, thread_group), "time", "scan_a", "scan_b", "aggr_j", "wait_aggr", "results"); + + base_t* data_a = (base_t*) numa_alloc_onnode(workload, numa_remote); + base_t* data_b = (base_t*) numa_alloc_onnode(workload, numa_remote); + base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload, numa_local); + fill_mt(data_a, workload, 0, 100, 42); + fill_mt(data_b, workload, 0, 100, 420); + std::memcpy(data_b_hbm, data_b, workload); + base_t* result = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), + numa_remote); + + std::string iteration("init"); + Query_Wrapper* qw = nullptr; + + while(iteration != "false") { + + std::promise p; + std::shared_future ready_future(p.get_future()); + + if(iteration != "run") { + if(qw != nullptr) { + delete qw; + } + + switch(mode.current) { + case PMode::expl_copy: + qw = new Query_Wrapper(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, + thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, + mode.current, thread_group.current, (base_t) 50, (base_t) 42, false); + break; + case PMode::no_copy: + qw = new Query_Wrapper(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, + thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, + mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); + break; + case PMode::hbm: + qw = new Query_Wrapper(&ready_future, workload, chunk_size.current, data_a, data_b_hbm, result, numa_local, numa_remote, + thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, + mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); + break; + } + } + qw->ready_future = &ready_future; + qw->clear_buffers(); + + + // todo create threads depending on mode + std::vector thread_pool; + auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; + auto filter_copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; + auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; + + + /* Intel Xeon Gold 6130 // todo implement different for 5120 -> fewer cpus + node 0 cpus: 0-15 64- 79 + node 1 cpus: 16-31 80- 95 + node 2 cpus: 32-47 96-111 + node 3 cpus: 48-63 112-127 + */ + int thread_id = 0; + std::vector> range {std::make_pair(0, 16), std::make_pair(64, 80)}; + for(uint32_t gid = 0; gid < thread_group.current; ++gid) { + + + for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) { + thread_pool.emplace_back(filter_lambda, gid, thread_group.current, tid); + pin_thread_in_range(thread_pool.back(), thread_id++, range); + } + + for(uint32_t tid = 0; tid < thread_count_filter_copy.current; ++tid) { + thread_pool.emplace_back(filter_copy_lambda, gid, thread_group.current, tid); + pin_thread_in_range(thread_pool.back(), thread_id++, range); + } + + for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) { + thread_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid); + pin_thread_in_range(thread_pool.back(), thread_id++, range); + } + } + + auto start = std::chrono::steady_clock::now(); + p.set_value(); + + // wait for every thread to join + for(std::thread& t : thread_pool) t.join(); + // aggregate all partial results + Aggregation::apply(result, result, + sizeof(base_t) * thread_count_aggregation.current * thread_group.current); + + auto end = std::chrono::steady_clock::now(); + + double duration = std::chrono::duration_cast(end-start).count() / (double)1000000000; + + + //TODO add mode + print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, + thread_count_filter_copy, thread_count_aggregation, thread_group, duration, + qw->trt->summarize_time(0), qw->trt->summarize_time(1), + qw->trt->summarize_time(2), qw->trt->summarize_time(3), *result); + iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, thread_count_aggregation, thread_group); + } + + auto end = std::chrono::system_clock::now(); + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) << std::endl; + + print_to_file(out_file, std::ctime(&end_time)); +} \ No newline at end of file diff --git a/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp b/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp new file mode 100644 index 0000000..b4a6753 --- /dev/null +++ b/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "const.h" + +#include "file_output.h" +#include "array_utils.h" +#include "timer_utils.h" +#include "barrier_utils.h" +#include "cpu_set_utils.h" +#include "iterable_range.h" +#include "memory_literals.h" +#include "pipelines/scan_filter_pipe.h" + +#include "aggregation.h" +#include "filter.h" + +using base_t = uint64_t; + +base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { + base_t sum = 0; + for(int i = 0; i < row_size / sizeof(base_t); ++i) { + sum += (row_A[i] < compare_value) * row_B[i]; + } + return sum; +} + + +int main(int argc, char** argv) { + size_t workload_b = 2_GiB; + std::ofstream out_file; + out_file.open("filter_aggreagate_pipe_bm_" + (std::string) BARRIER_MODE + ".csv"); + + Linear_Int_Range thread_group("thread_groups"); + Linear_Int_Range run("run"); + Exp_Int_Range chunk_size("chunk_size"); + Linear_Int_Range thread_count_filter("thread_cnt_filter"); + Linear_Int_Range thread_count_copy("thread_cnt_copy"); + Linear_Int_Range thread_count_aggregation("thread_cnt_agg"); + Range mode("mode"); + + uint32_t remote_node = 2; + uint32_t remote_node_2 = 2; + uint32_t local_node = 10; + + print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_copy, + thread_count_aggregation, thread_group), "time", + #ifdef THREAD_TIMINGS + "scan_a", "scan_b", "aggr_j", + #endif + #ifdef BARRIER_TIMINGS + "wait_scan_a", "wait_scan_b", "wait_aggr_j", + #endif + "result"); + + + /*** alloc data and buffers ************************************************/ + base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); + base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); + base_t* data_b_hbm = (base_t *) numa_alloc_onnode(workload_b, local_node); + fill_mt(data_a, workload_b, 0, 100, 42); + fill_mt(data_b, workload_b, 0, 100, 420); + std::memcpy(data_b_hbm, data_b, workload_b); + base_t* results = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), remote_node); + + std::string iteration("init"); + const bool simple_query = true; + Query_Wrapper* qw = nullptr; + while(iteration != "false") { + base_t compare_value = 50; + std::promise p; + std::shared_future ready_future(p.get_future()); + + if(iteration != "run") { + + if(qw != nullptr) { + delete qw; + } + + std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << " thread_group " << thread_group.current << std::endl; + switch(mode.current) { + case PMode::expl_copy: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, + thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, false); + break; + case PMode::no_copy: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, + thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); + break; + case PMode::hbm: + qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, + thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); + break; + } + } + + qw->ready_future = &ready_future; + qw->clear_buffers(); + + auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; + auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; + auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; + + std::vector filter_pool; + std::vector copy_pool; + std::vector agg_pool; + + int thread_id = 0; + // std::vector> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm2 + std::vector> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm + + for(uint32_t gid = 0; gid < thread_group.current; ++gid) { + + for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) { + filter_pool.emplace_back(filter_lambda, gid, thread_group.current, tid); + pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); + } + + if(mode.current == PMode::expl_copy){ + for(uint32_t tid = 0; tid < thread_count_copy.current; ++tid) { + copy_pool.emplace_back(copy_lambda, gid, thread_group.current, tid); + pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); + } + } + + for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) { + agg_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid); + pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); + } + } + + auto start = std::chrono::steady_clock::now(); + p.set_value(); + + for(std::thread& t : filter_pool) { t.join(); } + for(std::thread& t : copy_pool) { t.join(); } + for(std::thread& t : agg_pool) { t.join(); } + + Aggregation::apply(results, results, sizeof(base_t) * thread_count_aggregation.current * thread_group.current); + auto end = std::chrono::steady_clock::now(); + + constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; + uint64_t nanos = std::chrono::duration_cast(end - start).count(); + double seconds = (double)(nanos) / nanos_per_second; + + + + print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, + thread_count_copy, thread_count_aggregation, thread_group, seconds, + #ifdef THREAD_TIMINGS + qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), + #endif + #ifdef BARRIER_TIMINGS + qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), + #endif + results[0]); + + + iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_copy, thread_count_aggregation, thread_group); + + } + + numa_free(data_b_hbm, workload_b); + numa_free(data_a, workload_b); + numa_free(data_b, workload_b); + numa_free(results, thread_group.max * sizeof(base_t)); + +} \ No newline at end of file diff --git a/qdp_project/src/benchmark/latency.cpp b/qdp_project/src/benchmark/latency.cpp new file mode 100644 index 0000000..011066a --- /dev/null +++ b/qdp_project/src/benchmark/latency.cpp @@ -0,0 +1,188 @@ +/* + * numa_memory_latency + * Copyright (c) 2017 UMEZAWA Takeshi + * This software is licensed under GNU GPL version 2 or later. + * + * This file has been modified + */ + +#include +#include +#include +#include +#include +#include +#include +#include "file_output.h" +#include +#include +#include +#include + +#ifndef VOLATILE +#define VOLATILE 0 +#endif + +#define cachelinesize 64 +union CACHELINE { + char cacheline[cachelinesize]; + #if VOLATILE + volatile CACHELINE* next; + #else + CACHELINE* next; + #endif /*VOLATILE*/ +}; + +#define REPT4(x) do { x; x; x; x; } while(0) +#define REPT16(x) do { REPT4(x); REPT4(x); REPT4(x); REPT4(x); } while(0); +#define REPT64(x) do { REPT16(x); REPT16(x); REPT16(x); REPT16(x); } while(0); +#define REPT256(x) do { REPT64(x); REPT64(x); REPT64(x); REPT64(x); } while(0); +#define REPT1024(x) do { REPT256(x); REPT256(x); REPT256(x); REPT256(x); } while(0); + +size_t bufsize = 1 * 1024 * 1024 * 1024; +size_t nloop = 128 * 1024; +std::vector offsets; + +#if VOLATILE + +volatile CACHELINE* walk(volatile CACHELINE* start) +{ + volatile CACHELINE* p = start; + for (size_t i = 0; i < nloop; ++i) { + REPT1024(p = p->next); + } + return p; +} + +#else + +CACHELINE* walk(CACHELINE* start, uint64_t* sum) +{ + CACHELINE* p = start; + for (size_t i = 0; i < nloop; ++i) { + REPT1024( + *sum += static_cast(p->cacheline[cachelinesize-1]); + p = p->next; + ); + } + return p; +} + +#endif /*VOLATILE*/ + +void bench(int tasknode, int memnode, std::ofstream* out_file) +{ + struct timespec ts_begin, ts_end, ts_elapsed; + + printf("bench(task=%d, mem=%d)\n", tasknode, memnode); + + if (numa_run_on_node(tasknode) != 0) { + printf("failed to run on node: %s\n", strerror(errno)); + return; + } + + CACHELINE* const buf = (CACHELINE*)numa_alloc_onnode(bufsize, memnode); + if (buf == NULL) { + printf("failed to allocate memory\n"); + return; + } + + for (size_t i = 0; i < offsets.size() - 1; ++i) { + // assuming that next-pointer never overwrites last Byte of the cacheline/union + buf[offsets[i]].cacheline[cachelinesize-1] = offsets[i] % 128; + buf[offsets[i]].next = buf + offsets[i+1]; + } + buf[offsets[offsets.size() - 1]].next = buf; + buf[offsets[offsets.size() - 1]].cacheline[cachelinesize-1] = offsets[offsets.size() - 1] % 128; + + uint64_t value = 0; + uint64_t* sum = &value; + + clock_gettime(CLOCK_MONOTONIC, &ts_begin); + + #if VOLATILE + walk(buf); + #else + walk(buf, sum); + #endif /*VOLATILE*/ + + clock_gettime(CLOCK_MONOTONIC, &ts_end); + + ts_elapsed.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec; + ts_elapsed.tv_sec = ts_end.tv_sec - ts_begin.tv_sec; + if (ts_elapsed.tv_nsec < 0) { + --ts_elapsed.tv_sec; + ts_elapsed.tv_nsec += 1000*1000*1000; + } + double elapsed = ts_elapsed.tv_sec + 0.000000001 * ts_elapsed.tv_nsec; + printf("took %fsec. %fns/load\n", elapsed, elapsed/(1024*nloop)*(1000*1000*1000)); + print_to_file(*out_file, tasknode, memnode, elapsed/(1024*nloop)*(1000*1000*1000), *sum); + numa_free(buf, bufsize); +} + +struct RND { + std::mt19937 mt; + RND() : mt(time(NULL)) {} + std::mt19937::result_type operator()(std::mt19937::result_type n) { return mt() % n; } +} r; + +void usage(const char* prog) +{ + printf("usage: %s [-h] [bufsize] [nloop]\n", prog); +} + +int main(int argc, char* argv[]) +{ + int ch; + + while ((ch = getopt(argc, argv, "h")) != -1) { + switch (ch) { + case 'h': + default: + usage(argv[0]); + exit(1); + } + } + + argc -= optind; + argv += optind; + + if (argc > 1) { + // 1048576 KiB = 1 GiB + bufsize = atoi(argv[0]) * 1024; // in KiB + nloop = atoi(argv[1]) * 1024; + } + + offsets.resize(bufsize / cachelinesize); + + for (size_t i = 0; i < offsets.size(); ++i) + offsets[i] = i; + std::random_shuffle(offsets.begin() + 1, offsets.end(), r); + + uint64_t expected_checksum = 0; + #if VOLATILE == 0 + for (size_t i = 0; i < nloop * 1024; ++i) { + expected_checksum += offsets[i % offsets.size()] % 128; + } + #endif + + std::ofstream check_file; + check_file.open("../results/micro_bench/latency/micro_bench_latency_" + (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".checksum"); + check_file << expected_checksum; + check_file.close(); + + + printf("benchmark bufsize=%zuKiB, nloop=%zuKi\n", bufsize/1024, nloop/1024); + + std::ofstream out_file; + out_file.open("../results/micro_bench/latency/micro_bench_latency_"+ (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".csv"); + print_to_file(out_file, "tasknode", "memnode", "latency", "checksum"); + + for (int tasknode = 0; tasknode < 8; tasknode++) { + for (int memnode = 0; memnode < 16; memnode++) { + bench(tasknode, memnode, &out_file); + } + } + + return 0; +} \ No newline at end of file diff --git a/qdp_project/src/benchmark/micro_benchmarks.cpp b/qdp_project/src/benchmark/micro_benchmarks.cpp new file mode 100644 index 0000000..4e63f82 --- /dev/null +++ b/qdp_project/src/benchmark/micro_benchmarks.cpp @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#include +#include +#include "memory_literals.h" +#include "array_utils.h" +#include "file_output.h" +#include "aggregation.h" + + +using base_t = uint64_t; + +size_t thread_cnt_memcpy = 128; +size_t thread_cnt_read = 128; +size_t runs = 10; + + +base_t sum_up(base_t* data, size_t workload){ + base_t sum = 0; + for(int i = 0; i < workload/sizeof(base_t); i++){ + sum += data[i]; + } + return sum; +} + +int reverse_bits(int number, size_t bit_count) { + int result = 0; + for(int i = 0; i < bit_count; i++) { + result <<= 1; + result |= (number & 1); + number >>= 1; + } + return result; +} + + +double measure_memcpy_bw(base_t* src, base_t* dest, size_t workload, base_t* result){ + std::promise p; + std::shared_future ready_future(p.get_future()); + + auto thread_lambda = [&](base_t* source, base_t* destination, size_t count) { + ready_future.wait(); + memcpy(destination, source, count); + }; + + std::vector thread_pool; + size_t total_elements = workload / sizeof(base_t); + size_t elements_per_thread = total_elements / thread_cnt_memcpy; + size_t remainder = total_elements % thread_cnt_memcpy; + + for(size_t tid = 0; tid < thread_cnt_memcpy; tid++) { + size_t elements_to_process = elements_per_thread + (tid < remainder ? 1 : 0); + size_t byte_offset = (elements_per_thread * tid + std::min(tid, remainder)) * sizeof(base_t); + + thread_pool.emplace_back(thread_lambda, src + byte_offset / sizeof(base_t), dest + byte_offset / sizeof(base_t), elements_to_process * sizeof(base_t)); + } + + auto start = std::chrono::steady_clock::now(); + p.set_value(); + for(std::thread& t : thread_pool) { t.join(); } + auto stop = std::chrono::steady_clock::now(); + + auto duration = std::chrono::duration_cast(stop - start); + double seconds = duration.count() / 1e9; + double throughput = (workload / seconds) / (1024 * 1024 * 1024); + *result = sum_up(dest, workload); + return throughput; +} + +double measure_read_bw(base_t* data, size_t workload, base_t* results){ + const size_t chunk_size = sizeof(__m512i); + const size_t num_chunks = (workload) / chunk_size; + __m512i* src = reinterpret_cast<__m512i*>(data); + std::promise p; + std::shared_future ready_future(p.get_future()); + size_t num_chunks_per_thread = num_chunks / thread_cnt_read; + size_t num_chunks_remainder = num_chunks % thread_cnt_read; + + auto thread_lambda = [&](__m512i* src, int tid, int num_chunks) { + __m512i accumulator = _mm512_setzero_si512(); + ready_future.wait(); + for (int i = 0; i < num_chunks; i++) { + __m512i chunk = _mm512_load_si512(&src[i]); + accumulator = _mm512_add_epi64(accumulator, chunk); + } + results[tid] = _mm512_reduce_add_epi64(accumulator); + }; + + std::vector thread_pool; + int offset; + for(int tid = 0; tid < thread_cnt_read; tid++){ + if(tid < num_chunks_remainder){ + offset = tid * (num_chunks_per_thread + 1); + thread_pool.emplace_back(thread_lambda, &src[offset], tid, (num_chunks_per_thread + 1)); + } else { + offset = tid*num_chunks_per_thread + num_chunks_remainder; + thread_pool.emplace_back(thread_lambda, &src[offset], tid, num_chunks_per_thread); + } + + } + + auto start = std::chrono::steady_clock::now(); + p.set_value(); + for(std::thread& t : thread_pool) { t.join(); } + auto stop = std::chrono::steady_clock::now(); + + Aggregation::apply(results, results, sizeof(base_t) * thread_cnt_read); + auto duration = std::chrono::duration_cast(stop - start); + double seconds = duration.count() / 1e9; + double throughput = (workload / seconds) / (1024 * 1024 * 1024); + return throughput; +} + +void exec_multiple_runs_memcpy(size_t workload, int exec_node, int src_node, int dest_node, std::ofstream* out_file, std::string iteration_type){ + base_t value; + base_t* result = &value; + base_t* src = (base_t*) numa_alloc_onnode(workload, src_node); + base_t* dest = (base_t*) numa_alloc_onnode(workload, dest_node); + fill_mt(src, workload, 0, 100, 42); + fill_mt(dest, workload, 0, 100, 12); + numa_run_on_node(exec_node); + + if(dest_node == 0 && src_node == 0){ + std::ofstream check_file; + check_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) + + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_" + iteration_type + ".checksum"); + check_file << sum_up(src, workload); + check_file.close(); + } + + for(size_t run = 0; run < runs; run++){ + double bw = measure_memcpy_bw(src, dest, workload, result); + std::cout << "Copy throughput executed on node " << exec_node << " form node " << src_node << " to node " + << dest_node << ": " << bw << " GiB/s" << std::endl; + print_to_file(*out_file, run, src_node, dest_node, bw, *result); + std::memset(dest, 0x00, workload); + *result = 0; + } + numa_free(src, workload); + numa_free(dest, workload); +} + +void measure_all_memcpy_bw_for_chosen_execnode(int exec_node){ + std::ofstream out_file; + out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) + + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + ".csv"); + print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); + const size_t workload = 4_GiB; + + for(int src_node = 0; src_node < 16; src_node++){ + for(int dest_node = 0; dest_node < 16; dest_node++){ + exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, ""); + } + } + out_file.close(); +} + +void measure_all_memcpy_bw_for_chosen_execnode_reversed(int exec_node){ + std::ofstream out_file; + out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) + + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed.csv"); + print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); + const size_t workload = 4_GiB; + + for(int src_node = 15; src_node >= 0; src_node--){ + for(int dest_node = 15; dest_node >= 0; dest_node--){ + exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "reversed"); + } + } + out_file.close(); +} + + + +void measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(int exec_node){ + std::ofstream out_file; + out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) + + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed_bitwise.csv"); + print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); + const size_t workload = 4_GiB; + + for(int src_node = 0; src_node < 16; src_node++){ + for(int dest_node = 0; dest_node < 16; dest_node++){ + int reversed_src_node = reverse_bits(src_node, 4); + int reversed_dest_node = reverse_bits(dest_node, 4); + exec_multiple_runs_memcpy(workload, exec_node, reversed_src_node, reversed_dest_node, &out_file, "reversed_bitwise"); + } + } + out_file.close(); +} + + +void exec_multiple_runs_read(size_t workload, int mem_node, int exec_node, std::ofstream *out_file, std::string iteration_type){ + base_t* data = (base_t*) numa_alloc_onnode(workload, mem_node); + fill_mt(data, workload, 0, 100, 42); + base_t* results = (base_t*) numa_alloc_onnode(thread_cnt_read * sizeof(base_t), exec_node); + numa_run_on_node(exec_node); + + if(mem_node == 0 && exec_node == 0){ + std::ofstream check_file; + check_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_" + iteration_type + ".checksum"); + check_file << sum_up(data, workload); + check_file.close(); + } + + for(size_t run = 0; run < runs; run++){ + double bw = measure_read_bw(data, workload, results); + std::cout << "Read throughput executed on node " << exec_node << " for node " << mem_node << ": " << bw << " GiB/s" << std::endl; + print_to_file(*out_file, run, exec_node, mem_node, bw, results[0]); + std::memset(results, 0x00, thread_cnt_read * sizeof(base_t)); + } + numa_free(data, workload); + numa_free(results, thread_cnt_read * sizeof(base_t)); +} + +void measure_all_read_bw(){ + std::ofstream out_file; + out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + ".csv"); + print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); + const size_t workload = 8_GiB; + + for(int exec_node = 0; exec_node < 8; exec_node++){ + for(int mem_node = 0; mem_node < 16; mem_node++){ + exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, ""); + } + } + out_file.close(); +} + +void measure_all_read_bw_reversed(){ + std::ofstream out_file; + out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed.csv"); + print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); + const size_t workload = 8_GiB; + + for(int exec_node = 7; exec_node >= 0; exec_node--){ + for(int mem_node = 15; mem_node >= 0; mem_node--){ + exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed"); + } + } + out_file.close(); +} + +void measure_all_read_bw_reversed_bitwise(){ + std::ofstream out_file; + out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed_bitwise.csv"); + print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); + const size_t workload = 8_GiB; + + for(int exec_node0 = 0; exec_node0 < 8; exec_node0++){ + for(int mem_node0 = 0; mem_node0 < 16; mem_node0++){ + int mem_node = reverse_bits(mem_node0, 4); + int exec_node = reverse_bits(exec_node0, 3); + exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed_bitwise"); + } + } + out_file.close(); +} + + + +int main() { + // nodes 0-7 hold cores and DRAM, nodes 8-15 only HBM + + measure_all_read_bw_reversed_bitwise(); + measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(0); + + return 0; +} \ No newline at end of file diff --git a/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h new file mode 100644 index 0000000..6dbc652 --- /dev/null +++ b/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h @@ -0,0 +1,391 @@ + +#include +#include +#include +#include + +#include + +#include "filter.h" +#include "aggregation.h" +#include "vector_loader.h" +#include "timer_utils.h" +#include "barrier_utils.h" +#include "execution_modes.h" + + +template +class Query_Wrapper { +public: + // sync + std::shared_future* ready_future; + + thread_runtime_timing* trt; + barrier_timing* bt; + +private: + // numa + uint32_t close_mem; + uint32_t far_mem; + + // data + size_t size_b; + size_t chunk_size_b; + size_t chunk_size_w; + size_t chunk_cnt; + base_t* data_a; + base_t* data_b; + base_t* dest; + + // ratios + uint32_t thread_count_fc; + uint32_t thread_count_fi; + uint32_t thread_count_ag; + uint32_t thread_group; + + // done bits + volatile uint8_t* ready_flag_a; + volatile uint8_t* ready_flag_b; + std::mutex ready_a_m; + std::mutex ready_b_m; + + // buffer + uint16_t* mask_a; + uint16_t* mask_b; + base_t** buffer_b; + + // params + base_t cmp_a; + base_t cmp_b; + bool no_copy; + NewPMode mode; + + // sync + std::unique_ptr*>> sync_barrier; + std::string barrier_mode = BARRIER_MODE; + + using filterCopy = Filter; + using filterNoCopy = Filter; + using filter = Filter; + using aggregation = Aggregation; + +public: + + + Query_Wrapper(std::shared_future* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, + base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, + NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) : + ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), + dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){ + + chunk_size_w = chunk_size_b / sizeof(base_t); + chunk_cnt = size_b / chunk_size_b; + thread_count_fi = tc_fi; + thread_count_fc = tc_fc; + thread_count_ag = tc_ag; + + ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( + chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); + ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( + chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); + + mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); + mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); + + trt = new thread_runtime_timing(4, 16*4*4*4, close_mem); + bt = new barrier_timing(4, 16*4*4*4, close_mem); + reset_barriers(); + + if constexpr(BUFFER_LIMIT==1) { + // TODO size ok like that? + buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem); + buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); + buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); + } else { + buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem); + base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem); + *buffer_b = buffer_tmp; + } + }; + + void reset_barriers(){ + if(sync_barrier != nullptr) { + for(auto& barrier : *sync_barrier) { + delete barrier; + } + sync_barrier.reset(); + } + + sync_barrier = std::make_unique*>>(thread_group); + uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; + uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; + uint32_t barrier_thread_count; + + if constexpr(simple){ + barrier_thread_count = (thread_group / barrier_count) * + (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi)); + } else { + barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; + } + for(uint32_t i = 0; i < barrier_count; ++i) { + (*sync_barrier)[i] = new std::barrier(barrier_thread_count); + } + } + + void clear_buffers () { + std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); + std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); + + std::memset(mask_a, 0x00, size_b / sizeof(base_t)); + std::memset(mask_b, 0x00, size_b / sizeof(base_t)); + if constexpr(BUFFER_LIMIT==1) { + std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b); + std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b); + } else { + std::memset(*buffer_b, 0x00, size_b); + } + + trt->reset_accumulator(); + bt->reset_accumulator(); + reset_barriers(); + }; + + ~Query_Wrapper() { + numa_free((void*)ready_flag_a, + chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); + numa_free((void*)ready_flag_b, + chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); + + numa_free(mask_a, size_b / sizeof(base_t)); + numa_free(mask_b, size_b / sizeof(base_t)); + if constexpr(BUFFER_LIMIT==1) { + numa_free(buffer_b[0], thread_group * chunk_size_b); + numa_free(buffer_b[1], thread_group * chunk_size_b); + numa_free(buffer_b, size_b * sizeof(base_t*)); + } else { + numa_free(*buffer_b, size_b); + } + + delete trt; + for(auto& barrier : *sync_barrier) { + delete barrier; + } + delete bt; + + }; + + //this can be set without need to change allocations + void set_thread_group_count(uint32_t value) { + this->thread_group = value; + }; + +private: + static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, + size_t tcnt) { + base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; + return chunk_ptr + tid * (chunk_size_w / tcnt); + } + + static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, + size_t tcnt) { + // 16 integer are addressed with one uint16_t in mask buffer + size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); + return base_ptr + (offset / 16); + } + + static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { + uint8_t value = bitmap[bitpos / 8]; + switch(bitpos % 8) { + case 0: return value & 0b00000001; + case 1: return value & 0b00000010; + case 2: return value & 0b00000100; + case 3: return value & 0b00001000; + case 4: return value & 0b00010000; + case 5: return value & 0b00100000; + case 6: return value & 0b01000000; + case 7: return value & 0b10000000; + default: return false; + } + } + + static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { + mutex.lock(); + switch(bitpos % 8) { + case 0: bitmap[bitpos / 8] |= 0b00000001;break; + case 1: bitmap[bitpos / 8] |= 0b00000010;break; + case 2: bitmap[bitpos / 8] |= 0b00000100;break; + case 3: bitmap[bitpos / 8] |= 0b00001000;break; + case 4: bitmap[bitpos / 8] |= 0b00010000;break; + case 5: bitmap[bitpos / 8] |= 0b00100000;break; + case 6: bitmap[bitpos / 8] |= 0b01000000;break; + case 7: bitmap[bitpos / 8] |= 0b10000000;break; + } + mutex.unlock(); + } + +public: + + static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) { + base_t sum = 0; + for(int i = 0; i < size_b / sizeof(base_t); ++i) { + if(a[i] >= cmp_a && b[i] <= cmp_b) { + sum += b[i]; + } + } + return sum; + } + + static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { + uint32_t cnt = 0; + for(int i = 0; i < size_b / sizeof(base_t); ++i) { + if(leq) { + if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) { + ++cnt; + } + } else { + if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) { + ++cnt; + } + } + } + } + + static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { + for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) { + std::bitset<16> m(mask[i]); + uint16_t ch = 0; + for(int j = 0; j < 16; ++j) { + if(data[i*16 + j] <= cmp) { + ch |= 0x1 << j; + } + } + std::bitset<16> c(ch); + + std::cout << "act " << m << std::endl; + std::cout << "rea " << c << std::endl << std::endl; + } + } + + + void scan_b(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_fc; + assert(chunk_size_w % tcnt == 0); + assert(chunk_size_w % 16 == 0); + assert(chunk_size_w % tcnt * 16 == 0); + + // wait till everyone can start + ready_future->wait(); + + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + trt->start_timer(1, tid * gcnt + gid); + + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr = get_sub_chunk_ptr(data_b , chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr = get_sub_mask_ptr (mask_b , chunk_id, chunk_size_w, tid, tcnt); + + if constexpr(simple){ + base_t* buffer_ptr; + if constexpr(BUFFER_LIMIT==1) { + buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); + } else { + buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); + } + std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt); + } else { + if(no_copy) { + filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); + } else { + base_t* buffer_ptr; + if constexpr(BUFFER_LIMIT==1) { + buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); + } else { + buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); + } + filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); + } + } + + trt->stop_timer(1, tid * gcnt + gid); + bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); + + } + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + + } + + void scan_a(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_fi; + assert(chunk_size_w % tcnt == 0); + assert(chunk_size_w % 16 == 0); + assert(chunk_size_w % tcnt * 16 == 0); + + // wait till everyone can start + ready_future->wait(); + + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + trt->start_timer(0, tid * gcnt + gid); + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); + + filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); + + trt->stop_timer(0, tid * gcnt + gid); + bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); + } + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + } + + void aggr_j(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_ag; + // wait till everyone can start + ready_future->wait(); + + // calculate values + __m512i aggregator = aggregation::OP::zero(); + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + + bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); + trt->start_timer(2, tid * gcnt + gid); + + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr; + if(no_copy) { + chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); + } else { + if constexpr(BUFFER_LIMIT==1) { + chunk_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); + } else { + chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); + } + } + uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); + + base_t tmp = _mm512_reduce_add_epi64(aggregator); + if constexpr(simple){ + aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt); + } else { + aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); + } + trt->stop_timer(2, tid * gcnt + gid); + } + + // so threads with more runs dont wait for finished threads + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + + aggregation::happly(dest + (tid * gcnt + gid), aggregator); + } +}; \ No newline at end of file diff --git a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h new file mode 100644 index 0000000..3b1d861 --- /dev/null +++ b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h @@ -0,0 +1,395 @@ + +#include +#include +#include +#include +#include + +#include + +#include "filter.h" +#include "aggregation.h" +#include "vector_loader.h" +#include "timer_utils.h" +#include "barrier_utils.h" +#include "measurement_utils.h" +#include "execution_modes.h" + +#include "../../../thirdParty/dsa_offload/offloading-cacher/cache.hpp" + +template +class Query_Wrapper { +public: + // sync + std::shared_future* ready_future; + + thread_runtime_timing* trt; + barrier_timing* bt; + pcm_value_collector* pvc; + +private: + dsacache::Cache cache_; + + // numa + uint32_t close_mem; + uint32_t far_mem; + + // data + size_t size_b; + size_t chunk_size_b; + size_t chunk_size_w; + size_t chunk_cnt; + base_t* data_a; + base_t* data_b; + base_t* dest; + + // ratios + uint32_t thread_count_fc; + uint32_t thread_count_fi; + uint32_t thread_count_ag; + uint32_t thread_group; + + // done bits + volatile uint8_t* ready_flag_a; + volatile uint8_t* ready_flag_b; + std::mutex ready_a_m; + std::mutex ready_b_m; + + // buffer + uint16_t* mask_a; + uint16_t* mask_b; + + // params + base_t cmp_a; + base_t cmp_b; + NewPMode mode; + + // sync + std::unique_ptr*>> sync_barrier; + std::string barrier_mode = BARRIER_MODE; + + using filterCopy = Filter; + using filterNoCopy = Filter; + using filter = Filter; + using aggregation = Aggregation; + + void InitCache(const std::string& device) { + if (device == "default") { + static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + return numa_dst_node; + }; + + static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { + return std::vector{ numa_src_node, numa_dst_node }; + }; + + cache_.Init(cache_policy,copy_policy); + } + else if (device == "xeonmax") { + static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { + return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; + }; + + static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { + const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; + if (same_socket) { + const bool socket_number = numa_dst_node >> 2; + if (socket_number == 0) return std::vector{ 0, 1, 2, 3 }; + else return std::vector{ 4, 5, 6, 7 }; + } + else return std::vector{ numa_src_node, numa_dst_node }; + }; + + cache_.Init(cache_policy,copy_policy); + } + else { + std::cerr << "Given device '" << device << "' not supported!" << std::endl; + exit(-1); + } + } + +public: + + + Query_Wrapper(std::shared_future* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, + base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, + NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42) : + ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), + dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b){ + + chunk_size_w = chunk_size_b / sizeof(base_t); + chunk_cnt = size_b / chunk_size_b; + thread_count_fi = tc_fi; + thread_count_fc = tc_fc; + thread_count_ag = tc_ag; + + ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( + chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); + ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( + chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); + + mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); + mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); + + InitCache("xeonmax"); + + size_t measurement_space = THREAD_GROUP_MULTIPLIER * std::max(std::max(tc_fi, tc_fc), tc_ag); + trt = new thread_runtime_timing(3, measurement_space, far_mem); + bt = new barrier_timing(3, measurement_space, far_mem); + pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, far_mem); + reset_barriers(); + }; + + void reset_barriers(){ + if(sync_barrier != nullptr) { + for(auto& barrier : *sync_barrier) { + delete barrier; + } + sync_barrier.reset(); + } + + sync_barrier = std::make_unique*>>(thread_group); + uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; + uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; + uint32_t barrier_thread_count; + + if constexpr(simple){ + barrier_thread_count = (thread_group / barrier_count) * + (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi)); + } else { + barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; + } + for(uint32_t i = 0; i < barrier_count; ++i) { + (*sync_barrier)[i] = new std::barrier(barrier_thread_count); + } + } + + void clear_buffers () { + std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); + std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); + + std::memset(mask_a, 0x00, size_b / sizeof(base_t)); + std::memset(mask_b, 0x00, size_b / sizeof(base_t)); + + cache_.Clear(); + + trt->reset_accumulator(); + bt->reset_accumulator(); + pvc->reset(); + reset_barriers(); + }; + + ~Query_Wrapper() { + numa_free((void*)ready_flag_a, + chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); + numa_free((void*)ready_flag_b, + chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); + + numa_free(mask_a, size_b / sizeof(base_t)); + numa_free(mask_b, size_b / sizeof(base_t)); + + delete trt; + for(auto& barrier : *sync_barrier) { + delete barrier; + } + delete bt; + delete pvc; + }; + + //this can be set without need to change allocations + void set_thread_group_count(uint32_t value) { + this->thread_group = value; + }; + +private: + static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, + size_t tcnt) { + base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; + return chunk_ptr + tid * (chunk_size_w / tcnt); + } + + static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, + size_t tcnt) { + // 16 integer are addressed with one uint16_t in mask buffer + size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); + return base_ptr + (offset / 16); + } + + static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { + uint8_t value = bitmap[bitpos / 8]; + switch(bitpos % 8) { + case 0: return value & 0b00000001; + case 1: return value & 0b00000010; + case 2: return value & 0b00000100; + case 3: return value & 0b00001000; + case 4: return value & 0b00010000; + case 5: return value & 0b00100000; + case 6: return value & 0b01000000; + case 7: return value & 0b10000000; + default: return false; + } + } + + static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { + mutex.lock(); + switch(bitpos % 8) { + case 0: bitmap[bitpos / 8] |= 0b00000001;break; + case 1: bitmap[bitpos / 8] |= 0b00000010;break; + case 2: bitmap[bitpos / 8] |= 0b00000100;break; + case 3: bitmap[bitpos / 8] |= 0b00001000;break; + case 4: bitmap[bitpos / 8] |= 0b00010000;break; + case 5: bitmap[bitpos / 8] |= 0b00100000;break; + case 6: bitmap[bitpos / 8] |= 0b01000000;break; + case 7: bitmap[bitpos / 8] |= 0b10000000;break; + } + mutex.unlock(); + } + +public: + void scan_b(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_fc; + assert(chunk_size_w % tcnt == 0); + assert(chunk_size_w % 16 == 0); + assert(chunk_size_w % tcnt * 16 == 0); + + // wait till everyone can start + ready_future->wait(); + + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + trt->start_timer(1, tid * gcnt + gid); + pvc->start("scan_b", tid * gcnt + gid); + + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr = get_sub_mask_ptr(mask_b, chunk_id, chunk_size_w, tid, tcnt); + + if constexpr(simple){ + cache_.Access(chunk_ptr, chunk_size_b / tcnt); + } else { + const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt); + + // wait on copy to complete - during this time other threads may + // continue with their calculation which leads to little impact + // and we will be faster if the cache is used + + data->WaitOnCompletion(); + + // obtain the data location from the cache entry + + base_t* data_ptr = data->GetDataLocation(); + + // nullptr is still a legal return value for CacheData::GetLocation() + // even after waiting, so this must be checked + + if (data_ptr == nullptr) { + data_ptr = chunk_ptr; + } + + filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt); + } + + pvc->stop("scan_b", tid * gcnt + gid); + trt->stop_timer(1, tid * gcnt + gid); + + bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); + } + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + } + + void scan_a(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_fi; + assert(chunk_size_w % tcnt == 0); + assert(chunk_size_w % 16 == 0); + assert(chunk_size_w % tcnt * 16 == 0); + + // wait till everyone can start + ready_future->wait(); + + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + + for(uint32_t i = 0; i < runs; ++i) { + trt->start_timer(0, tid * gcnt + gid); + pvc->start("scan_a", tid * gcnt + gid); + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); + + filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); + + pvc->stop("scan_a", tid * gcnt + gid); + trt->stop_timer(0, tid * gcnt + gid); + bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); + } + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + } + + void aggr_j(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_ag; + // wait till everyone can start + ready_future->wait(); + + // calculate values + __m512i aggregator = aggregation::OP::zero(); + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + + bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); + trt->start_timer(2, tid * gcnt + gid); + pvc->start("aggr_j", tid * gcnt + gid); + + // calculate pointers + size_t chunk_id = gid + gcnt * i; + const base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); + + // access the cache for the given chunk which will have been accessed in scan_b + + const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt); + + // wait on the caching task to complete, this will give time for other processes + // to make progress here which will therefore not hurt performance + + data->WaitOnCompletion(); + + // after the copy task has finished we obtain the pointer to the cached + // copy of data_b which is then used from now on + + const base_t* data_ptr = data->GetDataLocation(); + + // nullptr is still a legal return value for CacheData::GetLocation() + // even after waiting, so this must be checked + + if (data_ptr == nullptr) { + data_ptr = chunk_ptr; + std::cerr << "Cache Miss" << std::endl; + } + + uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); + + base_t tmp = _mm512_reduce_add_epi64(aggregator); + + if constexpr(simple){ + aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, chunk_size_b / tcnt); + } else { + aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); + } + + pvc->stop("aggr_j", tid * gcnt + gid); + trt->stop_timer(2, tid * gcnt + gid); + } + + // so threads with more runs dont wait for alerady finished threads + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + + aggregation::happly(dest + (tid * gcnt + gid), aggregator); + } +}; \ No newline at end of file diff --git a/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h new file mode 100644 index 0000000..2b10b06 --- /dev/null +++ b/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h @@ -0,0 +1,387 @@ + +#include +#include +#include +#include + +#include + +#include "filter.h" +#include "aggregation.h" +#include "vector_loader.h" +#include "timer_utils.h" +#include "barrier_utils.h" +#include "execution_modes.h" + + +template +class Query_Wrapper { +public: + // sync + std::shared_future* ready_future; + + thread_runtime_timing* trt; + barrier_timing* bt; + +private: + // numa + uint32_t close_mem; + uint32_t far_mem; + + // data + size_t size_b; + size_t chunk_size_b; + size_t chunk_size_w; + size_t chunk_cnt; + base_t* data_a; + base_t* data_b; + base_t* dest; + + // ratios + uint32_t thread_count_fc; + uint32_t thread_count_fi; + uint32_t thread_count_ag; + uint32_t thread_group; + + // done bits + volatile uint8_t* ready_flag_a; + volatile uint8_t* ready_flag_b; + std::mutex ready_a_m; + std::mutex ready_b_m; + + // buffer + uint16_t* mask_a; + uint16_t* mask_b; + base_t** buffer_b; + + // params + base_t cmp_a; + base_t cmp_b; + bool no_copy; + PMode mode; + + // sync + std::unique_ptr*>> sync_barrier; + std::string barrier_mode = BARRIER_MODE; + + using filterCopy = Filter; + using filterNoCopy = Filter; + using filter = Filter; + using aggregation = Aggregation; + +public: + + + Query_Wrapper(std::shared_future* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, + base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, + PMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) : + ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), + dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){ + + chunk_size_w = chunk_size_b / sizeof(base_t); + chunk_cnt = size_b / chunk_size_b; + thread_count_fi = tc_fi; + thread_count_fc = tc_fc; + thread_count_ag = tc_ag; + + ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( + chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); + ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( + chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); + + mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); + mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); + + trt = new thread_runtime_timing(4, 20, close_mem); + bt = new barrier_timing(4, 20, close_mem); + reset_barriers(); + + if constexpr(BUFFER_LIMIT==1) { + // TODO size ok like that? + buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem); + buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); + buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); + } else { + buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem); + base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem); + *buffer_b = buffer_tmp; + } + }; + + void reset_barriers(){ + if(sync_barrier != nullptr) { + for(auto& barrier : *sync_barrier) { + delete barrier; + } + sync_barrier.reset(); + } + + sync_barrier = std::make_unique*>>(thread_group); + uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; + uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; + uint32_t barrier_thread_count; + + if constexpr(simple){ + barrier_thread_count = (thread_group / barrier_count) * + (mode == PMode::expl_copy ? thread_count_sum : (thread_count_ag + thread_count_fi)); + } else { + barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; + } + for(uint32_t i = 0; i < barrier_count; ++i) { + (*sync_barrier)[i] = new std::barrier(barrier_thread_count); + } + } + + + void clear_buffers () { + std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); + std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); + + std::memset(mask_a, 0x00, size_b / sizeof(base_t)); + std::memset(mask_b, 0x00, size_b / sizeof(base_t)); + if constexpr(BUFFER_LIMIT==1) { + std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b); + std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b); + } else { + std::memset(*buffer_b, 0x00, size_b); + } + + trt->reset_accumulator(); + bt->reset_accumulator(); + reset_barriers(); + }; + + ~Query_Wrapper() { + numa_free((void*)ready_flag_a, + chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); + numa_free((void*)ready_flag_b, + chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); + + numa_free(mask_a, size_b / sizeof(base_t)); + numa_free(mask_b, size_b / sizeof(base_t)); + if constexpr(BUFFER_LIMIT==1) { + numa_free(buffer_b[0], thread_group * chunk_size_b); + numa_free(buffer_b[1], thread_group * chunk_size_b); + numa_free(buffer_b, size_b * sizeof(base_t*)); + } else { + numa_free(*buffer_b, size_b); + } + + delete trt; + for(auto& barrier : *sync_barrier) { + delete barrier; + } + delete bt; + + }; + +private: + static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, + size_t tcnt) { + base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; + return chunk_ptr + tid * (chunk_size_w / tcnt); + } + + static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, + size_t tcnt) { + // 16 integer are addressed with one uint16_t in mask buffer + size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); + return base_ptr + (offset / 16); + } + + static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { + uint8_t value = bitmap[bitpos / 8]; + switch(bitpos % 8) { + case 0: return value & 0b00000001; + case 1: return value & 0b00000010; + case 2: return value & 0b00000100; + case 3: return value & 0b00001000; + case 4: return value & 0b00010000; + case 5: return value & 0b00100000; + case 6: return value & 0b01000000; + case 7: return value & 0b10000000; + default: return false; + } + } + + static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { + mutex.lock(); + switch(bitpos % 8) { + case 0: bitmap[bitpos / 8] |= 0b00000001;break; + case 1: bitmap[bitpos / 8] |= 0b00000010;break; + case 2: bitmap[bitpos / 8] |= 0b00000100;break; + case 3: bitmap[bitpos / 8] |= 0b00001000;break; + case 4: bitmap[bitpos / 8] |= 0b00010000;break; + case 5: bitmap[bitpos / 8] |= 0b00100000;break; + case 6: bitmap[bitpos / 8] |= 0b01000000;break; + case 7: bitmap[bitpos / 8] |= 0b10000000;break; + } + mutex.unlock(); + } + +public: + + static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) { + base_t sum = 0; + for(int i = 0; i < size_b / sizeof(base_t); ++i) { + if(a[i] >= cmp_a && b[i] <= cmp_b) { + sum += b[i]; + } + } + return sum; + } + + static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { + uint32_t cnt = 0; + for(int i = 0; i < size_b / sizeof(base_t); ++i) { + if(leq) { + if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) { + ++cnt; + } + } else { + if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) { + ++cnt; + } + } + } + } + + static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { + for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) { + std::bitset<16> m(mask[i]); + uint16_t ch = 0; + for(int j = 0; j < 16; ++j) { + if(data[i*16 + j] <= cmp) { + ch |= 0x1 << j; + } + } + std::bitset<16> c(ch); + + std::cout << "act " << m << std::endl; + std::cout << "rea " << c << std::endl << std::endl; + } + } + + + void scan_b(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_fc; + assert(chunk_size_w % tcnt == 0); + assert(chunk_size_w % 16 == 0); + assert(chunk_size_w % tcnt * 16 == 0); + + // wait till everyone can start + ready_future->wait(); + + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + trt->start_timer(1, tid * gcnt + gid); + + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr = get_sub_chunk_ptr(data_b , chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr = get_sub_mask_ptr (mask_b , chunk_id, chunk_size_w, tid, tcnt); + + if constexpr(simple){ + base_t* buffer_ptr; + if constexpr(BUFFER_LIMIT==1) { + buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); + } else { + buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); + } + std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt); + } else { + if(no_copy) { + filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); + } else { + base_t* buffer_ptr; + if constexpr(BUFFER_LIMIT==1) { + buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); + } else { + buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); + } + filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); + } + } + + trt->stop_timer(1, tid * gcnt + gid); + bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); + + } + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + + } + + void scan_a(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_fi; + assert(chunk_size_w % tcnt == 0); + assert(chunk_size_w % 16 == 0); + assert(chunk_size_w % tcnt * 16 == 0); + + // wait till everyone can start + ready_future->wait(); + + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + trt->start_timer(0, tid * gcnt + gid); + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); + + filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); + + trt->stop_timer(0, tid * gcnt + gid); + bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); + } + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + } + + void aggr_j(size_t gid, size_t gcnt, size_t tid) { + size_t tcnt = thread_count_ag; + // wait till everyone can start + ready_future->wait(); + + // calculate values + __m512i aggregator = aggregation::OP::zero(); + // the lower gids run once more if the chunks are not evenly distributable + uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); + uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { + + bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); + trt->start_timer(2, tid * gcnt + gid); + + // calculate pointers + size_t chunk_id = gid + gcnt * i; + base_t* chunk_ptr; + if(no_copy) { + chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); + } else { + if constexpr(BUFFER_LIMIT==1) { + chunk_ptr = get_sub_chunk_ptr(buffer_b[i%2], gid, chunk_size_w, tid, tcnt); + } else { + chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); + } + } + uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); + uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); + + base_t tmp = _mm512_reduce_add_epi64(aggregator); + if constexpr(simple){ + aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt); + } else { + aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); + } + trt->stop_timer(2, tid * gcnt + gid); + } + + // so threads with more runs dont wait for finished threads + (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); + + aggregation::happly(dest + (tid * gcnt + gid), aggregator); + } +}; \ No newline at end of file diff --git a/qdp_project/src/utils/array_utils.h b/qdp_project/src/utils/array_utils.h new file mode 100644 index 0000000..52eba76 --- /dev/null +++ b/qdp_project/src/utils/array_utils.h @@ -0,0 +1,80 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +/// @brief Fills a given array with random generated integers. +/// @tparam base_t Datatype of the array +/// @param dest Pointer to the array +/// @param size Size of the array +/// @param min Minumum value of the generated integers +/// @param max Maximum value of the generated integers +template +void fill(base_t * dest, uint64_t size, base_t min, base_t max) { + std::srand(std::time(nullptr)); + for(uint64_t i = 0; i < size/sizeof(base_t); ++i) { + dest[i] = (std::rand() % (max - min)) + min; + } +} + +/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937). +/// @tparam base_t Datatype of the array +/// @param dest Pointer to the array +/// @param size Size of the array +/// @param min Minumum value of the generated integers +/// @param max Maximum value of the generated integers +template +void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) { + static_assert(std::is_integral::value, "Data type is not integral."); + + size = size / sizeof(T); + + std::mt19937::result_type seed; + if (int_seed == 0) { + std::random_device rd; + seed = rd() ^ ( + (std::mt19937::result_type) std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() + + (std::mt19937::result_type) std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch()).count()); + } else seed = int_seed; + + std::mt19937 gen(seed); + std::uniform_int_distribution distrib(min, max); + + for (uint64_t j = 0; j < size; ++j) { + array[j] = distrib(gen); + } + +} + +/** + * @brief Checks if two arrays of the integral type *T* contain the same values + * + * @tparam T Integral type of *array0* and *array1* + * @param array0 Array 0 to check + * @param array1 Array 1 to check + * @param size_b Size of the two arrays in byte + * @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index) + * @return bool Weathor or not the content is equal or not + */ +template +typename std::enable_if::value, bool>::type + check_same(T* array0, T* array1, size_t size_b, bool verbose) { + for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) { + __m512i vec0 = _mm512_stream_load_si512(array0 + i); + __m512i vec1 = _mm512_stream_load_si512(array1 + i); + + __mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1); + } + + //TODO complete function + + return false; +} + diff --git a/qdp_project/src/utils/barrier_utils.h b/qdp_project/src/utils/barrier_utils.h new file mode 100644 index 0000000..a68f801 --- /dev/null +++ b/qdp_project/src/utils/barrier_utils.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include + +#define BARRIER_TIMINGS 1 + + +struct barrier_completion_function { + inline void operator() () { + return; + } +}; + +struct barrier_timing { + + uint32_t time_points, time_threads; + double** time_accumulator; + + barrier_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) { +#ifdef BARRIER_TIMINGS + time_points = timing_points; + time_threads = timing_threads; + time_accumulator = (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node); + for(uint32_t i = 0; i < timing_points; ++i) { + time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node); + } +#endif + } + + ~barrier_timing() { +#ifdef BARRIER_TIMINGS + for(uint32_t i = 0; i < time_points; ++i) { + numa_free(time_accumulator[i], time_threads * sizeof(double)); + } + numa_free(time_accumulator, time_points * sizeof(double*)); +#endif + } + + void reset_accumulator() { +#ifdef BARRIER_TIMINGS + for(uint32_t i = 0; i < time_points; ++i){ + for(uint32_t j = 0; j < time_threads; ++j){ + time_accumulator[i][j] = 0.0; + }} +#endif + } + + double summarize_time(uint32_t time_point) { +#ifdef BARRIER_TIMINGS + double sum = 0.0; + for(uint32_t i = 0; i < time_threads; ++i) { + sum += time_accumulator[time_point][i]; + } + return sum; +#endif + } + + void timed_wait(std::barrier& barrier, uint32_t point_id, uint32_t thread_id) { +#ifdef BARRIER_TIMINGS + auto before_barrier = std::chrono::steady_clock::now(); +#endif + barrier.arrive_and_wait(); +#ifdef BARRIER_TIMINGS + auto after_barrier = std::chrono::steady_clock::now(); + uint64_t barrier_wait_time = std::chrono::duration_cast(after_barrier - before_barrier).count(); + double seconds = barrier_wait_time / (1000.0 * 1000.0 * 1000.0); + time_accumulator[point_id][thread_id] += seconds; +#endif + } +}; \ No newline at end of file diff --git a/qdp_project/src/utils/const.h b/qdp_project/src/utils/const.h new file mode 100644 index 0000000..fde4b55 --- /dev/null +++ b/qdp_project/src/utils/const.h @@ -0,0 +1,33 @@ +/** + * @file const.h + * @author André Berthold + * @brief Defines handy constants. + * @version 0.1 + * @date 2023-05-25 + * + * @copyright Copyright (c) 2023 + * + */ + +#pragma once + +#include +#include + +constexpr size_t VECTOR_SIZE_I = 512; +constexpr size_t VECTOR_SIZE_B = VECTOR_SIZE_I / 8; +constexpr size_t VECTOR_SIZE_H = VECTOR_SIZE_B / sizeof(uint32_t); +constexpr size_t VECTOR_SIZE_W = VECTOR_SIZE_B / sizeof(uint64_t); + +template +constexpr size_t VECTOR_SIZE() { + return VECTOR_SIZE_B / sizeof(T); +} + +template +constexpr size_t V_MASK_SIZE() { + return VECTOR_SIZE() / 8; +} + + +const __mmask16 full_m16 = _mm512_int2mask(0xFFFF); \ No newline at end of file diff --git a/qdp_project/src/utils/cpu_set_utils.h b/qdp_project/src/utils/cpu_set_utils.h new file mode 100644 index 0000000..ba82604 --- /dev/null +++ b/qdp_project/src/utils/cpu_set_utils.h @@ -0,0 +1,82 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +/** Sets all bits in a given cpu_set_t between L and H (condition L <= H)*/ +#define CPU_BETWEEN(L, H, SET) assert(L <= H); for(; L < H; ++L) {CPU_SET(L, SET);} + +/** + * Applies the affinity defined in set to the thread, through pthread library + * calls. If it fails it wites the problem to stderr and terminated the program. +*/ +inline void pin_thread(std::thread& thread, cpu_set_t* set) { + int error_code = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), set); + if (error_code != 0) { + std::cerr << "Error calling pthread_setaffinity_np in copy_pool assignment: " << error_code << std::endl; + exit(-1); + } +} + +/** + * Returns the cpu id of the thread_id-th cpu in a given (multi)range. Thread_id + * greater than the number of cpus in the (multi)range are valid. In this case + * the (thread_id % #cpus in the range)-th cpu in the range is returned. +*/ +int get_cpu_id(int thread_id, const std::vector>& range) { + int subrange_size = range[0].second - range[0].first; + + int i = 0; + while(subrange_size <= thread_id) { + thread_id -= subrange_size; + i = (i + 1) % range.size(); + subrange_size = range[i].second - range[i].first; + } + return thread_id + range[i].first; +} + +/*inline void cpu_set_between(cpu_set_t* set, uint32_t low, uint32_t high) { + assert(low != high); + if (low > high) std::swap(low, high); + + for(; low < high; ++low) { + CPU_SET(low, set); + } +}*/ + +/** + * Pins the given thread to the thread_id-th cpu in the given range. +*/ +void pin_thread_in_range(std::thread& thread, int thread_id, std::vector>& range) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(get_cpu_id(thread_id, range), &set); + + pin_thread(thread, &set); +} + +/** + * Pins the given thread to all cpus in the given range. +*/ +void pin_thread_in_range(std::thread& thread, std::vector>& range) { + cpu_set_t set; + CPU_ZERO(&set); + for(auto r : range) { CPU_BETWEEN(r.first, r.second, &set); } + + pin_thread(thread, &set); +} + +/** + * Pins the given thread to all cpu ids between low (incl.) and high (excl.). +*/ +inline void pin_thread_between(std::thread& thread, uint32_t low, uint32_t high) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_BETWEEN(low, high, &set); + + pin_thread(thread, &set); +} \ No newline at end of file diff --git a/qdp_project/src/utils/execution_modes.h b/qdp_project/src/utils/execution_modes.h new file mode 100644 index 0000000..ca04b4f --- /dev/null +++ b/qdp_project/src/utils/execution_modes.h @@ -0,0 +1,89 @@ +#include + +enum PMode{no_copy = 0, hbm = 1, expl_copy = 2}; +struct mode_manager { + static inline PMode inc(PMode value) { + return static_cast(value + 1); + }; + static inline bool pred(PMode value) { + return no_copy <= value && value <= expl_copy; + }; + static std::string string(PMode value) { + switch(value) { + case no_copy: return "no_copy"; + case hbm: return "hbm_pre"; + case expl_copy:return "expl_co"; + } return "no_copy"; + }; +}; + +#define SIMPLE_Q 0 +#define COMPLEX_Q 1 + +#define SCAN_A 0 +#define SCAN_B 1 +#define AGGR_J 2 + +enum NewPMode{DRAM_base = 0, HBM_base = 1, Mixed_base = 2, Prefetch = 3}; +struct new_mode_manager { + /*constexpr static int thread_counts[2][4][3] = { + //simple query + //scan_a, scan_b, aggr_j + {{3, 0, 3}, // DRAM_base + {3, 0, 3}, // HBM_base + {3, 0, 3}, // Mixed_base + {1, 4, 1}},// Prefetching + //complex query + {{1, 4, 1}, // DRAM_base + {1, 4, 1}, // HBM_base + {1, 4, 1}, // Mixed_base + {1, 4, 1}},// Prefetching + };*/ + + /*constexpr static int thread_counts[2][4][3] = { + //simple query + //scan_a, scan_b, aggr_j + {{2, 0, 4}, // DRAM_base + {2, 0, 4}, // HBM_base + {2, 0, 4}, // Mixed_base + {1, 4, 1}},// Prefetching + //complex query + {{1, 4, 1}, // DRAM_base + {1, 4, 1}, // HBM_base + {1, 4, 1}, // Mixed_base + {1, 4, 1}},// Prefetching + };*/ + + constexpr static int thread_counts[2][4][3] = { + //simple query + //scan_a, scan_b, aggr_j + {{4, 0, 2}, // DRAM_base + {4, 0, 2}, // HBM_base + {4, 0, 2}, // Mixed_base + {1, 4, 1}},// Prefetching + //complex query + {{1, 4, 1}, // DRAM_base + {1, 4, 1}, // HBM_base + {1, 4, 1}, // Mixed_base + {1, 4, 1}},// Prefetching + }; + + static inline NewPMode inc(NewPMode value) { + return static_cast(value + 1); + }; + static inline bool pred(NewPMode value) { + return DRAM_base <= value && value <= Prefetch; + }; + static int thread_count(uint8_t query_type, NewPMode mode, uint8_t thread_type){ + if(query_type > 1) query_type = 1; + if(thread_type > 2) thread_type = 2; + return (thread_counts[query_type][mode][thread_type]); + }; + static std::string string(NewPMode value) { + switch(value) { + case DRAM_base: return "DRAM_Baseline"; + case HBM_base: return "HBM_Baseline"; + case Mixed_base: return "DRAM_HBM_Baseline"; + } return "Q-d_Prefetching"; + }; +}; \ No newline at end of file diff --git a/qdp_project/src/utils/file_output.h b/qdp_project/src/utils/file_output.h new file mode 100644 index 0000000..1dd85ba --- /dev/null +++ b/qdp_project/src/utils/file_output.h @@ -0,0 +1,76 @@ +/** + * @file file_output.h + * @author André Berthold + * @brief Implements a template-function that accepts an arbitrary number of parameters that should be printed + * @version 0.1 + * @date 2023-05-25 + * + * @copyright Copyright (c) 2023 + * + */ +#pragma once + +#include +#include +#include + +#include "iterable_range.h" + +template +inline constexpr bool is_numeric_v = std::disjunction< + std::is_integral, + std::is_floating_point>::value; + +/** + * @brief Converts a parameter to a string by either using it directly or its member current (if it is of type Labeled) + * as parameter to the std::string-Constructor. + * + * @tparam T Type of the parameter + * @param value Parameter to be converted + * @return std::string The converted parameter + */ +template +inline std::string to_string(T value) { + if constexpr(std::is_base_of::value){ + // integrals cannot be use in the string constructor and must be translated by the std::to_string-function + if constexpr (is_numeric_v) { + return std::to_string(value.current); + } else { + return std::string(value.current); + } + } else { + // integrals cannot be use in the string constructor and must be translated by the std::to_string-function + if constexpr (is_numeric_v) { + return std::to_string(value); + } else { + return std::string(value); + } + } +} + +/** + * @brief This function wites the content of *val* to *file*. Terminates terecursive function definition. + * + * @tparam type Type of the paramter *val* (is usually implicitly defeined) + * @param file File that is written to + * @param val Value that is translated to a char stream and written to the file + */ +template +inline void print_to_file(std::ofstream &file, type val) { + file << to_string(val) << std::endl; +} + +/** + * @brief This function wites the content of *val* and that content if *vals* to *file*. + * + * @tparam type Type of the paramter *val* (is usually implicitly defeined) + * @tparam types Parameter pack that describes the types of *vals* + * @param file File that is written to + * @param val Value that is translated to a char stream and written to the file + * @param vals Paramater pack of values that are gonna be printed to the file + */ +template +inline void print_to_file(std::ofstream &file, type val, types ... vals) { + file << to_string(val) << ","; + print_to_file(file, vals...); +} \ No newline at end of file diff --git a/qdp_project/src/utils/iterable_range.h b/qdp_project/src/utils/iterable_range.h new file mode 100644 index 0000000..95fc57e --- /dev/null +++ b/qdp_project/src/utils/iterable_range.h @@ -0,0 +1,208 @@ + #pragma once + +#include +#include +#include + + +constexpr auto NO_NEXT = "false"; + +/** + * @brief Class that adds an label member-parameter to a sub-class + * + */ +class Labeled { +public: + std::string label; +public: + Labeled(std::string str) : label(str) {}; + Labeled(const char* str) { this->label = std::string(str); }; +}; + +/** + * @brief Converts a parameter to a string by either reading the member label (if it is of type Labeled) or using it + * as parameter to the std::string-Constructor. + * + * @tparam T Type of the parameter + * @param value Parameter to be converted + * @return std::string The converted parameter + */ +template +inline std::string generateHead(T value) { + if constexpr(std::is_base_of::value){ + return value.label; + } else { + return std::string(value); + } +} + +/** + * @brief Converts a parameter-pack to a string calling genarateHead(T) on every parameter and concatenatin the results. + * + * @tparam T Type of the first parameter + * @tparam Ts Parameter pack specifying the preceeding parameters' types + * @param value Parameter to be transformed + * @param values Parameter-pack of the next prameters to be transformed + * @return std::string Comma-separated concatenation of all parameters string representation + */ +template +inline std::string generateHead(T value, Ts... values) { + return generateHead(value) + ',' + generateHead(values...); +} + + +/** + * @brief Takes a single Range object and calls its next function. + * + * @tparam T Specific type of the Range object + * @param t Instance of the Range object + * @return std::string Label of the Range object or "false" if the Range reaced its end and was reset + */ +template +std::string IterateOnce(T& t) { + if(t.next()) return t.label; + else t.reset(); + return std::string(NO_NEXT); //the string signalises that the iteration has to be terminiated. +} + +/** + * @brief Takes a number of Range objects and recusively increments them till the first Range does not reach its end + * upon incrementing. It tarts at the first Range object given. Every Range object that reached its end is reset to + * its start value. + * + * @tparam T Specific type of the first Range object + * @tparam Ts Types to the following Range objects + * @param t First instance of the Range object + * @param ts Parameter pack of the following Range objects + * @return std::string Label of the highest index Range object that was altered, or "false" if the last Range object + * reache its end and was reset + */ +template +std::string IterateOnce(T& t , Ts&... ts) { + if(t.next()) return t.label; + else t.reset(); + return IterateOnce(ts...); +} + + +/** + * @brief Class that provides a convenient interface for iteratin throug a parameter range. It stores a public value + * that can be altered by the classes' methods. + * + * @tparam T Base type of the parameter + * @tparam INIT Initial value of the current pointer + * @tparam PRED Struct providing an apply function testing if the current value is in range or not + * @tparam INC Struct providing an apply function setting the current value to the value following the current value + */ +template +class Range : public Labeled { +public: + /** + * @brief Current value of the parameter + */ + T current = INIT; + + /** + * @brief Resets current to its initial value + */ + void reset() {current = INIT; }; + + /** + * @brief Sets current to its next value (according to INC::inc) and returns if the range Reached its end + * (accordingt to PRED::pred). + * + * @return true The newly assigned value of current is in the range + * @return false Otherwise + */ + bool next() { + current = INC::inc(current); + return PRED::pred(current); + }; + + /** + * @brief Checks if current is in the Range (according to PRED). + * + * @return true PRED returns true + * @return false Otherwise + */ + bool valid() { return PRED::apply(current); }; +}; + +/** + * @brief Class that is in contrast to Range specialized for integral values. + * + * @tparam T Integral base type of the Range + * @tparam INIT Initial value of the parameter + * @tparam MAX Maximal value of the parameter + * @tparam INC Struct providing an apply function setting the current value to the value following the current value + */ +template +class Int_Range : public Labeled { +static_assert(std::is_integral::value, "Int_Range requires an integral base type"); + +public: + const T max = MAX; + T current = INIT; + + void reset() {current = INIT; }; + + bool next() { + current = INC::inc(current); + return current < MAX; + }; + + bool valid() { return current < MAX; }; + +}; + +/** + * @brief Class that is in contrast to Int_Range specialized for integrals that grow linearly. + * + * @tparam T Integral base type of the Range + * @tparam INIT Initial value of the parameter + * @tparam MAX Maximal value of the parameter + * @tparam STEP Increase of the value per next()-call + */ +template +class Linear_Int_Range : public Labeled { +static_assert(std::is_integral::value, "Linear_Int_Range requires an integral base type"); + +public: + const T max = MAX; + T current = INIT; + + void reset() {current = INIT; }; + + bool next() { + current += STEP; + return current < MAX; + }; + + bool valid() { return current < MAX; }; +}; + +/** + * @brief Class that is in contrast to Int_Range specialized for integrals that grow exponetially. + * + * @tparam T Integral base type of the Range + * @tparam INIT Initial value of the parameter + * @tparam MAX Maximal value of the parameter + * @tparam FACTOR Multiplicative Increase of the value per next()-call + */ +template +class Exp_Int_Range : public Labeled { +static_assert(std::is_integral::value, "Exp_Int_Range requires an integral base type"); + +public: + const T max = MAX; + T current = INIT; + + void reset() {current = INIT; }; + + bool next() { + current *= FACTOR; + return current < MAX; + }; + + bool valid() { return current < MAX; }; +}; \ No newline at end of file diff --git a/qdp_project/src/utils/measurement_utils.h b/qdp_project/src/utils/measurement_utils.h new file mode 100644 index 0000000..f403de0 --- /dev/null +++ b/qdp_project/src/utils/measurement_utils.h @@ -0,0 +1,152 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + + +#if PCM_M == 1 +#define PCM_MEASURE 1 +#include "pcm.h" +#endif + + + +struct pcm_value_collector { + const uint32_t value_count = 6; + + uint32_t threads; + std::vector points; +#ifdef PCM_MEASURE + pcm::SystemCounterState** states; +#endif + uint64_t** collection; + + pcm_value_collector(const std::vector& in_points, uint32_t threads, uint32_t memory_node) : threads(threads) { +#ifdef PCM_MEASURE + points = std::vector(in_points); + + collection = (uint64_t**) numa_alloc_onnode(threads * sizeof(uint64_t*), memory_node); + states = (pcm::SystemCounterState**) numa_alloc_onnode(threads * sizeof(pcm::SystemCounterState*), memory_node); + for(int i = 0; i < threads; ++i) { + collection[i] = (uint64_t*) numa_alloc_onnode(points.size() * value_count * sizeof(uint64_t), memory_node); + states[i] = (pcm::SystemCounterState*) numa_alloc_onnode(points.size() * sizeof(pcm::SystemCounterState), memory_node); + } +#endif + } + + ~pcm_value_collector() { +#ifdef PCM_MEASURE + for(int i = 0; i < threads; ++i) { + numa_free(collection[threads], points.size() * value_count * sizeof(uint64_t)); + } + numa_free(collection, threads * sizeof(uint64_t*)); + numa_free(states, threads * sizeof(pcm::SystemCounterState)); +#endif + } + + void reset() { +#ifdef PCM_MEASURE + for(int i = 0; i < threads; ++i) + for(uint32_t j = 0; j < points.size() * value_count; ++j){ + collection[i][j] = 0; + } +#endif + } + + int64_t point_index(const std::string& value) { + auto it = std::find(points.begin(), points.end(), value); + + if(it == points.end()) return -1; + else return it - points.begin(); + } + + std::vector summarize(const std::string &point) { +#ifdef PCM_MEASURE + std::vector sums(value_count); + int64_t idx = point_index(point); + if(idx < 0) return sums; + + for(uint32_t v = 0; v < value_count; ++v) { + for(uint32_t i = 0; i < threads; ++i) { + sums[v] += collection[i][static_cast(idx) + points.size() * v]; + } + } + return sums; +#endif + return std::vector {0}; + } + + std::string summarize_as_string(const std::string &point) { +#ifdef PCM_MEASURE + auto summary = summarize(point); + auto it = summary.begin(); + auto end = summary.end(); + + if(it >= end) return ""; + + std::string result(""); + result += std::to_string(*it); + ++it; + + while(it < end) { + result += ","; + result += std::to_string(*it); + ++it; + } + return result; +#endif + return ""; + } + + void start(const std::string& point, uint32_t thread) { +#ifdef PCM_MEASURE + int64_t idx = point_index(point); + if(idx < 0) { + std::cerr << "Invalid 'point' given. Ignored!" << std::endl; + return; + } + + states[thread][static_cast(idx)] = pcm::getSystemCounterState(); +#endif + } + + static std::string getHead(const std::string& point) { + return point + "_l2h," + + point + "_l2m," + + point + "_l3h," + + point + "_l3hns," + + point + "_l3m," + + point + "_mc"; + } + +#ifdef PCM_MEASURE + void read_values(uint32_t point_idx, uint32_t thread, pcm::SystemCounterState& start, pcm::SystemCounterState& end) { + collection[thread][point_idx + points.size() * 0] += getL2CacheHits(start, end); + collection[thread][point_idx + points.size() * 1] += getL2CacheMisses(start, end); + collection[thread][point_idx + points.size() * 2] += getL3CacheHits(start, end); + collection[thread][point_idx + points.size() * 3] += getL3CacheHitsNoSnoop(start, end); + collection[thread][point_idx + points.size() * 4] += getL3CacheMisses(start, end); + collection[thread][point_idx + points.size() * 5] += getBytesReadFromMC(start, end); + } +#endif + + void stop(const std::string& point, uint32_t thread) { +#ifdef PCM_MEASURE + auto state = pcm::getSystemCounterState(); + + int64_t idx = point_index(point); + if(idx < 0) { + std::cerr << "Invalid 'point' given. Ignored!" << std::endl; + return; + } + + auto start = states[thread][static_cast(idx)]; + read_values(static_cast(idx), thread, start, state); +#endif + } +}; diff --git a/qdp_project/src/utils/memory_literals.h b/qdp_project/src/utils/memory_literals.h new file mode 100644 index 0000000..bcf6395 --- /dev/null +++ b/qdp_project/src/utils/memory_literals.h @@ -0,0 +1,45 @@ +/** + * @file memory_literals.h + * @author André Berthold + * @brief Defines some operators that ease to define a certain size of memory. + * e.g. to alloc 3 Gib (Gibibit = 2^30 bit) of memory one can now simply write: "std::malloc(3_Gib)" + * to alloc 512 MB (Megabyte = 10^2 byte) of memory one can now simply write: "std::malloc(512_MB)" + * @version 0.1 + * @date 2023-05-25 + * + * @copyright Copyright (c) 2023 + * + */ +#pragma once + +#include + +typedef const unsigned long long int ull_int; +//***************************************************************************// +// Bit **********************************************************************// +//***************************************************************************// +constexpr size_t operator ""_b(ull_int value) { + // one byte is 8 bit + one byte if bit is no multiple of 8 + return value / 8 + value % 8; +} +constexpr size_t operator ""_kb (ull_int value) { return value * 1000 / 8; } +constexpr size_t operator ""_kib(ull_int value) { return value * 1024 / 8; } +constexpr size_t operator ""_Mb (ull_int value) { return value * 1000 * 1000 / 8; } +constexpr size_t operator ""_Mib(ull_int value) { return value * 1024 * 1024 / 8; } +constexpr size_t operator ""_Gb (ull_int value) { return value * 1000 * 1000 * 1000 / 8; } +constexpr size_t operator ""_Gib(ull_int value) { return value * 1024 * 1024 * 1024 / 8; } +constexpr size_t operator ""_Tb (ull_int value) { return value * 1000 * 1000 * 1000 * 1000 / 8; } +constexpr size_t operator ""_Tib(ull_int value) { return value * 1024 * 1024 * 1024 * 1024 / 8; } + +//***************************************************************************// +// Byte *********************************************************************// +//***************************************************************************// +constexpr size_t operator ""_B (ull_int value) { return value; } +constexpr size_t operator ""_kB (ull_int value) { return value * 1000; } +constexpr size_t operator ""_kiB(ull_int value) { return value * 1024; } +constexpr size_t operator ""_MB (ull_int value) { return value * 1000 * 1000; } +constexpr size_t operator ""_MiB(ull_int value) { return value * 1024 * 1024; } +constexpr size_t operator ""_GB (ull_int value) { return value * 1000 * 1000 * 1000; } +constexpr size_t operator ""_GiB(ull_int value) { return value * 1024 * 1024 * 1024; } +constexpr size_t operator ""_TB (ull_int value) { return value * 1000 * 1000 * 1000 * 1000; } +constexpr size_t operator ""_TiB(ull_int value) { return value * 1024 * 1024 * 1024 * 1024; } \ No newline at end of file diff --git a/qdp_project/src/utils/pcm.h b/qdp_project/src/utils/pcm.h new file mode 100644 index 0000000..91a19e0 --- /dev/null +++ b/qdp_project/src/utils/pcm.h @@ -0,0 +1,6 @@ +#pragma once +//this file includes all important header from the pcm repository +#include "cpucounters.h" +#include "msr.h" +#include "pci.h" +#include "mutex.h" diff --git a/qdp_project/src/utils/timer_utils.h b/qdp_project/src/utils/timer_utils.h new file mode 100644 index 0000000..b6ec54f --- /dev/null +++ b/qdp_project/src/utils/timer_utils.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include +#include + +#include + +#define THREAD_TIMINGS 1 + + + +struct thread_runtime_timing { + using time_point_t = std::chrono::time_point; + + uint32_t time_points, time_threads; + time_point_t** start_times; + double** time_accumulator; + + thread_runtime_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) { +#ifdef THREAD_TIMINGS + time_points = timing_points; + time_threads = timing_threads; + start_times = (time_point_t**) numa_alloc_onnode(timing_points * sizeof(time_point_t*), memory_node); + time_accumulator = (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node); + for(uint32_t i = 0; i < timing_points; ++i) { + start_times[i] = (time_point_t*) numa_alloc_onnode(timing_threads * sizeof(time_point_t), memory_node); + time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node); + } +#endif + } + + ~thread_runtime_timing() { +#ifdef THREAD_TIMINGS + for(uint32_t i = 0; i < time_points; ++i) { + numa_free(start_times[i], time_threads * sizeof(time_point_t)); + numa_free(time_accumulator[i], time_threads * sizeof(double)); + } + numa_free(start_times, time_points * sizeof(time_point_t*)); + numa_free(time_accumulator, time_points * sizeof(double*)); +#endif + } + + void reset_accumulator() { +#ifdef THREAD_TIMINGS + for(uint32_t i = 0; i < time_points; ++i){ + for(uint32_t j = 0; j < time_threads; ++j){ + time_accumulator[i][j] = 0.0; + }} +#endif + } + + double summarize_time(uint32_t time_point) { +#ifdef THREAD_TIMINGS + double sum = 0.0; + for(uint32_t i = 0; i < time_threads; ++i) { + sum += time_accumulator[time_point][i]; + } + return sum; +#endif + } + + void stop_timer(uint32_t point_id, uint32_t thread_id) { +#ifdef THREAD_TIMINGS + auto end_time = std::chrono::steady_clock::now(); + auto start_time = start_times[point_id][thread_id]; + + uint64_t time = std::chrono::duration_cast(end_time - start_time).count(); + double seconds = time / (1000.0 * 1000.0 * 1000.0); + time_accumulator[point_id][thread_id] += seconds; +#endif + } + + void start_timer(uint32_t point_id, uint32_t thread_id) { +#ifdef THREAD_TIMINGS + start_times[point_id][thread_id] = std::chrono::steady_clock::now(); +#endif + } + +}; diff --git a/qdp_project/src/utils/vector_loader.h b/qdp_project/src/utils/vector_loader.h new file mode 100644 index 0000000..ceab169 --- /dev/null +++ b/qdp_project/src/utils/vector_loader.h @@ -0,0 +1,93 @@ +/** + * @file vector_loader.h + * @author André Berthold + * @brief Provides an interface to easily excange vector loading strategies + * @version 0.1 + * @date 2023-05-25 + * + * @copyright Copyright (c) 2023 + * + */ + +#pragma once + +#include +#include + +#include + +enum load_mode {Unaligned = 0, Aligned = 1, Stream = 2}; + +/** + * @brief A class template that provides functions for loading and storing data of type *base_t* into/from vectors using the stretegy *mode*. + * + * @tparam base_t Base type of the data + * @tparam mode Strategy for loading the vector + */ +template +class Vector_Loader {}; + +/** + * @brief Template specialization for Vector_Loader with base_t = uint32_t. + * + * @tparam mode Strategy for loading the vector + */ +template +class Vector_Loader { + using base_t = uint32_t; + using mask_t = __mmask16; + using mask_base_t = uint8_t; +public: + + /** + * @brief Loads 512 bit of data into a vector register + * + * @param src Pointer to the data to load + * @return __m512i The vector register with the loaded data + */ + static inline __m512i load(base_t* src) { + if constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi32(src); + else if constexpr (mode == load_mode::Aligned) return _mm512_load_epi32 (src); + else if constexpr (mode == load_mode::Stream) return _mm512_stream_load_si512(src); + }; + + /** + * @brief Stroes data from a given vector register to a destination pointer + * + * @param dst Pointer to the data destination + * @param vector Vector register containing the data to store + */ + static inline void store(base_t* dst, __m512i vector) { + if constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi32(dst, vector); + else if constexpr (mode == load_mode::Aligned) _mm512_store_epi32 (dst, vector); + else if constexpr (mode == load_mode::Stream) _mm512_stream_si512((__m512i*)(dst), vector); + }; +}; + +/** + * @brief Template specialization for Vector_Loader with base_t = uint64_t. + * + * @tparam mode Strategy for loading the vector + */ +template +class Vector_Loader { + using base_t = uint64_t; + using mask_t = __mmask8; + using mask_base_t = uint8_t; +public: + + + + static inline __m512i load(base_t* src) { + if constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi64(src); + else if constexpr (mode == load_mode::Aligned) return _mm512_load_epi64 (src); + else if constexpr (mode == load_mode::Stream) return _mm512_stream_load_si512(src); + }; + + static inline void store(base_t* dst, __m512i vector) { + if constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi64(dst, vector); + else if constexpr (mode == load_mode::Aligned) _mm512_store_epi64 (dst, vector); + else if constexpr (mode == load_mode::Stream) _mm512_stream_si512((__m512i*)(dst), vector); + }; + +}; From 2b635d025d12e4853f5f4249156222d9e4fd3336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 17 Jan 2024 13:43:16 +0100 Subject: [PATCH 26/29] add defaulted-constructor for cache which got implicitly deleted by deleting the copy constructor --- offloading-cacher/cache.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index b84e347..058a1e1 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -237,6 +237,7 @@ namespace dsacache { std::unique_ptr GetFromCache(uint8_t* src, const size_t size, const int dst_node); public: + Cache() = default; Cache(const Cache& other) = delete; // initializes the cache with the two policy functions From 6cc49daf893c349d00583c203154f1eb48fa9dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 17 Jan 2024 13:45:24 +0100 Subject: [PATCH 27/29] remove all unused files and benchmark methods, adapt the MAX-Benchmark to use the cacher, remove manually-set numa configuration and replace it with dynamically adapting to the configured affinity, add two more template-options to the worker that control whether a is cached as well and whether scanb waits on the caching --- qdp_project/CMakeLists.txt | 26 +- qdp_project/bench_all_dimes.sh | 10 - qdp_project/bench_max.sh | 7 +- qdp_project/cmake_all_dimes.sh | 33 -- qdp_project/src/benchmark/DIMES_benchmark.cpp | 240 ----------- .../src/benchmark/DIMES_cores_benchmark.cpp | 260 ------------ qdp_project/src/benchmark/MAX_benchmark.cpp | 81 ++-- qdp_project/src/benchmark/QDP_minimal.h | 147 ------- .../src/benchmark/doubly_filtered_agg.cpp | 149 ------- .../benchmark/filter_aggregate_pipeline.cpp | 184 --------- qdp_project/src/benchmark/latency.cpp | 188 --------- .../src/benchmark/micro_benchmarks.cpp | 271 ------------ .../pipelines/DIMES_scan_filter_pipe.h | 391 ------------------ .../pipelines/MAX_scan_filter_pipe.h | 199 +++++---- .../benchmark/pipelines/scan_filter_pipe.h | 387 ----------------- qdp_project/src/utils/execution_modes.h | 41 +- 16 files changed, 181 insertions(+), 2433 deletions(-) delete mode 100644 qdp_project/bench_all_dimes.sh delete mode 100644 qdp_project/cmake_all_dimes.sh delete mode 100644 qdp_project/src/benchmark/DIMES_benchmark.cpp delete mode 100644 qdp_project/src/benchmark/DIMES_cores_benchmark.cpp delete mode 100644 qdp_project/src/benchmark/QDP_minimal.h delete mode 100644 qdp_project/src/benchmark/doubly_filtered_agg.cpp delete mode 100644 qdp_project/src/benchmark/filter_aggregate_pipeline.cpp delete mode 100644 qdp_project/src/benchmark/latency.cpp delete mode 100644 qdp_project/src/benchmark/micro_benchmarks.cpp delete mode 100644 qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h delete mode 100644 qdp_project/src/benchmark/pipelines/scan_filter_pipe.h diff --git a/qdp_project/CMakeLists.txt b/qdp_project/CMakeLists.txt index 71c8452..97c1915 100644 --- a/qdp_project/CMakeLists.txt +++ b/qdp_project/CMakeLists.txt @@ -20,12 +20,6 @@ set(SUPPRESS_WARNINGS "-Wno-literal-suffix -Wno-volatile") set(DEBUG_FLAGS "-g3" "-ggdb") set(RELEASE_FLAGS "-O3") -#set pcm location -set(PCM_LOCATION ./thirdParty/pcm) -set(PCM_LINKS -lpcm -L${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib) -# pass the in formation about the shared library location to the linker -link_directories(${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib) - #set flags used for Release and Debug build type add_compile_options( "$<$:${RELEASE_FLAGS}>" @@ -71,34 +65,18 @@ add_definitions(-DTHREAD_GROUP_MULTIPLIER=${THREAD_FACTOR}) eval(PINNING "cpu;numa" "cpu") add_definitions(-DPINNING=$) -eval(PCM_M "true;false" "false") -add_definitions(-DPCM_M=$) -add_definitions(${PCM_LINKS}) - # build directory set(CMAKE_BINARY_DIR "../bin") #relative to inside build set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) - - # include directories include_directories(src/utils) include_directories(src/algorithm) include_directories(src/algorithm/operators) -include_directories(thirdParty/pcm/src) # link libraries -link_libraries(-lnuma -lpthread) +link_libraries(-lnuma -lpthread -l:libdml.a) # Add targets only below # specify build targets -add_executable(FilterAggregatePipeline src/benchmark/filter_aggregate_pipeline.cpp) -add_executable(DoublyFiltered src/benchmark/doubly_filtered_agg.cpp) -add_executable(DIMESBench src/benchmark/DIMES_benchmark.cpp) -add_executable(DIMESCoreBench src/benchmark/DIMES_cores_benchmark.cpp) -add_executable(MicroBench src/benchmark/micro_benchmarks.cpp) -add_executable(MAXBench src/benchmark/MAX_benchmark.cpp - src/benchmark/QDP_minimal.h) -target_link_libraries(MAXBench libpcm.so) -add_executable(LatencyBench src/benchmark/latency.cpp) - +add_executable(MAXBench src/benchmark/MAX_benchmark.cpp) \ No newline at end of file diff --git a/qdp_project/bench_all_dimes.sh b/qdp_project/bench_all_dimes.sh deleted file mode 100644 index 9c05e62..0000000 --- a/qdp_project/bench_all_dimes.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!bin/bash - -../bin/DIMESBench_gus -../bin/DIMESBench_guc -../bin/DIMESBench_gls -../bin/DIMESBench_glc -../bin/DIMESBench_lus -../bin/DIMESBench_luc -../bin/DIMESBench_lls -../bin/DIMESBench_llc \ No newline at end of file diff --git a/qdp_project/bench_max.sh b/qdp_project/bench_max.sh index fb08bd8..b7e0168 100644 --- a/qdp_project/bench_max.sh +++ b/qdp_project/bench_max.sh @@ -3,13 +3,8 @@ current_date_time=$(date) echo "Benchmark start at: $current_date_time" -../bin/MAXBench_gcc -cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_c_HBM.csv - -../bin/MAXBench_gcn - -cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_n_HBM.csv +../bin/MAXBench current_date_time=$(date) echo "Benchmark end at: $current_date_time" \ No newline at end of file diff --git a/qdp_project/cmake_all_dimes.sh b/qdp_project/cmake_all_dimes.sh deleted file mode 100644 index 9ce3a96..0000000 --- a/qdp_project/cmake_all_dimes.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!bin/bash - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=simple .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_gus - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_guc - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited -DQUERY=simple .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_gls - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited -DQUERY=complex .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_glc - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=unlimited -DQUERY=simple .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_lus - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=unlimited -DQUERY=complex .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_luc - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=limited -DQUERY=simple .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_lls - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=limited -DQUERY=complex .. -cmake --build . --target DIMESBench -mv ../bin/DIMESBench ../bin/DIMESBench_llc \ No newline at end of file diff --git a/qdp_project/src/benchmark/DIMES_benchmark.cpp b/qdp_project/src/benchmark/DIMES_benchmark.cpp deleted file mode 100644 index 2ca9705..0000000 --- a/qdp_project/src/benchmark/DIMES_benchmark.cpp +++ /dev/null @@ -1,240 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifndef THREAD_GROUP_MULTIPLIER -#define THREAD_GROUP_MULTIPLIER 8 -#endif - -#ifndef QUERY -#define QUERY 1 -#endif - -#ifndef BARRIER_MODE -#define BARRIER_MODE "global" -#endif - -#ifndef BUFFER_LIMIT -#define BUFFER_LIMIT 1 -#endif - -#include "const.h" - -#include "file_output.h" -#include "array_utils.h" -#include "timer_utils.h" -#include "barrier_utils.h" -#include "cpu_set_utils.h" -#include "iterable_range.h" -#include "memory_literals.h" -#include "pipelines/DIMES_scan_filter_pipe.h" - -#include "aggregation.h" -#include "filter.h" - -using base_t = uint64_t; - -base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { - base_t sum = 0; - for(int i = 0; i < row_size / sizeof(base_t); ++i) { - sum += (row_A[i] < compare_value) * row_B[i]; - } - return sum; -} - -base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { - base_t sum = 0; - for(int i = 0; i < row_size / sizeof(base_t); ++i) { - sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; - } - return sum; -} - -int main(int argc, char** argv) { - // set constants - const size_t workload_b = 4_GiB; - const base_t compare_value_a = 50; - const base_t compare_value_b = 42; - constexpr bool simple_query = (QUERY == 1); - - const size_t thread_count = 6; - std::ofstream out_file; - out_file.open("../results/dimes_" - "q-" + (std::string)(simple_query == true ? "simple" : "complex") + - "_bm-" + (std::string) BARRIER_MODE + - "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + - "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".csv"); - - // set benchmark parameter - Linear_Int_Range run("run"); - Exp_Int_Range chunk_size("chunk_size"); - Range mode("mode"); - - uint32_t remote_node = 3; - uint32_t remote_node_2 = 2; - uint32_t local_node = 10; - - print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time", - #ifdef THREAD_TIMINGS - "scan_a", "scan_b", "aggr_j", - #endif - #ifdef BARRIER_TIMINGS - "wait_scan_a", "wait_scan_b", "wait_aggr_j", - #endif - "result"); - - - /*** alloc data and buffers ************************************************/ - base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); - base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); - base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); - base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); - fill_mt(data_a, workload_b, 0, 100, 42); - fill_mt(data_b, workload_b, 0, 100, 420); - std::memcpy(data_a_hbm, data_a, workload_b); - std::memcpy(data_b_hbm, data_b, workload_b); - base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node); - - std::ofstream check_file; - check_file.open("../results/dimes_" - "q-" + (std::string)(simple_query == true ? "simple" : "complex") + - "_bm-" + (std::string) BARRIER_MODE + - "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + - "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum"); - if constexpr (QUERY == 1) { - //calculate simple checksum if QUERY == 1 -> simple query is applied - check_file << sum_check(compare_value_a, data_a, data_b, workload_b); - } else { - check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); - } - check_file.close(); - - std::string iteration("init"); - Query_Wrapper* qw = nullptr; - while(iteration != "false") { - - std::promise p; - std::shared_future ready_future(p.get_future()); - - if(iteration != "run") { - - if(qw != nullptr) { - delete qw; - } - - std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << std::endl; - - uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); - uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); - uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); - switch(mode.current) { - case NewPMode::DRAM_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::HBM_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::Mixed_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::Prefetch: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false); - break; - } - } - - qw->ready_future = &ready_future; - qw->clear_buffers(); - - auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; - auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; - auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; - - std::vector filter_pool; - std::vector copy_pool; - std::vector agg_pool; - - uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); - uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); - uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); - - int thread_id = 0; - // std::vector> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II - //std::vector> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm - //std::vector> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids - //std::vector> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids - std::vector> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids - - for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { - - for(uint32_t tid = 0; tid < tc_filter; ++tid) { - filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); - pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); - } - - // if tc_copy == 0 this loop is skipped - for(uint32_t tid = 0; tid < tc_copy; ++tid) { - copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); - pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); - } - - for(uint32_t tid = 0; tid < tc_agg; ++tid) { - agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); - pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); - } - } - - auto start = std::chrono::steady_clock::now(); - p.set_value(); - - for(std::thread& t : filter_pool) { t.join(); } - for(std::thread& t : copy_pool) { t.join(); } - for(std::thread& t : agg_pool) { t.join(); } - - Aggregation::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); - auto end = std::chrono::steady_clock::now(); - - constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; - uint64_t nanos = std::chrono::duration_cast(end - start).count(); - double seconds = (double)(nanos) / nanos_per_second; - - - print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds, - #ifdef THREAD_TIMINGS - qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), - #endif - #ifdef BARRIER_TIMINGS - qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), - #endif - results[0]); - - - iteration = IterateOnce(run, chunk_size, mode); - } - - numa_free(data_b_hbm, workload_b); - numa_free(data_a, workload_b); - numa_free(data_b, workload_b); - - numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t)); - -} \ No newline at end of file diff --git a/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp b/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp deleted file mode 100644 index 93c6b1b..0000000 --- a/qdp_project/src/benchmark/DIMES_cores_benchmark.cpp +++ /dev/null @@ -1,260 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifndef QUERY -#define QUERY 1 -#endif - -#ifndef BARRIER_MODE -#define BARRIER_MODE "global" -#endif - -#define BUFFER_LIMIT 0 - -#include "const.h" - -#include "file_output.h" -#include "array_utils.h" -#include "timer_utils.h" -#include "barrier_utils.h" -#include "cpu_set_utils.h" -#include "iterable_range.h" -#include "memory_literals.h" -#include "pipelines/DIMES_scan_filter_pipe.h" - -#include "aggregation.h" -#include "filter.h" - -using base_t = uint64_t; - -base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { - base_t sum = 0; - for(int i = 0; i < row_size / sizeof(base_t); ++i) { - sum += (row_A[i] < compare_value) * row_B[i]; - } - return sum; -} - -base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { - base_t sum = 0; - for(int i = 0; i < row_size / sizeof(base_t); ++i) { - sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; - } - return sum; -} - - -int main(int argc, char** argv) { - // set constants - const size_t workload_b = 4_GiB; - const size_t chunk_size = 2_MiB; - const base_t compare_value_a = 50; - const base_t compare_value_b = 42; - constexpr bool simple_query = (QUERY == 1); - - - std::ofstream out_file; - out_file.open("../results/dimes_cores_" - "q-" + (std::string)(simple_query == true ? "simple" : "complex") + - "_bm-" + (std::string) BARRIER_MODE + - "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + - ".csv"); - - // set benchmark parameter - Linear_Int_Range run("run"); - - Exp_Int_Range scan_a_thread("scan_a_tc"); - Exp_Int_Range scan_b_thread("scan_b_tc"); - Exp_Int_Range aggr_j_thread("aggr_j_tc"); - Linear_Int_Range thread_group_count("thread_group_c"); - Range mode("mode"); - - uint32_t remote_node = 1; - uint32_t remote_node_2 = 0;//on heacboehm II: node 0 is two hops away from node 2 -> prefetching is more beneficial - uint32_t local_node = 2; - - print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), - "time", - #ifdef THREAD_TIMINGS - "scan_a", "scan_b", "aggr_j", - #endif - #ifdef BARRIER_TIMINGS - "wait_scan_a", "wait_scan_b", "wait_aggr_j", - #endif - "result"); - - - /*** alloc data and buffers ************************************************/ - base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); - base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); - base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); - base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); - fill_mt(data_a, workload_b, 0, 100, 42); - fill_mt(data_b, workload_b, 0, 100, 420); - std::memcpy(data_a_hbm, data_a, workload_b); - std::memcpy(data_b_hbm, data_b, workload_b); - base_t* results = (base_t*) numa_alloc_onnode(thread_group_count.max * aggr_j_thread.max * sizeof(base_t), remote_node); - - std::ofstream check_file; - check_file.open("../results/dimes_cores_" - "q-" + (std::string)(simple_query == true ? "simple" : "complex") + - "_bm-" + (std::string) BARRIER_MODE + - "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + - ".checksum"); - if constexpr (QUERY == 1) { - //calculate simple checksum if QUERY == 1 -> simple query is applied - check_file << sum_check(compare_value_a, data_a, data_b, workload_b); - } else { - check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); - } - check_file.close(); - - std::string iteration("init"); - Query_Wrapper* qw = nullptr; - while(iteration != "false") { - - std::promise p; - std::shared_future ready_future(p.get_future()); - - // skipping iteration through scan_b_thread while not used - while(simple_query && mode.current != NewPMode::Prefetch && scan_b_thread.current != 1) { - iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread); - } - - if(iteration != "run") { - std::cout << "Changing to mode " << mode.current - << " thread_group_count " << thread_group_count.current - << " thread_ratio " << scan_a_thread.current <<":"<< scan_b_thread.current <<":"<< aggr_j_thread.current - << std::endl; - - if(qw != nullptr) { - if (iteration == thread_group_count.label) { - - } else { - delete qw; - - uint32_t sat = scan_a_thread.current; - uint32_t sbt = simple_query && mode.current != NewPMode::Prefetch ? 0 : scan_b_thread.current; - uint32_t ajt = aggr_j_thread.current; - - switch(mode.current) { - case NewPMode::DRAM_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, - sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::HBM_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a_hbm, data_b_hbm, results, local_node, remote_node, - sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::Mixed_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b_hbm, results, local_node, remote_node, - sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::Prefetch: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, - sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, false); - break; - } - } - } - } - - qw->ready_future = &ready_future; - qw->clear_buffers(); - - auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; - auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; - auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; - - std::vector filter_pool; - std::vector copy_pool; - std::vector agg_pool; - - uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); - uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); - uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); - - int thread_id = 0; - // std::vector> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II - std::vector> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm - - for(uint32_t gid = 0; gid < thread_group_count.current; ++gid) { - - for(uint32_t tid = 0; tid < tc_filter; ++tid) { - filter_pool.emplace_back(filter_lambda, gid, thread_group_count.current, tid); - pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); - } - - // if tc_copy == 0 this loop is skipped - for(uint32_t tid = 0; tid < tc_copy; ++tid) { - copy_pool.emplace_back(copy_lambda, gid, thread_group_count.current, tid); - pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); - } - - for(uint32_t tid = 0; tid < tc_agg; ++tid) { - agg_pool.emplace_back(aggregation_lambda, gid, thread_group_count.current, tid); - pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); - } - } - - auto start = std::chrono::steady_clock::now(); - p.set_value(); - - for(std::thread& t : filter_pool) { t.join(); } - for(std::thread& t : copy_pool) { t.join(); } - for(std::thread& t : agg_pool) { t.join(); } - - Aggregation::apply(results, results, sizeof(base_t) * tc_agg * thread_group_count.current); - auto end = std::chrono::steady_clock::now(); - - constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; - uint64_t nanos = std::chrono::duration_cast(end - start).count(); - double seconds = (double)(nanos) / nanos_per_second; - -print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), - "time", - #ifdef THREAD_TIMINGS - "scan_a", "scan_b", "aggr_j", - #endif - #ifdef BARRIER_TIMINGS - "wait_scan_a", "wait_scan_b", "wait_aggr_j", - #endif - "result"); - - print_to_file(out_file, run, thread_group_count.current, new_mode_manager::string(mode.current), scan_a_thread, - (simple_query && mode.current != NewPMode::Prefetch ? 0 : scan_b_thread.current), - aggr_j_thread, seconds, - #ifdef THREAD_TIMINGS - qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), - #endif - #ifdef BARRIER_TIMINGS - qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), - #endif - results[0]); - - iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread); - } - - numa_free(data_b_hbm, workload_b); - numa_free(data_a, workload_b); - numa_free(data_b, workload_b); - - numa_free(results, thread_group_count.max * aggr_j_thread.max * sizeof(base_t)); - -} \ No newline at end of file diff --git a/qdp_project/src/benchmark/MAX_benchmark.cpp b/qdp_project/src/benchmark/MAX_benchmark.cpp index fb50f5a..0414e29 100644 --- a/qdp_project/src/benchmark/MAX_benchmark.cpp +++ b/qdp_project/src/benchmark/MAX_benchmark.cpp @@ -92,33 +92,36 @@ int main(int argc, char** argv) { #endif // set constants - const size_t workload_b = 2_GiB; - const base_t compare_value_a = 50; - const base_t compare_value_b = 42; + constexpr size_t workload_b = 2_GiB; + constexpr base_t compare_value_a = 50; + constexpr base_t compare_value_b = 42; constexpr bool simple_query = (QUERY == 1); + constexpr bool cache_a = false; + constexpr bool wait_b = false; + + constexpr size_t chunk_min = 1_MiB; + constexpr size_t chunk_max = 8_MiB + 1; + constexpr size_t chunk_incr = 128_kiB; + + // thread count is 12 here but as the default measurement uses 6 + // we must restrict the core assignment of these 12 threads to + // 6 physical cpu cores on the executing node + + constexpr size_t thread_count = 12; - const size_t thread_count = 6; std::ofstream out_file; + out_file.open("../results/max_" "q-" + (std::string)(simple_query == true ? "simple" : "complex") + "_bm-" + (std::string) BARRIER_MODE + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + - "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + "1MiB-2MiB.csv"); + "_tc-" + std::to_string(thread_count) + "1MiB-2MiB.csv"); // set benchmark parameter Linear_Int_Range run("run"); - constexpr size_t chunk_min = 1_MiB; constexpr size_t chunk_max = 8_MiB + 1; constexpr size_t chunk_incr = 128_kiB; Linear_Int_Range chunk_size("chunk_size"); Range mode("mode"); - uint32_t remote_node = 2; - uint32_t remote_node_2 = 2; - uint32_t local_node = 10; - - /*uint32_t remote_node = 6; - uint32_t remote_node_2 = 6; - uint32_t local_node = 2;*/ - print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time", #ifdef THREAD_TIMINGS "scan_a", "scan_b", "aggr_j", @@ -133,24 +136,22 @@ int main(int argc, char** argv) { #endif "result"); - /*** alloc data and buffers ************************************************/ - base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); - base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); - base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); - base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); + + base_t* data_a = (base_t*) numa_alloc_local(workload_b); + base_t* data_b = (base_t*) numa_alloc_local(workload_b); + base_t* results = (base_t*) numa_alloc_local(thread_count * sizeof(base_t)); + fill_mt(data_a, workload_b, 0, 100, 42); fill_mt(data_b, workload_b, 0, 100, 420); - std::memcpy(data_a_hbm, data_a, workload_b); - std::memcpy(data_b_hbm, data_b, workload_b); - base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node); + std::ofstream check_file; check_file.open("../results/max_" "q-" + (std::string)(simple_query == true ? "simple" : "complex") + "_bm-" + (std::string) BARRIER_MODE + "_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + - "_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum"); + "_tc-" + std::to_string(thread_count) + ".checksum"); if constexpr (QUERY == 1) { //calculate simple checksum if QUERY == 1 -> simple query is applied check_file << sum_check(compare_value_a, data_a, data_b, workload_b); @@ -160,37 +161,34 @@ int main(int argc, char** argv) { check_file.close(); std::string iteration("init"); - Query_Wrapper* qw = nullptr; + Query_Wrapper* qw = nullptr; + while(iteration != "false") { std::promise p; std::shared_future ready_future(p.get_future()); if(iteration != "run") { - if(qw != nullptr) { delete qw; } + uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); + switch(mode.current) { - case NewPMode::DRAM_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::HBM_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::Mixed_base: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); - break; - case NewPMode::Prefetch: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, - tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false); + case NewPMode::Prefetch: + qw = new Query_Wrapper( + &ready_future, workload_b, chunk_size.current, + data_a, data_b, results, tc_filter, tc_copy, tc_agg, + mode.current, 50, 42 + ); + break; + default: + std::cerr << "[x] Unsupported Execution Mode by this build." << std::endl; + exit(-1); } } @@ -280,10 +278,7 @@ int main(int argc, char** argv) { iteration = IterateOnce(run, chunk_size, mode); } - numa_free(data_b_hbm, workload_b); numa_free(data_a, workload_b); numa_free(data_b, workload_b); - numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t)); - } \ No newline at end of file diff --git a/qdp_project/src/benchmark/QDP_minimal.h b/qdp_project/src/benchmark/QDP_minimal.h deleted file mode 100644 index 007d0d9..0000000 --- a/qdp_project/src/benchmark/QDP_minimal.h +++ /dev/null @@ -1,147 +0,0 @@ -#include -#include -#include -#include -#include - -#include "const.h" -#include "array_utils.h" -#include "cpu_set_utils.h" -#include "iterable_range.h" -#include "memory_literals.h" -#include "pipelines/MAX_scan_filter_pipe.h" -#include "aggregation.h" - -using base_t = uint64_t; - -// calculate the checksum for the simple query -base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { - base_t sum = 0; - for(int i = 0; i < row_size / sizeof(base_t); ++i) { - sum += (row_A[i] < compare_value) * row_B[i]; - } - return sum; -} - -// calculate the checksum for the complex query -base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { - base_t sum = 0; - for(int i = 0; i < row_size / sizeof(base_t); ++i) { - sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; - } - return sum; -} - -class QDP_minimal { -private: - // values used for comparisons in the filter operations - const base_t compare_value_a = 50; - const base_t compare_value_b = 42; - // define, which numa nodes to use - // Xeon Max: node 0-7 DRAM and 8-15 HBM - // if the nodes are changed, the pinning ranges in run should be adjusted accordingly too - uint32_t dram_node = 2; - uint32_t dram_node_2 = 2; - uint32_t hbm_node = 10; - -public: - // results of running qdp, set by run() - base_t result; - base_t checksum; - double exec_time; - - // run qdp - void run(const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){ - // allocate data - base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, dram_node); - base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, dram_node_2); - base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t), dram_node); - - // fill the memory with acutal values - fill_mt(data_a, workload_b, 0, 100, 42); - fill_mt(data_b, workload_b, 0, 100, 420); - - // run qdp - run(data_a, data_b, results, workload_b, chunk_size, tc_filter, tc_copy, tc_agg); - - // free the allocated memory - numa_free(data_a, workload_b); - numa_free(data_b, workload_b); - numa_free(results, THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t)); - } - - // run qdp, work on provided memory pointers to enable memory reuse across multiple runs - void run(base_t* data_a, base_t* data_b, base_t* results, const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){ - constexpr bool simple_query = (QUERY == 1); - // sync objects - std::promise p; - std::shared_future ready_future(p.get_future()); - - // create the query wrapper, that is managing the to-be-used threads - Query_Wrapper* qw = new Query_Wrapper(&ready_future, workload_b, chunk_size, data_a, data_b, results, hbm_node, dram_node, - tc_filter, tc_copy, tc_agg, NewPMode::Prefetch, THREAD_GROUP_MULTIPLIER, compare_value_a, compare_value_b, false); - - // clear buffers to make sure, that they have been written and are fully mapped before running qdp - qw->clear_buffers(); - - // creating lambdas for executing filter (scan_a), copy (scan_b), and aggregation tasks on the query wrapper - // passing gid (group id), gcnt (group count) and tid (thread id) - auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; - auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; - auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; - - // creating thread pools, holding all used threads - std::vector filter_pool; - std::vector copy_pool; - std::vector agg_pool; - - int thread_id = 0; - // cpus on node 2 (for sapphire rapids), that the threads should be executed on - std::vector> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; - - // create all threads for all thread groups and for every task (copy, filter, aggregation), according their specific theadcount - for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { - for(uint32_t tid = 0; tid < tc_filter; ++tid) { - filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); - pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); - } - for(uint32_t tid = 0; tid < tc_copy; ++tid) { - copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); - pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); - } - for(uint32_t tid = 0; tid < tc_agg; ++tid) { - agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); - pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); - } - } - - // start the clock - auto start = std::chrono::steady_clock::now(); - // set value to the promise, to signal the waiting threads, that they can start now - p.set_value(); - - // wait for all thread to be finished - for(std::thread& t : filter_pool) { t.join(); } - for(std::thread& t : copy_pool) { t.join(); } - for(std::thread& t : agg_pool) { t.join(); } - - // sum up the results of all the aggregation threads to get a final result - Aggregation::apply(&result, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); - auto end = std::chrono::steady_clock::now(); - - // get the overall execution time in seconds - constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; - uint64_t nanos = std::chrono::duration_cast(end - start).count(); - exec_time = (double)(nanos) / nanos_per_second; - - // calculate the checksum according to the used query - if constexpr (QUERY == 1) { - // QUERY == 1 -> simple query is applied - checksum = sum_check(compare_value_a, data_a, data_b, workload_b); - } else { - checksum = sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); - } - - delete qw; - } -}; diff --git a/qdp_project/src/benchmark/doubly_filtered_agg.cpp b/qdp_project/src/benchmark/doubly_filtered_agg.cpp deleted file mode 100644 index eaee93d..0000000 --- a/qdp_project/src/benchmark/doubly_filtered_agg.cpp +++ /dev/null @@ -1,149 +0,0 @@ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "aggregation.h" -#include "array_utils.h" -#include "cpu_set_utils.h" -#include "file_output.h" -#include "iterable_range.h" -#include "memory_literals.h" -#include "pipelines/scan_filter_pipe.h" - -int main () { - - using base_t = uint64_t; - - - const size_t workload = 2_GiB; - const char filename[256] = "../results/doubly_filtered_results_stronger_affinity_.csv"; - const uint32_t numa_local = 2; - const uint32_t numa_remote = 3; - - - Linear_Int_Range thread_group("thread_groups"); - Exp_Int_Range thread_count_filter("thread_cnt_filter"); - Exp_Int_Range thread_count_filter_copy("thread_cnt_filter_copy"); - Exp_Int_Range thread_count_aggregation("thread_cnt_agg"); - Linear_Int_Range run("run"); - Range mode("mode"); - Exp_Int_Range chunk_size("chunk_size"); - - std::ofstream out_file; - out_file.open(filename); - print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, - thread_count_aggregation, thread_group), "time", "scan_a", "scan_b", "aggr_j", "wait_aggr", "results"); - - base_t* data_a = (base_t*) numa_alloc_onnode(workload, numa_remote); - base_t* data_b = (base_t*) numa_alloc_onnode(workload, numa_remote); - base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload, numa_local); - fill_mt(data_a, workload, 0, 100, 42); - fill_mt(data_b, workload, 0, 100, 420); - std::memcpy(data_b_hbm, data_b, workload); - base_t* result = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), - numa_remote); - - std::string iteration("init"); - Query_Wrapper* qw = nullptr; - - while(iteration != "false") { - - std::promise p; - std::shared_future ready_future(p.get_future()); - - if(iteration != "run") { - if(qw != nullptr) { - delete qw; - } - - switch(mode.current) { - case PMode::expl_copy: - qw = new Query_Wrapper(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, - thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, - mode.current, thread_group.current, (base_t) 50, (base_t) 42, false); - break; - case PMode::no_copy: - qw = new Query_Wrapper(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, - thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, - mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); - break; - case PMode::hbm: - qw = new Query_Wrapper(&ready_future, workload, chunk_size.current, data_a, data_b_hbm, result, numa_local, numa_remote, - thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, - mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); - break; - } - } - qw->ready_future = &ready_future; - qw->clear_buffers(); - - - // todo create threads depending on mode - std::vector thread_pool; - auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; - auto filter_copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; - auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; - - - /* Intel Xeon Gold 6130 // todo implement different for 5120 -> fewer cpus - node 0 cpus: 0-15 64- 79 - node 1 cpus: 16-31 80- 95 - node 2 cpus: 32-47 96-111 - node 3 cpus: 48-63 112-127 - */ - int thread_id = 0; - std::vector> range {std::make_pair(0, 16), std::make_pair(64, 80)}; - for(uint32_t gid = 0; gid < thread_group.current; ++gid) { - - - for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) { - thread_pool.emplace_back(filter_lambda, gid, thread_group.current, tid); - pin_thread_in_range(thread_pool.back(), thread_id++, range); - } - - for(uint32_t tid = 0; tid < thread_count_filter_copy.current; ++tid) { - thread_pool.emplace_back(filter_copy_lambda, gid, thread_group.current, tid); - pin_thread_in_range(thread_pool.back(), thread_id++, range); - } - - for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) { - thread_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid); - pin_thread_in_range(thread_pool.back(), thread_id++, range); - } - } - - auto start = std::chrono::steady_clock::now(); - p.set_value(); - - // wait for every thread to join - for(std::thread& t : thread_pool) t.join(); - // aggregate all partial results - Aggregation::apply(result, result, - sizeof(base_t) * thread_count_aggregation.current * thread_group.current); - - auto end = std::chrono::steady_clock::now(); - - double duration = std::chrono::duration_cast(end-start).count() / (double)1000000000; - - - //TODO add mode - print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, - thread_count_filter_copy, thread_count_aggregation, thread_group, duration, - qw->trt->summarize_time(0), qw->trt->summarize_time(1), - qw->trt->summarize_time(2), qw->trt->summarize_time(3), *result); - iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, thread_count_aggregation, thread_group); - } - - auto end = std::chrono::system_clock::now(); - std::time_t end_time = std::chrono::system_clock::to_time_t(end); - std::cout << "finished computation at " << std::ctime(&end_time) << std::endl; - - print_to_file(out_file, std::ctime(&end_time)); -} \ No newline at end of file diff --git a/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp b/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp deleted file mode 100644 index b4a6753..0000000 --- a/qdp_project/src/benchmark/filter_aggregate_pipeline.cpp +++ /dev/null @@ -1,184 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "const.h" - -#include "file_output.h" -#include "array_utils.h" -#include "timer_utils.h" -#include "barrier_utils.h" -#include "cpu_set_utils.h" -#include "iterable_range.h" -#include "memory_literals.h" -#include "pipelines/scan_filter_pipe.h" - -#include "aggregation.h" -#include "filter.h" - -using base_t = uint64_t; - -base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { - base_t sum = 0; - for(int i = 0; i < row_size / sizeof(base_t); ++i) { - sum += (row_A[i] < compare_value) * row_B[i]; - } - return sum; -} - - -int main(int argc, char** argv) { - size_t workload_b = 2_GiB; - std::ofstream out_file; - out_file.open("filter_aggreagate_pipe_bm_" + (std::string) BARRIER_MODE + ".csv"); - - Linear_Int_Range thread_group("thread_groups"); - Linear_Int_Range run("run"); - Exp_Int_Range chunk_size("chunk_size"); - Linear_Int_Range thread_count_filter("thread_cnt_filter"); - Linear_Int_Range thread_count_copy("thread_cnt_copy"); - Linear_Int_Range thread_count_aggregation("thread_cnt_agg"); - Range mode("mode"); - - uint32_t remote_node = 2; - uint32_t remote_node_2 = 2; - uint32_t local_node = 10; - - print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_copy, - thread_count_aggregation, thread_group), "time", - #ifdef THREAD_TIMINGS - "scan_a", "scan_b", "aggr_j", - #endif - #ifdef BARRIER_TIMINGS - "wait_scan_a", "wait_scan_b", "wait_aggr_j", - #endif - "result"); - - - /*** alloc data and buffers ************************************************/ - base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); - base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); - base_t* data_b_hbm = (base_t *) numa_alloc_onnode(workload_b, local_node); - fill_mt(data_a, workload_b, 0, 100, 42); - fill_mt(data_b, workload_b, 0, 100, 420); - std::memcpy(data_b_hbm, data_b, workload_b); - base_t* results = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), remote_node); - - std::string iteration("init"); - const bool simple_query = true; - Query_Wrapper* qw = nullptr; - while(iteration != "false") { - base_t compare_value = 50; - std::promise p; - std::shared_future ready_future(p.get_future()); - - if(iteration != "run") { - - if(qw != nullptr) { - delete qw; - } - - std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << " thread_group " << thread_group.current << std::endl; - switch(mode.current) { - case PMode::expl_copy: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, - thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, false); - break; - case PMode::no_copy: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, - thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); - break; - case PMode::hbm: - qw = new Query_Wrapper(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, - thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); - break; - } - } - - qw->ready_future = &ready_future; - qw->clear_buffers(); - - auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; - auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; - auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; - - std::vector filter_pool; - std::vector copy_pool; - std::vector agg_pool; - - int thread_id = 0; - // std::vector> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm2 - std::vector> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm - - for(uint32_t gid = 0; gid < thread_group.current; ++gid) { - - for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) { - filter_pool.emplace_back(filter_lambda, gid, thread_group.current, tid); - pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); - } - - if(mode.current == PMode::expl_copy){ - for(uint32_t tid = 0; tid < thread_count_copy.current; ++tid) { - copy_pool.emplace_back(copy_lambda, gid, thread_group.current, tid); - pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); - } - } - - for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) { - agg_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid); - pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); - } - } - - auto start = std::chrono::steady_clock::now(); - p.set_value(); - - for(std::thread& t : filter_pool) { t.join(); } - for(std::thread& t : copy_pool) { t.join(); } - for(std::thread& t : agg_pool) { t.join(); } - - Aggregation::apply(results, results, sizeof(base_t) * thread_count_aggregation.current * thread_group.current); - auto end = std::chrono::steady_clock::now(); - - constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; - uint64_t nanos = std::chrono::duration_cast(end - start).count(); - double seconds = (double)(nanos) / nanos_per_second; - - - - print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, - thread_count_copy, thread_count_aggregation, thread_group, seconds, - #ifdef THREAD_TIMINGS - qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), - #endif - #ifdef BARRIER_TIMINGS - qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), - #endif - results[0]); - - - iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_copy, thread_count_aggregation, thread_group); - - } - - numa_free(data_b_hbm, workload_b); - numa_free(data_a, workload_b); - numa_free(data_b, workload_b); - numa_free(results, thread_group.max * sizeof(base_t)); - -} \ No newline at end of file diff --git a/qdp_project/src/benchmark/latency.cpp b/qdp_project/src/benchmark/latency.cpp deleted file mode 100644 index 011066a..0000000 --- a/qdp_project/src/benchmark/latency.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * numa_memory_latency - * Copyright (c) 2017 UMEZAWA Takeshi - * This software is licensed under GNU GPL version 2 or later. - * - * This file has been modified - */ - -#include -#include -#include -#include -#include -#include -#include -#include "file_output.h" -#include -#include -#include -#include - -#ifndef VOLATILE -#define VOLATILE 0 -#endif - -#define cachelinesize 64 -union CACHELINE { - char cacheline[cachelinesize]; - #if VOLATILE - volatile CACHELINE* next; - #else - CACHELINE* next; - #endif /*VOLATILE*/ -}; - -#define REPT4(x) do { x; x; x; x; } while(0) -#define REPT16(x) do { REPT4(x); REPT4(x); REPT4(x); REPT4(x); } while(0); -#define REPT64(x) do { REPT16(x); REPT16(x); REPT16(x); REPT16(x); } while(0); -#define REPT256(x) do { REPT64(x); REPT64(x); REPT64(x); REPT64(x); } while(0); -#define REPT1024(x) do { REPT256(x); REPT256(x); REPT256(x); REPT256(x); } while(0); - -size_t bufsize = 1 * 1024 * 1024 * 1024; -size_t nloop = 128 * 1024; -std::vector offsets; - -#if VOLATILE - -volatile CACHELINE* walk(volatile CACHELINE* start) -{ - volatile CACHELINE* p = start; - for (size_t i = 0; i < nloop; ++i) { - REPT1024(p = p->next); - } - return p; -} - -#else - -CACHELINE* walk(CACHELINE* start, uint64_t* sum) -{ - CACHELINE* p = start; - for (size_t i = 0; i < nloop; ++i) { - REPT1024( - *sum += static_cast(p->cacheline[cachelinesize-1]); - p = p->next; - ); - } - return p; -} - -#endif /*VOLATILE*/ - -void bench(int tasknode, int memnode, std::ofstream* out_file) -{ - struct timespec ts_begin, ts_end, ts_elapsed; - - printf("bench(task=%d, mem=%d)\n", tasknode, memnode); - - if (numa_run_on_node(tasknode) != 0) { - printf("failed to run on node: %s\n", strerror(errno)); - return; - } - - CACHELINE* const buf = (CACHELINE*)numa_alloc_onnode(bufsize, memnode); - if (buf == NULL) { - printf("failed to allocate memory\n"); - return; - } - - for (size_t i = 0; i < offsets.size() - 1; ++i) { - // assuming that next-pointer never overwrites last Byte of the cacheline/union - buf[offsets[i]].cacheline[cachelinesize-1] = offsets[i] % 128; - buf[offsets[i]].next = buf + offsets[i+1]; - } - buf[offsets[offsets.size() - 1]].next = buf; - buf[offsets[offsets.size() - 1]].cacheline[cachelinesize-1] = offsets[offsets.size() - 1] % 128; - - uint64_t value = 0; - uint64_t* sum = &value; - - clock_gettime(CLOCK_MONOTONIC, &ts_begin); - - #if VOLATILE - walk(buf); - #else - walk(buf, sum); - #endif /*VOLATILE*/ - - clock_gettime(CLOCK_MONOTONIC, &ts_end); - - ts_elapsed.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec; - ts_elapsed.tv_sec = ts_end.tv_sec - ts_begin.tv_sec; - if (ts_elapsed.tv_nsec < 0) { - --ts_elapsed.tv_sec; - ts_elapsed.tv_nsec += 1000*1000*1000; - } - double elapsed = ts_elapsed.tv_sec + 0.000000001 * ts_elapsed.tv_nsec; - printf("took %fsec. %fns/load\n", elapsed, elapsed/(1024*nloop)*(1000*1000*1000)); - print_to_file(*out_file, tasknode, memnode, elapsed/(1024*nloop)*(1000*1000*1000), *sum); - numa_free(buf, bufsize); -} - -struct RND { - std::mt19937 mt; - RND() : mt(time(NULL)) {} - std::mt19937::result_type operator()(std::mt19937::result_type n) { return mt() % n; } -} r; - -void usage(const char* prog) -{ - printf("usage: %s [-h] [bufsize] [nloop]\n", prog); -} - -int main(int argc, char* argv[]) -{ - int ch; - - while ((ch = getopt(argc, argv, "h")) != -1) { - switch (ch) { - case 'h': - default: - usage(argv[0]); - exit(1); - } - } - - argc -= optind; - argv += optind; - - if (argc > 1) { - // 1048576 KiB = 1 GiB - bufsize = atoi(argv[0]) * 1024; // in KiB - nloop = atoi(argv[1]) * 1024; - } - - offsets.resize(bufsize / cachelinesize); - - for (size_t i = 0; i < offsets.size(); ++i) - offsets[i] = i; - std::random_shuffle(offsets.begin() + 1, offsets.end(), r); - - uint64_t expected_checksum = 0; - #if VOLATILE == 0 - for (size_t i = 0; i < nloop * 1024; ++i) { - expected_checksum += offsets[i % offsets.size()] % 128; - } - #endif - - std::ofstream check_file; - check_file.open("../results/micro_bench/latency/micro_bench_latency_" + (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".checksum"); - check_file << expected_checksum; - check_file.close(); - - - printf("benchmark bufsize=%zuKiB, nloop=%zuKi\n", bufsize/1024, nloop/1024); - - std::ofstream out_file; - out_file.open("../results/micro_bench/latency/micro_bench_latency_"+ (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".csv"); - print_to_file(out_file, "tasknode", "memnode", "latency", "checksum"); - - for (int tasknode = 0; tasknode < 8; tasknode++) { - for (int memnode = 0; memnode < 16; memnode++) { - bench(tasknode, memnode, &out_file); - } - } - - return 0; -} \ No newline at end of file diff --git a/qdp_project/src/benchmark/micro_benchmarks.cpp b/qdp_project/src/benchmark/micro_benchmarks.cpp deleted file mode 100644 index 4e63f82..0000000 --- a/qdp_project/src/benchmark/micro_benchmarks.cpp +++ /dev/null @@ -1,271 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "memory_literals.h" -#include "array_utils.h" -#include "file_output.h" -#include "aggregation.h" - - -using base_t = uint64_t; - -size_t thread_cnt_memcpy = 128; -size_t thread_cnt_read = 128; -size_t runs = 10; - - -base_t sum_up(base_t* data, size_t workload){ - base_t sum = 0; - for(int i = 0; i < workload/sizeof(base_t); i++){ - sum += data[i]; - } - return sum; -} - -int reverse_bits(int number, size_t bit_count) { - int result = 0; - for(int i = 0; i < bit_count; i++) { - result <<= 1; - result |= (number & 1); - number >>= 1; - } - return result; -} - - -double measure_memcpy_bw(base_t* src, base_t* dest, size_t workload, base_t* result){ - std::promise p; - std::shared_future ready_future(p.get_future()); - - auto thread_lambda = [&](base_t* source, base_t* destination, size_t count) { - ready_future.wait(); - memcpy(destination, source, count); - }; - - std::vector thread_pool; - size_t total_elements = workload / sizeof(base_t); - size_t elements_per_thread = total_elements / thread_cnt_memcpy; - size_t remainder = total_elements % thread_cnt_memcpy; - - for(size_t tid = 0; tid < thread_cnt_memcpy; tid++) { - size_t elements_to_process = elements_per_thread + (tid < remainder ? 1 : 0); - size_t byte_offset = (elements_per_thread * tid + std::min(tid, remainder)) * sizeof(base_t); - - thread_pool.emplace_back(thread_lambda, src + byte_offset / sizeof(base_t), dest + byte_offset / sizeof(base_t), elements_to_process * sizeof(base_t)); - } - - auto start = std::chrono::steady_clock::now(); - p.set_value(); - for(std::thread& t : thread_pool) { t.join(); } - auto stop = std::chrono::steady_clock::now(); - - auto duration = std::chrono::duration_cast(stop - start); - double seconds = duration.count() / 1e9; - double throughput = (workload / seconds) / (1024 * 1024 * 1024); - *result = sum_up(dest, workload); - return throughput; -} - -double measure_read_bw(base_t* data, size_t workload, base_t* results){ - const size_t chunk_size = sizeof(__m512i); - const size_t num_chunks = (workload) / chunk_size; - __m512i* src = reinterpret_cast<__m512i*>(data); - std::promise p; - std::shared_future ready_future(p.get_future()); - size_t num_chunks_per_thread = num_chunks / thread_cnt_read; - size_t num_chunks_remainder = num_chunks % thread_cnt_read; - - auto thread_lambda = [&](__m512i* src, int tid, int num_chunks) { - __m512i accumulator = _mm512_setzero_si512(); - ready_future.wait(); - for (int i = 0; i < num_chunks; i++) { - __m512i chunk = _mm512_load_si512(&src[i]); - accumulator = _mm512_add_epi64(accumulator, chunk); - } - results[tid] = _mm512_reduce_add_epi64(accumulator); - }; - - std::vector thread_pool; - int offset; - for(int tid = 0; tid < thread_cnt_read; tid++){ - if(tid < num_chunks_remainder){ - offset = tid * (num_chunks_per_thread + 1); - thread_pool.emplace_back(thread_lambda, &src[offset], tid, (num_chunks_per_thread + 1)); - } else { - offset = tid*num_chunks_per_thread + num_chunks_remainder; - thread_pool.emplace_back(thread_lambda, &src[offset], tid, num_chunks_per_thread); - } - - } - - auto start = std::chrono::steady_clock::now(); - p.set_value(); - for(std::thread& t : thread_pool) { t.join(); } - auto stop = std::chrono::steady_clock::now(); - - Aggregation::apply(results, results, sizeof(base_t) * thread_cnt_read); - auto duration = std::chrono::duration_cast(stop - start); - double seconds = duration.count() / 1e9; - double throughput = (workload / seconds) / (1024 * 1024 * 1024); - return throughput; -} - -void exec_multiple_runs_memcpy(size_t workload, int exec_node, int src_node, int dest_node, std::ofstream* out_file, std::string iteration_type){ - base_t value; - base_t* result = &value; - base_t* src = (base_t*) numa_alloc_onnode(workload, src_node); - base_t* dest = (base_t*) numa_alloc_onnode(workload, dest_node); - fill_mt(src, workload, 0, 100, 42); - fill_mt(dest, workload, 0, 100, 12); - numa_run_on_node(exec_node); - - if(dest_node == 0 && src_node == 0){ - std::ofstream check_file; - check_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) - + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_" + iteration_type + ".checksum"); - check_file << sum_up(src, workload); - check_file.close(); - } - - for(size_t run = 0; run < runs; run++){ - double bw = measure_memcpy_bw(src, dest, workload, result); - std::cout << "Copy throughput executed on node " << exec_node << " form node " << src_node << " to node " - << dest_node << ": " << bw << " GiB/s" << std::endl; - print_to_file(*out_file, run, src_node, dest_node, bw, *result); - std::memset(dest, 0x00, workload); - *result = 0; - } - numa_free(src, workload); - numa_free(dest, workload); -} - -void measure_all_memcpy_bw_for_chosen_execnode(int exec_node){ - std::ofstream out_file; - out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) - + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + ".csv"); - print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); - const size_t workload = 4_GiB; - - for(int src_node = 0; src_node < 16; src_node++){ - for(int dest_node = 0; dest_node < 16; dest_node++){ - exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, ""); - } - } - out_file.close(); -} - -void measure_all_memcpy_bw_for_chosen_execnode_reversed(int exec_node){ - std::ofstream out_file; - out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) - + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed.csv"); - print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); - const size_t workload = 4_GiB; - - for(int src_node = 15; src_node >= 0; src_node--){ - for(int dest_node = 15; dest_node >= 0; dest_node--){ - exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "reversed"); - } - } - out_file.close(); -} - - - -void measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(int exec_node){ - std::ofstream out_file; - out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) - + "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed_bitwise.csv"); - print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); - const size_t workload = 4_GiB; - - for(int src_node = 0; src_node < 16; src_node++){ - for(int dest_node = 0; dest_node < 16; dest_node++){ - int reversed_src_node = reverse_bits(src_node, 4); - int reversed_dest_node = reverse_bits(dest_node, 4); - exec_multiple_runs_memcpy(workload, exec_node, reversed_src_node, reversed_dest_node, &out_file, "reversed_bitwise"); - } - } - out_file.close(); -} - - -void exec_multiple_runs_read(size_t workload, int mem_node, int exec_node, std::ofstream *out_file, std::string iteration_type){ - base_t* data = (base_t*) numa_alloc_onnode(workload, mem_node); - fill_mt(data, workload, 0, 100, 42); - base_t* results = (base_t*) numa_alloc_onnode(thread_cnt_read * sizeof(base_t), exec_node); - numa_run_on_node(exec_node); - - if(mem_node == 0 && exec_node == 0){ - std::ofstream check_file; - check_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_" + iteration_type + ".checksum"); - check_file << sum_up(data, workload); - check_file.close(); - } - - for(size_t run = 0; run < runs; run++){ - double bw = measure_read_bw(data, workload, results); - std::cout << "Read throughput executed on node " << exec_node << " for node " << mem_node << ": " << bw << " GiB/s" << std::endl; - print_to_file(*out_file, run, exec_node, mem_node, bw, results[0]); - std::memset(results, 0x00, thread_cnt_read * sizeof(base_t)); - } - numa_free(data, workload); - numa_free(results, thread_cnt_read * sizeof(base_t)); -} - -void measure_all_read_bw(){ - std::ofstream out_file; - out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + ".csv"); - print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); - const size_t workload = 8_GiB; - - for(int exec_node = 0; exec_node < 8; exec_node++){ - for(int mem_node = 0; mem_node < 16; mem_node++){ - exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, ""); - } - } - out_file.close(); -} - -void measure_all_read_bw_reversed(){ - std::ofstream out_file; - out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed.csv"); - print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); - const size_t workload = 8_GiB; - - for(int exec_node = 7; exec_node >= 0; exec_node--){ - for(int mem_node = 15; mem_node >= 0; mem_node--){ - exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed"); - } - } - out_file.close(); -} - -void measure_all_read_bw_reversed_bitwise(){ - std::ofstream out_file; - out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed_bitwise.csv"); - print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); - const size_t workload = 8_GiB; - - for(int exec_node0 = 0; exec_node0 < 8; exec_node0++){ - for(int mem_node0 = 0; mem_node0 < 16; mem_node0++){ - int mem_node = reverse_bits(mem_node0, 4); - int exec_node = reverse_bits(exec_node0, 3); - exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed_bitwise"); - } - } - out_file.close(); -} - - - -int main() { - // nodes 0-7 hold cores and DRAM, nodes 8-15 only HBM - - measure_all_read_bw_reversed_bitwise(); - measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(0); - - return 0; -} \ No newline at end of file diff --git a/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h deleted file mode 100644 index 6dbc652..0000000 --- a/qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h +++ /dev/null @@ -1,391 +0,0 @@ - -#include -#include -#include -#include - -#include - -#include "filter.h" -#include "aggregation.h" -#include "vector_loader.h" -#include "timer_utils.h" -#include "barrier_utils.h" -#include "execution_modes.h" - - -template -class Query_Wrapper { -public: - // sync - std::shared_future* ready_future; - - thread_runtime_timing* trt; - barrier_timing* bt; - -private: - // numa - uint32_t close_mem; - uint32_t far_mem; - - // data - size_t size_b; - size_t chunk_size_b; - size_t chunk_size_w; - size_t chunk_cnt; - base_t* data_a; - base_t* data_b; - base_t* dest; - - // ratios - uint32_t thread_count_fc; - uint32_t thread_count_fi; - uint32_t thread_count_ag; - uint32_t thread_group; - - // done bits - volatile uint8_t* ready_flag_a; - volatile uint8_t* ready_flag_b; - std::mutex ready_a_m; - std::mutex ready_b_m; - - // buffer - uint16_t* mask_a; - uint16_t* mask_b; - base_t** buffer_b; - - // params - base_t cmp_a; - base_t cmp_b; - bool no_copy; - NewPMode mode; - - // sync - std::unique_ptr*>> sync_barrier; - std::string barrier_mode = BARRIER_MODE; - - using filterCopy = Filter; - using filterNoCopy = Filter; - using filter = Filter; - using aggregation = Aggregation; - -public: - - - Query_Wrapper(std::shared_future* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, - base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, - NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) : - ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), - dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){ - - chunk_size_w = chunk_size_b / sizeof(base_t); - chunk_cnt = size_b / chunk_size_b; - thread_count_fi = tc_fi; - thread_count_fc = tc_fc; - thread_count_ag = tc_ag; - - ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( - chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); - ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( - chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); - - mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); - mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); - - trt = new thread_runtime_timing(4, 16*4*4*4, close_mem); - bt = new barrier_timing(4, 16*4*4*4, close_mem); - reset_barriers(); - - if constexpr(BUFFER_LIMIT==1) { - // TODO size ok like that? - buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem); - buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); - buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); - } else { - buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem); - base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem); - *buffer_b = buffer_tmp; - } - }; - - void reset_barriers(){ - if(sync_barrier != nullptr) { - for(auto& barrier : *sync_barrier) { - delete barrier; - } - sync_barrier.reset(); - } - - sync_barrier = std::make_unique*>>(thread_group); - uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; - uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; - uint32_t barrier_thread_count; - - if constexpr(simple){ - barrier_thread_count = (thread_group / barrier_count) * - (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi)); - } else { - barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; - } - for(uint32_t i = 0; i < barrier_count; ++i) { - (*sync_barrier)[i] = new std::barrier(barrier_thread_count); - } - } - - void clear_buffers () { - std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); - std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); - - std::memset(mask_a, 0x00, size_b / sizeof(base_t)); - std::memset(mask_b, 0x00, size_b / sizeof(base_t)); - if constexpr(BUFFER_LIMIT==1) { - std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b); - std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b); - } else { - std::memset(*buffer_b, 0x00, size_b); - } - - trt->reset_accumulator(); - bt->reset_accumulator(); - reset_barriers(); - }; - - ~Query_Wrapper() { - numa_free((void*)ready_flag_a, - chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); - numa_free((void*)ready_flag_b, - chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); - - numa_free(mask_a, size_b / sizeof(base_t)); - numa_free(mask_b, size_b / sizeof(base_t)); - if constexpr(BUFFER_LIMIT==1) { - numa_free(buffer_b[0], thread_group * chunk_size_b); - numa_free(buffer_b[1], thread_group * chunk_size_b); - numa_free(buffer_b, size_b * sizeof(base_t*)); - } else { - numa_free(*buffer_b, size_b); - } - - delete trt; - for(auto& barrier : *sync_barrier) { - delete barrier; - } - delete bt; - - }; - - //this can be set without need to change allocations - void set_thread_group_count(uint32_t value) { - this->thread_group = value; - }; - -private: - static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, - size_t tcnt) { - base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; - return chunk_ptr + tid * (chunk_size_w / tcnt); - } - - static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, - size_t tcnt) { - // 16 integer are addressed with one uint16_t in mask buffer - size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); - return base_ptr + (offset / 16); - } - - static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { - uint8_t value = bitmap[bitpos / 8]; - switch(bitpos % 8) { - case 0: return value & 0b00000001; - case 1: return value & 0b00000010; - case 2: return value & 0b00000100; - case 3: return value & 0b00001000; - case 4: return value & 0b00010000; - case 5: return value & 0b00100000; - case 6: return value & 0b01000000; - case 7: return value & 0b10000000; - default: return false; - } - } - - static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { - mutex.lock(); - switch(bitpos % 8) { - case 0: bitmap[bitpos / 8] |= 0b00000001;break; - case 1: bitmap[bitpos / 8] |= 0b00000010;break; - case 2: bitmap[bitpos / 8] |= 0b00000100;break; - case 3: bitmap[bitpos / 8] |= 0b00001000;break; - case 4: bitmap[bitpos / 8] |= 0b00010000;break; - case 5: bitmap[bitpos / 8] |= 0b00100000;break; - case 6: bitmap[bitpos / 8] |= 0b01000000;break; - case 7: bitmap[bitpos / 8] |= 0b10000000;break; - } - mutex.unlock(); - } - -public: - - static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) { - base_t sum = 0; - for(int i = 0; i < size_b / sizeof(base_t); ++i) { - if(a[i] >= cmp_a && b[i] <= cmp_b) { - sum += b[i]; - } - } - return sum; - } - - static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { - uint32_t cnt = 0; - for(int i = 0; i < size_b / sizeof(base_t); ++i) { - if(leq) { - if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) { - ++cnt; - } - } else { - if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) { - ++cnt; - } - } - } - } - - static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { - for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) { - std::bitset<16> m(mask[i]); - uint16_t ch = 0; - for(int j = 0; j < 16; ++j) { - if(data[i*16 + j] <= cmp) { - ch |= 0x1 << j; - } - } - std::bitset<16> c(ch); - - std::cout << "act " << m << std::endl; - std::cout << "rea " << c << std::endl << std::endl; - } - } - - - void scan_b(size_t gid, size_t gcnt, size_t tid) { - size_t tcnt = thread_count_fc; - assert(chunk_size_w % tcnt == 0); - assert(chunk_size_w % 16 == 0); - assert(chunk_size_w % tcnt * 16 == 0); - - // wait till everyone can start - ready_future->wait(); - - // the lower gids run once more if the chunks are not evenly distributable - uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); - uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; - for(uint32_t i = 0; i < runs; ++i) { - trt->start_timer(1, tid * gcnt + gid); - - // calculate pointers - size_t chunk_id = gid + gcnt * i; - base_t* chunk_ptr = get_sub_chunk_ptr(data_b , chunk_id, chunk_size_w, tid, tcnt); - uint16_t* mask_ptr = get_sub_mask_ptr (mask_b , chunk_id, chunk_size_w, tid, tcnt); - - if constexpr(simple){ - base_t* buffer_ptr; - if constexpr(BUFFER_LIMIT==1) { - buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); - } else { - buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); - } - std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt); - } else { - if(no_copy) { - filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); - } else { - base_t* buffer_ptr; - if constexpr(BUFFER_LIMIT==1) { - buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); - } else { - buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); - } - filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); - } - } - - trt->stop_timer(1, tid * gcnt + gid); - bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); - - } - (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); - - } - - void scan_a(size_t gid, size_t gcnt, size_t tid) { - size_t tcnt = thread_count_fi; - assert(chunk_size_w % tcnt == 0); - assert(chunk_size_w % 16 == 0); - assert(chunk_size_w % tcnt * 16 == 0); - - // wait till everyone can start - ready_future->wait(); - - // the lower gids run once more if the chunks are not evenly distributable - uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); - uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; - for(uint32_t i = 0; i < runs; ++i) { - trt->start_timer(0, tid * gcnt + gid); - // calculate pointers - size_t chunk_id = gid + gcnt * i; - base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); - uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); - - filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); - - trt->stop_timer(0, tid * gcnt + gid); - bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); - } - (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); - } - - void aggr_j(size_t gid, size_t gcnt, size_t tid) { - size_t tcnt = thread_count_ag; - // wait till everyone can start - ready_future->wait(); - - // calculate values - __m512i aggregator = aggregation::OP::zero(); - // the lower gids run once more if the chunks are not evenly distributable - uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); - uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; - for(uint32_t i = 0; i < runs; ++i) { - - bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); - trt->start_timer(2, tid * gcnt + gid); - - // calculate pointers - size_t chunk_id = gid + gcnt * i; - base_t* chunk_ptr; - if(no_copy) { - chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); - } else { - if constexpr(BUFFER_LIMIT==1) { - chunk_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); - } else { - chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); - } - } - uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); - uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); - - base_t tmp = _mm512_reduce_add_epi64(aggregator); - if constexpr(simple){ - aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt); - } else { - aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); - } - trt->stop_timer(2, tid * gcnt + gid); - } - - // so threads with more runs dont wait for finished threads - (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); - - aggregation::happly(dest + (tid * gcnt + gid), aggregator); - } -}; \ No newline at end of file diff --git a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h index 3b1d861..e224391 100644 --- a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h +++ b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h @@ -15,9 +15,9 @@ #include "measurement_utils.h" #include "execution_modes.h" -#include "../../../thirdParty/dsa_offload/offloading-cacher/cache.hpp" +#include "../../../../offloading-cacher/cache.hpp" -template +template class Query_Wrapper { public: // sync @@ -28,11 +28,9 @@ public: pcm_value_collector* pvc; private: - dsacache::Cache cache_; + static constexpr size_t COPY_POLICY_MIN_SIZE = 64 * 1024 * 1024; - // numa - uint32_t close_mem; - uint32_t far_mem; + dsacache::Cache cache_; // data size_t size_b; @@ -47,13 +45,11 @@ private: uint32_t thread_count_fc; uint32_t thread_count_fi; uint32_t thread_count_ag; - uint32_t thread_group; + uint32_t thread_count; // done bits volatile uint8_t* ready_flag_a; volatile uint8_t* ready_flag_b; - std::mutex ready_a_m; - std::mutex ready_b_m; // buffer uint16_t* mask_a; @@ -73,70 +69,72 @@ private: using filter = Filter; using aggregation = Aggregation; - void InitCache(const std::string& device) { - if (device == "default") { - static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { - return numa_dst_node; - }; + static int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) { + return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; + } - static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { - return std::vector{ numa_src_node, numa_dst_node }; - }; + static std::vector CopyMethodPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) { + if (data_size < COPY_POLICY_MIN_SIZE) { + // if the data size is small then the copy will just be carried + // out by the destination node which does not require setting numa + // thread affinity as the selected dsa engine is already the one + // present on the calling thread - cache_.Init(cache_policy,copy_policy); - } - else if (device == "xeonmax") { - static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { - return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; - }; - - static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { - const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; - if (same_socket) { - const bool socket_number = numa_dst_node >> 2; - if (socket_number == 0) return std::vector{ 0, 1, 2, 3 }; - else return std::vector{ 4, 5, 6, 7 }; - } - else return std::vector{ numa_src_node, numa_dst_node }; - }; - - cache_.Init(cache_policy,copy_policy); + return std::vector{ (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) }; } else { - std::cerr << "Given device '" << device << "' not supported!" << std::endl; - exit(-1); + // for sufficiently large data, smart copy is used which will utilize + // all four engines for intra-socket copy operations and cross copy on + // the source and destination nodes for inter-socket copy + + const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; + + if (same_socket) { + const bool socket_number = numa_dst_node >> 2; + if (socket_number == 0) return std::vector{ 0, 1, 2, 3 }; + else return std::vector{ 4, 5, 6, 7 }; + } + else { + return std::vector{ + (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node), + (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) + }; + } } } public: - - - Query_Wrapper(std::shared_future* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, - base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, - NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42) : + Query_Wrapper(std::shared_future* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, + base_t* data_b, base_t* dest, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, + NewPMode mode, base_t cmp_a = 50, base_t cmp_b = 42) : ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), - dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b){ - + dest(dest), mode(mode), cmp_a(cmp_a), cmp_b(cmp_b) { + + const int current_cpu = sched_getcpu(); + const int current_node = numa_node_of_cpu(current_cpu); + const int cache_node = CachePlacementPolicy(current_node, current_node, 0); + chunk_size_w = chunk_size_b / sizeof(base_t); chunk_cnt = size_b / chunk_size_b; + thread_count_fi = tc_fi; thread_count_fc = tc_fc; thread_count_ag = tc_ag; - ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( - chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); - ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( - chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); + thread_count = tc_fi + tc_fc + tc_ag; + + ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), cache_node); + ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), cache_node); - mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); - mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); + mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), cache_node); + mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), cache_node); - InitCache("xeonmax"); + cache_.Init(CachePlacementPolicy, CopyMethodPolicy); - size_t measurement_space = THREAD_GROUP_MULTIPLIER * std::max(std::max(tc_fi, tc_fc), tc_ag); - trt = new thread_runtime_timing(3, measurement_space, far_mem); - bt = new barrier_timing(3, measurement_space, far_mem); - pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, far_mem); + size_t measurement_space = std::max(std::max(tc_fi, tc_fc), tc_ag); + trt = new thread_runtime_timing(3, measurement_space, current_node); + bt = new barrier_timing(3, measurement_space, current_node); + pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, current_node); reset_barriers(); }; @@ -148,16 +146,15 @@ public: sync_barrier.reset(); } - sync_barrier = std::make_unique*>>(thread_group); + sync_barrier = std::make_unique*>>(thread_count); uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; - uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; + uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_count; uint32_t barrier_thread_count; if constexpr(simple){ - barrier_thread_count = (thread_group / barrier_count) * - (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi)); + barrier_thread_count = (thread_count / barrier_count) * (mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi)); } else { - barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; + barrier_thread_count = (thread_count / barrier_count) * thread_count_sum; } for(uint32_t i = 0; i < barrier_count; ++i) { (*sync_barrier)[i] = new std::barrier(barrier_thread_count); @@ -180,10 +177,8 @@ public: }; ~Query_Wrapper() { - numa_free((void*)ready_flag_a, - chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); - numa_free((void*)ready_flag_b, - chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); + numa_free((void*)ready_flag_a, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); + numa_free((void*)ready_flag_b, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); numa_free(mask_a, size_b / sizeof(base_t)); numa_free(mask_b, size_b / sizeof(base_t)); @@ -202,14 +197,12 @@ public: }; private: - static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, - size_t tcnt) { + static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, size_t tcnt) { base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; return chunk_ptr + tid * (chunk_size_w / tcnt); } - static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, - size_t tcnt) { + static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, size_t tcnt) { // 16 integer are addressed with one uint16_t in mask buffer size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); return base_ptr + (offset / 16); @@ -258,6 +251,7 @@ public: // the lower gids run once more if the chunks are not evenly distributable uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { trt->start_timer(1, tid * gcnt + gid); pvc->start("scan_b", tid * gcnt + gid); @@ -268,28 +262,45 @@ public: uint16_t* mask_ptr = get_sub_mask_ptr(mask_b, chunk_id, chunk_size_w, tid, tcnt); if constexpr(simple){ - cache_.Access(chunk_ptr, chunk_size_b / tcnt); + cache_.Access(reinterpret_cast(chunk_ptr), chunk_size_b / tcnt); } else { - const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt); + const auto data = cache_.Access(reinterpret_cast(chunk_ptr), chunk_size_b / tcnt); - // wait on copy to complete - during this time other threads may - // continue with their calculation which leads to little impact - // and we will be faster if the cache is used + if constexpr(wait_b) { + // wait on copy to complete - during this time other threads may + // continue with their calculation which leads to little impact + // and we will be faster if the cache is used - data->WaitOnCompletion(); + data->WaitOnCompletion(); - // obtain the data location from the cache entry + // obtain the data location from the cache entry - base_t* data_ptr = data->GetDataLocation(); + base_t* data_ptr = reinterpret_cast(data->GetDataLocation()); - // nullptr is still a legal return value for CacheData::GetLocation() - // even after waiting, so this must be checked + // nullptr is still a legal return value for CacheData::GetLocation() + // even after waiting, so this must be checked - if (data_ptr == nullptr) { - data_ptr = chunk_ptr; + if (data_ptr == nullptr) { + std::cerr << "[!] Cache Miss in ScanB" << std::endl; + data_ptr = chunk_ptr; + } + + filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt); } + else { + // obtain the data location from the cache entry + + base_t* data_ptr = reinterpret_cast(data->GetDataLocation()); + + // nullptr is still a legal return value for CacheData::GetLocation() + // even after waiting, so this must be checked - filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt); + if (data_ptr == nullptr) { + data_ptr = chunk_ptr; + } + + filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt); + } } pvc->stop("scan_b", tid * gcnt + gid); @@ -321,7 +332,21 @@ public: base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); - filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); + if constexpr (cache_a) { + const auto data = cache_.Access(reinterpret_cast(chunk_ptr), chunk_size_b / tcnt); + data->WaitOnCompletion(); + base_t* data_ptr = reinterpret_cast(data->GetDataLocation()); + + if (data_ptr == nullptr) { + std::cerr << "[!] Cache Miss in ScanA" << std::endl; + data_ptr = chunk_ptr; + } + + filter::apply_same(mask_ptr, nullptr, data_ptr, cmp_a, chunk_size_b / tcnt); + } + else { + filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); + } pvc->stop("scan_a", tid * gcnt + gid); trt->stop_timer(0, tid * gcnt + gid); @@ -340,19 +365,19 @@ public: // the lower gids run once more if the chunks are not evenly distributable uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; + for(uint32_t i = 0; i < runs; ++i) { - bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); trt->start_timer(2, tid * gcnt + gid); pvc->start("aggr_j", tid * gcnt + gid); // calculate pointers size_t chunk_id = gid + gcnt * i; - const base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); + base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); // access the cache for the given chunk which will have been accessed in scan_b - const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt); + const auto data = cache_.Access(reinterpret_cast(chunk_ptr), chunk_size_b / tcnt); // wait on the caching task to complete, this will give time for other processes // to make progress here which will therefore not hurt performance @@ -362,14 +387,14 @@ public: // after the copy task has finished we obtain the pointer to the cached // copy of data_b which is then used from now on - const base_t* data_ptr = data->GetDataLocation(); + base_t* data_ptr = reinterpret_cast(data->GetDataLocation()); // nullptr is still a legal return value for CacheData::GetLocation() // even after waiting, so this must be checked if (data_ptr == nullptr) { data_ptr = chunk_ptr; - std::cerr << "Cache Miss" << std::endl; + std::cerr << "[!] Cache Miss in AggrJ" << std::endl; } uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); diff --git a/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h deleted file mode 100644 index 2b10b06..0000000 --- a/qdp_project/src/benchmark/pipelines/scan_filter_pipe.h +++ /dev/null @@ -1,387 +0,0 @@ - -#include -#include -#include -#include - -#include - -#include "filter.h" -#include "aggregation.h" -#include "vector_loader.h" -#include "timer_utils.h" -#include "barrier_utils.h" -#include "execution_modes.h" - - -template -class Query_Wrapper { -public: - // sync - std::shared_future* ready_future; - - thread_runtime_timing* trt; - barrier_timing* bt; - -private: - // numa - uint32_t close_mem; - uint32_t far_mem; - - // data - size_t size_b; - size_t chunk_size_b; - size_t chunk_size_w; - size_t chunk_cnt; - base_t* data_a; - base_t* data_b; - base_t* dest; - - // ratios - uint32_t thread_count_fc; - uint32_t thread_count_fi; - uint32_t thread_count_ag; - uint32_t thread_group; - - // done bits - volatile uint8_t* ready_flag_a; - volatile uint8_t* ready_flag_b; - std::mutex ready_a_m; - std::mutex ready_b_m; - - // buffer - uint16_t* mask_a; - uint16_t* mask_b; - base_t** buffer_b; - - // params - base_t cmp_a; - base_t cmp_b; - bool no_copy; - PMode mode; - - // sync - std::unique_ptr*>> sync_barrier; - std::string barrier_mode = BARRIER_MODE; - - using filterCopy = Filter; - using filterNoCopy = Filter; - using filter = Filter; - using aggregation = Aggregation; - -public: - - - Query_Wrapper(std::shared_future* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, - base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, - PMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) : - ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), - dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){ - - chunk_size_w = chunk_size_b / sizeof(base_t); - chunk_cnt = size_b / chunk_size_b; - thread_count_fi = tc_fi; - thread_count_fc = tc_fc; - thread_count_ag = tc_ag; - - ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( - chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); - ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( - chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); - - mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); - mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); - - trt = new thread_runtime_timing(4, 20, close_mem); - bt = new barrier_timing(4, 20, close_mem); - reset_barriers(); - - if constexpr(BUFFER_LIMIT==1) { - // TODO size ok like that? - buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem); - buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); - buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); - } else { - buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem); - base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem); - *buffer_b = buffer_tmp; - } - }; - - void reset_barriers(){ - if(sync_barrier != nullptr) { - for(auto& barrier : *sync_barrier) { - delete barrier; - } - sync_barrier.reset(); - } - - sync_barrier = std::make_unique*>>(thread_group); - uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; - uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; - uint32_t barrier_thread_count; - - if constexpr(simple){ - barrier_thread_count = (thread_group / barrier_count) * - (mode == PMode::expl_copy ? thread_count_sum : (thread_count_ag + thread_count_fi)); - } else { - barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; - } - for(uint32_t i = 0; i < barrier_count; ++i) { - (*sync_barrier)[i] = new std::barrier(barrier_thread_count); - } - } - - - void clear_buffers () { - std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); - std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); - - std::memset(mask_a, 0x00, size_b / sizeof(base_t)); - std::memset(mask_b, 0x00, size_b / sizeof(base_t)); - if constexpr(BUFFER_LIMIT==1) { - std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b); - std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b); - } else { - std::memset(*buffer_b, 0x00, size_b); - } - - trt->reset_accumulator(); - bt->reset_accumulator(); - reset_barriers(); - }; - - ~Query_Wrapper() { - numa_free((void*)ready_flag_a, - chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); - numa_free((void*)ready_flag_b, - chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); - - numa_free(mask_a, size_b / sizeof(base_t)); - numa_free(mask_b, size_b / sizeof(base_t)); - if constexpr(BUFFER_LIMIT==1) { - numa_free(buffer_b[0], thread_group * chunk_size_b); - numa_free(buffer_b[1], thread_group * chunk_size_b); - numa_free(buffer_b, size_b * sizeof(base_t*)); - } else { - numa_free(*buffer_b, size_b); - } - - delete trt; - for(auto& barrier : *sync_barrier) { - delete barrier; - } - delete bt; - - }; - -private: - static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, - size_t tcnt) { - base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; - return chunk_ptr + tid * (chunk_size_w / tcnt); - } - - static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, - size_t tcnt) { - // 16 integer are addressed with one uint16_t in mask buffer - size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); - return base_ptr + (offset / 16); - } - - static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { - uint8_t value = bitmap[bitpos / 8]; - switch(bitpos % 8) { - case 0: return value & 0b00000001; - case 1: return value & 0b00000010; - case 2: return value & 0b00000100; - case 3: return value & 0b00001000; - case 4: return value & 0b00010000; - case 5: return value & 0b00100000; - case 6: return value & 0b01000000; - case 7: return value & 0b10000000; - default: return false; - } - } - - static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { - mutex.lock(); - switch(bitpos % 8) { - case 0: bitmap[bitpos / 8] |= 0b00000001;break; - case 1: bitmap[bitpos / 8] |= 0b00000010;break; - case 2: bitmap[bitpos / 8] |= 0b00000100;break; - case 3: bitmap[bitpos / 8] |= 0b00001000;break; - case 4: bitmap[bitpos / 8] |= 0b00010000;break; - case 5: bitmap[bitpos / 8] |= 0b00100000;break; - case 6: bitmap[bitpos / 8] |= 0b01000000;break; - case 7: bitmap[bitpos / 8] |= 0b10000000;break; - } - mutex.unlock(); - } - -public: - - static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) { - base_t sum = 0; - for(int i = 0; i < size_b / sizeof(base_t); ++i) { - if(a[i] >= cmp_a && b[i] <= cmp_b) { - sum += b[i]; - } - } - return sum; - } - - static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { - uint32_t cnt = 0; - for(int i = 0; i < size_b / sizeof(base_t); ++i) { - if(leq) { - if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) { - ++cnt; - } - } else { - if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) { - ++cnt; - } - } - } - } - - static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { - for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) { - std::bitset<16> m(mask[i]); - uint16_t ch = 0; - for(int j = 0; j < 16; ++j) { - if(data[i*16 + j] <= cmp) { - ch |= 0x1 << j; - } - } - std::bitset<16> c(ch); - - std::cout << "act " << m << std::endl; - std::cout << "rea " << c << std::endl << std::endl; - } - } - - - void scan_b(size_t gid, size_t gcnt, size_t tid) { - size_t tcnt = thread_count_fc; - assert(chunk_size_w % tcnt == 0); - assert(chunk_size_w % 16 == 0); - assert(chunk_size_w % tcnt * 16 == 0); - - // wait till everyone can start - ready_future->wait(); - - // the lower gids run once more if the chunks are not evenly distributable - uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); - uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; - for(uint32_t i = 0; i < runs; ++i) { - trt->start_timer(1, tid * gcnt + gid); - - // calculate pointers - size_t chunk_id = gid + gcnt * i; - base_t* chunk_ptr = get_sub_chunk_ptr(data_b , chunk_id, chunk_size_w, tid, tcnt); - uint16_t* mask_ptr = get_sub_mask_ptr (mask_b , chunk_id, chunk_size_w, tid, tcnt); - - if constexpr(simple){ - base_t* buffer_ptr; - if constexpr(BUFFER_LIMIT==1) { - buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); - } else { - buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); - } - std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt); - } else { - if(no_copy) { - filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); - } else { - base_t* buffer_ptr; - if constexpr(BUFFER_LIMIT==1) { - buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); - } else { - buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); - } - filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); - } - } - - trt->stop_timer(1, tid * gcnt + gid); - bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); - - } - (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); - - } - - void scan_a(size_t gid, size_t gcnt, size_t tid) { - size_t tcnt = thread_count_fi; - assert(chunk_size_w % tcnt == 0); - assert(chunk_size_w % 16 == 0); - assert(chunk_size_w % tcnt * 16 == 0); - - // wait till everyone can start - ready_future->wait(); - - // the lower gids run once more if the chunks are not evenly distributable - uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); - uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; - for(uint32_t i = 0; i < runs; ++i) { - trt->start_timer(0, tid * gcnt + gid); - // calculate pointers - size_t chunk_id = gid + gcnt * i; - base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); - uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); - - filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); - - trt->stop_timer(0, tid * gcnt + gid); - bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); - } - (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); - } - - void aggr_j(size_t gid, size_t gcnt, size_t tid) { - size_t tcnt = thread_count_ag; - // wait till everyone can start - ready_future->wait(); - - // calculate values - __m512i aggregator = aggregation::OP::zero(); - // the lower gids run once more if the chunks are not evenly distributable - uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); - uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; - for(uint32_t i = 0; i < runs; ++i) { - - bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); - trt->start_timer(2, tid * gcnt + gid); - - // calculate pointers - size_t chunk_id = gid + gcnt * i; - base_t* chunk_ptr; - if(no_copy) { - chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); - } else { - if constexpr(BUFFER_LIMIT==1) { - chunk_ptr = get_sub_chunk_ptr(buffer_b[i%2], gid, chunk_size_w, tid, tcnt); - } else { - chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); - } - } - uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); - uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); - - base_t tmp = _mm512_reduce_add_epi64(aggregator); - if constexpr(simple){ - aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt); - } else { - aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); - } - trt->stop_timer(2, tid * gcnt + gid); - } - - // so threads with more runs dont wait for finished threads - (*(*sync_barrier)[barrier_idx]).arrive_and_drop(); - - aggregation::happly(dest + (tid * gcnt + gid), aggregator); - } -}; \ No newline at end of file diff --git a/qdp_project/src/utils/execution_modes.h b/qdp_project/src/utils/execution_modes.h index ca04b4f..b494fab 100644 --- a/qdp_project/src/utils/execution_modes.h +++ b/qdp_project/src/utils/execution_modes.h @@ -55,17 +55,24 @@ struct new_mode_manager { };*/ constexpr static int thread_counts[2][4][3] = { + // thread counts for both simple and complex querry + // inner layout: { scan_a, scan_b, aggr_j } + //simple query - //scan_a, scan_b, aggr_j - {{4, 0, 2}, // DRAM_base - {4, 0, 2}, // HBM_base - {4, 0, 2}, // Mixed_base - {1, 4, 1}},// Prefetching + { + {4, 0, 2}, // DRAM_base + {4, 0, 2}, // HBM_base + {4, 0, 2}, // Mixed_base + {4, 4, 4} // Prefetching + }, + //complex query - {{1, 4, 1}, // DRAM_base - {1, 4, 1}, // HBM_base - {1, 4, 1}, // Mixed_base - {1, 4, 1}},// Prefetching + { + {1, 4, 1}, // DRAM_base + {1, 4, 1}, // HBM_base + {1, 4, 1}, // Mixed_base + {4, 4, 4} // Prefetching + } }; static inline NewPMode inc(NewPMode value) { @@ -81,9 +88,17 @@ struct new_mode_manager { }; static std::string string(NewPMode value) { switch(value) { - case DRAM_base: return "DRAM_Baseline"; - case HBM_base: return "HBM_Baseline"; - case Mixed_base: return "DRAM_HBM_Baseline"; - } return "Q-d_Prefetching"; + case DRAM_base: + return "DRAM_Baseline"; + case HBM_base: + return "HBM_Baseline"; + case Mixed_base: + return "DRAM_HBM_Baseline"; + case Prefetch: + return "Q-d_Prefetching"; + default: + std::cerr << "[x] Unknown Processing Mode" << std::endl; + exit(-1); + } }; }; \ No newline at end of file From 0ad0e4af042dcc9bb30da22f26c486f68317f319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 17 Jan 2024 13:48:09 +0100 Subject: [PATCH 28/29] remove the manual build script and add numa and cpu asignment to the execution script --- qdp_project/bench_max.sh | 5 ++--- qdp_project/cmake_max.sh | 9 --------- 2 files changed, 2 insertions(+), 12 deletions(-) delete mode 100644 qdp_project/cmake_max.sh diff --git a/qdp_project/bench_max.sh b/qdp_project/bench_max.sh index b7e0168..e49275b 100644 --- a/qdp_project/bench_max.sh +++ b/qdp_project/bench_max.sh @@ -1,10 +1,9 @@ -#!bin/bash +#!/bin/bash current_date_time=$(date) echo "Benchmark start at: $current_date_time" - -../bin/MAXBench +sudo numactl --cpunodebind=2 -- taskset -c 0,1,2,3,4,5 ../bin/MAXBench current_date_time=$(date) echo "Benchmark end at: $current_date_time" \ No newline at end of file diff --git a/qdp_project/cmake_max.sh b/qdp_project/cmake_max.sh deleted file mode 100644 index 03c137b..0000000 --- a/qdp_project/cmake_max.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!bin/bash - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=cpu -DPCM_M=false .. -cmake --build . --target MAXBench -mv ../bin/MAXBench ../bin/MAXBench_gcc - -cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=numa -DPCM_M=false .. -cmake --build . --target MAXBench -mv ../bin/MAXBench ../bin/MAXBench_gcn From 1a3cb6dada1c9d64461d1a3141cc2b64348dc59a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 17 Jan 2024 13:49:21 +0100 Subject: [PATCH 29/29] prettify credit in the readme --- qdp_project/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qdp_project/README.md b/qdp_project/README.md index afad56b..7b774b4 100644 --- a/qdp_project/README.md +++ b/qdp_project/README.md @@ -1,3 +1,5 @@ This is a copy of the Query Driven Prefetching Repository + https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/qdp_minimal/code -Original Authors: André Berthold and Anna Bartuschka + +Original Authors: André Berthold and Anna Bartuschka \ No newline at end of file