From b14ca88e034d35834ae800f00fdf9774306b6c30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Sun, 26 Nov 2023 12:45:29 +0100 Subject: [PATCH] start implementation of benchmarks code, begin with state from test project, execute-move.hpp contains numa-aware task submit routine which is WIP --- benchmarks/CMakeLists.txt | 15 ++++++ benchmarks/benchmarks.md | 7 +++ benchmarks/error.hpp | 26 ++++++++++ benchmarks/execute-move.hpp | 99 +++++++++++++++++++++++++++++++++++++ benchmarks/main.cpp | 33 +++++++++++++ 5 files changed, 180 insertions(+) create mode 100755 benchmarks/CMakeLists.txt create mode 100644 benchmarks/benchmarks.md create mode 100644 benchmarks/error.hpp create mode 100644 benchmarks/execute-move.hpp create mode 100644 benchmarks/main.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100755 index 0000000..b2019e8 --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.18) + +project(dml-benchmark) + +set(CMAKE_CXX_STANDARD 20) + +include_directories("../../DML/include/") + +set(SOURCES main.cpp) + +add_executable(dml-benchmark ${SOURCES}) + +target_link_libraries(dml-benchmark libdml.a ${CMAKE_DL_LIBS}) + +install(TARGETS dml-benchmark DESTINATION ${CMAKE_INSTALL_PREFIX}) \ No newline at end of file diff --git a/benchmarks/benchmarks.md b/benchmarks/benchmarks.md new file mode 100644 index 0000000..46a2cbc --- /dev/null +++ b/benchmarks/benchmarks.md @@ -0,0 +1,7 @@ +- 1 to n engines per group +- 1 to n threads running on one specific core / dsa engine +- copy inside and across NUMA borders +- cross-copy: 2 engines copying from their numa domain to the domain of the other +- all with "packet sizes" of 1KiB, 2KiB, 4KiB, 8KiB, ..., 1GiB +- all with both CPU and DSA for comparison +- batch vs single submissions diff --git a/benchmarks/error.hpp b/benchmarks/error.hpp new file mode 100644 index 0000000..4c083e3 --- /dev/null +++ b/benchmarks/error.hpp @@ -0,0 +1,26 @@ +#include +#include + +inline std::ostream& operator<<(std::ostream& strm, const dml::status_code code) { + switch(code) { + case dml::status_code::ok: strm << "[ok]"; break; + case dml::status_code::false_predicate: strm << "[false predicate]"; break; + case dml::status_code::partial_completion: strm << "[partial completion]"; break; + case dml::status_code::nullptr_error: strm << "[nullptr error]"; break; + case dml::status_code::bad_size: strm << "[bad size]"; break; + case dml::status_code::bad_length: strm << "[bad length]"; break; + case dml::status_code::inconsistent_size: strm << "[inconsistent size]"; break; + case dml::status_code::dualcast_bad_padding: strm << "[dualcast bad padding]"; break; + case dml::status_code::bad_alignment: strm << "[bad alignment]"; break; + case dml::status_code::buffers_overlapping: strm << "[buffers overlapping]"; break; + case dml::status_code::delta_delta_empty: strm << "[delta delta empty]"; break; + case dml::status_code::batch_overflow: strm << "[batch overflow]"; break; + case dml::status_code::execution_failed: strm << "[execution failed]"; break; + case dml::status_code::unsupported_operation: strm << "[unsupported operation]"; break; + case dml::status_code::queue_busy: strm << "[queue busy]"; break; + case dml::status_code::error: strm << "[unknown error]"; break; + case dml::status_code::config_error: strm << "[config error]"; break; + default: strm << "[unhandled error]"; break; + } + return strm; +} \ No newline at end of file diff --git a/benchmarks/execute-move.hpp b/benchmarks/execute-move.hpp new file mode 100644 index 0000000..c855496 --- /dev/null +++ b/benchmarks/execute-move.hpp @@ -0,0 +1,99 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +struct ThreadArgs { + // thread placement / engine selection + uint8_t numa_node; + uint8_t core; + // region size and source+destination for move + size_t size; + uint8_t nnode_src; + uint8_t nnode_dst; + // repetition + uint8_t count; // TODO: unused + bool batched; // TODO: unused + // thread output + dml::status_code status; + std::chrono::microseconds duration; + // set by execution + sem_t* sig; +}; + +template +void* thread_function(void* argp) { + ThreadArgs* args = reinterpret_cast(argp); + + // set numa node and core affinity of the current thread + numa_run_on_node(args->numa_node); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(args->core, &cpuset); + if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) { + std::cerr << "Error setting affinity for thread designated to core " << args->core << " on node " << args->numa_node << std::endl; + return nullptr; + } + + // allocate memory for the move operation on the requested numa nodes + void* src = numa_alloc_onnode(args->size, args->nnode_src); + void* dst = numa_alloc_onnode(args->size, args->nnode_dst); + dml::data_view srcv = dml::make_view(reinterpret_cast(src), args->size); + dml::data_view dstv = dml::make_view(reinterpret_cast(dst), args->size); + + // wait for specified signal so that all operations start at the same time + sem_wait(args->sig); + + const auto st = std::chrono::high_resolution_clock::now(); + + // we use the asynchronous submit-routine even though this is not required + // here, however the project later on will only use async operation + auto handler = dml::submit(dml::mem_move, srcv, dstv, args->numa_node); + auto result = handler.get(); + + const auto et = std::chrono::high_resolution_clock::now(); + + // free the allocated memory regions on the selected nodes + numa_free(src, args->size); + numa_free(dst, args->size); + + args->duration = std::chrono::duration_cast(et - st); + args->status = result.status; + + return nullptr; +} + +template +void execute_mem_move(std::vector args) { + sem_t sem; + std::vector threads; + + // initialize semaphore and numactl-library + sem_init(&sem, 0, 0); + numa_available(); + + // for each submitted task we link the semaphore + // and create the thread, passing the argument + for (auto arg : args) { + arg.sig = &sem; + threads.emplace_back(); + + if (pthread_create(&threads.back(), nullptr, thread_function, &arg) != 0) { + std::cerr << "Error creating thread" << std::endl; + exit(1); + } + } + + // post will make all waiting threads pass + sem_post(&sem); + + for (pthread_t& t : threads) { + pthread_join(t, nullptr); + } + + sem_destroy(&sem); +} \ No newline at end of file diff --git a/benchmarks/main.cpp b/benchmarks/main.cpp new file mode 100644 index 0000000..8ca992f --- /dev/null +++ b/benchmarks/main.cpp @@ -0,0 +1,33 @@ +#include + +#include +#include + +#include "error.hpp" +#include "execute-move.hpp" + +int main(int argc, char **argv) { + if (argc < 2) { + std::cout << "Missing the execution path as the first parameter. Use hardware_path, software_path or automatic_path." << std::endl; + return 1; + } + + const std::string path = argv[1]; + + if (path == "hardware_path") { + std::cout << "Executing using dml::hardware path" << std::endl; + return execute_mem_move(); + } + else if (path == "software_path") { + std::cout << "Executing using dml::software path" << std::endl; + return execute_mem_move(); + } + else if (path == "auto_path") { + std::cout << "Executing using dml::automatic path" << std::endl; + return execute_mem_move(); + } + else { + std::cout << "Unrecognized value for parameter. Use hardware_path, software_path or automatic_path." << std::endl; + return 1; + } +} \ No newline at end of file