From 7e8c9acbc31eeada20f89822500c5bcc2beb7564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Sun, 26 Nov 2023 18:19:00 +0100 Subject: [PATCH] implement batch operation and add control parameters to the ThreadArgs struct, also add more timing information: now submission and completion will be timed separately --- benchmarks/execute-move.hpp | 108 ++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 12 deletions(-) diff --git a/benchmarks/execute-move.hpp b/benchmarks/execute-move.hpp index 67650ec..56df6b0 100644 --- a/benchmarks/execute-move.hpp +++ b/benchmarks/execute-move.hpp @@ -10,6 +10,8 @@ #include +#include "statuscode-tostring.hpp" + struct ThreadArgs { // thread placement / engine selection uint8_t numa_node; @@ -18,24 +20,54 @@ struct ThreadArgs { size_t size; uint8_t nnode_src; uint8_t nnode_dst; + // repetition + uint32_t rep_count; + bool batch_submit; + uint32_t batch_size; + uint32_t barrier_after_n_operations; // thread output dml::status_code status; - std::chrono::microseconds duration; + // average run duration in microseconds + double combined_duration; + double submit_duration; + double complete_duration; + // completed iterations + uint32_t rep_completed; // set by execution sem_t* sig; }; +double avg(const std::vector& v) { + int n = 0; + double mean = 0.0; + + for (const auto x : v) { + const double delta = static_cast(x) - mean; + mean += delta / ++n; + } + + return mean; +} + +#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl +#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Core " << args->core << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO +#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << #msg << std::endl; args->status = status; return nullptr; }} + template void* thread_function(void* argp) { ThreadArgs* args = reinterpret_cast(argp); + std::vector submission_durations; + std::vector completion_durations; + std::vector combined_durations; + // set numa node and core affinity of the current thread numa_run_on_node(args->numa_node); cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(args->core, &cpuset); if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) { - std::cerr << "Error setting affinity for thread designated to core " << args->core << " on node " << args->numa_node << std::endl; + LOG_ERR << "Error setting affinity for thread" << std::endl; return nullptr; } @@ -45,31 +77,83 @@ void* thread_function(void* argp) { dml::data_view srcv = dml::make_view(reinterpret_cast(src), args->size); dml::data_view dstv = dml::make_view(reinterpret_cast(dst), args->size); + args->status = dml::status_code::ok; + args->rep_completed = 0; + // wait for specified signal so that all operations start at the same time sem_wait(args->sig); - - const auto st = std::chrono::high_resolution_clock::now(); - // we use the asynchronous submit-routine even though this is not required - // here, however the project later on will only use async operation - auto handler = dml::submit(dml::mem_move, srcv, dstv); - auto result = handler.get(); + for (uint32_t i = 0; i < args->rep_count; i++) { + if (args->batch_submit) { + uint32_t opcount = args->batch_size; + + if (args->barrier_after_n_operations > 0) { + opcount += opcount / args->barrier_after_n_operations; + } + + const auto st = std::chrono::high_resolution_clock::now(); + + auto sequence = dml::sequence(opcount, std::allocator()); + + for (uint32_t j = 0; j < args->batch_size; j++) { + const auto status = sequence.add(dml::mem_copy, srcv, dstv); + + if (j % args->barrier_after_n_operations == 0) { + sequence.add(dml::nop); + } + } - const auto et = std::chrono::high_resolution_clock::now(); + auto handler = dml::submit(dml::batch, sequence); + + const auto se = std::chrono::high_resolution_clock::now(); + + auto result = handler.get(); + + const auto et = std::chrono::high_resolution_clock::now(); + + submission_durations.emplace_back(std::chrono::duration_cast(se - st).count()); + completion_durations.emplace_back(std::chrono::duration_cast(et - se).count()); + combined_durations.emplace_back(std::chrono::duration_cast(et - st).count()); + } + else { + const auto st = std::chrono::high_resolution_clock::now(); + + // we use the asynchronous submit-routine even though this is not required + // here, however the project later on will only use async operation and + // therefore this behaviour should be benchmarked + auto handler = dml::submit(dml::mem_copy, srcv, dstv); + + const auto se = std::chrono::high_resolution_clock::now(); + + auto result = handler.get(); + + const auto et = std::chrono::high_resolution_clock::now(); + + const dml::status_code status = result.status; + CHECK_STATUS(status, "Operation completed with an Error!"); + + submission_durations.emplace_back(std::chrono::duration_cast(se - st).count()); + completion_durations.emplace_back(std::chrono::duration_cast(et - se).count()); + combined_durations.emplace_back(std::chrono::duration_cast(et - st).count()); + } + + args->rep_completed++; + } // free the allocated memory regions on the selected nodes numa_free(src, args->size); numa_free(dst, args->size); - args->duration = std::chrono::duration_cast(et - st); - args->status = result.status; + args->combined_duration = avg(combined_durations); + args->complete_duration = avg(completion_durations); + args->submit_duration = avg(submission_durations); args->sig = nullptr; return nullptr; } template -void execute_mem_move(std::vector& args) { +void execute_dml_memcpy(std::vector& args) { sem_t sem; std::vector threads;