diff --git a/benchmarks/benchmark.hpp b/benchmarks/benchmark.hpp index c94c73a..62cb48f 100644 --- a/benchmarks/benchmark.hpp +++ b/benchmarks/benchmark.hpp @@ -19,10 +19,20 @@ double avg(const std::vector& v) { return static_cast(std::accumulate(v.begin(), v.end(), 0)) / static_cast(v.size()); } +double stdev(const std::vector& v, const double mean) { + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [mean](double x) { return x - mean; }); + const double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + const double stdev = std::sqrt(sq_sum / static_cast(v.size())); + return stdev; +} + #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl #define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO #define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }} +#define ADD_TIMING_MESSUREMENT { if (i >= 5) { submission_durations.emplace_back(std::chrono::duration_cast(se - st).count()); completion_durations.emplace_back(std::chrono::duration_cast(et - se).count()); combined_durations.emplace_back(std::chrono::duration_cast(et - st).count());}} + template void* thread_function(void* argp) { TaskData* args = reinterpret_cast(argp); @@ -43,21 +53,17 @@ void* thread_function(void* argp) { args->status = dml::status_code::ok; args->rep_completed = 0; - for (uint32_t i = 0; i < args->rep_count; i++) { + // we add 5 as the first 5 iterations will not be meassured + // to remove exceptional values encountered during warmup + for (uint32_t i = 0; i < args->rep_count + 5; i++) { // synchronize the start of each iteration // using the barrier structure args->barrier_->wait(); if (args->batch_submit) { - uint32_t opcount = args->batch_size; - - if (args->barrier_after_n_operations > 0) { - opcount += opcount / args->barrier_after_n_operations; - } - - const auto st = std::chrono::high_resolution_clock::now(); + const auto st = std::chrono::steady_clock::now(); - auto sequence = dml::sequence(opcount, std::allocator()); + auto sequence = dml::sequence(args->batch_size, std::allocator()); for (uint32_t j = 0; j < args->batch_size; j++) { // block_on_fault() is required to submit the task in a way so that the @@ -66,10 +72,6 @@ void* thread_function(void* argp) { const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv); CHECK_STATUS(status, "Adding operation to batch failed!"); - - if (j % args->barrier_after_n_operations == 0) { - sequence.add(dml::nop); - } } // we use the asynchronous submit-routine even though this is not required @@ -78,21 +80,48 @@ void* thread_function(void* argp) { auto handler = dml::submit(dml::batch, sequence); - const auto se = std::chrono::high_resolution_clock::now(); + const auto se = std::chrono::steady_clock::now(); auto result = handler.get(); - const auto et = std::chrono::high_resolution_clock::now(); + const auto et = std::chrono::steady_clock::now(); const dml::status_code status = result.status; CHECK_STATUS(status, "Batch completed with an Error!"); - submission_durations.emplace_back(std::chrono::duration_cast(se - st).count()); - completion_durations.emplace_back(std::chrono::duration_cast(et - se).count()); - combined_durations.emplace_back(std::chrono::duration_cast(et - st).count()); + ADD_TIMING_MESSUREMENT; + } + else if (args->batch_size > 1) { + // implementation for non-batched batch submit follows here + // this means we submit a bunch of work as single descriptors + // but then dont wait for the completion immediately + + std::vector>> handlers; + + const auto st = std::chrono::steady_clock::now(); + + for (uint32_t j = 0; j < args->batch_size; j++) { + // block_on_fault() is required to submit the task in a way so that the + // DSA engine can handle page faults itself together with the IOMMU which + // requires the WQ to be configured to allow this too + + handlers.emplace_back(dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv)); + } + + const auto se = std::chrono::steady_clock::now(); + + for (auto& handler : handlers) { + auto result = handler.get(); + const dml::status_code status = result.status; + CHECK_STATUS(status, "Operation completed with an Error!"); + } + + const auto et = std::chrono::steady_clock::now(); + + ADD_TIMING_MESSUREMENT; } else { - const auto st = std::chrono::high_resolution_clock::now(); + const auto st = std::chrono::steady_clock::now(); // we use the asynchronous submit-routine even though this is not required // here, however the project later on will only use async operation and @@ -102,18 +131,16 @@ void* thread_function(void* argp) { // requires the WQ to be configured to allow this too auto handler = dml::submit(dml::mem_copy.block_on_fault(), srcv, dstv); - const auto se = std::chrono::high_resolution_clock::now(); + const auto se = std::chrono::steady_clock::now(); auto result = handler.get(); - const auto et = std::chrono::high_resolution_clock::now(); + const auto et = std::chrono::steady_clock::now(); const dml::status_code status = result.status; CHECK_STATUS(status, "Operation completed with an Error!"); - submission_durations.emplace_back(std::chrono::duration_cast(se - st).count()); - completion_durations.emplace_back(std::chrono::duration_cast(et - se).count()); - combined_durations.emplace_back(std::chrono::duration_cast(et - st).count()); + ADD_TIMING_MESSUREMENT; } args->rep_completed++; @@ -126,6 +153,9 @@ void* thread_function(void* argp) { args->combined_duration = avg(combined_durations); args->complete_duration = avg(completion_durations); args->submit_duration = avg(submission_durations); + args->combined_duration_stdev = stdev(combined_durations, args->combined_duration); + args->complete_duration_stdev = stdev(completion_durations, args->complete_duration); + args->submit_duration_stdev = stdev(submission_durations, args->submit_duration); return nullptr; } diff --git a/benchmarks/task-data.hpp b/benchmarks/task-data.hpp index 17ca8a0..4174c22 100644 --- a/benchmarks/task-data.hpp +++ b/benchmarks/task-data.hpp @@ -18,25 +18,41 @@ struct TaskData { uint32_t rep_count; bool batch_submit; uint32_t batch_size; - uint32_t barrier_after_n_operations; // thread output dml::status_code status; // average run duration in microseconds double combined_duration; double submit_duration; double complete_duration; + double combined_duration_stdev; + double submit_duration_stdev; + double complete_duration_stdev; // completed iterations uint32_t rep_completed; // set by execution barrier* barrier_; }; +struct ReadTaskData { + // thread placement / engine selection + uint8_t numa_node; + // region size and source+destination for move + std::vector sizes; + uint8_t nnode_src; + uint8_t nnode_dst; + // repetition + uint32_t rep_count; + bool batch_submit; + uint32_t batch_size; + + void AddToTaskVector(std::vector& v) const; +}; + inline void to_json(nlohmann::json& j, const TaskData& a) { j["task"]["size"] = a.size; j["task"]["iterations"] = a.rep_count; - j["task"]["batching"]["enabled"] = a.batch_submit; + j["task"]["batching"]["batch_submit"] = a.batch_submit; j["task"]["batching"]["batch_size"] = a.batch_size; - j["task"]["batching"]["barrier_after_n_operations"] = a.barrier_after_n_operations; j["affinity"]["node"] = a.numa_node; j["affinity"]["nnode_src"] = a.nnode_src; j["affinity"]["nnode_dst"] = a.nnode_dst; @@ -44,21 +60,40 @@ inline void to_json(nlohmann::json& j, const TaskData& a) { j["report"]["time"]["completion_avg"] = a.complete_duration; j["report"]["time"]["submission_avg"] = a.submit_duration; j["report"]["time"]["combined_avg"] = a.combined_duration; + j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev; + j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev; + j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev; j["report"]["iterations_completed"] = a.rep_completed; j["report"]["status"] = StatusCodeToString(a.status); } -inline void from_json(const nlohmann::json& j, TaskData& a) { - j["task"]["size"].get_to(a.size); +inline void from_json(const nlohmann::json& j, ReadTaskData& a) { + const uint32_t size_count = j["tast"]["size_count"].template get(); + for (uint32_t i = 0; i < size_count; i++) { + a.sizes.emplace_back(j["task"]["size"][i].template get()); + } + j["task"]["iterations"].get_to(a.rep_count); - j["task"]["batching"]["enabled"].get_to(a.batch_submit); + j["task"]["batching"]["batch_submit"].get_to(a.batch_submit); j["task"]["batching"]["batch_size"].get_to(a.batch_size); - j["task"]["batching"]["barrier_after_n_operations"].get_to(a.barrier_after_n_operations); j["affinity"]["node"].get_to(a.numa_node); j["affinity"]["nnode_src"].get_to(a.nnode_src); j["affinity"]["nnode_dst"].get_to(a.nnode_dst); } +inline void ReadTaskData::AddToTaskVector(std::vector &v) const { + for (const auto s : sizes) { + TaskData t; + t.size = s; + t.rep_count = rep_count; + t.batch_submit = batch_submit; + t.batch_size = batch_size; + t.numa_node = numa_node; + t.nnode_dst = nnode_dst; + t.nnode_src = nnode_src; + } +} + inline void WriteResultLog(const std::vector& args, const std::string& path, std::ostream& os) { nlohmann::json json; @@ -74,10 +109,15 @@ inline void ReadWorkDescription(std::vector& args, std::string& path, is >> json; const uint32_t count = json.at("count"); - args.resize(count); + std::vector rtd; + rtd.resize(count); path = json.at("path"); for (uint32_t i = 0; i < count; i++) { - args[i] = json["list"][i].template get(); + rtd[i] = json["list"][i].template get(); + } + + for (const auto& e : rtd) { + e.AddToTaskVector(args); } } \ No newline at end of file diff --git a/benchmarks/task-description.json b/benchmarks/task-description.json index 3d6c3cf..fc648b4 100644 --- a/benchmarks/task-description.json +++ b/benchmarks/task-description.json @@ -1,15 +1,15 @@ { "count": 1, - "path" : "sw", + "path" : "hw", "list": [ { "task": { - "size": 4096, - "iterations": 10000, + "size_count": 5, + "size": [ 1024, 4096, 1048576, 134217728, 1073741824 ], + "iterations": 1000, "batching": { - "enabled": false, - "batch_size": 100, - "barrier_after_n_operations": 10 + "batch_submit": false, + "batch_size": 0 } }, "affinity": {