ignore first five runs to reduce influence of warmup, add non-batch-descriptor batch loop for testing, calculate standard deviation for all three messurements

1 year ago · 1bfb1f316c
3 changed files with 109 additions and 39 deletions
--- a/benchmarks/benchmark.hpp
+++ b/benchmarks/benchmark.hpp
@ -19,10 +19,20 @@ double avg(const std::vector<uint64_t>& v) {
    return static_cast<long double>(std::accumulate(v.begin(), v.end(), 0)) / static_cast<long double>(v.size());
 }

+double stdev(const std::vector<uint64_t>& v, const double mean) {
+    std::vector<double> diff(v.size());
+    std::transform(v.begin(), v.end(), diff.begin(), [mean](double x) { return x - mean; });
+    const double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+    const double stdev = std::sqrt(sq_sum / static_cast<double>(v.size()));
+    return stdev;
+}
+
 #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
 #define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
 #define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }}

+#define ADD_TIMING_MESSUREMENT { if (i >= 5) { submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count()); completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count()); combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());}}
+
 template <typename path>
 void* thread_function(void* argp) {
    TaskData* args = reinterpret_cast<TaskData*>(argp);
@ -43,21 +53,17 @@ void* thread_function(void* argp) {
    args->status = dml::status_code::ok;
    args->rep_completed = 0;

-    for (uint32_t i = 0; i < args->rep_count; i++) {
+    // we add 5 as the first 5 iterations will not be meassured
+    // to remove exceptional values encountered during warmup
+    for (uint32_t i = 0; i < args->rep_count + 5; i++) {
        // synchronize the start of each iteration
        // using the barrier structure
        args->barrier_->wait();

        if (args->batch_submit) {
-            uint32_t opcount = args->batch_size;
+            const auto st = std::chrono::steady_clock::now();

-            if (args->barrier_after_n_operations > 0) {
-                opcount += opcount / args->barrier_after_n_operations;
-            }
-
-            const auto st = std::chrono::high_resolution_clock::now();
-
-            auto sequence = dml::sequence(opcount, std::allocator<dml::byte_t>());
+            auto sequence = dml::sequence(args->batch_size, std::allocator<dml::byte_t>());

            for (uint32_t j = 0; j < args->batch_size; j++) {
                // block_on_fault() is required to submit the task in a way so that the
@ -66,10 +72,6 @@ void* thread_function(void* argp) {

                const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv);
                CHECK_STATUS(status, "Adding operation to batch failed!");
-
-                if (j % args->barrier_after_n_operations == 0) {
-                    sequence.add(dml::nop);
-                }
            }

            // we use the asynchronous submit-routine even though this is not required
@ -78,21 +80,48 @@ void* thread_function(void* argp) {
            
            auto handler = dml::submit<path>(dml::batch, sequence);

-            const auto se = std::chrono::high_resolution_clock::now();
+            const auto se = std::chrono::steady_clock::now();

            auto result = handler.get();

-            const auto et = std::chrono::high_resolution_clock::now();
+            const auto et = std::chrono::steady_clock::now();

            const dml::status_code status = result.status;
            CHECK_STATUS(status, "Batch completed with an Error!");

-            submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count());
-            completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count());
-            combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());
+            ADD_TIMING_MESSUREMENT;
+        }
+        else if (args->batch_size > 1) {
+            // implementation for non-batched batch submit follows here
+            // this means we submit a bunch of work as single descriptors
+            // but then dont wait for the completion immediately
+
+            std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers;
+
+            const auto st = std::chrono::steady_clock::now();
+
+            for (uint32_t j = 0; j < args->batch_size; j++) {
+                // block_on_fault() is required to submit the task in a way so that the
+                // DSA engine can handle page faults itself together with the IOMMU which
+                // requires the WQ to be configured to allow this too
+
+                handlers.emplace_back(dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv));
+            }
+
+            const auto se = std::chrono::steady_clock::now();
+
+            for (auto& handler : handlers) {
+                auto result = handler.get();
+                const dml::status_code status = result.status;
+                CHECK_STATUS(status, "Operation completed with an Error!");
+            }
+
+            const auto et = std::chrono::steady_clock::now();
+
+            ADD_TIMING_MESSUREMENT;
        }
        else {
-            const auto st = std::chrono::high_resolution_clock::now();
+            const auto st = std::chrono::steady_clock::now();

            // we use the asynchronous submit-routine even though this is not required
            // here, however the project later on will only use async operation and
@ -102,18 +131,16 @@ void* thread_function(void* argp) {
            // requires the WQ to be configured to allow this too
            auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv);

-            const auto se = std::chrono::high_resolution_clock::now();
+            const auto se = std::chrono::steady_clock::now();

            auto result = handler.get();

-            const auto et = std::chrono::high_resolution_clock::now();
+            const auto et = std::chrono::steady_clock::now();

            const dml::status_code status = result.status;
            CHECK_STATUS(status, "Operation completed with an Error!");

-            submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count());
-            completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count());
-            combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());
+            ADD_TIMING_MESSUREMENT;
        }

        args->rep_completed++;
@ -126,6 +153,9 @@ void* thread_function(void* argp) {
    args->combined_duration = avg(combined_durations);
    args->complete_duration = avg(completion_durations);
    args->submit_duration = avg(submission_durations);
+    args->combined_duration_stdev = stdev(combined_durations, args->combined_duration);
+    args->complete_duration_stdev = stdev(completion_durations, args->complete_duration);
+    args->submit_duration_stdev = stdev(submission_durations, args->submit_duration);

    return nullptr;
 }
--- a/benchmarks/task-data.hpp
+++ b/benchmarks/task-data.hpp
@ -18,25 +18,41 @@ struct TaskData {
    uint32_t rep_count;
    bool batch_submit;
    uint32_t batch_size;
-    uint32_t barrier_after_n_operations;
    // thread output
    dml::status_code status;
    // average run duration in microseconds
    double combined_duration;
    double submit_duration;
    double complete_duration;
+    double combined_duration_stdev;
+    double submit_duration_stdev;
+    double complete_duration_stdev;
    // completed iterations
    uint32_t rep_completed;
    // set by execution
    barrier* barrier_;
 };

+struct ReadTaskData {
+    // thread placement / engine selection
+    uint8_t numa_node;
+    // region size and source+destination for move
+    std::vector<size_t> sizes;
+    uint8_t nnode_src;
+    uint8_t nnode_dst;
+    // repetition
+    uint32_t rep_count;
+    bool batch_submit;
+    uint32_t batch_size;
+
+    void AddToTaskVector(std::vector<TaskData>& v) const;
+};
+
 inline void to_json(nlohmann::json& j, const TaskData& a) {
    j["task"]["size"] = a.size;
    j["task"]["iterations"] = a.rep_count;
-    j["task"]["batching"]["enabled"] = a.batch_submit;
+    j["task"]["batching"]["batch_submit"] = a.batch_submit;
    j["task"]["batching"]["batch_size"] = a.batch_size;
-    j["task"]["batching"]["barrier_after_n_operations"] = a.barrier_after_n_operations;
    j["affinity"]["node"] = a.numa_node;
    j["affinity"]["nnode_src"] = a.nnode_src;
    j["affinity"]["nnode_dst"] = a.nnode_dst;
@ -44,21 +60,40 @@ inline void to_json(nlohmann::json& j, const TaskData& a) {
    j["report"]["time"]["completion_avg"] = a.complete_duration;
    j["report"]["time"]["submission_avg"] = a.submit_duration;
    j["report"]["time"]["combined_avg"] = a.combined_duration;
+    j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev;
+    j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev;
+    j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev;
    j["report"]["iterations_completed"] = a.rep_completed;
    j["report"]["status"] = StatusCodeToString(a.status);
 }

-inline void from_json(const nlohmann::json& j, TaskData& a) {
-    j["task"]["size"].get_to(a.size);
+inline void from_json(const nlohmann::json& j, ReadTaskData& a) {
+    const uint32_t size_count = j["tast"]["size_count"].template get<uint32_t>();
+    for (uint32_t i = 0; i < size_count; i++) {
+        a.sizes.emplace_back(j["task"]["size"][i].template get<uint32_t>());
+    }
+
    j["task"]["iterations"].get_to(a.rep_count);
-    j["task"]["batching"]["enabled"].get_to(a.batch_submit);
+    j["task"]["batching"]["batch_submit"].get_to(a.batch_submit);
    j["task"]["batching"]["batch_size"].get_to(a.batch_size);
-    j["task"]["batching"]["barrier_after_n_operations"].get_to(a.barrier_after_n_operations);
    j["affinity"]["node"].get_to(a.numa_node);
    j["affinity"]["nnode_src"].get_to(a.nnode_src);
    j["affinity"]["nnode_dst"].get_to(a.nnode_dst);
 }

+inline void ReadTaskData::AddToTaskVector(std::vector<TaskData> &v) const {
+    for (const auto s : sizes) {
+        TaskData t;
+        t.size = s;
+        t.rep_count = rep_count;
+        t.batch_submit = batch_submit;
+        t.batch_size = batch_size;
+        t.numa_node = numa_node;
+        t.nnode_dst = nnode_dst;
+        t.nnode_src = nnode_src;
+    }
+}
+
 inline void WriteResultLog(const std::vector<TaskData>& args, const std::string& path, std::ostream& os) {
    nlohmann::json json;

@ -74,10 +109,15 @@ inline void ReadWorkDescription(std::vector<TaskData>& args, std::string& path,
    is >> json;

    const uint32_t count = json.at("count");
-    args.resize(count);
+    std::vector<ReadTaskData> rtd;
+    rtd.resize(count);
    path = json.at("path");

    for (uint32_t i = 0; i < count; i++) {
-        args[i] = json["list"][i].template get<TaskData>();
+        rtd[i] = json["list"][i].template get<ReadTaskData>();
+    }
+
+    for (const auto& e : rtd) {
+        e.AddToTaskVector(args);
    }
 }
--- a/benchmarks/task-description.json
+++ b/benchmarks/task-description.json
@ -1,15 +1,15 @@
 {
  "count": 1,
-  "path" : "sw",
+  "path" : "hw",
  "list": [
    {
      "task": {
-        "size": 4096,
-        "iterations": 10000,
+        "size_count": 5,
+        "size": [ 1024, 4096, 1048576, 134217728, 1073741824 ],
+        "iterations": 1000,
        "batching": {
-          "enabled": false,
-          "batch_size": 100,
-          "barrier_after_n_operations": 10
+          "batch_submit": false,
+          "batch_size": 0
        }
      },
      "affinity": {