diff --git a/benchmarks/benchmark.hpp b/benchmarks/benchmark.hpp
index c94c73a..62cb48f 100644
--- a/benchmarks/benchmark.hpp
+++ b/benchmarks/benchmark.hpp
@@ -19,10 +19,20 @@ double avg(const std::vector<uint64_t>& v) {
     return static_cast<long double>(std::accumulate(v.begin(), v.end(), 0)) / static_cast<long double>(v.size());
 }
 
+double stdev(const std::vector<uint64_t>& v, const double mean) {
+    std::vector<double> diff(v.size());
+    std::transform(v.begin(), v.end(), diff.begin(), [mean](double x) { return x - mean; });
+    const double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+    const double stdev = std::sqrt(sq_sum / static_cast<double>(v.size()));
+    return stdev;
+}
+
 #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
 #define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
 #define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }}
 
+#define ADD_TIMING_MESSUREMENT { if (i >= 5) { submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count()); completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count()); combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());}}
+
 template <typename path>
 void* thread_function(void* argp) {
     TaskData* args = reinterpret_cast<TaskData*>(argp);
@@ -43,21 +53,17 @@ void* thread_function(void* argp) {
     args->status = dml::status_code::ok;
     args->rep_completed = 0;
 
-    for (uint32_t i = 0; i < args->rep_count; i++) {
+    // we add 5 as the first 5 iterations will not be meassured
+    // to remove exceptional values encountered during warmup
+    for (uint32_t i = 0; i < args->rep_count + 5; i++) {
         // synchronize the start of each iteration
         // using the barrier structure
         args->barrier_->wait();
 
         if (args->batch_submit) {
-            uint32_t opcount = args->batch_size;
-
-            if (args->barrier_after_n_operations > 0) {
-                opcount += opcount / args->barrier_after_n_operations;
-            }
-
-            const auto st = std::chrono::high_resolution_clock::now();
+            const auto st = std::chrono::steady_clock::now();
 
-            auto sequence = dml::sequence(opcount, std::allocator<dml::byte_t>());
+            auto sequence = dml::sequence(args->batch_size, std::allocator<dml::byte_t>());
 
             for (uint32_t j = 0; j < args->batch_size; j++) {
                 // block_on_fault() is required to submit the task in a way so that the
@@ -66,10 +72,6 @@ void* thread_function(void* argp) {
 
                 const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv);
                 CHECK_STATUS(status, "Adding operation to batch failed!");
-
-                if (j % args->barrier_after_n_operations == 0) {
-                    sequence.add(dml::nop);
-                }
             }
 
             // we use the asynchronous submit-routine even though this is not required
@@ -78,21 +80,48 @@ void* thread_function(void* argp) {
             
             auto handler = dml::submit<path>(dml::batch, sequence);
 
-            const auto se = std::chrono::high_resolution_clock::now();
+            const auto se = std::chrono::steady_clock::now();
 
             auto result = handler.get();
 
-            const auto et = std::chrono::high_resolution_clock::now();
+            const auto et = std::chrono::steady_clock::now();
 
             const dml::status_code status = result.status;
             CHECK_STATUS(status, "Batch completed with an Error!");
 
-            submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count());
-            completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count());
-            combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());
+            ADD_TIMING_MESSUREMENT;
+        }
+        else if (args->batch_size > 1) {
+            // implementation for non-batched batch submit follows here
+            // this means we submit a bunch of work as single descriptors
+            // but then dont wait for the completion immediately
+
+            std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers;
+
+            const auto st = std::chrono::steady_clock::now();
+
+            for (uint32_t j = 0; j < args->batch_size; j++) {
+                // block_on_fault() is required to submit the task in a way so that the
+                // DSA engine can handle page faults itself together with the IOMMU which
+                // requires the WQ to be configured to allow this too
+
+                handlers.emplace_back(dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv));
+            }
+
+            const auto se = std::chrono::steady_clock::now();
+
+            for (auto& handler : handlers) {
+                auto result = handler.get();
+                const dml::status_code status = result.status;
+                CHECK_STATUS(status, "Operation completed with an Error!");
+            }
+
+            const auto et = std::chrono::steady_clock::now();
+
+            ADD_TIMING_MESSUREMENT;
         }
         else {
-            const auto st = std::chrono::high_resolution_clock::now();
+            const auto st = std::chrono::steady_clock::now();
 
             // we use the asynchronous submit-routine even though this is not required
             // here, however the project later on will only use async operation and
@@ -102,18 +131,16 @@ void* thread_function(void* argp) {
             // requires the WQ to be configured to allow this too
             auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv);
 
-            const auto se = std::chrono::high_resolution_clock::now();
+            const auto se = std::chrono::steady_clock::now();
 
             auto result = handler.get();
 
-            const auto et = std::chrono::high_resolution_clock::now();
+            const auto et = std::chrono::steady_clock::now();
 
             const dml::status_code status = result.status;
             CHECK_STATUS(status, "Operation completed with an Error!");
 
-            submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count());
-            completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count());
-            combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());
+            ADD_TIMING_MESSUREMENT;
         }
 
         args->rep_completed++;
@@ -126,6 +153,9 @@ void* thread_function(void* argp) {
     args->combined_duration = avg(combined_durations);
     args->complete_duration = avg(completion_durations);
     args->submit_duration = avg(submission_durations);
+    args->combined_duration_stdev = stdev(combined_durations, args->combined_duration);
+    args->complete_duration_stdev = stdev(completion_durations, args->complete_duration);
+    args->submit_duration_stdev = stdev(submission_durations, args->submit_duration);
 
     return nullptr;
 }
diff --git a/benchmarks/task-data.hpp b/benchmarks/task-data.hpp
index 17ca8a0..4174c22 100644
--- a/benchmarks/task-data.hpp
+++ b/benchmarks/task-data.hpp
@@ -18,25 +18,41 @@ struct TaskData {
     uint32_t rep_count;
     bool batch_submit;
     uint32_t batch_size;
-    uint32_t barrier_after_n_operations;
     // thread output
     dml::status_code status;
     // average run duration in microseconds
     double combined_duration;
     double submit_duration;
     double complete_duration;
+    double combined_duration_stdev;
+    double submit_duration_stdev;
+    double complete_duration_stdev;
     // completed iterations
     uint32_t rep_completed;
     // set by execution
     barrier* barrier_;
 };
 
+struct ReadTaskData {
+    // thread placement / engine selection
+    uint8_t numa_node;
+    // region size and source+destination for move
+    std::vector<size_t> sizes;
+    uint8_t nnode_src;
+    uint8_t nnode_dst;
+    // repetition
+    uint32_t rep_count;
+    bool batch_submit;
+    uint32_t batch_size;
+
+    void AddToTaskVector(std::vector<TaskData>& v) const;
+};
+
 inline void to_json(nlohmann::json& j, const TaskData& a) {
     j["task"]["size"] = a.size;
     j["task"]["iterations"] = a.rep_count;
-    j["task"]["batching"]["enabled"] = a.batch_submit;
+    j["task"]["batching"]["batch_submit"] = a.batch_submit;
     j["task"]["batching"]["batch_size"] = a.batch_size;
-    j["task"]["batching"]["barrier_after_n_operations"] = a.barrier_after_n_operations;
     j["affinity"]["node"] = a.numa_node;
     j["affinity"]["nnode_src"] = a.nnode_src;
     j["affinity"]["nnode_dst"] = a.nnode_dst;
@@ -44,21 +60,40 @@ inline void to_json(nlohmann::json& j, const TaskData& a) {
     j["report"]["time"]["completion_avg"] = a.complete_duration;
     j["report"]["time"]["submission_avg"] = a.submit_duration;
     j["report"]["time"]["combined_avg"] = a.combined_duration;
+    j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev;
+    j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev;
+    j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev;
     j["report"]["iterations_completed"] = a.rep_completed;
     j["report"]["status"] = StatusCodeToString(a.status);
 }
 
-inline void from_json(const nlohmann::json& j, TaskData& a) {
-    j["task"]["size"].get_to(a.size);
+inline void from_json(const nlohmann::json& j, ReadTaskData& a) {
+    const uint32_t size_count = j["tast"]["size_count"].template get<uint32_t>();
+    for (uint32_t i = 0; i < size_count; i++) {
+        a.sizes.emplace_back(j["task"]["size"][i].template get<uint32_t>());
+    }
+
     j["task"]["iterations"].get_to(a.rep_count);
-    j["task"]["batching"]["enabled"].get_to(a.batch_submit);
+    j["task"]["batching"]["batch_submit"].get_to(a.batch_submit);
     j["task"]["batching"]["batch_size"].get_to(a.batch_size);
-    j["task"]["batching"]["barrier_after_n_operations"].get_to(a.barrier_after_n_operations);
     j["affinity"]["node"].get_to(a.numa_node);
     j["affinity"]["nnode_src"].get_to(a.nnode_src);
     j["affinity"]["nnode_dst"].get_to(a.nnode_dst);
 }
 
+inline void ReadTaskData::AddToTaskVector(std::vector<TaskData> &v) const {
+    for (const auto s : sizes) {
+        TaskData t;
+        t.size = s;
+        t.rep_count = rep_count;
+        t.batch_submit = batch_submit;
+        t.batch_size = batch_size;
+        t.numa_node = numa_node;
+        t.nnode_dst = nnode_dst;
+        t.nnode_src = nnode_src;
+    }
+}
+
 inline void WriteResultLog(const std::vector<TaskData>& args, const std::string& path, std::ostream& os) {
     nlohmann::json json;
 
@@ -74,10 +109,15 @@ inline void ReadWorkDescription(std::vector<TaskData>& args, std::string& path,
     is >> json;
 
     const uint32_t count = json.at("count");
-    args.resize(count);
+    std::vector<ReadTaskData> rtd;
+    rtd.resize(count);
     path = json.at("path");
 
     for (uint32_t i = 0; i < count; i++) {
-        args[i] = json["list"][i].template get<TaskData>();
+        rtd[i] = json["list"][i].template get<ReadTaskData>();
+    }
+
+    for (const auto& e : rtd) {
+        e.AddToTaskVector(args);
     }
 }
\ No newline at end of file
diff --git a/benchmarks/task-description.json b/benchmarks/task-description.json
index 3d6c3cf..fc648b4 100644
--- a/benchmarks/task-description.json
+++ b/benchmarks/task-description.json
@@ -1,15 +1,15 @@
 {
   "count": 1,
-  "path" : "sw",
+  "path" : "hw",
   "list": [
     {
       "task": {
-        "size": 4096,
-        "iterations": 10000,
+        "size_count": 5,
+        "size": [ 1024, 4096, 1048576, 134217728, 1073741824 ],
+        "iterations": 1000,
         "batching": {
-          "enabled": false,
-          "batch_size": 100,
-          "barrier_after_n_operations": 10
+          "batch_submit": false,
+          "batch_size": 0
         }
       },
       "affinity": {