diff --git a/.gitmodules b/.gitmodules
index 2d9be9d..3c5a7e5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "gosh"]
 	path = thesis/gosh
 	url = https://github.com/nfeske/gosh.git
-[submodule "benchmarks/json"]
-	path = benchmarks/json
+[submodule "benchmarks/util/json"]
+	path = benchmarks/util/json
 	url = https://github.com/nlohmann/json
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index d396e70..3980ceb 100755
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -10,7 +10,6 @@ find_package(NUMA REQUIRED)
 
 set(DML_SOURCE_DIR "../../DML/include/")
 set(SOURCES main.cpp)
-set(INCLUDES benchmark.hpp statuscode-tostring.hpp task-data.hpp)
 
 add_executable(dml-benchmark ${SOURCES})
 
diff --git a/benchmarks/benchmark-findings.md b/benchmarks/benchmark-findings.md
deleted file mode 100644
index eac5519..0000000
--- a/benchmarks/benchmark-findings.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# cross-copy
-- for internode copy, the src-node-engine is only slightly faster than the dst
-- for intersocket copy, the src-node-engine is faster than dst-node-engine
-- using both in tandem can give about 1.4x speedup over only src-node
diff --git a/benchmarks/benchmark-plan.md b/benchmarks/benchmark-plan.md
deleted file mode 100644
index 1278a15..0000000
--- a/benchmarks/benchmark-plan.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# peak-perf
-
-- meassure ddr to ddr
-- meassure ddr to hbm
-
-All for 1KiB, 4KiB, 1MiB, 1GiB
-
-All for HW and also SW path
-
-All for intra-node, inter-node and inter-socket
-
---> conclude how much overhead DSA engine has
-
---> conclude size after which using HW makes sense
-    this point is reached when submit overhead for
-    hw execution is smaller than entire copy time
-    for sw execution
-
-# submit // done
-
-- single submit-and-wait
-- multi submit
-- batch submit
-
-All with both 1 and 4 engines per WQ
-
-All for 1KiB, 4KiB, 1MiB, 1GiB
-
-All only on DDR and intra-node
-
---> conclude which work submission strategy is best for which size
-
---> conclude whether multiple engines significantly improve batch perf
-
-# mtsubmit // done
-
-- multiple threads submit to the same WQ
-- use 1,2,4,8,12 threads
-
-All using DDR and 1MiB
-
-All for 1 vs 4 engines
-
-All on DDR and intra-node
-
---> conclude how bad mt submit hurts performance
-
---> conclude whether multiple engines help mt submit
-
-# cross-copy // done
-
-- compare which is faster: xcopy, copy from source node, copy from dst node
-
-All for both inter-node and inter-socket copy using DDR and 1MiB on 4E
-
---> conclude where a copy thread should live
-
diff --git a/benchmarks/benchmark-results/task-description.json b/benchmarks/benchmark-results/task-description.json
deleted file mode 100644
index 3c8086c..0000000
--- a/benchmarks/benchmark-results/task-description.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "count": 1,
-  "path" : "sw",
-  "list": [
-    {
-      "task": {
-        "size": 4096,
-        "iterations": 1000,
-        "batching": {
-          "batch_submit": false,
-          "batch_size": 0
-        }
-      },
-      "affinity": {
-        "node": 0,
-        "nnode_src": 0,
-        "nnode_dst": 0
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/benchmarks/benchmark.hpp b/benchmarks/benchmark.hpp
index 4752716..aec1ba6 100644
--- a/benchmarks/benchmark.hpp
+++ b/benchmarks/benchmark.hpp
@@ -11,9 +11,10 @@
 
 #include <dml/dml.hpp>
 
-#include "barrier.hpp"
-#include "statuscode-tostring.hpp"
-#include "task-data.hpp"
+#include "util/barrier.hpp"
+#include "util/statuscode-tostring.hpp"
+#include "util/task-data.hpp"
+#include "util/array_utils.h"
 
 #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
 #define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
@@ -34,6 +35,8 @@ void* thread_function(void* argp) {
     dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size);
     dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size);
 
+    fill_mt(reinterpret_cast<uint8_t*>(src), args->size, std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
+
     args->status = dml::status_code::ok;
     args->rep_completed = 0;
 
diff --git a/benchmarks/json b/benchmarks/json
deleted file mode 160000
index 360ce45..0000000
--- a/benchmarks/json
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 360ce457f46f03111332f473fdbb3a353f16723c
diff --git a/benchmarks/util/array_utils.h b/benchmarks/util/array_utils.h
new file mode 100644
index 0000000..d9f8a85
--- /dev/null
+++ b/benchmarks/util/array_utils.h
@@ -0,0 +1,82 @@
+// source: https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/intel_xeon_max/code/src/utils/array_utils.h
+// author: Andre Berthold
+
+#pragma once
+#include <cstdlib>
+#include <ctime>
+#include <cstdint>
+#include <type_traits>
+#include <random>
+#include <chrono>
+
+#include <immintrin.h>
+
+/// @brief Fills a given array with random generated integers.
+/// @tparam base_t Datatype of the array
+/// @param dest Pointer to the array
+/// @param size Size of the array
+/// @param min Minumum value of the generated integers
+/// @param max Maximum value of the generated integers
+template<typename base_t>
+void fill(base_t * dest, uint64_t size, base_t min, base_t max) {
+    std::srand(std::time(nullptr));
+    for(uint64_t i = 0; i < size/sizeof(base_t); ++i) {
+        dest[i] = (std::rand() % (max - min)) + min;
+    }
+}
+
+/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937).
+/// @tparam base_t Datatype of the array
+/// @param dest Pointer to the array
+/// @param size Size of the array
+/// @param min Minumum value of the generated integers
+/// @param max Maximum value of the generated integers
+template <typename T>
+void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) {
+	static_assert(std::is_integral<T>::value, "Data type is not integral.");
+        
+    size = size / sizeof(T);
+
+    std::mt19937::result_type seed;
+    if (int_seed == 0) {
+        std::random_device rd;
+        seed = rd() ^ (
+            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::seconds>(
+                std::chrono::system_clock::now().time_since_epoch()).count() + 
+            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::microseconds>(
+                std::chrono::high_resolution_clock::now().time_since_epoch()).count());
+    } else seed = int_seed;
+        
+    std::mt19937 gen(seed);
+    std::uniform_int_distribution<T> distrib(min, max);
+        
+    for (uint64_t j = 0; j < size; ++j) {
+        array[j] = distrib(gen);
+    }
+	
+}
+
+/**
+ * @brief Checks if two arrays of the integral type *T* contain the same values
+ * 
+ * @tparam T Integral type of *array0* and *array1*
+ * @param array0 Array 0 to check
+ * @param array1 Array 1 to check
+ * @param size_b Size of the two arrays in byte
+ * @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index)
+ * @return bool Weathor or not the content is equal or not
+ */
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value, bool>::type
+        check_same(T* array0, T* array1, size_t size_b, bool verbose) {
+    for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) {
+        __m512i vec0 = _mm512_stream_load_si512(array0 + i);
+        __m512i vec1 = _mm512_stream_load_si512(array1 + i);
+
+        __mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1);
+    }
+
+    //TODO complete function
+
+    return false;
+}
diff --git a/benchmarks/barrier.hpp b/benchmarks/util/barrier.hpp
similarity index 100%
rename from benchmarks/barrier.hpp
rename to benchmarks/util/barrier.hpp
diff --git a/benchmarks/util/json b/benchmarks/util/json
new file mode 160000
index 0000000..9cca280
--- /dev/null
+++ b/benchmarks/util/json
@@ -0,0 +1 @@
+Subproject commit 9cca280a4d0ccf0c08f47a99aa71d1b0e52f8d03
diff --git a/benchmarks/statuscode-tostring.hpp b/benchmarks/util/statuscode-tostring.hpp
similarity index 100%
rename from benchmarks/statuscode-tostring.hpp
rename to benchmarks/util/statuscode-tostring.hpp
diff --git a/benchmarks/task-data.hpp b/benchmarks/util/task-data.hpp
similarity index 83%
rename from benchmarks/task-data.hpp
rename to benchmarks/util/task-data.hpp
index a78f631..2c770c4 100644
--- a/benchmarks/task-data.hpp
+++ b/benchmarks/util/task-data.hpp
@@ -21,9 +21,12 @@ struct TaskData {
     // thread output
     dml::status_code status;
     // average run duration in microseconds
-    std::vector<double> combined_duration;
-    std::vector<double> submit_duration;
-    std::vector<double> complete_duration;
+    double combined_duration;
+    double submit_duration;
+    double complete_duration;
+    double combined_duration_stdev;
+    double submit_duration_stdev;
+    double complete_duration_stdev;
     // completed iterations
     uint32_t rep_completed;
     // set by execution
@@ -38,10 +41,13 @@ inline void to_json(nlohmann::json& j, const TaskData& a) {
     j["affinity"]["node"] = a.numa_node;
     j["affinity"]["nnode_src"] = a.nnode_src;
     j["affinity"]["nnode_dst"] = a.nnode_dst;
-    j["report"]["time"]["unit"] = "microseconds";
+    j["report"]["time"]["unit"] = "nanoseconds";
     j["report"]["time"]["completion_avg"] = a.complete_duration;
     j["report"]["time"]["submission_avg"] = a.submit_duration;
     j["report"]["time"]["combined_avg"] = a.combined_duration;
+    j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev;
+    j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev;
+    j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev;
     j["report"]["iterations_completed"] = a.rep_completed;
     j["report"]["status"] = StatusCodeToString(a.status);
 }