restructure of directory layout

1 year ago · bc8c4f8ab3
12 changed files with 101 additions and 93 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,6 +1,6 @@
 [submodule "gosh"]
 	path = thesis/gosh
 	url = https://github.com/nfeske/gosh.git
-[submodule "benchmarks/json"]
-	path = benchmarks/json
+[submodule "benchmarks/util/json"]
+	path = benchmarks/util/json
 	url = https://github.com/nlohmann/json
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@ -10,7 +10,6 @@ find_package(NUMA REQUIRED)

 set(DML_SOURCE_DIR "../../DML/include/")
 set(SOURCES main.cpp)
-set(INCLUDES benchmark.hpp statuscode-tostring.hpp task-data.hpp)

 add_executable(dml-benchmark ${SOURCES})

--- a/benchmarks/benchmark-findings.md
+++ b/benchmarks/benchmark-findings.md
@ -1,4 +0,0 @@
-# cross-copy
- for internode copy, the src-node-engine is only slightly faster than the dst
- for intersocket copy, the src-node-engine is faster than dst-node-engine
- using both in tandem can give about 1.4x speedup over only src-node
--- a/benchmarks/benchmark-plan.md
+++ b/benchmarks/benchmark-plan.md
@ -1,57 +0,0 @@
-# peak-perf
-
- meassure ddr to ddr
- meassure ddr to hbm
-
-All for 1KiB, 4KiB, 1MiB, 1GiB
-
-All for HW and also SW path
-
-All for intra-node, inter-node and inter-socket
-
--> conclude how much overhead DSA engine has
-
--> conclude size after which using HW makes sense
-    this point is reached when submit overhead for
-    hw execution is smaller than entire copy time
-    for sw execution
-
-# submit // done
-
- single submit-and-wait
- multi submit
- batch submit
-
-All with both 1 and 4 engines per WQ
-
-All for 1KiB, 4KiB, 1MiB, 1GiB
-
-All only on DDR and intra-node
-
--> conclude which work submission strategy is best for which size
-
--> conclude whether multiple engines significantly improve batch perf
-
-# mtsubmit // done
-
- multiple threads submit to the same WQ
- use 1,2,4,8,12 threads
-
-All using DDR and 1MiB
-
-All for 1 vs 4 engines
-
-All on DDR and intra-node
-
--> conclude how bad mt submit hurts performance
-
--> conclude whether multiple engines help mt submit
-
-# cross-copy // done
-
- compare which is faster: xcopy, copy from source node, copy from dst node
-
-All for both inter-node and inter-socket copy using DDR and 1MiB on 4E
-
--> conclude where a copy thread should live
-
--- a/benchmarks/benchmark-results/task-description.json
+++ b/benchmarks/benchmark-results/task-description.json
@ -1,21 +0,0 @@
-{
-  "count": 1,
-  "path" : "sw",
-  "list": [
-    {
-      "task": {
-        "size": 4096,
-        "iterations": 1000,
-        "batching": {
-          "batch_submit": false,
-          "batch_size": 0
-        }
-      },
-      "affinity": {
-        "node": 0,
-        "nnode_src": 0,
-        "nnode_dst": 0
-      }
-    }
-  ]
-}
--- a/benchmarks/benchmark.hpp
+++ b/benchmarks/benchmark.hpp
@ -11,9 +11,10 @@

 #include <dml/dml.hpp>

-#include "barrier.hpp"
-#include "statuscode-tostring.hpp"
-#include "task-data.hpp"
+#include "util/barrier.hpp"
+#include "util/statuscode-tostring.hpp"
+#include "util/task-data.hpp"
+#include "util/array_utils.h"

 #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
 #define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
@ -34,6 +35,8 @@ void* thread_function(void* argp) {
    dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size);
    dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size);

+    fill_mt(reinterpret_cast<uint8_t*>(src), args->size, std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
+
    args->status = dml::status_code::ok;
    args->rep_completed = 0;

--- a/benchmarks/json
+++ b/benchmarks/json
@ -1 +0,0 @@
-Subproject commit 360ce457f46f03111332f473fdbb3a353f16723c
--- a/benchmarks/util/array_utils.h
+++ b/benchmarks/util/array_utils.h
@ -0,0 +1,82 @@
+// source: https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/intel_xeon_max/code/src/utils/array_utils.h
+// author: Andre Berthold
+
+#pragma once
+#include <cstdlib>
+#include <ctime>
+#include <cstdint>
+#include <type_traits>
+#include <random>
+#include <chrono>
+
+#include <immintrin.h>
+
+/// @brief Fills a given array with random generated integers.
+/// @tparam base_t Datatype of the array
+/// @param dest Pointer to the array
+/// @param size Size of the array
+/// @param min Minumum value of the generated integers
+/// @param max Maximum value of the generated integers
+template<typename base_t>
+void fill(base_t * dest, uint64_t size, base_t min, base_t max) {
+    std::srand(std::time(nullptr));
+    for(uint64_t i = 0; i < size/sizeof(base_t); ++i) {
+        dest[i] = (std::rand() % (max - min)) + min;
+    }
+}
+
+/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937).
+/// @tparam base_t Datatype of the array
+/// @param dest Pointer to the array
+/// @param size Size of the array
+/// @param min Minumum value of the generated integers
+/// @param max Maximum value of the generated integers
+template <typename T>
+void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) {
+	static_assert(std::is_integral<T>::value, "Data type is not integral.");
+        
+    size = size / sizeof(T);
+
+    std::mt19937::result_type seed;
+    if (int_seed == 0) {
+        std::random_device rd;
+        seed = rd() ^ (
+            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::seconds>(
+                std::chrono::system_clock::now().time_since_epoch()).count() + 
+            (std::mt19937::result_type) std::chrono::duration_cast<std::chrono::microseconds>(
+                std::chrono::high_resolution_clock::now().time_since_epoch()).count());
+    } else seed = int_seed;
+        
+    std::mt19937 gen(seed);
+    std::uniform_int_distribution<T> distrib(min, max);
+        
+    for (uint64_t j = 0; j < size; ++j) {
+        array[j] = distrib(gen);
+    }
+	
+}
+
+/**
+ * @brief Checks if two arrays of the integral type *T* contain the same values
+ * 
+ * @tparam T Integral type of *array0* and *array1*
+ * @param array0 Array 0 to check
+ * @param array1 Array 1 to check
+ * @param size_b Size of the two arrays in byte
+ * @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index)
+ * @return bool Weathor or not the content is equal or not
+ */
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value, bool>::type
+        check_same(T* array0, T* array1, size_t size_b, bool verbose) {
+    for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) {
+        __m512i vec0 = _mm512_stream_load_si512(array0 + i);
+        __m512i vec1 = _mm512_stream_load_si512(array1 + i);
+
+        __mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1);
+    }
+
+    //TODO complete function
+
+    return false;
+}
--- a/benchmarks/util/barrier.hpp
+++ b/benchmarks/util/barrier.hpp
--- a/benchmarks/util/json
+++ b/benchmarks/util/json
@ -0,0 +1 @@
+Subproject commit 9cca280a4d0ccf0c08f47a99aa71d1b0e52f8d03
--- a/benchmarks/util/statuscode-tostring.hpp
+++ b/benchmarks/util/statuscode-tostring.hpp
--- a/benchmarks/util/task-data.hpp
+++ b/benchmarks/util/task-data.hpp
@ -21,9 +21,12 @@ struct TaskData {
    // thread output
    dml::status_code status;
    // average run duration in microseconds
-    std::vector<double> combined_duration;
-    std::vector<double> submit_duration;
-    std::vector<double> complete_duration;
+    double combined_duration;
+    double submit_duration;
+    double complete_duration;
+    double combined_duration_stdev;
+    double submit_duration_stdev;
+    double complete_duration_stdev;
    // completed iterations
    uint32_t rep_completed;
    // set by execution
@ -38,10 +41,13 @@ inline void to_json(nlohmann::json& j, const TaskData& a) {
    j["affinity"]["node"] = a.numa_node;
    j["affinity"]["nnode_src"] = a.nnode_src;
    j["affinity"]["nnode_dst"] = a.nnode_dst;
-    j["report"]["time"]["unit"] = "microseconds";
+    j["report"]["time"]["unit"] = "nanoseconds";
    j["report"]["time"]["completion_avg"] = a.complete_duration;
    j["report"]["time"]["submission_avg"] = a.submit_duration;
    j["report"]["time"]["combined_avg"] = a.combined_duration;
+    j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev;
+    j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev;
+    j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev;
    j["report"]["iterations_completed"] = a.rep_completed;
    j["report"]["status"] = StatusCodeToString(a.status);
 }