diff --git a/.gitmodules b/.gitmodules index 2d9be9d..3c5a7e5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "gosh"] path = thesis/gosh url = https://github.com/nfeske/gosh.git -[submodule "benchmarks/json"] - path = benchmarks/json +[submodule "benchmarks/util/json"] + path = benchmarks/util/json url = https://github.com/nlohmann/json diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index d396e70..3980ceb 100755 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -10,7 +10,6 @@ find_package(NUMA REQUIRED) set(DML_SOURCE_DIR "../../DML/include/") set(SOURCES main.cpp) -set(INCLUDES benchmark.hpp statuscode-tostring.hpp task-data.hpp) add_executable(dml-benchmark ${SOURCES}) diff --git a/benchmarks/benchmark-findings.md b/benchmarks/benchmark-findings.md deleted file mode 100644 index eac5519..0000000 --- a/benchmarks/benchmark-findings.md +++ /dev/null @@ -1,4 +0,0 @@ -# cross-copy -- for internode copy, the src-node-engine is only slightly faster than the dst -- for intersocket copy, the src-node-engine is faster than dst-node-engine -- using both in tandem can give about 1.4x speedup over only src-node diff --git a/benchmarks/benchmark-plan.md b/benchmarks/benchmark-plan.md deleted file mode 100644 index 1278a15..0000000 --- a/benchmarks/benchmark-plan.md +++ /dev/null @@ -1,57 +0,0 @@ -# peak-perf - -- meassure ddr to ddr -- meassure ddr to hbm - -All for 1KiB, 4KiB, 1MiB, 1GiB - -All for HW and also SW path - -All for intra-node, inter-node and inter-socket - ---> conclude how much overhead DSA engine has - ---> conclude size after which using HW makes sense - this point is reached when submit overhead for - hw execution is smaller than entire copy time - for sw execution - -# submit // done - -- single submit-and-wait -- multi submit -- batch submit - -All with both 1 and 4 engines per WQ - -All for 1KiB, 4KiB, 1MiB, 1GiB - -All only on DDR and intra-node - ---> conclude which work submission strategy is best for which size - ---> conclude whether multiple engines significantly improve batch perf - -# mtsubmit // done - -- multiple threads submit to the same WQ -- use 1,2,4,8,12 threads - -All using DDR and 1MiB - -All for 1 vs 4 engines - -All on DDR and intra-node - ---> conclude how bad mt submit hurts performance - ---> conclude whether multiple engines help mt submit - -# cross-copy // done - -- compare which is faster: xcopy, copy from source node, copy from dst node - -All for both inter-node and inter-socket copy using DDR and 1MiB on 4E - ---> conclude where a copy thread should live - diff --git a/benchmarks/benchmark-results/task-description.json b/benchmarks/benchmark-results/task-description.json deleted file mode 100644 index 3c8086c..0000000 --- a/benchmarks/benchmark-results/task-description.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "count": 1, - "path" : "sw", - "list": [ - { - "task": { - "size": 4096, - "iterations": 1000, - "batching": { - "batch_submit": false, - "batch_size": 0 - } - }, - "affinity": { - "node": 0, - "nnode_src": 0, - "nnode_dst": 0 - } - } - ] -} \ No newline at end of file diff --git a/benchmarks/benchmark.hpp b/benchmarks/benchmark.hpp index 4752716..aec1ba6 100644 --- a/benchmarks/benchmark.hpp +++ b/benchmarks/benchmark.hpp @@ -11,9 +11,10 @@ #include -#include "barrier.hpp" -#include "statuscode-tostring.hpp" -#include "task-data.hpp" +#include "util/barrier.hpp" +#include "util/statuscode-tostring.hpp" +#include "util/task-data.hpp" +#include "util/array_utils.h" #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl #define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO @@ -34,6 +35,8 @@ void* thread_function(void* argp) { dml::data_view srcv = dml::make_view(reinterpret_cast(src), args->size); dml::data_view dstv = dml::make_view(reinterpret_cast(dst), args->size); + fill_mt(reinterpret_cast(src), args->size, std::numeric_limits::min(), std::numeric_limits::max()); + args->status = dml::status_code::ok; args->rep_completed = 0; diff --git a/benchmarks/json b/benchmarks/json deleted file mode 160000 index 360ce45..0000000 --- a/benchmarks/json +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 360ce457f46f03111332f473fdbb3a353f16723c diff --git a/benchmarks/util/array_utils.h b/benchmarks/util/array_utils.h new file mode 100644 index 0000000..d9f8a85 --- /dev/null +++ b/benchmarks/util/array_utils.h @@ -0,0 +1,82 @@ +// source: https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/intel_xeon_max/code/src/utils/array_utils.h +// author: Andre Berthold + +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +/// @brief Fills a given array with random generated integers. +/// @tparam base_t Datatype of the array +/// @param dest Pointer to the array +/// @param size Size of the array +/// @param min Minumum value of the generated integers +/// @param max Maximum value of the generated integers +template +void fill(base_t * dest, uint64_t size, base_t min, base_t max) { + std::srand(std::time(nullptr)); + for(uint64_t i = 0; i < size/sizeof(base_t); ++i) { + dest[i] = (std::rand() % (max - min)) + min; + } +} + +/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937). +/// @tparam base_t Datatype of the array +/// @param dest Pointer to the array +/// @param size Size of the array +/// @param min Minumum value of the generated integers +/// @param max Maximum value of the generated integers +template +void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) { + static_assert(std::is_integral::value, "Data type is not integral."); + + size = size / sizeof(T); + + std::mt19937::result_type seed; + if (int_seed == 0) { + std::random_device rd; + seed = rd() ^ ( + (std::mt19937::result_type) std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() + + (std::mt19937::result_type) std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch()).count()); + } else seed = int_seed; + + std::mt19937 gen(seed); + std::uniform_int_distribution distrib(min, max); + + for (uint64_t j = 0; j < size; ++j) { + array[j] = distrib(gen); + } + +} + +/** + * @brief Checks if two arrays of the integral type *T* contain the same values + * + * @tparam T Integral type of *array0* and *array1* + * @param array0 Array 0 to check + * @param array1 Array 1 to check + * @param size_b Size of the two arrays in byte + * @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index) + * @return bool Weathor or not the content is equal or not + */ +template +typename std::enable_if::value, bool>::type + check_same(T* array0, T* array1, size_t size_b, bool verbose) { + for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) { + __m512i vec0 = _mm512_stream_load_si512(array0 + i); + __m512i vec1 = _mm512_stream_load_si512(array1 + i); + + __mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1); + } + + //TODO complete function + + return false; +} diff --git a/benchmarks/barrier.hpp b/benchmarks/util/barrier.hpp similarity index 100% rename from benchmarks/barrier.hpp rename to benchmarks/util/barrier.hpp diff --git a/benchmarks/util/json b/benchmarks/util/json new file mode 160000 index 0000000..9cca280 --- /dev/null +++ b/benchmarks/util/json @@ -0,0 +1 @@ +Subproject commit 9cca280a4d0ccf0c08f47a99aa71d1b0e52f8d03 diff --git a/benchmarks/statuscode-tostring.hpp b/benchmarks/util/statuscode-tostring.hpp similarity index 100% rename from benchmarks/statuscode-tostring.hpp rename to benchmarks/util/statuscode-tostring.hpp diff --git a/benchmarks/task-data.hpp b/benchmarks/util/task-data.hpp similarity index 83% rename from benchmarks/task-data.hpp rename to benchmarks/util/task-data.hpp index a78f631..2c770c4 100644 --- a/benchmarks/task-data.hpp +++ b/benchmarks/util/task-data.hpp @@ -21,9 +21,12 @@ struct TaskData { // thread output dml::status_code status; // average run duration in microseconds - std::vector combined_duration; - std::vector submit_duration; - std::vector complete_duration; + double combined_duration; + double submit_duration; + double complete_duration; + double combined_duration_stdev; + double submit_duration_stdev; + double complete_duration_stdev; // completed iterations uint32_t rep_completed; // set by execution @@ -38,10 +41,13 @@ inline void to_json(nlohmann::json& j, const TaskData& a) { j["affinity"]["node"] = a.numa_node; j["affinity"]["nnode_src"] = a.nnode_src; j["affinity"]["nnode_dst"] = a.nnode_dst; - j["report"]["time"]["unit"] = "microseconds"; + j["report"]["time"]["unit"] = "nanoseconds"; j["report"]["time"]["completion_avg"] = a.complete_duration; j["report"]["time"]["submission_avg"] = a.submit_duration; j["report"]["time"]["combined_avg"] = a.combined_duration; + j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev; + j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev; + j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev; j["report"]["iterations_completed"] = a.rep_completed; j["report"]["status"] = StatusCodeToString(a.status); }