Browse Source

restructure of directory layout

master
Constantin Fürst 5 months ago
parent
commit
bc8c4f8ab3
  1. 4
      .gitmodules
  2. 1
      benchmarks/CMakeLists.txt
  3. 4
      benchmarks/benchmark-findings.md
  4. 57
      benchmarks/benchmark-plan.md
  5. 21
      benchmarks/benchmark-results/task-description.json
  6. 9
      benchmarks/benchmark.hpp
  7. 1
      benchmarks/json
  8. 82
      benchmarks/util/array_utils.h
  9. 0
      benchmarks/util/barrier.hpp
  10. 1
      benchmarks/util/json
  11. 0
      benchmarks/util/statuscode-tostring.hpp
  12. 14
      benchmarks/util/task-data.hpp

4
.gitmodules

@ -1,6 +1,6 @@
[submodule "gosh"]
path = thesis/gosh
url = https://github.com/nfeske/gosh.git
[submodule "benchmarks/json"]
path = benchmarks/json
[submodule "benchmarks/util/json"]
path = benchmarks/util/json
url = https://github.com/nlohmann/json

1
benchmarks/CMakeLists.txt

@ -10,7 +10,6 @@ find_package(NUMA REQUIRED)
set(DML_SOURCE_DIR "../../DML/include/")
set(SOURCES main.cpp)
set(INCLUDES benchmark.hpp statuscode-tostring.hpp task-data.hpp)
add_executable(dml-benchmark ${SOURCES})

4
benchmarks/benchmark-findings.md

@ -1,4 +0,0 @@
# cross-copy
- for internode copy, the src-node-engine is only slightly faster than the dst
- for intersocket copy, the src-node-engine is faster than dst-node-engine
- using both in tandem can give about 1.4x speedup over only src-node

57
benchmarks/benchmark-plan.md

@ -1,57 +0,0 @@
# peak-perf
- meassure ddr to ddr
- meassure ddr to hbm
All for 1KiB, 4KiB, 1MiB, 1GiB
All for HW and also SW path
All for intra-node, inter-node and inter-socket
--> conclude how much overhead DSA engine has
--> conclude size after which using HW makes sense
this point is reached when submit overhead for
hw execution is smaller than entire copy time
for sw execution
# submit // done
- single submit-and-wait
- multi submit
- batch submit
All with both 1 and 4 engines per WQ
All for 1KiB, 4KiB, 1MiB, 1GiB
All only on DDR and intra-node
--> conclude which work submission strategy is best for which size
--> conclude whether multiple engines significantly improve batch perf
# mtsubmit // done
- multiple threads submit to the same WQ
- use 1,2,4,8,12 threads
All using DDR and 1MiB
All for 1 vs 4 engines
All on DDR and intra-node
--> conclude how bad mt submit hurts performance
--> conclude whether multiple engines help mt submit
# cross-copy // done
- compare which is faster: xcopy, copy from source node, copy from dst node
All for both inter-node and inter-socket copy using DDR and 1MiB on 4E
--> conclude where a copy thread should live

21
benchmarks/benchmark-results/task-description.json

@ -1,21 +0,0 @@
{
"count": 1,
"path" : "sw",
"list": [
{
"task": {
"size": 4096,
"iterations": 1000,
"batching": {
"batch_submit": false,
"batch_size": 0
}
},
"affinity": {
"node": 0,
"nnode_src": 0,
"nnode_dst": 0
}
}
]
}

9
benchmarks/benchmark.hpp

@ -11,9 +11,10 @@
#include <dml/dml.hpp>
#include "barrier.hpp"
#include "statuscode-tostring.hpp"
#include "task-data.hpp"
#include "util/barrier.hpp"
#include "util/statuscode-tostring.hpp"
#include "util/task-data.hpp"
#include "util/array_utils.h"
#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
@ -34,6 +35,8 @@ void* thread_function(void* argp) {
dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size);
dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size);
fill_mt(reinterpret_cast<uint8_t*>(src), args->size, std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
args->status = dml::status_code::ok;
args->rep_completed = 0;

1
benchmarks/json

@ -1 +0,0 @@
Subproject commit 360ce457f46f03111332f473fdbb3a353f16723c

82
benchmarks/util/array_utils.h

@ -0,0 +1,82 @@
// source: https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/intel_xeon_max/code/src/utils/array_utils.h
// author: Andre Berthold
#pragma once
#include <cstdlib>
#include <ctime>
#include <cstdint>
#include <type_traits>
#include <random>
#include <chrono>
#include <immintrin.h>
/// @brief Fills a given array with random generated integers.
/// @tparam base_t Datatype of the array
/// @param dest Pointer to the array
/// @param size Size of the array
/// @param min Minumum value of the generated integers
/// @param max Maximum value of the generated integers
template<typename base_t>
void fill(base_t * dest, uint64_t size, base_t min, base_t max) {
std::srand(std::time(nullptr));
for(uint64_t i = 0; i < size/sizeof(base_t); ++i) {
dest[i] = (std::rand() % (max - min)) + min;
}
}
/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937).
/// @tparam base_t Datatype of the array
/// @param dest Pointer to the array
/// @param size Size of the array
/// @param min Minumum value of the generated integers
/// @param max Maximum value of the generated integers
template <typename T>
void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) {
static_assert(std::is_integral<T>::value, "Data type is not integral.");
size = size / sizeof(T);
std::mt19937::result_type seed;
if (int_seed == 0) {
std::random_device rd;
seed = rd() ^ (
(std::mt19937::result_type) std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now().time_since_epoch()).count() +
(std::mt19937::result_type) std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::high_resolution_clock::now().time_since_epoch()).count());
} else seed = int_seed;
std::mt19937 gen(seed);
std::uniform_int_distribution<T> distrib(min, max);
for (uint64_t j = 0; j < size; ++j) {
array[j] = distrib(gen);
}
}
/**
* @brief Checks if two arrays of the integral type *T* contain the same values
*
* @tparam T Integral type of *array0* and *array1*
* @param array0 Array 0 to check
* @param array1 Array 1 to check
* @param size_b Size of the two arrays in byte
* @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index)
* @return bool Weathor or not the content is equal or not
*/
template <typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
check_same(T* array0, T* array1, size_t size_b, bool verbose) {
for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) {
__m512i vec0 = _mm512_stream_load_si512(array0 + i);
__m512i vec1 = _mm512_stream_load_si512(array1 + i);
__mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1);
}
//TODO complete function
return false;
}

0
benchmarks/barrier.hpp → benchmarks/util/barrier.hpp

1
benchmarks/util/json

@ -0,0 +1 @@
Subproject commit 9cca280a4d0ccf0c08f47a99aa71d1b0e52f8d03

0
benchmarks/statuscode-tostring.hpp → benchmarks/util/statuscode-tostring.hpp

14
benchmarks/task-data.hpp → benchmarks/util/task-data.hpp

@ -21,9 +21,12 @@ struct TaskData {
// thread output
dml::status_code status;
// average run duration in microseconds
std::vector<double> combined_duration;
std::vector<double> submit_duration;
std::vector<double> complete_duration;
double combined_duration;
double submit_duration;
double complete_duration;
double combined_duration_stdev;
double submit_duration_stdev;
double complete_duration_stdev;
// completed iterations
uint32_t rep_completed;
// set by execution
@ -38,10 +41,13 @@ inline void to_json(nlohmann::json& j, const TaskData& a) {
j["affinity"]["node"] = a.numa_node;
j["affinity"]["nnode_src"] = a.nnode_src;
j["affinity"]["nnode_dst"] = a.nnode_dst;
j["report"]["time"]["unit"] = "microseconds";
j["report"]["time"]["unit"] = "nanoseconds";
j["report"]["time"]["completion_avg"] = a.complete_duration;
j["report"]["time"]["submission_avg"] = a.submit_duration;
j["report"]["time"]["combined_avg"] = a.combined_duration;
j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev;
j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev;
j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev;
j["report"]["iterations_completed"] = a.rep_completed;
j["report"]["status"] = StatusCodeToString(a.status);
}
Loading…
Cancel
Save