Constantin Fürst
1 year ago
12 changed files with 101 additions and 93 deletions
-
4.gitmodules
-
1benchmarks/CMakeLists.txt
-
4benchmarks/benchmark-findings.md
-
57benchmarks/benchmark-plan.md
-
21benchmarks/benchmark-results/task-description.json
-
9benchmarks/benchmark.hpp
-
1benchmarks/json
-
82benchmarks/util/array_utils.h
-
0benchmarks/util/barrier.hpp
-
1benchmarks/util/json
-
0benchmarks/util/statuscode-tostring.hpp
-
14benchmarks/util/task-data.hpp
@ -1,6 +1,6 @@ |
|||
[submodule "gosh"] |
|||
path = thesis/gosh |
|||
url = https://github.com/nfeske/gosh.git |
|||
[submodule "benchmarks/json"] |
|||
path = benchmarks/json |
|||
[submodule "benchmarks/util/json"] |
|||
path = benchmarks/util/json |
|||
url = https://github.com/nlohmann/json |
@ -1,4 +0,0 @@ |
|||
# cross-copy |
|||
- for internode copy, the src-node-engine is only slightly faster than the dst |
|||
- for intersocket copy, the src-node-engine is faster than dst-node-engine |
|||
- using both in tandem can give about 1.4x speedup over only src-node |
@ -1,57 +0,0 @@ |
|||
# peak-perf |
|||
|
|||
- meassure ddr to ddr |
|||
- meassure ddr to hbm |
|||
|
|||
All for 1KiB, 4KiB, 1MiB, 1GiB |
|||
|
|||
All for HW and also SW path |
|||
|
|||
All for intra-node, inter-node and inter-socket |
|||
|
|||
--> conclude how much overhead DSA engine has |
|||
|
|||
--> conclude size after which using HW makes sense |
|||
this point is reached when submit overhead for |
|||
hw execution is smaller than entire copy time |
|||
for sw execution |
|||
|
|||
# submit // done |
|||
|
|||
- single submit-and-wait |
|||
- multi submit |
|||
- batch submit |
|||
|
|||
All with both 1 and 4 engines per WQ |
|||
|
|||
All for 1KiB, 4KiB, 1MiB, 1GiB |
|||
|
|||
All only on DDR and intra-node |
|||
|
|||
--> conclude which work submission strategy is best for which size |
|||
|
|||
--> conclude whether multiple engines significantly improve batch perf |
|||
|
|||
# mtsubmit // done |
|||
|
|||
- multiple threads submit to the same WQ |
|||
- use 1,2,4,8,12 threads |
|||
|
|||
All using DDR and 1MiB |
|||
|
|||
All for 1 vs 4 engines |
|||
|
|||
All on DDR and intra-node |
|||
|
|||
--> conclude how bad mt submit hurts performance |
|||
|
|||
--> conclude whether multiple engines help mt submit |
|||
|
|||
# cross-copy // done |
|||
|
|||
- compare which is faster: xcopy, copy from source node, copy from dst node |
|||
|
|||
All for both inter-node and inter-socket copy using DDR and 1MiB on 4E |
|||
|
|||
--> conclude where a copy thread should live |
|||
|
@ -1,21 +0,0 @@ |
|||
{ |
|||
"count": 1, |
|||
"path" : "sw", |
|||
"list": [ |
|||
{ |
|||
"task": { |
|||
"size": 4096, |
|||
"iterations": 1000, |
|||
"batching": { |
|||
"batch_submit": false, |
|||
"batch_size": 0 |
|||
} |
|||
}, |
|||
"affinity": { |
|||
"node": 0, |
|||
"nnode_src": 0, |
|||
"nnode_dst": 0 |
|||
} |
|||
} |
|||
] |
|||
} |
@ -0,0 +1,82 @@ |
|||
// source: https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/intel_xeon_max/code/src/utils/array_utils.h |
|||
// author: Andre Berthold |
|||
|
|||
#pragma once |
|||
#include <cstdlib> |
|||
#include <ctime> |
|||
#include <cstdint> |
|||
#include <type_traits> |
|||
#include <random> |
|||
#include <chrono> |
|||
|
|||
#include <immintrin.h> |
|||
|
|||
/// @brief Fills a given array with random generated integers. |
|||
/// @tparam base_t Datatype of the array |
|||
/// @param dest Pointer to the array |
|||
/// @param size Size of the array |
|||
/// @param min Minumum value of the generated integers |
|||
/// @param max Maximum value of the generated integers |
|||
template<typename base_t> |
|||
void fill(base_t * dest, uint64_t size, base_t min, base_t max) { |
|||
std::srand(std::time(nullptr)); |
|||
for(uint64_t i = 0; i < size/sizeof(base_t); ++i) { |
|||
dest[i] = (std::rand() % (max - min)) + min; |
|||
} |
|||
} |
|||
|
|||
/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937). |
|||
/// @tparam base_t Datatype of the array |
|||
/// @param dest Pointer to the array |
|||
/// @param size Size of the array |
|||
/// @param min Minumum value of the generated integers |
|||
/// @param max Maximum value of the generated integers |
|||
template <typename T> |
|||
void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) { |
|||
static_assert(std::is_integral<T>::value, "Data type is not integral."); |
|||
|
|||
size = size / sizeof(T); |
|||
|
|||
std::mt19937::result_type seed; |
|||
if (int_seed == 0) { |
|||
std::random_device rd; |
|||
seed = rd() ^ ( |
|||
(std::mt19937::result_type) std::chrono::duration_cast<std::chrono::seconds>( |
|||
std::chrono::system_clock::now().time_since_epoch()).count() + |
|||
(std::mt19937::result_type) std::chrono::duration_cast<std::chrono::microseconds>( |
|||
std::chrono::high_resolution_clock::now().time_since_epoch()).count()); |
|||
} else seed = int_seed; |
|||
|
|||
std::mt19937 gen(seed); |
|||
std::uniform_int_distribution<T> distrib(min, max); |
|||
|
|||
for (uint64_t j = 0; j < size; ++j) { |
|||
array[j] = distrib(gen); |
|||
} |
|||
|
|||
} |
|||
|
|||
/** |
|||
* @brief Checks if two arrays of the integral type *T* contain the same values |
|||
* |
|||
* @tparam T Integral type of *array0* and *array1* |
|||
* @param array0 Array 0 to check |
|||
* @param array1 Array 1 to check |
|||
* @param size_b Size of the two arrays in byte |
|||
* @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index) |
|||
* @return bool Weathor or not the content is equal or not |
|||
*/ |
|||
template <typename T> |
|||
typename std::enable_if<std::is_integral<T>::value, bool>::type |
|||
check_same(T* array0, T* array1, size_t size_b, bool verbose) { |
|||
for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) { |
|||
__m512i vec0 = _mm512_stream_load_si512(array0 + i); |
|||
__m512i vec1 = _mm512_stream_load_si512(array1 + i); |
|||
|
|||
__mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1); |
|||
} |
|||
|
|||
//TODO complete function |
|||
|
|||
return false; |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue