Constantin Fürst
11 months ago
33 changed files with 4682 additions and 0 deletions
-
104qdp_project/.gitignore
-
104qdp_project/CMakeLists.txt
-
3qdp_project/README.md
-
10qdp_project/bench_all_dimes.sh
-
15qdp_project/bench_max.sh
-
33qdp_project/cmake_all_dimes.sh
-
9qdp_project/cmake_max.sh
-
0qdp_project/src/.gitkeep
-
316qdp_project/src/algorithm/operators/aggregation.h
-
170qdp_project/src/algorithm/operators/filter.h
-
240qdp_project/src/benchmark/DIMES_benchmark.cpp
-
260qdp_project/src/benchmark/DIMES_cores_benchmark.cpp
-
289qdp_project/src/benchmark/MAX_benchmark.cpp
-
147qdp_project/src/benchmark/QDP_minimal.h
-
149qdp_project/src/benchmark/doubly_filtered_agg.cpp
-
184qdp_project/src/benchmark/filter_aggregate_pipeline.cpp
-
188qdp_project/src/benchmark/latency.cpp
-
271qdp_project/src/benchmark/micro_benchmarks.cpp
-
391qdp_project/src/benchmark/pipelines/DIMES_scan_filter_pipe.h
-
395qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
-
387qdp_project/src/benchmark/pipelines/scan_filter_pipe.h
-
80qdp_project/src/utils/array_utils.h
-
73qdp_project/src/utils/barrier_utils.h
-
33qdp_project/src/utils/const.h
-
82qdp_project/src/utils/cpu_set_utils.h
-
89qdp_project/src/utils/execution_modes.h
-
76qdp_project/src/utils/file_output.h
-
208qdp_project/src/utils/iterable_range.h
-
152qdp_project/src/utils/measurement_utils.h
-
45qdp_project/src/utils/memory_literals.h
-
6qdp_project/src/utils/pcm.h
-
80qdp_project/src/utils/timer_utils.h
-
93qdp_project/src/utils/vector_loader.h
@ -0,0 +1,104 @@ |
|||||
|
|
||||
|
|
||||
|
bin/ |
||||
|
|
||||
|
|
||||
|
# CMake building files |
||||
|
CMakeLists.txt.user |
||||
|
CMakeCache.txt |
||||
|
CMakeFiles |
||||
|
CMakeScripts |
||||
|
Testing |
||||
|
Makefile |
||||
|
cmake_install.cmake |
||||
|
install_manifest.txt |
||||
|
compile_commands.json |
||||
|
CTestTestfile.cmake |
||||
|
_deps |
||||
|
.cmake |
||||
|
|
||||
|
# Prerequisites |
||||
|
*.d |
||||
|
|
||||
|
# Object files |
||||
|
*.o |
||||
|
*.ko |
||||
|
*.obj |
||||
|
*.elf |
||||
|
|
||||
|
# Linker output |
||||
|
*.ilk |
||||
|
*.map |
||||
|
*.exp |
||||
|
|
||||
|
# Precompiled Headers |
||||
|
*.gch |
||||
|
*.pch |
||||
|
|
||||
|
# Libraries |
||||
|
*.lib |
||||
|
*.a |
||||
|
*.la |
||||
|
*.lo |
||||
|
|
||||
|
# Shared objects (inc. Windows DLLs) |
||||
|
*.dll |
||||
|
*.so |
||||
|
*.so.* |
||||
|
*.dylib |
||||
|
|
||||
|
# Executables |
||||
|
*.exe |
||||
|
*.out |
||||
|
*.app |
||||
|
*.i*86 |
||||
|
*.x86_64 |
||||
|
*.hex |
||||
|
|
||||
|
# Debug files |
||||
|
*.dSYM/ |
||||
|
*.su |
||||
|
*.idb |
||||
|
*.pdb |
||||
|
|
||||
|
# Kernel Module Compile Results |
||||
|
*.mod* |
||||
|
*.cmd |
||||
|
.tmp_versions/ |
||||
|
modules.order |
||||
|
Module.symvers |
||||
|
Mkfile.old |
||||
|
dkms.conf |
||||
|
|
||||
|
# Prerequisites |
||||
|
*.d |
||||
|
|
||||
|
# Compiled Object files |
||||
|
*.slo |
||||
|
*.lo |
||||
|
*.o |
||||
|
*.obj |
||||
|
|
||||
|
# Precompiled Headers |
||||
|
*.gch |
||||
|
*.pch |
||||
|
|
||||
|
# Compiled Dynamic libraries |
||||
|
*.so |
||||
|
*.dylib |
||||
|
*.dll |
||||
|
|
||||
|
# Fortran module files |
||||
|
*.mod |
||||
|
*.smod |
||||
|
|
||||
|
# Compiled Static libraries |
||||
|
*.lai |
||||
|
*.la |
||||
|
*.a |
||||
|
*.lib |
||||
|
|
||||
|
# Executables |
||||
|
*.exe |
||||
|
*.out |
||||
|
*.app |
@ -0,0 +1,104 @@ |
|||||
|
cmake_minimum_required(VERSION 3.18) |
||||
|
|
||||
|
# set the project name |
||||
|
project(NUMA_Slow_Fast_Datamigration_Test VERSION 0.1) |
||||
|
|
||||
|
# specify the C standard |
||||
|
set(CMAKE_CXX_STANDARD 20) |
||||
|
set(CMAKE_CXX_STANDARD_REQUIRED True) |
||||
|
|
||||
|
#set flags on need cross compile for sapphirerapids architecture |
||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids") |
||||
|
#set flags on need cross compile for skylake micro architecture |
||||
|
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=skylake-avx512") |
||||
|
#set flags on need cross compile for knights landing micro architecture (for debugging) |
||||
|
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512cd -mavx512er -mavx512pf") |
||||
|
|
||||
|
#suppress selected! warnigs that are not very important to resolve. This is to keep the compileation output clean |
||||
|
set(SUPPRESS_WARNINGS "-Wno-literal-suffix -Wno-volatile") |
||||
|
|
||||
|
set(DEBUG_FLAGS "-g3" "-ggdb") |
||||
|
set(RELEASE_FLAGS "-O3") |
||||
|
|
||||
|
#set pcm location |
||||
|
set(PCM_LOCATION ./thirdParty/pcm) |
||||
|
set(PCM_LINKS -lpcm -L${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib) |
||||
|
# pass the in formation about the shared library location to the linker |
||||
|
link_directories(${CMAKE_CURRENT_LIST_DIR}/${PCM_LOCATION}/build/lib) |
||||
|
|
||||
|
#set flags used for Release and Debug build type |
||||
|
add_compile_options( |
||||
|
"$<$<CONFIG:Release>:${RELEASE_FLAGS}>" |
||||
|
"$<$<CONFIG:Debug>:${DEBUG_FLAGS}>" |
||||
|
) |
||||
|
|
||||
|
# evaluate custom variables |
||||
|
function(eval vname vvalid vdefault) |
||||
|
# is variable is set to the below value if its not already defined from the comand line |
||||
|
set(VALID ${vvalid} CACHE INTERNAL "Possible values for ${vname}") |
||||
|
set(${vname} ${vdefault} CACHE STRING "The barrier mode") |
||||
|
# command for GUI shenanigans |
||||
|
set_property(CACHE ${vname} PROPERTY STRINGS VALID) |
||||
|
|
||||
|
if(${vname} IN_LIST VALID) |
||||
|
message(STATUS "Variable ${vname} = ${${vname}}") |
||||
|
else() |
||||
|
message(STATUS "Variable ${vname} has invalid value ${${vname}}") |
||||
|
# set the fallback value for use in parent function |
||||
|
unset(${vname} CACHE) |
||||
|
message(STATUS "Fallback to default: ${vname} = ${vdefault}") |
||||
|
set(${vname} ${vdefault} PARENT_SCOPE) |
||||
|
endif() |
||||
|
endfunction() |
||||
|
|
||||
|
eval(WSUPPRESS "suppress;show" "show") |
||||
|
if($<STREQUAL:${BUFFER_LIMIT},suppress> EQUAL 1) |
||||
|
add_compile_options("${SUPPRESS_WARNINGS}") |
||||
|
endif() |
||||
|
|
||||
|
eval(BARRIER_MODE "global;local" "global") |
||||
|
add_definitions(-DBARRIER_MODE="${BARRIER_MODE}") |
||||
|
|
||||
|
eval(BUFFER_LIMIT "unlimited;limited" "unlimited") |
||||
|
add_definitions(-DBUFFER_LIMIT=$<STREQUAL:${BUFFER_LIMIT},limited>) |
||||
|
|
||||
|
eval(QUERY "simple;complex" "simple") |
||||
|
add_definitions(-DQUERY=$<STREQUAL:${QUERY},simple>) |
||||
|
|
||||
|
eval(THREAD_FACTOR "1;2;3;4;5;6;7;8;9;10" "4") |
||||
|
add_definitions(-DTHREAD_GROUP_MULTIPLIER=${THREAD_FACTOR}) |
||||
|
|
||||
|
eval(PINNING "cpu;numa" "cpu") |
||||
|
add_definitions(-DPINNING=$<STREQUAL:${PINNING},cpu>) |
||||
|
|
||||
|
eval(PCM_M "true;false" "false") |
||||
|
add_definitions(-DPCM_M=$<STREQUAL:${PCM_M},true>) |
||||
|
add_definitions(${PCM_LINKS}) |
||||
|
|
||||
|
# build directory |
||||
|
set(CMAKE_BINARY_DIR "../bin") #relative to inside build |
||||
|
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) |
||||
|
|
||||
|
|
||||
|
|
||||
|
# include directories |
||||
|
include_directories(src/utils) |
||||
|
include_directories(src/algorithm) |
||||
|
include_directories(src/algorithm/operators) |
||||
|
include_directories(thirdParty/pcm/src) |
||||
|
|
||||
|
# link libraries |
||||
|
link_libraries(-lnuma -lpthread) |
||||
|
|
||||
|
# Add targets only below |
||||
|
# specify build targets |
||||
|
add_executable(FilterAggregatePipeline src/benchmark/filter_aggregate_pipeline.cpp) |
||||
|
add_executable(DoublyFiltered src/benchmark/doubly_filtered_agg.cpp) |
||||
|
add_executable(DIMESBench src/benchmark/DIMES_benchmark.cpp) |
||||
|
add_executable(DIMESCoreBench src/benchmark/DIMES_cores_benchmark.cpp) |
||||
|
add_executable(MicroBench src/benchmark/micro_benchmarks.cpp) |
||||
|
add_executable(MAXBench src/benchmark/MAX_benchmark.cpp |
||||
|
src/benchmark/QDP_minimal.h) |
||||
|
target_link_libraries(MAXBench libpcm.so) |
||||
|
add_executable(LatencyBench src/benchmark/latency.cpp) |
||||
|
|
@ -0,0 +1,3 @@ |
|||||
|
This is a copy of the Query Driven Prefetching Repository |
||||
|
https://os.inf.tu-dresden.de/repo/gitbox/andre.berthold/Query-driven_Prefetching/src/branch/qdp_minimal/code |
||||
|
Original Authors: André Berthold and Anna Bartuschka |
@ -0,0 +1,10 @@ |
|||||
|
#!bin/bash |
||||
|
|
||||
|
../bin/DIMESBench_gus |
||||
|
../bin/DIMESBench_guc |
||||
|
../bin/DIMESBench_gls |
||||
|
../bin/DIMESBench_glc |
||||
|
../bin/DIMESBench_lus |
||||
|
../bin/DIMESBench_luc |
||||
|
../bin/DIMESBench_lls |
||||
|
../bin/DIMESBench_llc |
@ -0,0 +1,15 @@ |
|||||
|
#!bin/bash |
||||
|
|
||||
|
current_date_time=$(date) |
||||
|
echo "Benchmark start at: $current_date_time" |
||||
|
|
||||
|
../bin/MAXBench_gcc |
||||
|
|
||||
|
cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_c_HBM.csv |
||||
|
|
||||
|
../bin/MAXBench_gcn |
||||
|
|
||||
|
cp ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB.csv ../results/max_q-complex_bm-global_bl-unlimited_tc-121MiB-2MiB_pin_n_HBM.csv |
||||
|
|
||||
|
current_date_time=$(date) |
||||
|
echo "Benchmark end at: $current_date_time" |
@ -0,0 +1,33 @@ |
|||||
|
#!bin/bash |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=simple .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_gus |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_guc |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited -DQUERY=simple .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_gls |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=limited -DQUERY=complex .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_glc |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=unlimited -DQUERY=simple .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_lus |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=unlimited -DQUERY=complex .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_luc |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=limited -DQUERY=simple .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_lls |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=local -DBUFFER_LIMIT=limited -DQUERY=complex .. |
||||
|
cmake --build . --target DIMESBench |
||||
|
mv ../bin/DIMESBench ../bin/DIMESBench_llc |
@ -0,0 +1,9 @@ |
|||||
|
#!bin/bash |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=cpu -DPCM_M=false .. |
||||
|
cmake --build . --target MAXBench |
||||
|
mv ../bin/MAXBench ../bin/MAXBench_gcc |
||||
|
|
||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DWSUPPRESS=suppress -DBARRIER_MODE=global -DBUFFER_LIMIT=unlimited -DQUERY=complex -DTHREAD_FACTOR=2 -DPINNING=numa -DPCM_M=false .. |
||||
|
cmake --build . --target MAXBench |
||||
|
mv ../bin/MAXBench ../bin/MAXBench_gcn |
@ -0,0 +1,316 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <algorithm> |
||||
|
#include <immintrin.h> |
||||
|
#include <type_traits> |
||||
|
|
||||
|
#include "vector_loader.h" |
||||
|
#include "const.h" |
||||
|
|
||||
|
|
||||
|
/** |
||||
|
* @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type. |
||||
|
* |
||||
|
* @tparam T |
||||
|
*/ |
||||
|
template <typename T> |
||||
|
class AggFunction { |
||||
|
static_assert(std::is_integral<T>::value, "The base type of an AggFunction must be an integral"); |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Template class that implements methods used for Summation. It wraps the corresponding vector intrinsics |
||||
|
* |
||||
|
* @tparam T base datatype for the implemented methods |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
class Sum : public AggFunction<T> { |
||||
|
public: |
||||
|
static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_add_epi32(aggregator, vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_add_epi64(aggregator, vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
||||
|
}; |
||||
|
|
||||
|
static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_mask_add_epi32(aggregator, mask, aggregator, vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_mask_add_epi64(aggregator, mask, aggregator, vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
||||
|
}; |
||||
|
|
||||
|
static inline T simd_reduce(__m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_reduce_add_epi32(vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_reduce_add_epi64(vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
||||
|
}; |
||||
|
|
||||
|
static inline T scalar_agg(T aggregator, T scalar) { return aggregator + scalar; }; |
||||
|
|
||||
|
static inline __m512i zero() { return _mm512_set1_epi32(0); }; |
||||
|
}; |
||||
|
|
||||
|
|
||||
|
/** |
||||
|
* @brief Template class that implements methods used for Maximum determination. It wraps the corresponding vector intrinsics |
||||
|
* |
||||
|
* @tparam T base datatype for the implemented methods |
||||
|
* |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
class Max : public AggFunction<T> { |
||||
|
public: |
||||
|
static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_max_epi32(aggregator, vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_max_epi64(aggregator, vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_mask_max_epi32(aggregator, mask, aggregator, vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_mask_max_epi64(aggregator, mask, aggregator, vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline T simd_reduce(__m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_reduce_max_epi32(vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_reduce_max_epi64(vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline T scalar_agg(T aggregator, T scalar) { return std::max(aggregator, scalar); } |
||||
|
|
||||
|
static inline __m512i zero() { |
||||
|
if constexpr (sizeof(T) == 4) { |
||||
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFF); |
||||
|
else return _mm512_set1_epi32(0x0); |
||||
|
} |
||||
|
else if constexpr (sizeof(T) == 8) { |
||||
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF); |
||||
|
else return _mm512_set1_epi32(0x0); |
||||
|
} |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Template class that implements methods used for Minimum determination. It wraps the corresponding vector intrinsics |
||||
|
* |
||||
|
* @tparam T base datatype for the implemented methods |
||||
|
* |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
class Min : public AggFunction<T> { |
||||
|
public: |
||||
|
static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_min_epi32(aggregator, vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_min_epi64(aggregator, vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_mask_min_epi32(aggregator, mask, aggregator, vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_mask_min_epi64(aggregator, mask, aggregator, vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline T simd_reduce(__m512i vector) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_reduce_min_epi32(vector); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_reduce_min_epi64(vector); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline T scalar_agg(T aggregator, T scalar) { return std::min(aggregator, scalar); } |
||||
|
|
||||
|
static inline __m512i zero() { |
||||
|
if constexpr (sizeof(T) == 4) { |
||||
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFF); |
||||
|
else return _mm512_set1_epi32(0xFFFFFFFF); |
||||
|
} |
||||
|
else if constexpr (sizeof(T) == 8) { |
||||
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFFFFFFFFFF); |
||||
|
else return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF); |
||||
|
} |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Template Class that implements an aggregation operation. |
||||
|
* |
||||
|
* @tparam base_t Base type of the values for aggregation |
||||
|
* @tparam func |
||||
|
* @tparam load_mode |
||||
|
*/ |
||||
|
template<typename base_t, template<typename _base_t> class func, load_mode load_mode> |
||||
|
class Aggregation{ |
||||
|
public: |
||||
|
|
||||
|
static_assert(std::is_same_v<base_t, uint64_t>, "Enforce unsigned 64 bit ints."); |
||||
|
|
||||
|
using OP = func<base_t>; |
||||
|
/** |
||||
|
* @brief Calculates the memory maximal needed to store a chunk's processing result. |
||||
|
* |
||||
|
* @param chunk_size_b Size of the chunk in byte |
||||
|
* @return size_t Size of the chunk's processing result in byte |
||||
|
*/ |
||||
|
static size_t result_bytes_per_chunk(size_t chunk_size_b) { |
||||
|
// aggregation returns a single value of type base_t |
||||
|
return sizeof(base_t); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes. |
||||
|
* The result is written to main memory. |
||||
|
* |
||||
|
* @param dest Pointer to the start of the result chunk |
||||
|
* @param src Pointer to the start of the source chunk |
||||
|
* @param chunk_size_b Size of the source chunk in bytes |
||||
|
* @return true When the aggregation is done |
||||
|
* @return false Never |
||||
|
*/ |
||||
|
static bool apply (base_t *dest, base_t *src, size_t chunk_size_b) { |
||||
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
||||
|
size_t value_count = chunk_size_b / sizeof(base_t); |
||||
|
__m512i agg_vec = func<base_t>::zero(); |
||||
|
size_t i = 0; |
||||
|
base_t result = 0; |
||||
|
// stop before! running out of space |
||||
|
if(value_count >= lanes) {// keep in mind value_count is unsigned so if it becomes negative, it doesn't. |
||||
|
for(; i <= value_count - lanes; i += lanes) { |
||||
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
||||
|
|
||||
|
agg_vec = func<base_t>::simd_agg(agg_vec, vec); |
||||
|
} |
||||
|
result = func<base_t>::simd_reduce(agg_vec); |
||||
|
} |
||||
|
|
||||
|
for(; i < value_count; ++i) { |
||||
|
result = func<base_t>::scalar_agg(result, src[i]); |
||||
|
} |
||||
|
*dest = result; |
||||
|
|
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, |
||||
|
* while applying the bit string stored in *masks*. The result is written to main memory. |
||||
|
* |
||||
|
* @param dest Pointer to the start of the result chunk |
||||
|
* @param src Pointer to the start of the source chunk |
||||
|
* @param masks Pointer the bitstring that marks the values that should be aggregated |
||||
|
* @param chunk_size_b Size of the source chunk in bytes |
||||
|
* @return true When the aggregation is done |
||||
|
* @return false Never |
||||
|
*/ |
||||
|
static bool apply_masked (base_t *dest, base_t *src, uint16_t* msks, size_t chunk_size_b) { |
||||
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
||||
|
uint8_t* masks = (uint8_t *)msks; |
||||
|
size_t value_count = chunk_size_b / sizeof(base_t); |
||||
|
__m512i agg_vec = func<base_t>::zero(); |
||||
|
size_t i = 0; |
||||
|
// stop before! running out of space |
||||
|
if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't. |
||||
|
for(; i <= value_count - lanes; i += lanes) { |
||||
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
||||
|
__mmask8 mask = _mm512_int2mask(masks[i / lanes]); |
||||
|
|
||||
|
agg_vec = func<base_t>::simd_mask_agg(agg_vec, mask, vec); |
||||
|
} |
||||
|
*dest = func<base_t>::simd_reduce(agg_vec); |
||||
|
|
||||
|
for(; i < value_count; ++i) { |
||||
|
uint8_t mask = masks[i / lanes]; |
||||
|
if(mask & (0b1 << (i % lanes))){ |
||||
|
*dest = func<base_t>::scalar_agg(*dest, src[i]); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, |
||||
|
* while applying the bit string stored in *masks*. The values are agggegated in the register *dest* without |
||||
|
* clearing beforehand. |
||||
|
* |
||||
|
* NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte |
||||
|
* |
||||
|
* @param dest Vector register used for storing and passing the result around |
||||
|
* @param src Pointer to the start of the source chunk |
||||
|
* @param masks Pointer the bitstring that marks the values that should be aggregated |
||||
|
* @param chunk_size_b Size of the source chunk in bytes |
||||
|
* @return __m512i Vector register holding the aggregation result |
||||
|
*/ |
||||
|
static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks, size_t chunk_size_b) { |
||||
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
||||
|
uint8_t* masks = (uint8_t*) msks; |
||||
|
//TODO this function does not work if value_count % lanes != 0 |
||||
|
size_t value_count = chunk_size_b / sizeof(base_t); |
||||
|
size_t i = 0; |
||||
|
// stop before! running out of space |
||||
|
if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't. |
||||
|
for(; i <= value_count - lanes; i += lanes) { |
||||
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
||||
|
__mmask8 mask = _mm512_int2mask(masks[i / lanes]); |
||||
|
dest = func<base_t>::simd_agg(dest, mask, vec); |
||||
|
} |
||||
|
|
||||
|
return dest; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, |
||||
|
* while applying two bit strings stored in *masks_0* and *masks_1*. The values are aggregated in the register |
||||
|
* *dest* without clearing beforehand. |
||||
|
* |
||||
|
* NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte |
||||
|
* |
||||
|
* @param dest Vector register used for storing and passing the result around |
||||
|
* @param src Pointer to the start of the source chunk |
||||
|
* @param masks_0 Pointer the bitstring that marks the values that should be aggregated |
||||
|
* @param masks_1 Pointer the bitstring that marks the values that should be aggregated |
||||
|
* @param chunk_size_b Size of the source chunk in bytes |
||||
|
* @return __m512i Vector register holding the aggregation result |
||||
|
*/ |
||||
|
static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks0, uint16_t* msks1, size_t chunk_size_b) { |
||||
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
||||
|
uint8_t* masks0 = (uint8_t*) msks0; |
||||
|
uint8_t* masks1 = (uint8_t*) msks1; |
||||
|
//TODO this function does not work if value_count % lanes != 0 |
||||
|
size_t value_count = chunk_size_b / sizeof(base_t); |
||||
|
size_t i = 0; |
||||
|
// stop before! running out of space |
||||
|
if(value_count >= lanes) // keep in mind value_count is unsigned so if it becomes negative, it doesn't. |
||||
|
for(; i <= value_count - lanes; i += lanes) { |
||||
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
||||
|
__mmask8 mask0 = _mm512_int2mask(masks0[i / lanes]); |
||||
|
__mmask8 mask1 = _mm512_int2mask(masks1[i / lanes]); |
||||
|
|
||||
|
mask0 = _kand_mask8(mask0, mask1); |
||||
|
dest = func<base_t>::simd_agg(dest, mask0, vec); |
||||
|
} |
||||
|
|
||||
|
return dest; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Reduces a vector by applying the aggregation function horizontally. |
||||
|
* |
||||
|
* @param dest Result of the horizontal aggregation |
||||
|
* @param src Vector as source for the horizontal aggregation |
||||
|
* @return true When the operation is done |
||||
|
* @return false Never |
||||
|
*/ |
||||
|
static bool happly (base_t *dest, __m512i src) { |
||||
|
*dest = func<base_t>::simd_reduce(src); |
||||
|
|
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
static __m512i get_zero() { |
||||
|
return func<base_t>::zero(); |
||||
|
} |
||||
|
}; |
@ -0,0 +1,170 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include<cstdint> |
||||
|
#include<type_traits> |
||||
|
|
||||
|
#include <immintrin.h> |
||||
|
|
||||
|
#include "vector_loader.h" |
||||
|
|
||||
|
/** |
||||
|
* @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type. |
||||
|
* |
||||
|
* @tparam T An integral datatype |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
class FilterFunction { |
||||
|
static_assert(std::is_integral<T>::value, "The base type of a FilterFunction must be an integeral."); |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Template class that implements methods used for finding values that are not equal to the compare value. |
||||
|
* It wraps the corresponding vector intrinsics. |
||||
|
* |
||||
|
* @tparam T base datatype for the implemented methods |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
class NEQ : public FilterFunction<T> { |
||||
|
public: |
||||
|
static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_cmpneq_epi32_mask(vector, comp); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_cmpneq_epi64_mask(vector, comp); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "NEQ is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline bool scalar_filter(T scalar, T comp) { return scalar != comp; } |
||||
|
}; |
||||
|
|
||||
|
template<typename T> |
||||
|
class EQ : public FilterFunction<T> { |
||||
|
public: |
||||
|
static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_cmpeq_epi32_mask(vector, comp); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_cmpeq_epi64_mask(vector, comp); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "EQ is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline bool scalar_filter(T scalar, T comp) { return scalar == comp; } |
||||
|
}; |
||||
|
|
||||
|
template<typename T> |
||||
|
class LT : public FilterFunction<T> { |
||||
|
public: |
||||
|
static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_cmplt_epi32_mask(vector, comp); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_cmplt_epi64_mask(vector, comp); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LT is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline bool scalar_filter(T scalar, T comp) { return scalar < comp; } |
||||
|
}; |
||||
|
|
||||
|
template<typename T> |
||||
|
class LEQ : public FilterFunction<T> { |
||||
|
public: |
||||
|
static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_cmple_epi32_mask(vector, comp); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_cmple_epi64_mask(vector, comp); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "LEQ is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline bool scalar_filter(T scalar, T comp) { return scalar <= comp; } |
||||
|
}; |
||||
|
|
||||
|
template<typename T> |
||||
|
class GT : public FilterFunction<T> { |
||||
|
public: |
||||
|
static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_cmpgt_epi32_mask(vector, comp); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_cmpgt_epi64_mask(vector, comp); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GT is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline bool scalar_filter(T scalar, T comp) { return scalar > comp; } |
||||
|
}; |
||||
|
|
||||
|
template<typename T> |
||||
|
class GEQ : public FilterFunction<T> { |
||||
|
public: |
||||
|
static inline __mmask16 simd_filter(__m512i vector, __m512i comp) { |
||||
|
if constexpr (sizeof(T) == 4) return _mm512_cmpge_epi32_mask(vector, comp); |
||||
|
else if constexpr (sizeof(T) == 8) return _mm512_cmpge_epi64_mask(vector, comp); |
||||
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "GEQ is only implemented for 32 and 64 wide integers"); |
||||
|
} |
||||
|
|
||||
|
static inline bool scalar_filter(T scalar, T comp) { return scalar >= comp; } |
||||
|
}; |
||||
|
|
||||
|
|
||||
|
template<typename base_t, template<typename _base_t> class func, load_mode load_mode, bool copy> |
||||
|
class Filter { |
||||
|
public: |
||||
|
|
||||
|
static_assert(std::is_same_v<base_t, uint64_t>, "We enforce 64 bit integer"); |
||||
|
|
||||
|
/** |
||||
|
* @brief Calculates the memory maximal needed to store a chunk's processing result. |
||||
|
* |
||||
|
* @param chunk_size_b Size of the chunk in byte |
||||
|
* @return size_t Size of the chunk's processing result in byte |
||||
|
*/ |
||||
|
static size_t result_bytes_per_chunk(size_t chunk_size_b) { |
||||
|
// + 7 to enshure that we have enougth bytes -> / 8 -> rounds down |
||||
|
// if we had 17 / 8 = 2 but (17 + 7) / 8 = 3 |
||||
|
// if we hat 16 / 8 = 2 is right, as well as, 16 + 7 / 8 = 2 |
||||
|
return (chunk_size_b / sizeof(base_t) + 7) / 8; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/** |
||||
|
* @brief Applies the filter function on the chunk starting at *src* and spanning *chunk_size_b* bytes, while comparing with he same value every time. |
||||
|
* The resulting bit string is written to main memory. |
||||
|
* |
||||
|
* @param dest Pointer to the start of the result chunk |
||||
|
* @param src Pointer to the start of the source chunk |
||||
|
* @param cmp_value Comparision value to compare the values from source to |
||||
|
* @param chunk_size_b Size of the source chunk in bytes |
||||
|
* @return true When the filter operation is done |
||||
|
* @return false Never |
||||
|
*/ |
||||
|
// we only need this impl. yet, as all filter are at the end of a pipeline |
||||
|
static bool apply_same (uint16_t *dst, base_t *buffer, base_t *src, base_t cmp_value, size_t chunk_size_b) { |
||||
|
constexpr uint32_t lanes = VECTOR_SIZE<base_t>(); |
||||
|
uint8_t* dest = (uint8_t*) dst; |
||||
|
size_t value_count = chunk_size_b / sizeof(base_t); |
||||
|
__m512i cmp_vec = _mm512_set1_epi64(cmp_value); |
||||
|
size_t i = 0; |
||||
|
// this weird implementetion is neccessary, see analogous impl in aggregation for explaination |
||||
|
if(value_count > lanes) { |
||||
|
for(; (i < value_count - lanes); i += lanes) { |
||||
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
||||
|
__mmask8 bitmask = func<base_t>::simd_filter(vec, cmp_vec); |
||||
|
|
||||
|
uint8_t int_mask = (uint8_t) _mm512_mask2int(bitmask); |
||||
|
|
||||
|
dest[i / lanes] = int_mask; |
||||
|
if constexpr(copy){ |
||||
|
Vector_Loader<base_t, load_mode>::store(buffer + i, vec); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
auto dest_pos = i / lanes; |
||||
|
uint8_t int_mask = 0; |
||||
|
for(; i < value_count; ++i) { |
||||
|
base_t val = src[i]; |
||||
|
|
||||
|
uint8_t result = func<base_t>::scalar_filter(val, cmp_value); |
||||
|
|
||||
|
int_mask |= (result << (i % lanes)); |
||||
|
|
||||
|
if constexpr(copy){ |
||||
|
buffer[i] = val; |
||||
|
} |
||||
|
} |
||||
|
dest[dest_pos] = int_mask; |
||||
|
|
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
}; |
@ -0,0 +1,240 @@ |
|||||
|
#include <atomic>
|
||||
|
#include <barrier>
|
||||
|
#include <chrono>
|
||||
|
#include <condition_variable>
|
||||
|
#include <cstdlib>
|
||||
|
#include <cstring>
|
||||
|
#include <fstream>
|
||||
|
#include <future>
|
||||
|
#include <iostream>
|
||||
|
#include <limits>
|
||||
|
#include <list>
|
||||
|
#include <mutex>
|
||||
|
#include <queue>
|
||||
|
#include <thread>
|
||||
|
#include <tuple>
|
||||
|
#include <utility>
|
||||
|
|
||||
|
#include <numa.h>
|
||||
|
|
||||
|
#ifndef THREAD_GROUP_MULTIPLIER
|
||||
|
#define THREAD_GROUP_MULTIPLIER 8
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef QUERY
|
||||
|
#define QUERY 1
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef BARRIER_MODE
|
||||
|
#define BARRIER_MODE "global"
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef BUFFER_LIMIT
|
||||
|
#define BUFFER_LIMIT 1
|
||||
|
#endif
|
||||
|
|
||||
|
#include "const.h"
|
||||
|
|
||||
|
#include "file_output.h"
|
||||
|
#include "array_utils.h"
|
||||
|
#include "timer_utils.h"
|
||||
|
#include "barrier_utils.h"
|
||||
|
#include "cpu_set_utils.h"
|
||||
|
#include "iterable_range.h"
|
||||
|
#include "memory_literals.h"
|
||||
|
#include "pipelines/DIMES_scan_filter_pipe.h"
|
||||
|
|
||||
|
#include "aggregation.h"
|
||||
|
#include "filter.h"
|
||||
|
|
||||
|
using base_t = uint64_t; |
||||
|
|
||||
|
base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
int main(int argc, char** argv) { |
||||
|
// set constants
|
||||
|
const size_t workload_b = 4_GiB; |
||||
|
const base_t compare_value_a = 50; |
||||
|
const base_t compare_value_b = 42; |
||||
|
constexpr bool simple_query = (QUERY == 1); |
||||
|
|
||||
|
const size_t thread_count = 6; |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/dimes_" |
||||
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
||||
|
"_bm-" + (std::string) BARRIER_MODE + |
||||
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
||||
|
"_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".csv"); |
||||
|
|
||||
|
// set benchmark parameter
|
||||
|
Linear_Int_Range<uint32_t, 0, 10, 1> run("run"); |
||||
|
Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size"); |
||||
|
Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode"); |
||||
|
|
||||
|
uint32_t remote_node = 3; |
||||
|
uint32_t remote_node_2 = 2; |
||||
|
uint32_t local_node = 10; |
||||
|
|
||||
|
print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time", |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
"scan_a", "scan_b", "aggr_j", |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
"wait_scan_a", "wait_scan_b", "wait_aggr_j", |
||||
|
#endif
|
||||
|
"result"); |
||||
|
|
||||
|
|
||||
|
/*** alloc data and buffers ************************************************/ |
||||
|
base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); |
||||
|
base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); |
||||
|
base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); |
||||
|
base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); |
||||
|
fill_mt<base_t>(data_a, workload_b, 0, 100, 42); |
||||
|
fill_mt<base_t>(data_b, workload_b, 0, 100, 420); |
||||
|
std::memcpy(data_a_hbm, data_a, workload_b); |
||||
|
std::memcpy(data_b_hbm, data_b, workload_b); |
||||
|
base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node); |
||||
|
|
||||
|
std::ofstream check_file; |
||||
|
check_file.open("../results/dimes_" |
||||
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
||||
|
"_bm-" + (std::string) BARRIER_MODE + |
||||
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
||||
|
"_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum"); |
||||
|
if constexpr (QUERY == 1) { |
||||
|
//calculate simple checksum if QUERY == 1 -> simple query is applied
|
||||
|
check_file << sum_check(compare_value_a, data_a, data_b, workload_b); |
||||
|
} else { |
||||
|
check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); |
||||
|
} |
||||
|
check_file.close(); |
||||
|
|
||||
|
std::string iteration("init"); |
||||
|
Query_Wrapper<base_t, simple_query>* qw = nullptr; |
||||
|
while(iteration != "false") { |
||||
|
|
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
|
||||
|
if(iteration != "run") { |
||||
|
|
||||
|
if(qw != nullptr) { |
||||
|
delete qw; |
||||
|
} |
||||
|
|
||||
|
std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << std::endl; |
||||
|
|
||||
|
uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); |
||||
|
uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); |
||||
|
uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); |
||||
|
switch(mode.current) { |
||||
|
case NewPMode::DRAM_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::HBM_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::Mixed_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::Prefetch: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
qw->ready_future = &ready_future; |
||||
|
qw->clear_buffers(); |
||||
|
|
||||
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; |
||||
|
auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; |
||||
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; |
||||
|
|
||||
|
std::vector<std::thread> filter_pool; |
||||
|
std::vector<std::thread> copy_pool; |
||||
|
std::vector<std::thread> agg_pool; |
||||
|
|
||||
|
uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); |
||||
|
uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); |
||||
|
uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); |
||||
|
|
||||
|
int thread_id = 0; |
||||
|
// std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
|
||||
|
//std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
|
||||
|
//std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids
|
||||
|
//std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids
|
||||
|
std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids
|
||||
|
|
||||
|
for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < tc_filter; ++tid) { |
||||
|
filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
|
||||
|
// if tc_copy == 0 this loop is skipped
|
||||
|
for(uint32_t tid = 0; tid < tc_copy; ++tid) { |
||||
|
copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
||||
|
agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
p.set_value(); |
||||
|
|
||||
|
for(std::thread& t : filter_pool) { t.join(); } |
||||
|
for(std::thread& t : copy_pool) { t.join(); } |
||||
|
for(std::thread& t : agg_pool) { t.join(); } |
||||
|
|
||||
|
Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); |
||||
|
auto end = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; |
||||
|
uint64_t nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); |
||||
|
double seconds = (double)(nanos) / nanos_per_second; |
||||
|
|
||||
|
|
||||
|
print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds, |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), |
||||
|
#endif
|
||||
|
results[0]); |
||||
|
|
||||
|
|
||||
|
iteration = IterateOnce(run, chunk_size, mode); |
||||
|
} |
||||
|
|
||||
|
numa_free(data_b_hbm, workload_b); |
||||
|
numa_free(data_a, workload_b); |
||||
|
numa_free(data_b, workload_b); |
||||
|
|
||||
|
numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t)); |
||||
|
|
||||
|
} |
@ -0,0 +1,260 @@ |
|||||
|
#include <atomic>
|
||||
|
#include <barrier>
|
||||
|
#include <chrono>
|
||||
|
#include <condition_variable>
|
||||
|
#include <cstdlib>
|
||||
|
#include <cstring>
|
||||
|
#include <fstream>
|
||||
|
#include <future>
|
||||
|
#include <iostream>
|
||||
|
#include <limits>
|
||||
|
#include <list>
|
||||
|
#include <mutex>
|
||||
|
#include <queue>
|
||||
|
#include <thread>
|
||||
|
#include <tuple>
|
||||
|
#include <utility>
|
||||
|
|
||||
|
#include <numa.h>
|
||||
|
|
||||
|
#ifndef QUERY
|
||||
|
#define QUERY 1
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef BARRIER_MODE
|
||||
|
#define BARRIER_MODE "global"
|
||||
|
#endif
|
||||
|
|
||||
|
#define BUFFER_LIMIT 0
|
||||
|
|
||||
|
#include "const.h"
|
||||
|
|
||||
|
#include "file_output.h"
|
||||
|
#include "array_utils.h"
|
||||
|
#include "timer_utils.h"
|
||||
|
#include "barrier_utils.h"
|
||||
|
#include "cpu_set_utils.h"
|
||||
|
#include "iterable_range.h"
|
||||
|
#include "memory_literals.h"
|
||||
|
#include "pipelines/DIMES_scan_filter_pipe.h"
|
||||
|
|
||||
|
#include "aggregation.h"
|
||||
|
#include "filter.h"
|
||||
|
|
||||
|
using base_t = uint64_t; |
||||
|
|
||||
|
base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
int main(int argc, char** argv) { |
||||
|
// set constants
|
||||
|
const size_t workload_b = 4_GiB; |
||||
|
const size_t chunk_size = 2_MiB; |
||||
|
const base_t compare_value_a = 50; |
||||
|
const base_t compare_value_b = 42; |
||||
|
constexpr bool simple_query = (QUERY == 1); |
||||
|
|
||||
|
|
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/dimes_cores_" |
||||
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
||||
|
"_bm-" + (std::string) BARRIER_MODE + |
||||
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
||||
|
".csv"); |
||||
|
|
||||
|
// set benchmark parameter
|
||||
|
Linear_Int_Range<uint32_t, 0, 3, 1> run("run"); |
||||
|
|
||||
|
Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_a_thread("scan_a_tc"); |
||||
|
Exp_Int_Range<uint32_t, 1, 4+1, 2> scan_b_thread("scan_b_tc"); |
||||
|
Exp_Int_Range<uint32_t, 1, 4+1, 2> aggr_j_thread("aggr_j_tc"); |
||||
|
Linear_Int_Range<uint32_t, 1, 16+1, 1> thread_group_count("thread_group_c"); |
||||
|
Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode"); |
||||
|
|
||||
|
uint32_t remote_node = 1; |
||||
|
uint32_t remote_node_2 = 0;//on heacboehm II: node 0 is two hops away from node 2 -> prefetching is more beneficial
|
||||
|
uint32_t local_node = 2; |
||||
|
|
||||
|
print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), |
||||
|
"time", |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
"scan_a", "scan_b", "aggr_j", |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
"wait_scan_a", "wait_scan_b", "wait_aggr_j", |
||||
|
#endif
|
||||
|
"result"); |
||||
|
|
||||
|
|
||||
|
/*** alloc data and buffers ************************************************/ |
||||
|
base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); |
||||
|
base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); |
||||
|
base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); |
||||
|
base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); |
||||
|
fill_mt<base_t>(data_a, workload_b, 0, 100, 42); |
||||
|
fill_mt<base_t>(data_b, workload_b, 0, 100, 420); |
||||
|
std::memcpy(data_a_hbm, data_a, workload_b); |
||||
|
std::memcpy(data_b_hbm, data_b, workload_b); |
||||
|
base_t* results = (base_t*) numa_alloc_onnode(thread_group_count.max * aggr_j_thread.max * sizeof(base_t), remote_node); |
||||
|
|
||||
|
std::ofstream check_file; |
||||
|
check_file.open("../results/dimes_cores_" |
||||
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
||||
|
"_bm-" + (std::string) BARRIER_MODE + |
||||
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
||||
|
".checksum"); |
||||
|
if constexpr (QUERY == 1) { |
||||
|
//calculate simple checksum if QUERY == 1 -> simple query is applied
|
||||
|
check_file << sum_check(compare_value_a, data_a, data_b, workload_b); |
||||
|
} else { |
||||
|
check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); |
||||
|
} |
||||
|
check_file.close(); |
||||
|
|
||||
|
std::string iteration("init"); |
||||
|
Query_Wrapper<base_t, simple_query>* qw = nullptr; |
||||
|
while(iteration != "false") { |
||||
|
|
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
|
||||
|
// skipping iteration through scan_b_thread while not used
|
||||
|
while(simple_query && mode.current != NewPMode::Prefetch && scan_b_thread.current != 1) { |
||||
|
iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread); |
||||
|
} |
||||
|
|
||||
|
if(iteration != "run") { |
||||
|
std::cout << "Changing to mode " << mode.current |
||||
|
<< " thread_group_count " << thread_group_count.current |
||||
|
<< " thread_ratio " << scan_a_thread.current <<":"<< scan_b_thread.current <<":"<< aggr_j_thread.current |
||||
|
<< std::endl; |
||||
|
|
||||
|
if(qw != nullptr) { |
||||
|
if (iteration == thread_group_count.label) { |
||||
|
|
||||
|
} else { |
||||
|
delete qw; |
||||
|
|
||||
|
uint32_t sat = scan_a_thread.current; |
||||
|
uint32_t sbt = simple_query && mode.current != NewPMode::Prefetch ? 0 : scan_b_thread.current; |
||||
|
uint32_t ajt = aggr_j_thread.current; |
||||
|
|
||||
|
switch(mode.current) { |
||||
|
case NewPMode::DRAM_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, |
||||
|
sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::HBM_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a_hbm, data_b_hbm, results, local_node, remote_node, |
||||
|
sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::Mixed_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b_hbm, results, local_node, remote_node, |
||||
|
sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::Prefetch: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, local_node, remote_node, |
||||
|
sat, sbt, ajt, mode.current, thread_group_count.current, (base_t) 50, (base_t) 42, false); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
qw->ready_future = &ready_future; |
||||
|
qw->clear_buffers(); |
||||
|
|
||||
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; |
||||
|
auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; |
||||
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; |
||||
|
|
||||
|
std::vector<std::thread> filter_pool; |
||||
|
std::vector<std::thread> copy_pool; |
||||
|
std::vector<std::thread> agg_pool; |
||||
|
|
||||
|
uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); |
||||
|
uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); |
||||
|
uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); |
||||
|
|
||||
|
int thread_id = 0; |
||||
|
// std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
|
||||
|
std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
|
||||
|
|
||||
|
for(uint32_t gid = 0; gid < thread_group_count.current; ++gid) { |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < tc_filter; ++tid) { |
||||
|
filter_pool.emplace_back(filter_lambda, gid, thread_group_count.current, tid); |
||||
|
pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
|
||||
|
// if tc_copy == 0 this loop is skipped
|
||||
|
for(uint32_t tid = 0; tid < tc_copy; ++tid) { |
||||
|
copy_pool.emplace_back(copy_lambda, gid, thread_group_count.current, tid); |
||||
|
pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
||||
|
agg_pool.emplace_back(aggregation_lambda, gid, thread_group_count.current, tid); |
||||
|
pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
p.set_value(); |
||||
|
|
||||
|
for(std::thread& t : filter_pool) { t.join(); } |
||||
|
for(std::thread& t : copy_pool) { t.join(); } |
||||
|
for(std::thread& t : agg_pool) { t.join(); } |
||||
|
|
||||
|
Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * thread_group_count.current); |
||||
|
auto end = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; |
||||
|
uint64_t nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); |
||||
|
double seconds = (double)(nanos) / nanos_per_second; |
||||
|
|
||||
|
print_to_file(out_file, generateHead(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread), |
||||
|
"time", |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
"scan_a", "scan_b", "aggr_j", |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
"wait_scan_a", "wait_scan_b", "wait_aggr_j", |
||||
|
#endif
|
||||
|
"result"); |
||||
|
|
||||
|
print_to_file(out_file, run, thread_group_count.current, new_mode_manager::string(mode.current), scan_a_thread, |
||||
|
(simple_query && mode.current != NewPMode::Prefetch ? 0 : scan_b_thread.current), |
||||
|
aggr_j_thread, seconds, |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), |
||||
|
#endif
|
||||
|
results[0]); |
||||
|
|
||||
|
iteration = IterateOnce(run, thread_group_count, mode, scan_a_thread, scan_b_thread, aggr_j_thread); |
||||
|
} |
||||
|
|
||||
|
numa_free(data_b_hbm, workload_b); |
||||
|
numa_free(data_a, workload_b); |
||||
|
numa_free(data_b, workload_b); |
||||
|
|
||||
|
numa_free(results, thread_group_count.max * aggr_j_thread.max * sizeof(base_t)); |
||||
|
|
||||
|
} |
@ -0,0 +1,289 @@ |
|||||
|
#include <atomic>
|
||||
|
#include <barrier>
|
||||
|
#include <chrono>
|
||||
|
#include <condition_variable>
|
||||
|
#include <cstdlib>
|
||||
|
#include <cstring>
|
||||
|
#include <fstream>
|
||||
|
#include <future>
|
||||
|
#include <iostream>
|
||||
|
#include <limits>
|
||||
|
#include <list>
|
||||
|
#include <mutex>
|
||||
|
#include <queue>
|
||||
|
#include <thread>
|
||||
|
#include <tuple>
|
||||
|
#include <utility>
|
||||
|
|
||||
|
#include <numa.h>
|
||||
|
|
||||
|
#ifndef THREAD_GROUP_MULTIPLIER
|
||||
|
#define THREAD_GROUP_MULTIPLIER 2
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef QUERY
|
||||
|
#define QUERY 1
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef BARRIER_MODE
|
||||
|
#define BARRIER_MODE "global"
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef BUFFER_LIMIT
|
||||
|
#define BUFFER_LIMIT 1
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef PINNING
|
||||
|
#define PINNING 1
|
||||
|
#endif
|
||||
|
|
||||
|
#ifndef PCM_M
|
||||
|
#define PCM_M 0
|
||||
|
#endif
|
||||
|
|
||||
|
#if PCM_M == 1
|
||||
|
#include "pcm.h"
|
||||
|
#endif
|
||||
|
|
||||
|
#include "const.h"
|
||||
|
|
||||
|
#include "file_output.h"
|
||||
|
#include "array_utils.h"
|
||||
|
#include "timer_utils.h"
|
||||
|
#include "barrier_utils.h"
|
||||
|
#include "measurement_utils.h"
|
||||
|
#include "cpu_set_utils.h"
|
||||
|
#include "iterable_range.h"
|
||||
|
#include "memory_literals.h"
|
||||
|
#include "pipelines/MAX_scan_filter_pipe.h"
|
||||
|
|
||||
|
#include "aggregation.h"
|
||||
|
#include "filter.h"
|
||||
|
|
||||
|
using base_t = uint64_t; |
||||
|
|
||||
|
base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
int main(int argc, char** argv) { |
||||
|
#if PCM == 1
|
||||
|
pcm::PCM *pcm = pcm::PCM::getInstance(); |
||||
|
//and check for errors
|
||||
|
auto error_code = pcm->program(); |
||||
|
if(error_code != pcm::PCM::Success) { |
||||
|
std::cerr << "PCM couldn't start" << std::endl; |
||||
|
std::cerr << "Error code: " << error_code << std::endl; |
||||
|
std::cerr << "Try to execute 'sudo modprobe msr' and execute this program with root privigeges."; |
||||
|
return 1; |
||||
|
} |
||||
|
#endif
|
||||
|
|
||||
|
// set constants
|
||||
|
const size_t workload_b = 2_GiB; |
||||
|
const base_t compare_value_a = 50; |
||||
|
const base_t compare_value_b = 42; |
||||
|
constexpr bool simple_query = (QUERY == 1); |
||||
|
|
||||
|
const size_t thread_count = 6; |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/max_" |
||||
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
||||
|
"_bm-" + (std::string) BARRIER_MODE + |
||||
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
||||
|
"_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + "1MiB-2MiB.csv"); |
||||
|
|
||||
|
// set benchmark parameter
|
||||
|
Linear_Int_Range<uint32_t, 0, 30, 1> run("run"); |
||||
|
constexpr size_t chunk_min = 1_MiB; constexpr size_t chunk_max = 8_MiB + 1; constexpr size_t chunk_incr = 128_kiB; |
||||
|
Linear_Int_Range<size_t, chunk_min, chunk_max, chunk_incr> chunk_size("chunk_size"); |
||||
|
Range<NewPMode, DRAM_base, new_mode_manager, new_mode_manager> mode("mode"); |
||||
|
|
||||
|
uint32_t remote_node = 2; |
||||
|
uint32_t remote_node_2 = 2; |
||||
|
uint32_t local_node = 10; |
||||
|
|
||||
|
/*uint32_t remote_node = 6;
|
||||
|
uint32_t remote_node_2 = 6; |
||||
|
uint32_t local_node = 2;*/ |
||||
|
|
||||
|
print_to_file(out_file, generateHead(run, chunk_size, mode), "thread_group", "time", |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
"scan_a", "scan_b", "aggr_j", |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
"wait_scan_a", "wait_scan_b", "wait_aggr_j", |
||||
|
#endif
|
||||
|
#if PCM == 1
|
||||
|
pcm_value_collector::getHead("scan_a"), |
||||
|
pcm_value_collector::getHead("scan_b"), |
||||
|
pcm_value_collector::getHead("aggr_j"), |
||||
|
#endif
|
||||
|
"result"); |
||||
|
|
||||
|
|
||||
|
/*** alloc data and buffers ************************************************/ |
||||
|
base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); |
||||
|
base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); |
||||
|
base_t* data_a_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); |
||||
|
base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload_b, local_node); |
||||
|
fill_mt<base_t>(data_a, workload_b, 0, 100, 42); |
||||
|
fill_mt<base_t>(data_b, workload_b, 0, 100, 420); |
||||
|
std::memcpy(data_a_hbm, data_a, workload_b); |
||||
|
std::memcpy(data_b_hbm, data_b, workload_b); |
||||
|
base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t), remote_node); |
||||
|
|
||||
|
std::ofstream check_file; |
||||
|
check_file.open("../results/max_" |
||||
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
||||
|
"_bm-" + (std::string) BARRIER_MODE + |
||||
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
||||
|
"_tc-" + std::to_string(thread_count * THREAD_GROUP_MULTIPLIER) + ".checksum"); |
||||
|
if constexpr (QUERY == 1) { |
||||
|
//calculate simple checksum if QUERY == 1 -> simple query is applied
|
||||
|
check_file << sum_check(compare_value_a, data_a, data_b, workload_b); |
||||
|
} else { |
||||
|
check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); |
||||
|
} |
||||
|
check_file.close(); |
||||
|
|
||||
|
std::string iteration("init"); |
||||
|
Query_Wrapper<base_t, simple_query>* qw = nullptr; |
||||
|
while(iteration != "false") { |
||||
|
|
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
|
||||
|
if(iteration != "run") { |
||||
|
|
||||
|
if(qw != nullptr) { |
||||
|
delete qw; |
||||
|
} |
||||
|
uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); |
||||
|
uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); |
||||
|
uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); |
||||
|
switch(mode.current) { |
||||
|
case NewPMode::DRAM_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::HBM_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a_hbm, data_b_hbm, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::Mixed_base: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case NewPMode::Prefetch: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, |
||||
|
tc_filter, tc_copy, tc_agg, mode.current, THREAD_GROUP_MULTIPLIER, (base_t) 50, (base_t) 42, false); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
qw->ready_future = &ready_future; |
||||
|
qw->clear_buffers(); |
||||
|
|
||||
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; |
||||
|
auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; |
||||
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; |
||||
|
|
||||
|
std::vector<std::thread> filter_pool; |
||||
|
std::vector<std::thread> copy_pool; |
||||
|
std::vector<std::thread> agg_pool; |
||||
|
|
||||
|
uint8_t tc_filter = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_A); |
||||
|
uint8_t tc_copy = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, SCAN_B); |
||||
|
uint8_t tc_agg = new_mode_manager::thread_count(simple_query ? SIMPLE_Q : COMPLEX_Q, mode.current, AGGR_J); |
||||
|
|
||||
|
int thread_id = 0; |
||||
|
// std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm II
|
||||
|
//std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
|
||||
|
std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; // node 2 sapphire rapids
|
||||
|
//std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 48)}; // node 2+3 sapphire rapids
|
||||
|
//std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(0, 48)}; // node 0-3 sapphire rapids
|
||||
|
|
||||
|
for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < tc_filter; ++tid) { |
||||
|
filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
#if PINNING
|
||||
|
pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); |
||||
|
#else
|
||||
|
pin_thread_in_range(filter_pool.back(), pinning_ranges); |
||||
|
#endif
|
||||
|
} |
||||
|
|
||||
|
// if tc_copy == 0 this loop is skipped
|
||||
|
for(uint32_t tid = 0; tid < tc_copy; ++tid) { |
||||
|
copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
#if PINNING
|
||||
|
pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); |
||||
|
#else
|
||||
|
pin_thread_in_range(copy_pool.back(), pinning_ranges); |
||||
|
#endif
|
||||
|
} |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
||||
|
agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
#if PINNING
|
||||
|
pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); |
||||
|
#else
|
||||
|
pin_thread_in_range(agg_pool.back(), pinning_ranges); |
||||
|
#endif
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
p.set_value(); |
||||
|
|
||||
|
for(std::thread& t : filter_pool) { t.join(); } |
||||
|
for(std::thread& t : copy_pool) { t.join(); } |
||||
|
for(std::thread& t : agg_pool) { t.join(); } |
||||
|
|
||||
|
Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); |
||||
|
auto end = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; |
||||
|
uint64_t nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); |
||||
|
double seconds = (double)(nanos) / nanos_per_second; |
||||
|
|
||||
|
|
||||
|
|
||||
|
print_to_file(out_file, run, chunk_size, new_mode_manager::string(mode.current), THREAD_GROUP_MULTIPLIER, seconds, |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), |
||||
|
#endif
|
||||
|
#if PCM == 1
|
||||
|
qw->pvc->summarize_as_string("scan_a"), |
||||
|
qw->pvc->summarize_as_string("scan_b"), |
||||
|
qw->pvc->summarize_as_string("aggr_j"), |
||||
|
#endif
|
||||
|
results[0]); |
||||
|
|
||||
|
iteration = IterateOnce(run, chunk_size, mode); |
||||
|
} |
||||
|
|
||||
|
numa_free(data_b_hbm, workload_b); |
||||
|
numa_free(data_a, workload_b); |
||||
|
numa_free(data_b, workload_b); |
||||
|
|
||||
|
numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t)); |
||||
|
|
||||
|
} |
@ -0,0 +1,147 @@ |
|||||
|
#include <chrono> |
||||
|
#include <iostream> |
||||
|
#include <thread> |
||||
|
#include <future> |
||||
|
#include <numa.h> |
||||
|
|
||||
|
#include "const.h" |
||||
|
#include "array_utils.h" |
||||
|
#include "cpu_set_utils.h" |
||||
|
#include "iterable_range.h" |
||||
|
#include "memory_literals.h" |
||||
|
#include "pipelines/MAX_scan_filter_pipe.h" |
||||
|
#include "aggregation.h" |
||||
|
|
||||
|
using base_t = uint64_t; |
||||
|
|
||||
|
// calculate the checksum for the simple query |
||||
|
base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
// calculate the checksum for the complex query |
||||
|
base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
class QDP_minimal { |
||||
|
private: |
||||
|
// values used for comparisons in the filter operations |
||||
|
const base_t compare_value_a = 50; |
||||
|
const base_t compare_value_b = 42; |
||||
|
// define, which numa nodes to use |
||||
|
// Xeon Max: node 0-7 DRAM and 8-15 HBM |
||||
|
// if the nodes are changed, the pinning ranges in run should be adjusted accordingly too |
||||
|
uint32_t dram_node = 2; |
||||
|
uint32_t dram_node_2 = 2; |
||||
|
uint32_t hbm_node = 10; |
||||
|
|
||||
|
public: |
||||
|
// results of running qdp, set by run() |
||||
|
base_t result; |
||||
|
base_t checksum; |
||||
|
double exec_time; |
||||
|
|
||||
|
// run qdp |
||||
|
void run(const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){ |
||||
|
// allocate data |
||||
|
base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, dram_node); |
||||
|
base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, dram_node_2); |
||||
|
base_t* results = (base_t*) numa_alloc_onnode(THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t), dram_node); |
||||
|
|
||||
|
// fill the memory with acutal values |
||||
|
fill_mt<base_t>(data_a, workload_b, 0, 100, 42); |
||||
|
fill_mt<base_t>(data_b, workload_b, 0, 100, 420); |
||||
|
|
||||
|
// run qdp |
||||
|
run(data_a, data_b, results, workload_b, chunk_size, tc_filter, tc_copy, tc_agg); |
||||
|
|
||||
|
// free the allocated memory |
||||
|
numa_free(data_a, workload_b); |
||||
|
numa_free(data_b, workload_b); |
||||
|
numa_free(results, THREAD_GROUP_MULTIPLIER * tc_agg * sizeof(base_t)); |
||||
|
} |
||||
|
|
||||
|
// run qdp, work on provided memory pointers to enable memory reuse across multiple runs |
||||
|
void run(base_t* data_a, base_t* data_b, base_t* results, const size_t workload_b, size_t chunk_size, uint8_t tc_filter, uint8_t tc_copy, uint8_t tc_agg){ |
||||
|
constexpr bool simple_query = (QUERY == 1); |
||||
|
// sync objects |
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
|
||||
|
// create the query wrapper, that is managing the to-be-used threads |
||||
|
Query_Wrapper<base_t, simple_query>* qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size, data_a, data_b, results, hbm_node, dram_node, |
||||
|
tc_filter, tc_copy, tc_agg, NewPMode::Prefetch, THREAD_GROUP_MULTIPLIER, compare_value_a, compare_value_b, false); |
||||
|
|
||||
|
// clear buffers to make sure, that they have been written and are fully mapped before running qdp |
||||
|
qw->clear_buffers(); |
||||
|
|
||||
|
// creating lambdas for executing filter (scan_a), copy (scan_b), and aggregation tasks on the query wrapper |
||||
|
// passing gid (group id), gcnt (group count) and tid (thread id) |
||||
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; |
||||
|
auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; |
||||
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; |
||||
|
|
||||
|
// creating thread pools, holding all used threads |
||||
|
std::vector<std::thread> filter_pool; |
||||
|
std::vector<std::thread> copy_pool; |
||||
|
std::vector<std::thread> agg_pool; |
||||
|
|
||||
|
int thread_id = 0; |
||||
|
// cpus on node 2 (for sapphire rapids), that the threads should be executed on |
||||
|
std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(24, 36), std::make_pair(120, 132)}; |
||||
|
|
||||
|
// create all threads for all thread groups and for every task (copy, filter, aggregation), according their specific theadcount |
||||
|
for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { |
||||
|
for(uint32_t tid = 0; tid < tc_filter; ++tid) { |
||||
|
filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
for(uint32_t tid = 0; tid < tc_copy; ++tid) { |
||||
|
copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
||||
|
agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
||||
|
pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// start the clock |
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
// set value to the promise, to signal the waiting threads, that they can start now |
||||
|
p.set_value(); |
||||
|
|
||||
|
// wait for all thread to be finished |
||||
|
for(std::thread& t : filter_pool) { t.join(); } |
||||
|
for(std::thread& t : copy_pool) { t.join(); } |
||||
|
for(std::thread& t : agg_pool) { t.join(); } |
||||
|
|
||||
|
// sum up the results of all the aggregation threads to get a final result |
||||
|
Aggregation<base_t, Sum, load_mode::Aligned>::apply(&result, results, sizeof(base_t) * tc_agg * THREAD_GROUP_MULTIPLIER); |
||||
|
auto end = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
// get the overall execution time in seconds |
||||
|
constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; |
||||
|
uint64_t nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); |
||||
|
exec_time = (double)(nanos) / nanos_per_second; |
||||
|
|
||||
|
// calculate the checksum according to the used query |
||||
|
if constexpr (QUERY == 1) { |
||||
|
// QUERY == 1 -> simple query is applied |
||||
|
checksum = sum_check(compare_value_a, data_a, data_b, workload_b); |
||||
|
} else { |
||||
|
checksum = sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); |
||||
|
} |
||||
|
|
||||
|
delete qw; |
||||
|
} |
||||
|
}; |
@ -0,0 +1,149 @@ |
|||||
|
|
||||
|
#include <cstring>
|
||||
|
#include <fstream>
|
||||
|
#include <future>
|
||||
|
#include <iostream>
|
||||
|
#include <string>
|
||||
|
#include <thread>
|
||||
|
#include <vector>
|
||||
|
|
||||
|
#include <numa.h>
|
||||
|
|
||||
|
#include "aggregation.h"
|
||||
|
#include "array_utils.h"
|
||||
|
#include "cpu_set_utils.h"
|
||||
|
#include "file_output.h"
|
||||
|
#include "iterable_range.h"
|
||||
|
#include "memory_literals.h"
|
||||
|
#include "pipelines/scan_filter_pipe.h"
|
||||
|
|
||||
|
int main () { |
||||
|
|
||||
|
using base_t = uint64_t; |
||||
|
|
||||
|
|
||||
|
const size_t workload = 2_GiB; |
||||
|
const char filename[256] = "../results/doubly_filtered_results_stronger_affinity_.csv"; |
||||
|
const uint32_t numa_local = 2; |
||||
|
const uint32_t numa_remote = 3; |
||||
|
|
||||
|
|
||||
|
Linear_Int_Range<uint32_t, 1, 6, 1> thread_group("thread_groups"); |
||||
|
Exp_Int_Range<uint32_t, 1, 5, 2> thread_count_filter("thread_cnt_filter"); |
||||
|
Exp_Int_Range<uint32_t, 1, 5, 2> thread_count_filter_copy("thread_cnt_filter_copy"); |
||||
|
Exp_Int_Range<uint32_t, 1, 5, 2> thread_count_aggregation("thread_cnt_agg"); |
||||
|
Linear_Int_Range<uint32_t, 0, 30, 1> run("run"); |
||||
|
Range<PMode, no_copy, mode_manager, mode_manager> mode("mode"); |
||||
|
Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size"); |
||||
|
|
||||
|
std::ofstream out_file; |
||||
|
out_file.open(filename); |
||||
|
print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, |
||||
|
thread_count_aggregation, thread_group), "time", "scan_a", "scan_b", "aggr_j", "wait_aggr", "results"); |
||||
|
|
||||
|
base_t* data_a = (base_t*) numa_alloc_onnode(workload, numa_remote); |
||||
|
base_t* data_b = (base_t*) numa_alloc_onnode(workload, numa_remote); |
||||
|
base_t* data_b_hbm = (base_t*) numa_alloc_onnode(workload, numa_local); |
||||
|
fill_mt<base_t>(data_a, workload, 0, 100, 42); |
||||
|
fill_mt<base_t>(data_b, workload, 0, 100, 420); |
||||
|
std::memcpy(data_b_hbm, data_b, workload); |
||||
|
base_t* result = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), |
||||
|
numa_remote); |
||||
|
|
||||
|
std::string iteration("init"); |
||||
|
Query_Wrapper<base_t, false>* qw = nullptr; |
||||
|
|
||||
|
while(iteration != "false") { |
||||
|
|
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
|
||||
|
if(iteration != "run") { |
||||
|
if(qw != nullptr) { |
||||
|
delete qw; |
||||
|
} |
||||
|
|
||||
|
switch(mode.current) { |
||||
|
case PMode::expl_copy: |
||||
|
qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, |
||||
|
thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, |
||||
|
mode.current, thread_group.current, (base_t) 50, (base_t) 42, false); |
||||
|
break; |
||||
|
case PMode::no_copy: |
||||
|
qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b, result, numa_local, numa_remote, |
||||
|
thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, |
||||
|
mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case PMode::hbm: |
||||
|
qw = new Query_Wrapper<base_t, false>(&ready_future, workload, chunk_size.current, data_a, data_b_hbm, result, numa_local, numa_remote, |
||||
|
thread_count_filter.current, thread_count_filter_copy.current, thread_count_aggregation.current, |
||||
|
mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
qw->ready_future = &ready_future; |
||||
|
qw->clear_buffers(); |
||||
|
|
||||
|
|
||||
|
// todo create threads depending on mode
|
||||
|
std::vector<std::thread> thread_pool; |
||||
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; |
||||
|
auto filter_copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; |
||||
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; |
||||
|
|
||||
|
|
||||
|
/* Intel Xeon Gold 6130 // todo implement different for 5120 -> fewer cpus
|
||||
|
node 0 cpus: 0-15 64- 79 |
||||
|
node 1 cpus: 16-31 80- 95 |
||||
|
node 2 cpus: 32-47 96-111 |
||||
|
node 3 cpus: 48-63 112-127 |
||||
|
*/ |
||||
|
int thread_id = 0; |
||||
|
std::vector<std::pair<int, int>> range {std::make_pair(0, 16), std::make_pair(64, 80)}; |
||||
|
for(uint32_t gid = 0; gid < thread_group.current; ++gid) { |
||||
|
|
||||
|
|
||||
|
for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) { |
||||
|
thread_pool.emplace_back(filter_lambda, gid, thread_group.current, tid); |
||||
|
pin_thread_in_range(thread_pool.back(), thread_id++, range); |
||||
|
} |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < thread_count_filter_copy.current; ++tid) { |
||||
|
thread_pool.emplace_back(filter_copy_lambda, gid, thread_group.current, tid); |
||||
|
pin_thread_in_range(thread_pool.back(), thread_id++, range); |
||||
|
} |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) { |
||||
|
thread_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid); |
||||
|
pin_thread_in_range(thread_pool.back(), thread_id++, range); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
p.set_value(); |
||||
|
|
||||
|
// wait for every thread to join
|
||||
|
for(std::thread& t : thread_pool) t.join(); |
||||
|
// aggregate all partial results
|
||||
|
Aggregation<base_t, Sum, load_mode::Aligned>::apply(result, result, |
||||
|
sizeof(base_t) * thread_count_aggregation.current * thread_group.current); |
||||
|
|
||||
|
auto end = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
double duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / (double)1000000000; |
||||
|
|
||||
|
|
||||
|
//TODO add mode
|
||||
|
print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, |
||||
|
thread_count_filter_copy, thread_count_aggregation, thread_group, duration, |
||||
|
qw->trt->summarize_time(0), qw->trt->summarize_time(1), |
||||
|
qw->trt->summarize_time(2), qw->trt->summarize_time(3), *result); |
||||
|
iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_filter_copy, thread_count_aggregation, thread_group); |
||||
|
} |
||||
|
|
||||
|
auto end = std::chrono::system_clock::now(); |
||||
|
std::time_t end_time = std::chrono::system_clock::to_time_t(end); |
||||
|
std::cout << "finished computation at " << std::ctime(&end_time) << std::endl; |
||||
|
|
||||
|
print_to_file(out_file, std::ctime(&end_time)); |
||||
|
} |
@ -0,0 +1,184 @@ |
|||||
|
#include <atomic>
|
||||
|
#include <barrier>
|
||||
|
#include <chrono>
|
||||
|
#include <condition_variable>
|
||||
|
#include <cstdlib>
|
||||
|
#include <cstring>
|
||||
|
#include <fstream>
|
||||
|
#include <future>
|
||||
|
#include <iostream>
|
||||
|
#include <limits>
|
||||
|
#include <list>
|
||||
|
#include <mutex>
|
||||
|
#include <queue>
|
||||
|
#include <thread>
|
||||
|
#include <tuple>
|
||||
|
#include <utility>
|
||||
|
|
||||
|
#include <numa.h>
|
||||
|
|
||||
|
#include "const.h"
|
||||
|
|
||||
|
#include "file_output.h"
|
||||
|
#include "array_utils.h"
|
||||
|
#include "timer_utils.h"
|
||||
|
#include "barrier_utils.h"
|
||||
|
#include "cpu_set_utils.h"
|
||||
|
#include "iterable_range.h"
|
||||
|
#include "memory_literals.h"
|
||||
|
#include "pipelines/scan_filter_pipe.h"
|
||||
|
|
||||
|
#include "aggregation.h"
|
||||
|
#include "filter.h"
|
||||
|
|
||||
|
using base_t = uint64_t; |
||||
|
|
||||
|
base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
||||
|
sum += (row_A[i] < compare_value) * row_B[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
int main(int argc, char** argv) { |
||||
|
size_t workload_b = 2_GiB; |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("filter_aggreagate_pipe_bm_" + (std::string) BARRIER_MODE + ".csv"); |
||||
|
|
||||
|
Linear_Int_Range<uint32_t, 1, 7, 1> thread_group("thread_groups"); |
||||
|
Linear_Int_Range<uint32_t, 0, 10, 1> run("run"); |
||||
|
Exp_Int_Range<size_t, 1_MiB, 8_MiB + 1, 2> chunk_size("chunk_size"); |
||||
|
Linear_Int_Range<uint32_t, 1, 2, 1> thread_count_filter("thread_cnt_filter"); |
||||
|
Linear_Int_Range<uint32_t, 2, 3, 1> thread_count_copy("thread_cnt_copy"); |
||||
|
Linear_Int_Range<uint32_t, 1, 2, 1> thread_count_aggregation("thread_cnt_agg"); |
||||
|
Range<PMode, no_copy, mode_manager, mode_manager> mode("mode"); |
||||
|
|
||||
|
uint32_t remote_node = 2; |
||||
|
uint32_t remote_node_2 = 2; |
||||
|
uint32_t local_node = 10; |
||||
|
|
||||
|
print_to_file(out_file, generateHead(run, chunk_size, mode, thread_count_filter, thread_count_copy, |
||||
|
thread_count_aggregation, thread_group), "time", |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
"scan_a", "scan_b", "aggr_j", |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
"wait_scan_a", "wait_scan_b", "wait_aggr_j", |
||||
|
#endif
|
||||
|
"result"); |
||||
|
|
||||
|
|
||||
|
/*** alloc data and buffers ************************************************/ |
||||
|
base_t* data_a = (base_t*) numa_alloc_onnode(workload_b, remote_node); |
||||
|
base_t* data_b = (base_t*) numa_alloc_onnode(workload_b, remote_node_2); |
||||
|
base_t* data_b_hbm = (base_t *) numa_alloc_onnode(workload_b, local_node); |
||||
|
fill_mt<base_t>(data_a, workload_b, 0, 100, 42); |
||||
|
fill_mt<base_t>(data_b, workload_b, 0, 100, 420); |
||||
|
std::memcpy(data_b_hbm, data_b, workload_b); |
||||
|
base_t* results = (base_t*) numa_alloc_onnode(thread_group.max * thread_count_aggregation.max * sizeof(base_t), remote_node); |
||||
|
|
||||
|
std::string iteration("init"); |
||||
|
const bool simple_query = true; |
||||
|
Query_Wrapper<base_t, simple_query>* qw = nullptr; |
||||
|
while(iteration != "false") { |
||||
|
base_t compare_value = 50; |
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
|
||||
|
if(iteration != "run") { |
||||
|
|
||||
|
if(qw != nullptr) { |
||||
|
delete qw; |
||||
|
} |
||||
|
|
||||
|
std::cout << "Changing to mode " << mode.current << " chunksize " << chunk_size.current << " thread_group " << thread_group.current << std::endl; |
||||
|
switch(mode.current) { |
||||
|
case PMode::expl_copy: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, |
||||
|
thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, false); |
||||
|
break; |
||||
|
case PMode::no_copy: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b, results, local_node, remote_node, |
||||
|
thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
case PMode::hbm: |
||||
|
qw = new Query_Wrapper<base_t, simple_query>(&ready_future, workload_b, chunk_size.current, data_a, data_b_hbm, results, local_node, remote_node, |
||||
|
thread_count_filter.current, thread_count_copy.current, thread_count_aggregation.current, mode.current, thread_group.current, (base_t) 50, (base_t) 42, true); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
qw->ready_future = &ready_future; |
||||
|
qw->clear_buffers(); |
||||
|
|
||||
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_a(gid, gcnt, tid); }; |
||||
|
auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->scan_b(gid, gcnt, tid); }; |
||||
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw->aggr_j(gid, gcnt, tid); }; |
||||
|
|
||||
|
std::vector<std::thread> filter_pool; |
||||
|
std::vector<std::thread> copy_pool; |
||||
|
std::vector<std::thread> agg_pool; |
||||
|
|
||||
|
int thread_id = 0; |
||||
|
// std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(28, 42), std::make_pair(84, 98)}; // node 2 heacboehm2
|
||||
|
std::vector<std::pair<int, int>> pinning_ranges {std::make_pair(32, 48), std::make_pair(96, 112)}; // node 2 heacboehm
|
||||
|
|
||||
|
for(uint32_t gid = 0; gid < thread_group.current; ++gid) { |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < thread_count_filter.current; ++tid) { |
||||
|
filter_pool.emplace_back(filter_lambda, gid, thread_group.current, tid); |
||||
|
pin_thread_in_range(filter_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
|
||||
|
if(mode.current == PMode::expl_copy){ |
||||
|
for(uint32_t tid = 0; tid < thread_count_copy.current; ++tid) { |
||||
|
copy_pool.emplace_back(copy_lambda, gid, thread_group.current, tid); |
||||
|
pin_thread_in_range(copy_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
for(uint32_t tid = 0; tid < thread_count_aggregation.current; ++tid) { |
||||
|
agg_pool.emplace_back(aggregation_lambda, gid, thread_group.current, tid); |
||||
|
pin_thread_in_range(agg_pool.back(), thread_id++, pinning_ranges); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
p.set_value(); |
||||
|
|
||||
|
for(std::thread& t : filter_pool) { t.join(); } |
||||
|
for(std::thread& t : copy_pool) { t.join(); } |
||||
|
for(std::thread& t : agg_pool) { t.join(); } |
||||
|
|
||||
|
Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_count_aggregation.current * thread_group.current); |
||||
|
auto end = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
constexpr double nanos_per_second = ((double)1000) * 1000 * 1000; |
||||
|
uint64_t nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); |
||||
|
double seconds = (double)(nanos) / nanos_per_second; |
||||
|
|
||||
|
|
||||
|
|
||||
|
print_to_file(out_file, run, chunk_size, mode_manager::string(mode.current), thread_count_filter, |
||||
|
thread_count_copy, thread_count_aggregation, thread_group, seconds, |
||||
|
#ifdef THREAD_TIMINGS
|
||||
|
qw->trt->summarize_time(0), qw->trt->summarize_time(1), qw->trt->summarize_time(2), |
||||
|
#endif
|
||||
|
#ifdef BARRIER_TIMINGS
|
||||
|
qw->bt->summarize_time(0), qw->bt->summarize_time(1), qw->bt->summarize_time(2), |
||||
|
#endif
|
||||
|
results[0]); |
||||
|
|
||||
|
|
||||
|
iteration = IterateOnce(run, chunk_size, mode, thread_count_filter, thread_count_copy, thread_count_aggregation, thread_group); |
||||
|
|
||||
|
} |
||||
|
|
||||
|
numa_free(data_b_hbm, workload_b); |
||||
|
numa_free(data_a, workload_b); |
||||
|
numa_free(data_b, workload_b); |
||||
|
numa_free(results, thread_group.max * sizeof(base_t)); |
||||
|
|
||||
|
} |
@ -0,0 +1,188 @@ |
|||||
|
/*
|
||||
|
* numa_memory_latency |
||||
|
* Copyright (c) 2017 UMEZAWA Takeshi |
||||
|
* This software is licensed under GNU GPL version 2 or later. |
||||
|
* |
||||
|
* This file has been modified |
||||
|
*/ |
||||
|
|
||||
|
#include <algorithm>
|
||||
|
#include <stdio.h>
|
||||
|
#include <stdint.h>
|
||||
|
#include <stdlib.h>
|
||||
|
#include <iostream>
|
||||
|
#include <unistd.h>
|
||||
|
#include <ctime>
|
||||
|
#include "file_output.h"
|
||||
|
#include <vector>
|
||||
|
#include <random>
|
||||
|
#include <algorithm>
|
||||
|
#include <numa.h>
|
||||
|
|
||||
|
#ifndef VOLATILE
|
||||
|
#define VOLATILE 0
|
||||
|
#endif
|
||||
|
|
||||
|
#define cachelinesize 64
|
||||
|
union CACHELINE { |
||||
|
char cacheline[cachelinesize]; |
||||
|
#if VOLATILE
|
||||
|
volatile CACHELINE* next; |
||||
|
#else
|
||||
|
CACHELINE* next; |
||||
|
#endif /*VOLATILE*/
|
||||
|
}; |
||||
|
|
||||
|
#define REPT4(x) do { x; x; x; x; } while(0)
|
||||
|
#define REPT16(x) do { REPT4(x); REPT4(x); REPT4(x); REPT4(x); } while(0);
|
||||
|
#define REPT64(x) do { REPT16(x); REPT16(x); REPT16(x); REPT16(x); } while(0);
|
||||
|
#define REPT256(x) do { REPT64(x); REPT64(x); REPT64(x); REPT64(x); } while(0);
|
||||
|
#define REPT1024(x) do { REPT256(x); REPT256(x); REPT256(x); REPT256(x); } while(0);
|
||||
|
|
||||
|
size_t bufsize = 1 * 1024 * 1024 * 1024; |
||||
|
size_t nloop = 128 * 1024; |
||||
|
std::vector<size_t> offsets; |
||||
|
|
||||
|
#if VOLATILE
|
||||
|
|
||||
|
volatile CACHELINE* walk(volatile CACHELINE* start) |
||||
|
{ |
||||
|
volatile CACHELINE* p = start; |
||||
|
for (size_t i = 0; i < nloop; ++i) { |
||||
|
REPT1024(p = p->next); |
||||
|
} |
||||
|
return p; |
||||
|
} |
||||
|
|
||||
|
#else
|
||||
|
|
||||
|
CACHELINE* walk(CACHELINE* start, uint64_t* sum) |
||||
|
{ |
||||
|
CACHELINE* p = start; |
||||
|
for (size_t i = 0; i < nloop; ++i) { |
||||
|
REPT1024( |
||||
|
*sum += static_cast<uint64_t>(p->cacheline[cachelinesize-1]); |
||||
|
p = p->next; |
||||
|
); |
||||
|
} |
||||
|
return p; |
||||
|
} |
||||
|
|
||||
|
#endif /*VOLATILE*/
|
||||
|
|
||||
|
void bench(int tasknode, int memnode, std::ofstream* out_file) |
||||
|
{ |
||||
|
struct timespec ts_begin, ts_end, ts_elapsed; |
||||
|
|
||||
|
printf("bench(task=%d, mem=%d)\n", tasknode, memnode); |
||||
|
|
||||
|
if (numa_run_on_node(tasknode) != 0) { |
||||
|
printf("failed to run on node: %s\n", strerror(errno)); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
CACHELINE* const buf = (CACHELINE*)numa_alloc_onnode(bufsize, memnode); |
||||
|
if (buf == NULL) { |
||||
|
printf("failed to allocate memory\n"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
for (size_t i = 0; i < offsets.size() - 1; ++i) { |
||||
|
// assuming that next-pointer never overwrites last Byte of the cacheline/union
|
||||
|
buf[offsets[i]].cacheline[cachelinesize-1] = offsets[i] % 128; |
||||
|
buf[offsets[i]].next = buf + offsets[i+1]; |
||||
|
} |
||||
|
buf[offsets[offsets.size() - 1]].next = buf; |
||||
|
buf[offsets[offsets.size() - 1]].cacheline[cachelinesize-1] = offsets[offsets.size() - 1] % 128; |
||||
|
|
||||
|
uint64_t value = 0; |
||||
|
uint64_t* sum = &value; |
||||
|
|
||||
|
clock_gettime(CLOCK_MONOTONIC, &ts_begin); |
||||
|
|
||||
|
#if VOLATILE
|
||||
|
walk(buf); |
||||
|
#else
|
||||
|
walk(buf, sum); |
||||
|
#endif /*VOLATILE*/
|
||||
|
|
||||
|
clock_gettime(CLOCK_MONOTONIC, &ts_end); |
||||
|
|
||||
|
ts_elapsed.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec; |
||||
|
ts_elapsed.tv_sec = ts_end.tv_sec - ts_begin.tv_sec; |
||||
|
if (ts_elapsed.tv_nsec < 0) { |
||||
|
--ts_elapsed.tv_sec; |
||||
|
ts_elapsed.tv_nsec += 1000*1000*1000; |
||||
|
} |
||||
|
double elapsed = ts_elapsed.tv_sec + 0.000000001 * ts_elapsed.tv_nsec; |
||||
|
printf("took %fsec. %fns/load\n", elapsed, elapsed/(1024*nloop)*(1000*1000*1000)); |
||||
|
print_to_file(*out_file, tasknode, memnode, elapsed/(1024*nloop)*(1000*1000*1000), *sum); |
||||
|
numa_free(buf, bufsize); |
||||
|
} |
||||
|
|
||||
|
struct RND { |
||||
|
std::mt19937 mt; |
||||
|
RND() : mt(time(NULL)) {} |
||||
|
std::mt19937::result_type operator()(std::mt19937::result_type n) { return mt() % n; } |
||||
|
} r; |
||||
|
|
||||
|
void usage(const char* prog) |
||||
|
{ |
||||
|
printf("usage: %s [-h] [bufsize] [nloop]\n", prog); |
||||
|
} |
||||
|
|
||||
|
int main(int argc, char* argv[]) |
||||
|
{ |
||||
|
int ch; |
||||
|
|
||||
|
while ((ch = getopt(argc, argv, "h")) != -1) { |
||||
|
switch (ch) { |
||||
|
case 'h': |
||||
|
default: |
||||
|
usage(argv[0]); |
||||
|
exit(1); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
argc -= optind; |
||||
|
argv += optind; |
||||
|
|
||||
|
if (argc > 1) { |
||||
|
// 1048576 KiB = 1 GiB
|
||||
|
bufsize = atoi(argv[0]) * 1024; // in KiB
|
||||
|
nloop = atoi(argv[1]) * 1024; |
||||
|
} |
||||
|
|
||||
|
offsets.resize(bufsize / cachelinesize); |
||||
|
|
||||
|
for (size_t i = 0; i < offsets.size(); ++i) |
||||
|
offsets[i] = i; |
||||
|
std::random_shuffle(offsets.begin() + 1, offsets.end(), r); |
||||
|
|
||||
|
uint64_t expected_checksum = 0; |
||||
|
#if VOLATILE == 0
|
||||
|
for (size_t i = 0; i < nloop * 1024; ++i) { |
||||
|
expected_checksum += offsets[i % offsets.size()] % 128; |
||||
|
} |
||||
|
#endif
|
||||
|
|
||||
|
std::ofstream check_file; |
||||
|
check_file.open("../results/micro_bench/latency/micro_bench_latency_" + (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".checksum"); |
||||
|
check_file << expected_checksum; |
||||
|
check_file.close(); |
||||
|
|
||||
|
|
||||
|
printf("benchmark bufsize=%zuKiB, nloop=%zuKi\n", bufsize/1024, nloop/1024); |
||||
|
|
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/micro_bench/latency/micro_bench_latency_"+ (std::string)(VOLATILE == 1 ? "volatile" : "sum") + ".csv"); |
||||
|
print_to_file(out_file, "tasknode", "memnode", "latency", "checksum"); |
||||
|
|
||||
|
for (int tasknode = 0; tasknode < 8; tasknode++) { |
||||
|
for (int memnode = 0; memnode < 16; memnode++) { |
||||
|
bench(tasknode, memnode, &out_file); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
@ -0,0 +1,271 @@ |
|||||
|
#include <iostream>
|
||||
|
#include <chrono>
|
||||
|
#include <future>
|
||||
|
#include <numa.h>
|
||||
|
#include <algorithm>
|
||||
|
#include <cstring>
|
||||
|
#include "memory_literals.h"
|
||||
|
#include "array_utils.h"
|
||||
|
#include "file_output.h"
|
||||
|
#include "aggregation.h"
|
||||
|
|
||||
|
|
||||
|
using base_t = uint64_t; |
||||
|
|
||||
|
size_t thread_cnt_memcpy = 128; |
||||
|
size_t thread_cnt_read = 128; |
||||
|
size_t runs = 10; |
||||
|
|
||||
|
|
||||
|
base_t sum_up(base_t* data, size_t workload){ |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < workload/sizeof(base_t); i++){ |
||||
|
sum += data[i]; |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
int reverse_bits(int number, size_t bit_count) { |
||||
|
int result = 0; |
||||
|
for(int i = 0; i < bit_count; i++) { |
||||
|
result <<= 1; |
||||
|
result |= (number & 1); |
||||
|
number >>= 1; |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
double measure_memcpy_bw(base_t* src, base_t* dest, size_t workload, base_t* result){ |
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
|
||||
|
auto thread_lambda = [&](base_t* source, base_t* destination, size_t count) { |
||||
|
ready_future.wait(); |
||||
|
memcpy(destination, source, count); |
||||
|
}; |
||||
|
|
||||
|
std::vector<std::thread> thread_pool; |
||||
|
size_t total_elements = workload / sizeof(base_t); |
||||
|
size_t elements_per_thread = total_elements / thread_cnt_memcpy; |
||||
|
size_t remainder = total_elements % thread_cnt_memcpy; |
||||
|
|
||||
|
for(size_t tid = 0; tid < thread_cnt_memcpy; tid++) { |
||||
|
size_t elements_to_process = elements_per_thread + (tid < remainder ? 1 : 0); |
||||
|
size_t byte_offset = (elements_per_thread * tid + std::min(tid, remainder)) * sizeof(base_t); |
||||
|
|
||||
|
thread_pool.emplace_back(thread_lambda, src + byte_offset / sizeof(base_t), dest + byte_offset / sizeof(base_t), elements_to_process * sizeof(base_t)); |
||||
|
} |
||||
|
|
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
p.set_value(); |
||||
|
for(std::thread& t : thread_pool) { t.join(); } |
||||
|
auto stop = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start); |
||||
|
double seconds = duration.count() / 1e9; |
||||
|
double throughput = (workload / seconds) / (1024 * 1024 * 1024); |
||||
|
*result = sum_up(dest, workload); |
||||
|
return throughput; |
||||
|
} |
||||
|
|
||||
|
double measure_read_bw(base_t* data, size_t workload, base_t* results){ |
||||
|
const size_t chunk_size = sizeof(__m512i); |
||||
|
const size_t num_chunks = (workload) / chunk_size; |
||||
|
__m512i* src = reinterpret_cast<__m512i*>(data); |
||||
|
std::promise<void> p; |
||||
|
std::shared_future<void> ready_future(p.get_future()); |
||||
|
size_t num_chunks_per_thread = num_chunks / thread_cnt_read; |
||||
|
size_t num_chunks_remainder = num_chunks % thread_cnt_read; |
||||
|
|
||||
|
auto thread_lambda = [&](__m512i* src, int tid, int num_chunks) { |
||||
|
__m512i accumulator = _mm512_setzero_si512(); |
||||
|
ready_future.wait(); |
||||
|
for (int i = 0; i < num_chunks; i++) { |
||||
|
__m512i chunk = _mm512_load_si512(&src[i]); |
||||
|
accumulator = _mm512_add_epi64(accumulator, chunk); |
||||
|
} |
||||
|
results[tid] = _mm512_reduce_add_epi64(accumulator); |
||||
|
}; |
||||
|
|
||||
|
std::vector<std::thread> thread_pool; |
||||
|
int offset; |
||||
|
for(int tid = 0; tid < thread_cnt_read; tid++){ |
||||
|
if(tid < num_chunks_remainder){ |
||||
|
offset = tid * (num_chunks_per_thread + 1); |
||||
|
thread_pool.emplace_back(thread_lambda, &src[offset], tid, (num_chunks_per_thread + 1)); |
||||
|
} else { |
||||
|
offset = tid*num_chunks_per_thread + num_chunks_remainder; |
||||
|
thread_pool.emplace_back(thread_lambda, &src[offset], tid, num_chunks_per_thread); |
||||
|
} |
||||
|
|
||||
|
} |
||||
|
|
||||
|
auto start = std::chrono::steady_clock::now(); |
||||
|
p.set_value(); |
||||
|
for(std::thread& t : thread_pool) { t.join(); } |
||||
|
auto stop = std::chrono::steady_clock::now(); |
||||
|
|
||||
|
Aggregation<base_t, Sum, load_mode::Aligned>::apply(results, results, sizeof(base_t) * thread_cnt_read); |
||||
|
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start); |
||||
|
double seconds = duration.count() / 1e9; |
||||
|
double throughput = (workload / seconds) / (1024 * 1024 * 1024); |
||||
|
return throughput; |
||||
|
} |
||||
|
|
||||
|
void exec_multiple_runs_memcpy(size_t workload, int exec_node, int src_node, int dest_node, std::ofstream* out_file, std::string iteration_type){ |
||||
|
base_t value; |
||||
|
base_t* result = &value; |
||||
|
base_t* src = (base_t*) numa_alloc_onnode(workload, src_node); |
||||
|
base_t* dest = (base_t*) numa_alloc_onnode(workload, dest_node); |
||||
|
fill_mt<base_t>(src, workload, 0, 100, 42); |
||||
|
fill_mt<base_t>(dest, workload, 0, 100, 12); |
||||
|
numa_run_on_node(exec_node); |
||||
|
|
||||
|
if(dest_node == 0 && src_node == 0){ |
||||
|
std::ofstream check_file; |
||||
|
check_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) |
||||
|
+ "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_" + iteration_type + ".checksum"); |
||||
|
check_file << sum_up(src, workload); |
||||
|
check_file.close(); |
||||
|
} |
||||
|
|
||||
|
for(size_t run = 0; run < runs; run++){ |
||||
|
double bw = measure_memcpy_bw(src, dest, workload, result); |
||||
|
std::cout << "Copy throughput executed on node " << exec_node << " form node " << src_node << " to node " |
||||
|
<< dest_node << ": " << bw << " GiB/s" << std::endl; |
||||
|
print_to_file(*out_file, run, src_node, dest_node, bw, *result); |
||||
|
std::memset(dest, 0x00, workload); |
||||
|
*result = 0; |
||||
|
} |
||||
|
numa_free(src, workload); |
||||
|
numa_free(dest, workload); |
||||
|
} |
||||
|
|
||||
|
void measure_all_memcpy_bw_for_chosen_execnode(int exec_node){ |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) |
||||
|
+ "_threadcnt_" + std::to_string(thread_cnt_memcpy) + ".csv"); |
||||
|
print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); |
||||
|
const size_t workload = 4_GiB; |
||||
|
|
||||
|
for(int src_node = 0; src_node < 16; src_node++){ |
||||
|
for(int dest_node = 0; dest_node < 16; dest_node++){ |
||||
|
exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, ""); |
||||
|
} |
||||
|
} |
||||
|
out_file.close(); |
||||
|
} |
||||
|
|
||||
|
void measure_all_memcpy_bw_for_chosen_execnode_reversed(int exec_node){ |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) |
||||
|
+ "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed.csv"); |
||||
|
print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); |
||||
|
const size_t workload = 4_GiB; |
||||
|
|
||||
|
for(int src_node = 15; src_node >= 0; src_node--){ |
||||
|
for(int dest_node = 15; dest_node >= 0; dest_node--){ |
||||
|
exec_multiple_runs_memcpy(workload, exec_node, src_node, dest_node, &out_file, "reversed"); |
||||
|
} |
||||
|
} |
||||
|
out_file.close(); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
|
||||
|
void measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(int exec_node){ |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/micro_bench/micro_bench_bw_memcpy_execnode_" + std::to_string(exec_node) |
||||
|
+ "_threadcnt_" + std::to_string(thread_cnt_memcpy) + "_reversed_bitwise.csv"); |
||||
|
print_to_file(out_file, "run", "src_node", "dest_node", "bw", "result"); |
||||
|
const size_t workload = 4_GiB; |
||||
|
|
||||
|
for(int src_node = 0; src_node < 16; src_node++){ |
||||
|
for(int dest_node = 0; dest_node < 16; dest_node++){ |
||||
|
int reversed_src_node = reverse_bits(src_node, 4); |
||||
|
int reversed_dest_node = reverse_bits(dest_node, 4); |
||||
|
exec_multiple_runs_memcpy(workload, exec_node, reversed_src_node, reversed_dest_node, &out_file, "reversed_bitwise"); |
||||
|
} |
||||
|
} |
||||
|
out_file.close(); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
void exec_multiple_runs_read(size_t workload, int mem_node, int exec_node, std::ofstream *out_file, std::string iteration_type){ |
||||
|
base_t* data = (base_t*) numa_alloc_onnode(workload, mem_node); |
||||
|
fill_mt<base_t>(data, workload, 0, 100, 42); |
||||
|
base_t* results = (base_t*) numa_alloc_onnode(thread_cnt_read * sizeof(base_t), exec_node); |
||||
|
numa_run_on_node(exec_node); |
||||
|
|
||||
|
if(mem_node == 0 && exec_node == 0){ |
||||
|
std::ofstream check_file; |
||||
|
check_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_" + iteration_type + ".checksum"); |
||||
|
check_file << sum_up(data, workload); |
||||
|
check_file.close(); |
||||
|
} |
||||
|
|
||||
|
for(size_t run = 0; run < runs; run++){ |
||||
|
double bw = measure_read_bw(data, workload, results); |
||||
|
std::cout << "Read throughput executed on node " << exec_node << " for node " << mem_node << ": " << bw << " GiB/s" << std::endl; |
||||
|
print_to_file(*out_file, run, exec_node, mem_node, bw, results[0]); |
||||
|
std::memset(results, 0x00, thread_cnt_read * sizeof(base_t)); |
||||
|
} |
||||
|
numa_free(data, workload); |
||||
|
numa_free(results, thread_cnt_read * sizeof(base_t)); |
||||
|
} |
||||
|
|
||||
|
void measure_all_read_bw(){ |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + ".csv"); |
||||
|
print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); |
||||
|
const size_t workload = 8_GiB; |
||||
|
|
||||
|
for(int exec_node = 0; exec_node < 8; exec_node++){ |
||||
|
for(int mem_node = 0; mem_node < 16; mem_node++){ |
||||
|
exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, ""); |
||||
|
} |
||||
|
} |
||||
|
out_file.close(); |
||||
|
} |
||||
|
|
||||
|
void measure_all_read_bw_reversed(){ |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed.csv"); |
||||
|
print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); |
||||
|
const size_t workload = 8_GiB; |
||||
|
|
||||
|
for(int exec_node = 7; exec_node >= 0; exec_node--){ |
||||
|
for(int mem_node = 15; mem_node >= 0; mem_node--){ |
||||
|
exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed"); |
||||
|
} |
||||
|
} |
||||
|
out_file.close(); |
||||
|
} |
||||
|
|
||||
|
void measure_all_read_bw_reversed_bitwise(){ |
||||
|
std::ofstream out_file; |
||||
|
out_file.open("../results/micro_bench/micro_bench_bw_read_threadcnt_" + std::to_string(thread_cnt_read) + "_reversed_bitwise.csv"); |
||||
|
print_to_file(out_file, "run", "exec_node", "mem_node", "bw", "result"); |
||||
|
const size_t workload = 8_GiB; |
||||
|
|
||||
|
for(int exec_node0 = 0; exec_node0 < 8; exec_node0++){ |
||||
|
for(int mem_node0 = 0; mem_node0 < 16; mem_node0++){ |
||||
|
int mem_node = reverse_bits(mem_node0, 4); |
||||
|
int exec_node = reverse_bits(exec_node0, 3); |
||||
|
exec_multiple_runs_read(workload, mem_node, exec_node, &out_file, "reversed_bitwise"); |
||||
|
} |
||||
|
} |
||||
|
out_file.close(); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
|
||||
|
int main() { |
||||
|
// nodes 0-7 hold cores and DRAM, nodes 8-15 only HBM
|
||||
|
|
||||
|
measure_all_read_bw_reversed_bitwise(); |
||||
|
measure_all_memcpy_bw_for_chosen_execnode_reversed_bitwise(0); |
||||
|
|
||||
|
return 0; |
||||
|
} |
@ -0,0 +1,391 @@ |
|||||
|
|
||||
|
#include <cassert> |
||||
|
#include <mutex> |
||||
|
#include <cstring> |
||||
|
#include <bitset> |
||||
|
|
||||
|
#include <numa.h> |
||||
|
|
||||
|
#include "filter.h" |
||||
|
#include "aggregation.h" |
||||
|
#include "vector_loader.h" |
||||
|
#include "timer_utils.h" |
||||
|
#include "barrier_utils.h" |
||||
|
#include "execution_modes.h" |
||||
|
|
||||
|
|
||||
|
template<typename base_t, bool simple> |
||||
|
class Query_Wrapper { |
||||
|
public: |
||||
|
// sync |
||||
|
std::shared_future<void>* ready_future; |
||||
|
|
||||
|
thread_runtime_timing* trt; |
||||
|
barrier_timing* bt; |
||||
|
|
||||
|
private: |
||||
|
// numa |
||||
|
uint32_t close_mem; |
||||
|
uint32_t far_mem; |
||||
|
|
||||
|
// data |
||||
|
size_t size_b; |
||||
|
size_t chunk_size_b; |
||||
|
size_t chunk_size_w; |
||||
|
size_t chunk_cnt; |
||||
|
base_t* data_a; |
||||
|
base_t* data_b; |
||||
|
base_t* dest; |
||||
|
|
||||
|
// ratios |
||||
|
uint32_t thread_count_fc; |
||||
|
uint32_t thread_count_fi; |
||||
|
uint32_t thread_count_ag; |
||||
|
uint32_t thread_group; |
||||
|
|
||||
|
// done bits |
||||
|
volatile uint8_t* ready_flag_a; |
||||
|
volatile uint8_t* ready_flag_b; |
||||
|
std::mutex ready_a_m; |
||||
|
std::mutex ready_b_m; |
||||
|
|
||||
|
// buffer |
||||
|
uint16_t* mask_a; |
||||
|
uint16_t* mask_b; |
||||
|
base_t** buffer_b; |
||||
|
|
||||
|
// params |
||||
|
base_t cmp_a; |
||||
|
base_t cmp_b; |
||||
|
bool no_copy; |
||||
|
NewPMode mode; |
||||
|
|
||||
|
// sync |
||||
|
std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier; |
||||
|
std::string barrier_mode = BARRIER_MODE; |
||||
|
|
||||
|
using filterCopy = Filter<base_t, LT, load_mode::Stream, true>; |
||||
|
using filterNoCopy = Filter<base_t, LT, load_mode::Stream, false>; |
||||
|
using filter = Filter<base_t, LT, load_mode::Stream, false>; |
||||
|
using aggregation = Aggregation<base_t, Sum, load_mode::Stream>; |
||||
|
|
||||
|
public: |
||||
|
|
||||
|
|
||||
|
Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, |
||||
|
base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, |
||||
|
NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) : |
||||
|
ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), |
||||
|
dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){ |
||||
|
|
||||
|
chunk_size_w = chunk_size_b / sizeof(base_t); |
||||
|
chunk_cnt = size_b / chunk_size_b; |
||||
|
thread_count_fi = tc_fi; |
||||
|
thread_count_fc = tc_fc; |
||||
|
thread_count_ag = tc_ag; |
||||
|
|
||||
|
ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( |
||||
|
chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); |
||||
|
ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( |
||||
|
chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); |
||||
|
|
||||
|
mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); |
||||
|
mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); |
||||
|
|
||||
|
trt = new thread_runtime_timing(4, 16*4*4*4, close_mem); |
||||
|
bt = new barrier_timing(4, 16*4*4*4, close_mem); |
||||
|
reset_barriers(); |
||||
|
|
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
// TODO size ok like that? |
||||
|
buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem); |
||||
|
buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); |
||||
|
buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); |
||||
|
} else { |
||||
|
buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem); |
||||
|
base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem); |
||||
|
*buffer_b = buffer_tmp; |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
void reset_barriers(){ |
||||
|
if(sync_barrier != nullptr) { |
||||
|
for(auto& barrier : *sync_barrier) { |
||||
|
delete barrier; |
||||
|
} |
||||
|
sync_barrier.reset(); |
||||
|
} |
||||
|
|
||||
|
sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group); |
||||
|
uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; |
||||
|
uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; |
||||
|
uint32_t barrier_thread_count; |
||||
|
|
||||
|
if constexpr(simple){ |
||||
|
barrier_thread_count = (thread_group / barrier_count) * |
||||
|
(mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi)); |
||||
|
} else { |
||||
|
barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; |
||||
|
} |
||||
|
for(uint32_t i = 0; i < barrier_count; ++i) { |
||||
|
(*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void clear_buffers () { |
||||
|
std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); |
||||
|
std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); |
||||
|
|
||||
|
std::memset(mask_a, 0x00, size_b / sizeof(base_t)); |
||||
|
std::memset(mask_b, 0x00, size_b / sizeof(base_t)); |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b); |
||||
|
std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b); |
||||
|
} else { |
||||
|
std::memset(*buffer_b, 0x00, size_b); |
||||
|
} |
||||
|
|
||||
|
trt->reset_accumulator(); |
||||
|
bt->reset_accumulator(); |
||||
|
reset_barriers(); |
||||
|
}; |
||||
|
|
||||
|
~Query_Wrapper() { |
||||
|
numa_free((void*)ready_flag_a, |
||||
|
chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); |
||||
|
numa_free((void*)ready_flag_b, |
||||
|
chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); |
||||
|
|
||||
|
numa_free(mask_a, size_b / sizeof(base_t)); |
||||
|
numa_free(mask_b, size_b / sizeof(base_t)); |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
numa_free(buffer_b[0], thread_group * chunk_size_b); |
||||
|
numa_free(buffer_b[1], thread_group * chunk_size_b); |
||||
|
numa_free(buffer_b, size_b * sizeof(base_t*)); |
||||
|
} else { |
||||
|
numa_free(*buffer_b, size_b); |
||||
|
} |
||||
|
|
||||
|
delete trt; |
||||
|
for(auto& barrier : *sync_barrier) { |
||||
|
delete barrier; |
||||
|
} |
||||
|
delete bt; |
||||
|
|
||||
|
}; |
||||
|
|
||||
|
//this can be set without need to change allocations |
||||
|
void set_thread_group_count(uint32_t value) { |
||||
|
this->thread_group = value; |
||||
|
}; |
||||
|
|
||||
|
private: |
||||
|
static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, |
||||
|
size_t tcnt) { |
||||
|
base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; |
||||
|
return chunk_ptr + tid * (chunk_size_w / tcnt); |
||||
|
} |
||||
|
|
||||
|
static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, |
||||
|
size_t tcnt) { |
||||
|
// 16 integer are addressed with one uint16_t in mask buffer |
||||
|
size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); |
||||
|
return base_ptr + (offset / 16); |
||||
|
} |
||||
|
|
||||
|
static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { |
||||
|
uint8_t value = bitmap[bitpos / 8]; |
||||
|
switch(bitpos % 8) { |
||||
|
case 0: return value & 0b00000001; |
||||
|
case 1: return value & 0b00000010; |
||||
|
case 2: return value & 0b00000100; |
||||
|
case 3: return value & 0b00001000; |
||||
|
case 4: return value & 0b00010000; |
||||
|
case 5: return value & 0b00100000; |
||||
|
case 6: return value & 0b01000000; |
||||
|
case 7: return value & 0b10000000; |
||||
|
default: return false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { |
||||
|
mutex.lock(); |
||||
|
switch(bitpos % 8) { |
||||
|
case 0: bitmap[bitpos / 8] |= 0b00000001;break; |
||||
|
case 1: bitmap[bitpos / 8] |= 0b00000010;break; |
||||
|
case 2: bitmap[bitpos / 8] |= 0b00000100;break; |
||||
|
case 3: bitmap[bitpos / 8] |= 0b00001000;break; |
||||
|
case 4: bitmap[bitpos / 8] |= 0b00010000;break; |
||||
|
case 5: bitmap[bitpos / 8] |= 0b00100000;break; |
||||
|
case 6: bitmap[bitpos / 8] |= 0b01000000;break; |
||||
|
case 7: bitmap[bitpos / 8] |= 0b10000000;break; |
||||
|
} |
||||
|
mutex.unlock(); |
||||
|
} |
||||
|
|
||||
|
public: |
||||
|
|
||||
|
static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < size_b / sizeof(base_t); ++i) { |
||||
|
if(a[i] >= cmp_a && b[i] <= cmp_b) { |
||||
|
sum += b[i]; |
||||
|
} |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { |
||||
|
uint32_t cnt = 0; |
||||
|
for(int i = 0; i < size_b / sizeof(base_t); ++i) { |
||||
|
if(leq) { |
||||
|
if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) { |
||||
|
++cnt; |
||||
|
} |
||||
|
} else { |
||||
|
if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) { |
||||
|
++cnt; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { |
||||
|
for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) { |
||||
|
std::bitset<16> m(mask[i]); |
||||
|
uint16_t ch = 0; |
||||
|
for(int j = 0; j < 16; ++j) { |
||||
|
if(data[i*16 + j] <= cmp) { |
||||
|
ch |= 0x1 << j; |
||||
|
} |
||||
|
} |
||||
|
std::bitset<16> c(ch); |
||||
|
|
||||
|
std::cout << "act " << m << std::endl; |
||||
|
std::cout << "rea " << c << std::endl << std::endl; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
void scan_b(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_fc; |
||||
|
assert(chunk_size_w % tcnt == 0); |
||||
|
assert(chunk_size_w % 16 == 0); |
||||
|
assert(chunk_size_w % tcnt * 16 == 0); |
||||
|
|
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
trt->start_timer(1, tid * gcnt + gid); |
||||
|
|
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr = get_sub_chunk_ptr(data_b , chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr = get_sub_mask_ptr (mask_b , chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
if constexpr(simple){ |
||||
|
base_t* buffer_ptr; |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} |
||||
|
std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
if(no_copy) { |
||||
|
filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
base_t* buffer_ptr; |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} |
||||
|
filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
trt->stop_timer(1, tid * gcnt + gid); |
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); |
||||
|
|
||||
|
} |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
|
||||
|
} |
||||
|
|
||||
|
void scan_a(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_fi; |
||||
|
assert(chunk_size_w % tcnt == 0); |
||||
|
assert(chunk_size_w % 16 == 0); |
||||
|
assert(chunk_size_w % tcnt * 16 == 0); |
||||
|
|
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
trt->start_timer(0, tid * gcnt + gid); |
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); |
||||
|
|
||||
|
trt->stop_timer(0, tid * gcnt + gid); |
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); |
||||
|
} |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
} |
||||
|
|
||||
|
void aggr_j(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_ag; |
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// calculate values |
||||
|
__m512i aggregator = aggregation::OP::zero(); |
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
|
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); |
||||
|
trt->start_timer(2, tid * gcnt + gid); |
||||
|
|
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr; |
||||
|
if(no_copy) { |
||||
|
chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
chunk_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} |
||||
|
} |
||||
|
uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
base_t tmp = _mm512_reduce_add_epi64(aggregator); |
||||
|
if constexpr(simple){ |
||||
|
aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); |
||||
|
} |
||||
|
trt->stop_timer(2, tid * gcnt + gid); |
||||
|
} |
||||
|
|
||||
|
// so threads with more runs dont wait for finished threads |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
|
||||
|
aggregation::happly(dest + (tid * gcnt + gid), aggregator); |
||||
|
} |
||||
|
}; |
@ -0,0 +1,395 @@ |
|||||
|
|
||||
|
#include <cassert> |
||||
|
#include <mutex> |
||||
|
#include <cstring> |
||||
|
#include <bitset> |
||||
|
#include <algorithm> |
||||
|
|
||||
|
#include <numa.h> |
||||
|
|
||||
|
#include "filter.h" |
||||
|
#include "aggregation.h" |
||||
|
#include "vector_loader.h" |
||||
|
#include "timer_utils.h" |
||||
|
#include "barrier_utils.h" |
||||
|
#include "measurement_utils.h" |
||||
|
#include "execution_modes.h" |
||||
|
|
||||
|
#include "../../../thirdParty/dsa_offload/offloading-cacher/cache.hpp" |
||||
|
|
||||
|
template<typename base_t, bool simple> |
||||
|
class Query_Wrapper { |
||||
|
public: |
||||
|
// sync |
||||
|
std::shared_future<void>* ready_future; |
||||
|
|
||||
|
thread_runtime_timing* trt; |
||||
|
barrier_timing* bt; |
||||
|
pcm_value_collector* pvc; |
||||
|
|
||||
|
private: |
||||
|
dsacache::Cache cache_; |
||||
|
|
||||
|
// numa |
||||
|
uint32_t close_mem; |
||||
|
uint32_t far_mem; |
||||
|
|
||||
|
// data |
||||
|
size_t size_b; |
||||
|
size_t chunk_size_b; |
||||
|
size_t chunk_size_w; |
||||
|
size_t chunk_cnt; |
||||
|
base_t* data_a; |
||||
|
base_t* data_b; |
||||
|
base_t* dest; |
||||
|
|
||||
|
// ratios |
||||
|
uint32_t thread_count_fc; |
||||
|
uint32_t thread_count_fi; |
||||
|
uint32_t thread_count_ag; |
||||
|
uint32_t thread_group; |
||||
|
|
||||
|
// done bits |
||||
|
volatile uint8_t* ready_flag_a; |
||||
|
volatile uint8_t* ready_flag_b; |
||||
|
std::mutex ready_a_m; |
||||
|
std::mutex ready_b_m; |
||||
|
|
||||
|
// buffer |
||||
|
uint16_t* mask_a; |
||||
|
uint16_t* mask_b; |
||||
|
|
||||
|
// params |
||||
|
base_t cmp_a; |
||||
|
base_t cmp_b; |
||||
|
NewPMode mode; |
||||
|
|
||||
|
// sync |
||||
|
std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier; |
||||
|
std::string barrier_mode = BARRIER_MODE; |
||||
|
|
||||
|
using filterCopy = Filter<base_t, LT, load_mode::Stream, true>; |
||||
|
using filterNoCopy = Filter<base_t, LT, load_mode::Stream, false>; |
||||
|
using filter = Filter<base_t, LT, load_mode::Stream, false>; |
||||
|
using aggregation = Aggregation<base_t, Sum, load_mode::Stream>; |
||||
|
|
||||
|
void InitCache(const std::string& device) { |
||||
|
if (device == "default") { |
||||
|
static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { |
||||
|
return numa_dst_node; |
||||
|
}; |
||||
|
|
||||
|
static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { |
||||
|
return std::vector<int>{ numa_src_node, numa_dst_node }; |
||||
|
}; |
||||
|
|
||||
|
cache_.Init(cache_policy,copy_policy); |
||||
|
} |
||||
|
else if (device == "xeonmax") { |
||||
|
static const auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { |
||||
|
return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; |
||||
|
}; |
||||
|
|
||||
|
static const auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { |
||||
|
const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0; |
||||
|
if (same_socket) { |
||||
|
const bool socket_number = numa_dst_node >> 2; |
||||
|
if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 }; |
||||
|
else return std::vector<int>{ 4, 5, 6, 7 }; |
||||
|
} |
||||
|
else return std::vector<int>{ numa_src_node, numa_dst_node }; |
||||
|
}; |
||||
|
|
||||
|
cache_.Init(cache_policy,copy_policy); |
||||
|
} |
||||
|
else { |
||||
|
std::cerr << "Given device '" << device << "' not supported!" << std::endl; |
||||
|
exit(-1); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public: |
||||
|
|
||||
|
|
||||
|
Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, |
||||
|
base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, |
||||
|
NewPMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42) : |
||||
|
ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), |
||||
|
dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b){ |
||||
|
|
||||
|
chunk_size_w = chunk_size_b / sizeof(base_t); |
||||
|
chunk_cnt = size_b / chunk_size_b; |
||||
|
thread_count_fi = tc_fi; |
||||
|
thread_count_fc = tc_fc; |
||||
|
thread_count_ag = tc_ag; |
||||
|
|
||||
|
ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( |
||||
|
chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); |
||||
|
ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( |
||||
|
chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); |
||||
|
|
||||
|
mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); |
||||
|
mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); |
||||
|
|
||||
|
InitCache("xeonmax"); |
||||
|
|
||||
|
size_t measurement_space = THREAD_GROUP_MULTIPLIER * std::max(std::max(tc_fi, tc_fc), tc_ag); |
||||
|
trt = new thread_runtime_timing(3, measurement_space, far_mem); |
||||
|
bt = new barrier_timing(3, measurement_space, far_mem); |
||||
|
pvc = new pcm_value_collector({"scan_a", "scan_b", "aggr_j"}, measurement_space, far_mem); |
||||
|
reset_barriers(); |
||||
|
}; |
||||
|
|
||||
|
void reset_barriers(){ |
||||
|
if(sync_barrier != nullptr) { |
||||
|
for(auto& barrier : *sync_barrier) { |
||||
|
delete barrier; |
||||
|
} |
||||
|
sync_barrier.reset(); |
||||
|
} |
||||
|
|
||||
|
sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group); |
||||
|
uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; |
||||
|
uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; |
||||
|
uint32_t barrier_thread_count; |
||||
|
|
||||
|
if constexpr(simple){ |
||||
|
barrier_thread_count = (thread_group / barrier_count) * |
||||
|
(mode == NewPMode::Prefetch ? thread_count_sum : (thread_count_ag + thread_count_fi)); |
||||
|
} else { |
||||
|
barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; |
||||
|
} |
||||
|
for(uint32_t i = 0; i < barrier_count; ++i) { |
||||
|
(*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void clear_buffers () { |
||||
|
std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); |
||||
|
std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); |
||||
|
|
||||
|
std::memset(mask_a, 0x00, size_b / sizeof(base_t)); |
||||
|
std::memset(mask_b, 0x00, size_b / sizeof(base_t)); |
||||
|
|
||||
|
cache_.Clear(); |
||||
|
|
||||
|
trt->reset_accumulator(); |
||||
|
bt->reset_accumulator(); |
||||
|
pvc->reset(); |
||||
|
reset_barriers(); |
||||
|
}; |
||||
|
|
||||
|
~Query_Wrapper() { |
||||
|
numa_free((void*)ready_flag_a, |
||||
|
chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); |
||||
|
numa_free((void*)ready_flag_b, |
||||
|
chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); |
||||
|
|
||||
|
numa_free(mask_a, size_b / sizeof(base_t)); |
||||
|
numa_free(mask_b, size_b / sizeof(base_t)); |
||||
|
|
||||
|
delete trt; |
||||
|
for(auto& barrier : *sync_barrier) { |
||||
|
delete barrier; |
||||
|
} |
||||
|
delete bt; |
||||
|
delete pvc; |
||||
|
}; |
||||
|
|
||||
|
//this can be set without need to change allocations |
||||
|
void set_thread_group_count(uint32_t value) { |
||||
|
this->thread_group = value; |
||||
|
}; |
||||
|
|
||||
|
private: |
||||
|
static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, |
||||
|
size_t tcnt) { |
||||
|
base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; |
||||
|
return chunk_ptr + tid * (chunk_size_w / tcnt); |
||||
|
} |
||||
|
|
||||
|
static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, |
||||
|
size_t tcnt) { |
||||
|
// 16 integer are addressed with one uint16_t in mask buffer |
||||
|
size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); |
||||
|
return base_ptr + (offset / 16); |
||||
|
} |
||||
|
|
||||
|
static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { |
||||
|
uint8_t value = bitmap[bitpos / 8]; |
||||
|
switch(bitpos % 8) { |
||||
|
case 0: return value & 0b00000001; |
||||
|
case 1: return value & 0b00000010; |
||||
|
case 2: return value & 0b00000100; |
||||
|
case 3: return value & 0b00001000; |
||||
|
case 4: return value & 0b00010000; |
||||
|
case 5: return value & 0b00100000; |
||||
|
case 6: return value & 0b01000000; |
||||
|
case 7: return value & 0b10000000; |
||||
|
default: return false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { |
||||
|
mutex.lock(); |
||||
|
switch(bitpos % 8) { |
||||
|
case 0: bitmap[bitpos / 8] |= 0b00000001;break; |
||||
|
case 1: bitmap[bitpos / 8] |= 0b00000010;break; |
||||
|
case 2: bitmap[bitpos / 8] |= 0b00000100;break; |
||||
|
case 3: bitmap[bitpos / 8] |= 0b00001000;break; |
||||
|
case 4: bitmap[bitpos / 8] |= 0b00010000;break; |
||||
|
case 5: bitmap[bitpos / 8] |= 0b00100000;break; |
||||
|
case 6: bitmap[bitpos / 8] |= 0b01000000;break; |
||||
|
case 7: bitmap[bitpos / 8] |= 0b10000000;break; |
||||
|
} |
||||
|
mutex.unlock(); |
||||
|
} |
||||
|
|
||||
|
public: |
||||
|
void scan_b(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_fc; |
||||
|
assert(chunk_size_w % tcnt == 0); |
||||
|
assert(chunk_size_w % 16 == 0); |
||||
|
assert(chunk_size_w % tcnt * 16 == 0); |
||||
|
|
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
trt->start_timer(1, tid * gcnt + gid); |
||||
|
pvc->start("scan_b", tid * gcnt + gid); |
||||
|
|
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr = get_sub_mask_ptr(mask_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
if constexpr(simple){ |
||||
|
cache_.Access(chunk_ptr, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt); |
||||
|
|
||||
|
// wait on copy to complete - during this time other threads may |
||||
|
// continue with their calculation which leads to little impact |
||||
|
// and we will be faster if the cache is used |
||||
|
|
||||
|
data->WaitOnCompletion(); |
||||
|
|
||||
|
// obtain the data location from the cache entry |
||||
|
|
||||
|
base_t* data_ptr = data->GetDataLocation(); |
||||
|
|
||||
|
// nullptr is still a legal return value for CacheData::GetLocation() |
||||
|
// even after waiting, so this must be checked |
||||
|
|
||||
|
if (data_ptr == nullptr) { |
||||
|
data_ptr = chunk_ptr; |
||||
|
} |
||||
|
|
||||
|
filterNoCopy::apply_same(mask_ptr, nullptr, data_ptr, cmp_b, chunk_size_b / tcnt); |
||||
|
} |
||||
|
|
||||
|
pvc->stop("scan_b", tid * gcnt + gid); |
||||
|
trt->stop_timer(1, tid * gcnt + gid); |
||||
|
|
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); |
||||
|
} |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
} |
||||
|
|
||||
|
void scan_a(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_fi; |
||||
|
assert(chunk_size_w % tcnt == 0); |
||||
|
assert(chunk_size_w % 16 == 0); |
||||
|
assert(chunk_size_w % tcnt * 16 == 0); |
||||
|
|
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
|
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
trt->start_timer(0, tid * gcnt + gid); |
||||
|
pvc->start("scan_a", tid * gcnt + gid); |
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); |
||||
|
|
||||
|
pvc->stop("scan_a", tid * gcnt + gid); |
||||
|
trt->stop_timer(0, tid * gcnt + gid); |
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); |
||||
|
} |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
} |
||||
|
|
||||
|
void aggr_j(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_ag; |
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// calculate values |
||||
|
__m512i aggregator = aggregation::OP::zero(); |
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
|
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); |
||||
|
trt->start_timer(2, tid * gcnt + gid); |
||||
|
pvc->start("aggr_j", tid * gcnt + gid); |
||||
|
|
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
const base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
// access the cache for the given chunk which will have been accessed in scan_b |
||||
|
|
||||
|
const auto data = cache_.Access(chunk_ptr, chunk_size_b / tcnt); |
||||
|
|
||||
|
// wait on the caching task to complete, this will give time for other processes |
||||
|
// to make progress here which will therefore not hurt performance |
||||
|
|
||||
|
data->WaitOnCompletion(); |
||||
|
|
||||
|
// after the copy task has finished we obtain the pointer to the cached |
||||
|
// copy of data_b which is then used from now on |
||||
|
|
||||
|
const base_t* data_ptr = data->GetDataLocation(); |
||||
|
|
||||
|
// nullptr is still a legal return value for CacheData::GetLocation() |
||||
|
// even after waiting, so this must be checked |
||||
|
|
||||
|
if (data_ptr == nullptr) { |
||||
|
data_ptr = chunk_ptr; |
||||
|
std::cerr << "Cache Miss" << std::endl; |
||||
|
} |
||||
|
|
||||
|
uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
base_t tmp = _mm512_reduce_add_epi64(aggregator); |
||||
|
|
||||
|
if constexpr(simple){ |
||||
|
aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); |
||||
|
} |
||||
|
|
||||
|
pvc->stop("aggr_j", tid * gcnt + gid); |
||||
|
trt->stop_timer(2, tid * gcnt + gid); |
||||
|
} |
||||
|
|
||||
|
// so threads with more runs dont wait for alerady finished threads |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
|
||||
|
aggregation::happly(dest + (tid * gcnt + gid), aggregator); |
||||
|
} |
||||
|
}; |
@ -0,0 +1,387 @@ |
|||||
|
|
||||
|
#include <cassert> |
||||
|
#include <mutex> |
||||
|
#include <cstring> |
||||
|
#include <bitset> |
||||
|
|
||||
|
#include <numa.h> |
||||
|
|
||||
|
#include "filter.h" |
||||
|
#include "aggregation.h" |
||||
|
#include "vector_loader.h" |
||||
|
#include "timer_utils.h" |
||||
|
#include "barrier_utils.h" |
||||
|
#include "execution_modes.h" |
||||
|
|
||||
|
|
||||
|
template<typename base_t, bool simple> |
||||
|
class Query_Wrapper { |
||||
|
public: |
||||
|
// sync |
||||
|
std::shared_future<void>* ready_future; |
||||
|
|
||||
|
thread_runtime_timing* trt; |
||||
|
barrier_timing* bt; |
||||
|
|
||||
|
private: |
||||
|
// numa |
||||
|
uint32_t close_mem; |
||||
|
uint32_t far_mem; |
||||
|
|
||||
|
// data |
||||
|
size_t size_b; |
||||
|
size_t chunk_size_b; |
||||
|
size_t chunk_size_w; |
||||
|
size_t chunk_cnt; |
||||
|
base_t* data_a; |
||||
|
base_t* data_b; |
||||
|
base_t* dest; |
||||
|
|
||||
|
// ratios |
||||
|
uint32_t thread_count_fc; |
||||
|
uint32_t thread_count_fi; |
||||
|
uint32_t thread_count_ag; |
||||
|
uint32_t thread_group; |
||||
|
|
||||
|
// done bits |
||||
|
volatile uint8_t* ready_flag_a; |
||||
|
volatile uint8_t* ready_flag_b; |
||||
|
std::mutex ready_a_m; |
||||
|
std::mutex ready_b_m; |
||||
|
|
||||
|
// buffer |
||||
|
uint16_t* mask_a; |
||||
|
uint16_t* mask_b; |
||||
|
base_t** buffer_b; |
||||
|
|
||||
|
// params |
||||
|
base_t cmp_a; |
||||
|
base_t cmp_b; |
||||
|
bool no_copy; |
||||
|
PMode mode; |
||||
|
|
||||
|
// sync |
||||
|
std::unique_ptr<std::vector<std::barrier<barrier_completion_function>*>> sync_barrier; |
||||
|
std::string barrier_mode = BARRIER_MODE; |
||||
|
|
||||
|
using filterCopy = Filter<base_t, LEQ, load_mode::Aligned, true>; |
||||
|
using filterNoCopy = Filter<base_t, LEQ, load_mode::Aligned, false>; |
||||
|
using filter = Filter<base_t, GEQ, load_mode::Aligned, false>; |
||||
|
using aggregation = Aggregation<base_t, Sum, load_mode::Aligned>; |
||||
|
|
||||
|
public: |
||||
|
|
||||
|
|
||||
|
Query_Wrapper(std::shared_future<void>* rdy_fut, size_t workload_b, size_t chunk_size_b, base_t* data_a, |
||||
|
base_t* data_b, base_t* dest, uint32_t numa_close, uint32_t numa_far, uint32_t tc_fi, uint32_t tc_fc, uint32_t tc_ag, |
||||
|
PMode mode, uint32_t thread_group, base_t cmp_a = 50, base_t cmp_b = 42, bool no_copy = false) : |
||||
|
ready_future(rdy_fut), size_b(workload_b), chunk_size_b(chunk_size_b), data_a(data_a), data_b(data_b), |
||||
|
dest(dest), close_mem(numa_close), far_mem(numa_far), mode(mode), thread_group(thread_group), cmp_a(cmp_a), cmp_b(cmp_b), no_copy(no_copy){ |
||||
|
|
||||
|
chunk_size_w = chunk_size_b / sizeof(base_t); |
||||
|
chunk_cnt = size_b / chunk_size_b; |
||||
|
thread_count_fi = tc_fi; |
||||
|
thread_count_fc = tc_fc; |
||||
|
thread_count_ag = tc_ag; |
||||
|
|
||||
|
ready_flag_a = (volatile uint8_t *) numa_alloc_onnode( |
||||
|
chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0), close_mem); |
||||
|
ready_flag_b = (volatile uint8_t *) numa_alloc_onnode( |
||||
|
chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0), close_mem); |
||||
|
|
||||
|
mask_a = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); |
||||
|
mask_b = (uint16_t *) numa_alloc_onnode(size_b / sizeof(base_t), close_mem); |
||||
|
|
||||
|
trt = new thread_runtime_timing(4, 20, close_mem); |
||||
|
bt = new barrier_timing(4, 20, close_mem); |
||||
|
reset_barriers(); |
||||
|
|
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
// TODO size ok like that? |
||||
|
buffer_b = (base_t**) numa_alloc_onnode(size_b * sizeof(base_t*), close_mem); |
||||
|
buffer_b[0] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); |
||||
|
buffer_b[1] = (base_t*) numa_alloc_onnode(thread_group * chunk_size_b, close_mem); |
||||
|
} else { |
||||
|
buffer_b = (base_t **) numa_alloc_onnode(sizeof(base_t*), close_mem); |
||||
|
base_t* buffer_tmp = (base_t *) numa_alloc_onnode(size_b, close_mem); |
||||
|
*buffer_b = buffer_tmp; |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
void reset_barriers(){ |
||||
|
if(sync_barrier != nullptr) { |
||||
|
for(auto& barrier : *sync_barrier) { |
||||
|
delete barrier; |
||||
|
} |
||||
|
sync_barrier.reset(); |
||||
|
} |
||||
|
|
||||
|
sync_barrier = std::make_unique<std::vector<std::barrier<barrier_completion_function>*>>(thread_group); |
||||
|
uint32_t thread_count_sum = thread_count_ag + thread_count_fi + thread_count_fc; |
||||
|
uint32_t barrier_count = barrier_mode.compare("global") == 0 ? 1 : thread_group; |
||||
|
uint32_t barrier_thread_count; |
||||
|
|
||||
|
if constexpr(simple){ |
||||
|
barrier_thread_count = (thread_group / barrier_count) * |
||||
|
(mode == PMode::expl_copy ? thread_count_sum : (thread_count_ag + thread_count_fi)); |
||||
|
} else { |
||||
|
barrier_thread_count = (thread_group / barrier_count) * thread_count_sum; |
||||
|
} |
||||
|
for(uint32_t i = 0; i < barrier_count; ++i) { |
||||
|
(*sync_barrier)[i] = new std::barrier<barrier_completion_function>(barrier_thread_count); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
void clear_buffers () { |
||||
|
std::memset((void*)ready_flag_a, 0x00, chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); |
||||
|
std::memset((void*)ready_flag_b, 0x00, chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); |
||||
|
|
||||
|
std::memset(mask_a, 0x00, size_b / sizeof(base_t)); |
||||
|
std::memset(mask_b, 0x00, size_b / sizeof(base_t)); |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
std::memset(buffer_b[0], 0x00, thread_group * chunk_size_b); |
||||
|
std::memset(buffer_b[1], 0x00, thread_group * chunk_size_b); |
||||
|
} else { |
||||
|
std::memset(*buffer_b, 0x00, size_b); |
||||
|
} |
||||
|
|
||||
|
trt->reset_accumulator(); |
||||
|
bt->reset_accumulator(); |
||||
|
reset_barriers(); |
||||
|
}; |
||||
|
|
||||
|
~Query_Wrapper() { |
||||
|
numa_free((void*)ready_flag_a, |
||||
|
chunk_cnt * thread_count_fi / 8 + ((chunk_cnt * thread_count_fi % 8) != 0)); |
||||
|
numa_free((void*)ready_flag_b, |
||||
|
chunk_cnt * thread_count_fc / 8 + ((chunk_cnt * thread_count_fc % 8) != 0)); |
||||
|
|
||||
|
numa_free(mask_a, size_b / sizeof(base_t)); |
||||
|
numa_free(mask_b, size_b / sizeof(base_t)); |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
numa_free(buffer_b[0], thread_group * chunk_size_b); |
||||
|
numa_free(buffer_b[1], thread_group * chunk_size_b); |
||||
|
numa_free(buffer_b, size_b * sizeof(base_t*)); |
||||
|
} else { |
||||
|
numa_free(*buffer_b, size_b); |
||||
|
} |
||||
|
|
||||
|
delete trt; |
||||
|
for(auto& barrier : *sync_barrier) { |
||||
|
delete barrier; |
||||
|
} |
||||
|
delete bt; |
||||
|
|
||||
|
}; |
||||
|
|
||||
|
private: |
||||
|
static inline base_t* get_sub_chunk_ptr(base_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, |
||||
|
size_t tcnt) { |
||||
|
base_t* chunk_ptr = base_ptr + chunk_id * chunk_size_w; |
||||
|
return chunk_ptr + tid * (chunk_size_w / tcnt); |
||||
|
} |
||||
|
|
||||
|
static inline uint16_t* get_sub_mask_ptr(uint16_t* base_ptr, size_t chunk_id, size_t chunk_size_w, size_t tid, |
||||
|
size_t tcnt) { |
||||
|
// 16 integer are addressed with one uint16_t in mask buffer |
||||
|
size_t offset = chunk_id * chunk_size_w + tid * (chunk_size_w / tcnt); |
||||
|
return base_ptr + (offset / 16); |
||||
|
} |
||||
|
|
||||
|
static bool bit_at(volatile uint8_t* bitmap, uint32_t bitpos) { |
||||
|
uint8_t value = bitmap[bitpos / 8]; |
||||
|
switch(bitpos % 8) { |
||||
|
case 0: return value & 0b00000001; |
||||
|
case 1: return value & 0b00000010; |
||||
|
case 2: return value & 0b00000100; |
||||
|
case 3: return value & 0b00001000; |
||||
|
case 4: return value & 0b00010000; |
||||
|
case 5: return value & 0b00100000; |
||||
|
case 6: return value & 0b01000000; |
||||
|
case 7: return value & 0b10000000; |
||||
|
default: return false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static void set_bit_at(volatile uint8_t* bitmap, std::mutex& mutex, uint32_t bitpos) { |
||||
|
mutex.lock(); |
||||
|
switch(bitpos % 8) { |
||||
|
case 0: bitmap[bitpos / 8] |= 0b00000001;break; |
||||
|
case 1: bitmap[bitpos / 8] |= 0b00000010;break; |
||||
|
case 2: bitmap[bitpos / 8] |= 0b00000100;break; |
||||
|
case 3: bitmap[bitpos / 8] |= 0b00001000;break; |
||||
|
case 4: bitmap[bitpos / 8] |= 0b00010000;break; |
||||
|
case 5: bitmap[bitpos / 8] |= 0b00100000;break; |
||||
|
case 6: bitmap[bitpos / 8] |= 0b01000000;break; |
||||
|
case 7: bitmap[bitpos / 8] |= 0b10000000;break; |
||||
|
} |
||||
|
mutex.unlock(); |
||||
|
} |
||||
|
|
||||
|
public: |
||||
|
|
||||
|
static base_t checksum(base_t* a, base_t* b, base_t cmp_a, base_t cmp_b, size_t size_b) { |
||||
|
base_t sum = 0; |
||||
|
for(int i = 0; i < size_b / sizeof(base_t); ++i) { |
||||
|
if(a[i] >= cmp_a && b[i] <= cmp_b) { |
||||
|
sum += b[i]; |
||||
|
} |
||||
|
} |
||||
|
return sum; |
||||
|
} |
||||
|
|
||||
|
static void checkmask(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { |
||||
|
uint32_t cnt = 0; |
||||
|
for(int i = 0; i < size_b / sizeof(base_t); ++i) { |
||||
|
if(leq) { |
||||
|
if(((data[i] <= cmp) != bit_at((uint8_t*)mask, i))) { |
||||
|
++cnt; |
||||
|
} |
||||
|
} else { |
||||
|
if(((data[i] >= cmp) != bit_at((uint8_t*)mask, i))) { |
||||
|
++cnt; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static void checkmask_16(uint16_t* mask, base_t cmp, base_t* data, size_t size_b, bool leq) { |
||||
|
for(int i = 0; i < size_b / sizeof(base_t) / 16 ; ++i) { |
||||
|
std::bitset<16> m(mask[i]); |
||||
|
uint16_t ch = 0; |
||||
|
for(int j = 0; j < 16; ++j) { |
||||
|
if(data[i*16 + j] <= cmp) { |
||||
|
ch |= 0x1 << j; |
||||
|
} |
||||
|
} |
||||
|
std::bitset<16> c(ch); |
||||
|
|
||||
|
std::cout << "act " << m << std::endl; |
||||
|
std::cout << "rea " << c << std::endl << std::endl; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
void scan_b(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_fc; |
||||
|
assert(chunk_size_w % tcnt == 0); |
||||
|
assert(chunk_size_w % 16 == 0); |
||||
|
assert(chunk_size_w % tcnt * 16 == 0); |
||||
|
|
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
trt->start_timer(1, tid * gcnt + gid); |
||||
|
|
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr = get_sub_chunk_ptr(data_b , chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr = get_sub_mask_ptr (mask_b , chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
if constexpr(simple){ |
||||
|
base_t* buffer_ptr; |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} |
||||
|
std::memcpy(buffer_ptr, chunk_ptr, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
if(no_copy) { |
||||
|
filterNoCopy::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
base_t* buffer_ptr; |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
buffer_ptr = get_sub_chunk_ptr(buffer_b[i % 2], gid, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
buffer_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} |
||||
|
filterCopy::apply_same(mask_ptr, buffer_ptr, chunk_ptr, cmp_b, chunk_size_b / tcnt); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
trt->stop_timer(1, tid * gcnt + gid); |
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 1, tid * gcnt + gid); |
||||
|
|
||||
|
} |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
|
||||
|
} |
||||
|
|
||||
|
void scan_a(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_fi; |
||||
|
assert(chunk_size_w % tcnt == 0); |
||||
|
assert(chunk_size_w % 16 == 0); |
||||
|
assert(chunk_size_w % tcnt * 16 == 0); |
||||
|
|
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
trt->start_timer(0, tid * gcnt + gid); |
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr = get_sub_chunk_ptr(data_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
filter::apply_same(mask_ptr, nullptr, chunk_ptr, cmp_a, chunk_size_b / tcnt); |
||||
|
|
||||
|
trt->stop_timer(0, tid * gcnt + gid); |
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 0, tid * gcnt + gid); |
||||
|
} |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
} |
||||
|
|
||||
|
void aggr_j(size_t gid, size_t gcnt, size_t tid) { |
||||
|
size_t tcnt = thread_count_ag; |
||||
|
// wait till everyone can start |
||||
|
ready_future->wait(); |
||||
|
|
||||
|
// calculate values |
||||
|
__m512i aggregator = aggregation::OP::zero(); |
||||
|
// the lower gids run once more if the chunks are not evenly distributable |
||||
|
uint32_t runs = chunk_cnt / gcnt + (chunk_cnt % gcnt > gid); |
||||
|
uint32_t barrier_idx = barrier_mode.compare("global") == 0 ? 0 : gid; |
||||
|
for(uint32_t i = 0; i < runs; ++i) { |
||||
|
|
||||
|
bt->timed_wait(*(*sync_barrier)[barrier_idx], 2, tid * gcnt + gid); |
||||
|
trt->start_timer(2, tid * gcnt + gid); |
||||
|
|
||||
|
// calculate pointers |
||||
|
size_t chunk_id = gid + gcnt * i; |
||||
|
base_t* chunk_ptr; |
||||
|
if(no_copy) { |
||||
|
chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
if constexpr(BUFFER_LIMIT==1) { |
||||
|
chunk_ptr = get_sub_chunk_ptr(buffer_b[i%2], gid, chunk_size_w, tid, tcnt); |
||||
|
} else { |
||||
|
chunk_ptr = get_sub_chunk_ptr(*buffer_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
} |
||||
|
} |
||||
|
uint16_t* mask_ptr_a = get_sub_mask_ptr (mask_a, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
uint16_t* mask_ptr_b = get_sub_mask_ptr (mask_b, chunk_id, chunk_size_w, tid, tcnt); |
||||
|
|
||||
|
base_t tmp = _mm512_reduce_add_epi64(aggregator); |
||||
|
if constexpr(simple){ |
||||
|
aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, chunk_size_b / tcnt); |
||||
|
} else { |
||||
|
aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, chunk_size_b / tcnt); |
||||
|
} |
||||
|
trt->stop_timer(2, tid * gcnt + gid); |
||||
|
} |
||||
|
|
||||
|
// so threads with more runs dont wait for finished threads |
||||
|
(*(*sync_barrier)[barrier_idx]).arrive_and_drop(); |
||||
|
|
||||
|
aggregation::happly(dest + (tid * gcnt + gid), aggregator); |
||||
|
} |
||||
|
}; |
@ -0,0 +1,80 @@ |
|||||
|
#pragma once |
||||
|
#include <cstdlib> |
||||
|
#include <ctime> |
||||
|
#include <cstdint> |
||||
|
#include <type_traits> |
||||
|
#include <random> |
||||
|
#include <chrono> |
||||
|
|
||||
|
#include <immintrin.h> |
||||
|
|
||||
|
/// @brief Fills a given array with random generated integers. |
||||
|
/// @tparam base_t Datatype of the array |
||||
|
/// @param dest Pointer to the array |
||||
|
/// @param size Size of the array |
||||
|
/// @param min Minumum value of the generated integers |
||||
|
/// @param max Maximum value of the generated integers |
||||
|
template<typename base_t> |
||||
|
void fill(base_t * dest, uint64_t size, base_t min, base_t max) { |
||||
|
std::srand(std::time(nullptr)); |
||||
|
for(uint64_t i = 0; i < size/sizeof(base_t); ++i) { |
||||
|
dest[i] = (std::rand() % (max - min)) + min; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/// @brief Fills a given array with random generated integers using the mersenne twister engine (type std::mt19937). |
||||
|
/// @tparam base_t Datatype of the array |
||||
|
/// @param dest Pointer to the array |
||||
|
/// @param size Size of the array |
||||
|
/// @param min Minumum value of the generated integers |
||||
|
/// @param max Maximum value of the generated integers |
||||
|
template <typename T> |
||||
|
void fill_mt(T* array, uint64_t size, T min, T max, uint64_t int_seed = 0) { |
||||
|
static_assert(std::is_integral<T>::value, "Data type is not integral."); |
||||
|
|
||||
|
size = size / sizeof(T); |
||||
|
|
||||
|
std::mt19937::result_type seed; |
||||
|
if (int_seed == 0) { |
||||
|
std::random_device rd; |
||||
|
seed = rd() ^ ( |
||||
|
(std::mt19937::result_type) std::chrono::duration_cast<std::chrono::seconds>( |
||||
|
std::chrono::system_clock::now().time_since_epoch()).count() + |
||||
|
(std::mt19937::result_type) std::chrono::duration_cast<std::chrono::microseconds>( |
||||
|
std::chrono::high_resolution_clock::now().time_since_epoch()).count()); |
||||
|
} else seed = int_seed; |
||||
|
|
||||
|
std::mt19937 gen(seed); |
||||
|
std::uniform_int_distribution<T> distrib(min, max); |
||||
|
|
||||
|
for (uint64_t j = 0; j < size; ++j) { |
||||
|
array[j] = distrib(gen); |
||||
|
} |
||||
|
|
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Checks if two arrays of the integral type *T* contain the same values |
||||
|
* |
||||
|
* @tparam T Integral type of *array0* and *array1* |
||||
|
* @param array0 Array 0 to check |
||||
|
* @param array1 Array 1 to check |
||||
|
* @param size_b Size of the two arrays in byte |
||||
|
* @param verbose Decides if outputs are verbose of not (print every not matching numbers with their index) |
||||
|
* @return bool Weathor or not the content is equal or not |
||||
|
*/ |
||||
|
template <typename T> |
||||
|
typename std::enable_if<std::is_integral<T>::value, bool>::type |
||||
|
check_same(T* array0, T* array1, size_t size_b, bool verbose) { |
||||
|
for(uint64_t i = 0; i <= size_b / sizeof(T); i += 64 / sizeof(T)) { |
||||
|
__m512i vec0 = _mm512_stream_load_si512(array0 + i); |
||||
|
__m512i vec1 = _mm512_stream_load_si512(array1 + i); |
||||
|
|
||||
|
__mmask8 res = _mm512_cmpeq_epi64_mask(vec0, vec1); |
||||
|
} |
||||
|
|
||||
|
//TODO complete function |
||||
|
|
||||
|
return false; |
||||
|
} |
||||
|
|
@ -0,0 +1,73 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <numa.h> |
||||
|
#include <barrier> |
||||
|
#include <chrono> |
||||
|
|
||||
|
#define BARRIER_TIMINGS 1 |
||||
|
|
||||
|
|
||||
|
struct barrier_completion_function { |
||||
|
inline void operator() () { |
||||
|
return; |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
struct barrier_timing { |
||||
|
|
||||
|
uint32_t time_points, time_threads; |
||||
|
double** time_accumulator; |
||||
|
|
||||
|
barrier_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) { |
||||
|
#ifdef BARRIER_TIMINGS |
||||
|
time_points = timing_points; |
||||
|
time_threads = timing_threads; |
||||
|
time_accumulator = (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node); |
||||
|
for(uint32_t i = 0; i < timing_points; ++i) { |
||||
|
time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node); |
||||
|
} |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
~barrier_timing() { |
||||
|
#ifdef BARRIER_TIMINGS |
||||
|
for(uint32_t i = 0; i < time_points; ++i) { |
||||
|
numa_free(time_accumulator[i], time_threads * sizeof(double)); |
||||
|
} |
||||
|
numa_free(time_accumulator, time_points * sizeof(double*)); |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
void reset_accumulator() { |
||||
|
#ifdef BARRIER_TIMINGS |
||||
|
for(uint32_t i = 0; i < time_points; ++i){ |
||||
|
for(uint32_t j = 0; j < time_threads; ++j){ |
||||
|
time_accumulator[i][j] = 0.0; |
||||
|
}} |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
double summarize_time(uint32_t time_point) { |
||||
|
#ifdef BARRIER_TIMINGS |
||||
|
double sum = 0.0; |
||||
|
for(uint32_t i = 0; i < time_threads; ++i) { |
||||
|
sum += time_accumulator[time_point][i]; |
||||
|
} |
||||
|
return sum; |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
void timed_wait(std::barrier<struct barrier_completion_function>& barrier, uint32_t point_id, uint32_t thread_id) { |
||||
|
#ifdef BARRIER_TIMINGS |
||||
|
auto before_barrier = std::chrono::steady_clock::now(); |
||||
|
#endif |
||||
|
barrier.arrive_and_wait(); |
||||
|
#ifdef BARRIER_TIMINGS |
||||
|
auto after_barrier = std::chrono::steady_clock::now(); |
||||
|
uint64_t barrier_wait_time = std::chrono::duration_cast<std::chrono::nanoseconds>(after_barrier - before_barrier).count(); |
||||
|
double seconds = barrier_wait_time / (1000.0 * 1000.0 * 1000.0); |
||||
|
time_accumulator[point_id][thread_id] += seconds; |
||||
|
#endif |
||||
|
} |
||||
|
}; |
@ -0,0 +1,33 @@ |
|||||
|
/** |
||||
|
* @file const.h |
||||
|
* @author André Berthold |
||||
|
* @brief Defines handy constants. |
||||
|
* @version 0.1 |
||||
|
* @date 2023-05-25 |
||||
|
* |
||||
|
* @copyright Copyright (c) 2023 |
||||
|
* |
||||
|
*/ |
||||
|
|
||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <immintrin.h> |
||||
|
|
||||
|
constexpr size_t VECTOR_SIZE_I = 512; |
||||
|
constexpr size_t VECTOR_SIZE_B = VECTOR_SIZE_I / 8; |
||||
|
constexpr size_t VECTOR_SIZE_H = VECTOR_SIZE_B / sizeof(uint32_t); |
||||
|
constexpr size_t VECTOR_SIZE_W = VECTOR_SIZE_B / sizeof(uint64_t); |
||||
|
|
||||
|
template<typename T> |
||||
|
constexpr size_t VECTOR_SIZE() { |
||||
|
return VECTOR_SIZE_B / sizeof(T); |
||||
|
} |
||||
|
|
||||
|
template<typename T> |
||||
|
constexpr size_t V_MASK_SIZE() { |
||||
|
return VECTOR_SIZE<T>() / 8; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
const __mmask16 full_m16 = _mm512_int2mask(0xFFFF); |
@ -0,0 +1,82 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <thread> |
||||
|
#include <cassert> |
||||
|
#include <iostream> |
||||
|
#include <vector> |
||||
|
#include <utility> |
||||
|
|
||||
|
/** Sets all bits in a given cpu_set_t between L and H (condition L <= H)*/ |
||||
|
#define CPU_BETWEEN(L, H, SET) assert(L <= H); for(; L < H; ++L) {CPU_SET(L, SET);} |
||||
|
|
||||
|
/** |
||||
|
* Applies the affinity defined in set to the thread, through pthread library |
||||
|
* calls. If it fails it wites the problem to stderr and terminated the program. |
||||
|
*/ |
||||
|
inline void pin_thread(std::thread& thread, cpu_set_t* set) { |
||||
|
int error_code = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), set); |
||||
|
if (error_code != 0) { |
||||
|
std::cerr << "Error calling pthread_setaffinity_np in copy_pool assignment: " << error_code << std::endl; |
||||
|
exit(-1); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Returns the cpu id of the thread_id-th cpu in a given (multi)range. Thread_id |
||||
|
* greater than the number of cpus in the (multi)range are valid. In this case |
||||
|
* the (thread_id % #cpus in the range)-th cpu in the range is returned. |
||||
|
*/ |
||||
|
int get_cpu_id(int thread_id, const std::vector<std::pair<int, int>>& range) { |
||||
|
int subrange_size = range[0].second - range[0].first; |
||||
|
|
||||
|
int i = 0; |
||||
|
while(subrange_size <= thread_id) { |
||||
|
thread_id -= subrange_size; |
||||
|
i = (i + 1) % range.size(); |
||||
|
subrange_size = range[i].second - range[i].first; |
||||
|
} |
||||
|
return thread_id + range[i].first; |
||||
|
} |
||||
|
|
||||
|
/*inline void cpu_set_between(cpu_set_t* set, uint32_t low, uint32_t high) { |
||||
|
assert(low != high); |
||||
|
if (low > high) std::swap(low, high); |
||||
|
|
||||
|
for(; low < high; ++low) { |
||||
|
CPU_SET(low, set); |
||||
|
} |
||||
|
}*/ |
||||
|
|
||||
|
/** |
||||
|
* Pins the given thread to the thread_id-th cpu in the given range. |
||||
|
*/ |
||||
|
void pin_thread_in_range(std::thread& thread, int thread_id, std::vector<std::pair<int, int>>& range) { |
||||
|
cpu_set_t set; |
||||
|
CPU_ZERO(&set); |
||||
|
CPU_SET(get_cpu_id(thread_id, range), &set); |
||||
|
|
||||
|
pin_thread(thread, &set); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Pins the given thread to all cpus in the given range. |
||||
|
*/ |
||||
|
void pin_thread_in_range(std::thread& thread, std::vector<std::pair<int, int>>& range) { |
||||
|
cpu_set_t set; |
||||
|
CPU_ZERO(&set); |
||||
|
for(auto r : range) { CPU_BETWEEN(r.first, r.second, &set); } |
||||
|
|
||||
|
pin_thread(thread, &set); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Pins the given thread to all cpu ids between low (incl.) and high (excl.). |
||||
|
*/ |
||||
|
inline void pin_thread_between(std::thread& thread, uint32_t low, uint32_t high) { |
||||
|
cpu_set_t set; |
||||
|
CPU_ZERO(&set); |
||||
|
CPU_BETWEEN(low, high, &set); |
||||
|
|
||||
|
pin_thread(thread, &set); |
||||
|
} |
@ -0,0 +1,89 @@ |
|||||
|
#include <string> |
||||
|
|
||||
|
enum PMode{no_copy = 0, hbm = 1, expl_copy = 2}; |
||||
|
struct mode_manager { |
||||
|
static inline PMode inc(PMode value) { |
||||
|
return static_cast<PMode>(value + 1); |
||||
|
}; |
||||
|
static inline bool pred(PMode value) { |
||||
|
return no_copy <= value && value <= expl_copy; |
||||
|
}; |
||||
|
static std::string string(PMode value) { |
||||
|
switch(value) { |
||||
|
case no_copy: return "no_copy"; |
||||
|
case hbm: return "hbm_pre"; |
||||
|
case expl_copy:return "expl_co"; |
||||
|
} return "no_copy"; |
||||
|
}; |
||||
|
}; |
||||
|
|
||||
|
#define SIMPLE_Q 0 |
||||
|
#define COMPLEX_Q 1 |
||||
|
|
||||
|
#define SCAN_A 0 |
||||
|
#define SCAN_B 1 |
||||
|
#define AGGR_J 2 |
||||
|
|
||||
|
enum NewPMode{DRAM_base = 0, HBM_base = 1, Mixed_base = 2, Prefetch = 3}; |
||||
|
struct new_mode_manager { |
||||
|
/*constexpr static int thread_counts[2][4][3] = { |
||||
|
//simple query |
||||
|
//scan_a, scan_b, aggr_j |
||||
|
{{3, 0, 3}, // DRAM_base |
||||
|
{3, 0, 3}, // HBM_base |
||||
|
{3, 0, 3}, // Mixed_base |
||||
|
{1, 4, 1}},// Prefetching |
||||
|
//complex query |
||||
|
{{1, 4, 1}, // DRAM_base |
||||
|
{1, 4, 1}, // HBM_base |
||||
|
{1, 4, 1}, // Mixed_base |
||||
|
{1, 4, 1}},// Prefetching |
||||
|
};*/ |
||||
|
|
||||
|
/*constexpr static int thread_counts[2][4][3] = { |
||||
|
//simple query |
||||
|
//scan_a, scan_b, aggr_j |
||||
|
{{2, 0, 4}, // DRAM_base |
||||
|
{2, 0, 4}, // HBM_base |
||||
|
{2, 0, 4}, // Mixed_base |
||||
|
{1, 4, 1}},// Prefetching |
||||
|
//complex query |
||||
|
{{1, 4, 1}, // DRAM_base |
||||
|
{1, 4, 1}, // HBM_base |
||||
|
{1, 4, 1}, // Mixed_base |
||||
|
{1, 4, 1}},// Prefetching |
||||
|
};*/ |
||||
|
|
||||
|
constexpr static int thread_counts[2][4][3] = { |
||||
|
//simple query |
||||
|
//scan_a, scan_b, aggr_j |
||||
|
{{4, 0, 2}, // DRAM_base |
||||
|
{4, 0, 2}, // HBM_base |
||||
|
{4, 0, 2}, // Mixed_base |
||||
|
{1, 4, 1}},// Prefetching |
||||
|
//complex query |
||||
|
{{1, 4, 1}, // DRAM_base |
||||
|
{1, 4, 1}, // HBM_base |
||||
|
{1, 4, 1}, // Mixed_base |
||||
|
{1, 4, 1}},// Prefetching |
||||
|
}; |
||||
|
|
||||
|
static inline NewPMode inc(NewPMode value) { |
||||
|
return static_cast<NewPMode>(value + 1); |
||||
|
}; |
||||
|
static inline bool pred(NewPMode value) { |
||||
|
return DRAM_base <= value && value <= Prefetch; |
||||
|
}; |
||||
|
static int thread_count(uint8_t query_type, NewPMode mode, uint8_t thread_type){ |
||||
|
if(query_type > 1) query_type = 1; |
||||
|
if(thread_type > 2) thread_type = 2; |
||||
|
return (thread_counts[query_type][mode][thread_type]); |
||||
|
}; |
||||
|
static std::string string(NewPMode value) { |
||||
|
switch(value) { |
||||
|
case DRAM_base: return "DRAM_Baseline"; |
||||
|
case HBM_base: return "HBM_Baseline"; |
||||
|
case Mixed_base: return "DRAM_HBM_Baseline"; |
||||
|
} return "Q-d_Prefetching"; |
||||
|
}; |
||||
|
}; |
@ -0,0 +1,76 @@ |
|||||
|
/** |
||||
|
* @file file_output.h |
||||
|
* @author André Berthold |
||||
|
* @brief Implements a template-function that accepts an arbitrary number of parameters that should be printed |
||||
|
* @version 0.1 |
||||
|
* @date 2023-05-25 |
||||
|
* |
||||
|
* @copyright Copyright (c) 2023 |
||||
|
* |
||||
|
*/ |
||||
|
#pragma once |
||||
|
|
||||
|
#include <fstream> |
||||
|
#include <string> |
||||
|
#include <type_traits> |
||||
|
|
||||
|
#include "iterable_range.h" |
||||
|
|
||||
|
template<class T> |
||||
|
inline constexpr bool is_numeric_v = std::disjunction< |
||||
|
std::is_integral<T>, |
||||
|
std::is_floating_point<T>>::value; |
||||
|
|
||||
|
/** |
||||
|
* @brief Converts a parameter to a string by either using it directly or its member current (if it is of type Labeled) |
||||
|
* as parameter to the std::string-Constructor. |
||||
|
* |
||||
|
* @tparam T Type of the parameter |
||||
|
* @param value Parameter to be converted |
||||
|
* @return std::string The converted parameter |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
inline std::string to_string(T value) { |
||||
|
if constexpr(std::is_base_of<Labeled, T>::value){ |
||||
|
// integrals cannot be use in the string constructor and must be translated by the std::to_string-function |
||||
|
if constexpr (is_numeric_v<decltype(value.current)>) { |
||||
|
return std::to_string(value.current); |
||||
|
} else { |
||||
|
return std::string(value.current); |
||||
|
} |
||||
|
} else { |
||||
|
// integrals cannot be use in the string constructor and must be translated by the std::to_string-function |
||||
|
if constexpr (is_numeric_v<decltype(value)>) { |
||||
|
return std::to_string(value); |
||||
|
} else { |
||||
|
return std::string(value); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief This function wites the content of *val* to *file*. Terminates terecursive function definition. |
||||
|
* |
||||
|
* @tparam type Type of the paramter *val* (is usually implicitly defeined) |
||||
|
* @param file File that is written to |
||||
|
* @param val Value that is translated to a char stream and written to the file |
||||
|
*/ |
||||
|
template<typename type> |
||||
|
inline void print_to_file(std::ofstream &file, type val) { |
||||
|
file << to_string(val) << std::endl; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief This function wites the content of *val* and that content if *vals* to *file*. |
||||
|
* |
||||
|
* @tparam type Type of the paramter *val* (is usually implicitly defeined) |
||||
|
* @tparam types Parameter pack that describes the types of *vals* |
||||
|
* @param file File that is written to |
||||
|
* @param val Value that is translated to a char stream and written to the file |
||||
|
* @param vals Paramater pack of values that are gonna be printed to the file |
||||
|
*/ |
||||
|
template<typename type, typename... types> |
||||
|
inline void print_to_file(std::ofstream &file, type val, types ... vals) { |
||||
|
file << to_string(val) << ","; |
||||
|
print_to_file(file, vals...); |
||||
|
} |
@ -0,0 +1,208 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <type_traits> |
||||
|
#include <string> |
||||
|
|
||||
|
|
||||
|
constexpr auto NO_NEXT = "false"; |
||||
|
|
||||
|
/** |
||||
|
* @brief Class that adds an label member-parameter to a sub-class |
||||
|
* |
||||
|
*/ |
||||
|
class Labeled { |
||||
|
public: |
||||
|
std::string label; |
||||
|
public: |
||||
|
Labeled(std::string str) : label(str) {}; |
||||
|
Labeled(const char* str) { this->label = std::string(str); }; |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Converts a parameter to a string by either reading the member label (if it is of type Labeled) or using it |
||||
|
* as parameter to the std::string-Constructor. |
||||
|
* |
||||
|
* @tparam T Type of the parameter |
||||
|
* @param value Parameter to be converted |
||||
|
* @return std::string The converted parameter |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
inline std::string generateHead(T value) { |
||||
|
if constexpr(std::is_base_of<Labeled, T>::value){ |
||||
|
return value.label; |
||||
|
} else { |
||||
|
return std::string(value); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Converts a parameter-pack to a string calling genarateHead(T) on every parameter and concatenatin the results. |
||||
|
* |
||||
|
* @tparam T Type of the first parameter |
||||
|
* @tparam Ts Parameter pack specifying the preceeding parameters' types |
||||
|
* @param value Parameter to be transformed |
||||
|
* @param values Parameter-pack of the next prameters to be transformed |
||||
|
* @return std::string Comma-separated concatenation of all parameters string representation |
||||
|
*/ |
||||
|
template<typename T, typename... Ts> |
||||
|
inline std::string generateHead(T value, Ts... values) { |
||||
|
return generateHead(value) + ',' + generateHead(values...); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/** |
||||
|
* @brief Takes a single Range object and calls its next function. |
||||
|
* |
||||
|
* @tparam T Specific type of the Range object |
||||
|
* @param t Instance of the Range object |
||||
|
* @return std::string Label of the Range object or "false" if the Range reaced its end and was reset |
||||
|
*/ |
||||
|
template<typename T> |
||||
|
std::string IterateOnce(T& t) { |
||||
|
if(t.next()) return t.label; |
||||
|
else t.reset(); |
||||
|
return std::string(NO_NEXT); //the string signalises that the iteration has to be terminiated. |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @brief Takes a number of Range objects and recusively increments them till the first Range does not reach its end |
||||
|
* upon incrementing. It tarts at the first Range object given. Every Range object that reached its end is reset to |
||||
|
* its start value. |
||||
|
* |
||||
|
* @tparam T Specific type of the first Range object |
||||
|
* @tparam Ts Types to the following Range objects |
||||
|
* @param t First instance of the Range object |
||||
|
* @param ts Parameter pack of the following Range objects |
||||
|
* @return std::string Label of the highest index Range object that was altered, or "false" if the last Range object |
||||
|
* reache its end and was reset |
||||
|
*/ |
||||
|
template<typename T, typename... Ts> |
||||
|
std::string IterateOnce(T& t , Ts&... ts) { |
||||
|
if(t.next()) return t.label; |
||||
|
else t.reset(); |
||||
|
return IterateOnce<Ts...>(ts...); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
/** |
||||
|
* @brief Class that provides a convenient interface for iteratin throug a parameter range. It stores a public value |
||||
|
* that can be altered by the classes' methods. |
||||
|
* |
||||
|
* @tparam T Base type of the parameter |
||||
|
* @tparam INIT Initial value of the current pointer |
||||
|
* @tparam PRED Struct providing an apply function testing if the current value is in range or not |
||||
|
* @tparam INC Struct providing an apply function setting the current value to the value following the current value |
||||
|
*/ |
||||
|
template<typename T, T INIT, typename PRED, typename INC> |
||||
|
class Range : public Labeled { |
||||
|
public: |
||||
|
/** |
||||
|
* @brief Current value of the parameter |
||||
|
*/ |
||||
|
T current = INIT; |
||||
|
|
||||
|
/** |
||||
|
* @brief Resets current to its initial value |
||||
|
*/ |
||||
|
void reset() {current = INIT; }; |
||||
|
|
||||
|
/** |
||||
|
* @brief Sets current to its next value (according to INC::inc) and returns if the range Reached its end |
||||
|
* (accordingt to PRED::pred). |
||||
|
* |
||||
|
* @return true The newly assigned value of current is in the range |
||||
|
* @return false Otherwise |
||||
|
*/ |
||||
|
bool next() { |
||||
|
current = INC::inc(current); |
||||
|
return PRED::pred(current); |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Checks if current is in the Range (according to PRED). |
||||
|
* |
||||
|
* @return true PRED returns true |
||||
|
* @return false Otherwise |
||||
|
*/ |
||||
|
bool valid() { return PRED::apply(current); }; |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Class that is in contrast to Range specialized for integral values. |
||||
|
* |
||||
|
* @tparam T Integral base type of the Range |
||||
|
* @tparam INIT Initial value of the parameter |
||||
|
* @tparam MAX Maximal value of the parameter |
||||
|
* @tparam INC Struct providing an apply function setting the current value to the value following the current value |
||||
|
*/ |
||||
|
template<typename T, T INIT, T MAX, typename INC> |
||||
|
class Int_Range : public Labeled { |
||||
|
static_assert(std::is_integral<T>::value, "Int_Range requires an integral base type"); |
||||
|
|
||||
|
public: |
||||
|
const T max = MAX; |
||||
|
T current = INIT; |
||||
|
|
||||
|
void reset() {current = INIT; }; |
||||
|
|
||||
|
bool next() { |
||||
|
current = INC::inc(current); |
||||
|
return current < MAX; |
||||
|
}; |
||||
|
|
||||
|
bool valid() { return current < MAX; }; |
||||
|
|
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Class that is in contrast to Int_Range specialized for integrals that grow linearly. |
||||
|
* |
||||
|
* @tparam T Integral base type of the Range |
||||
|
* @tparam INIT Initial value of the parameter |
||||
|
* @tparam MAX Maximal value of the parameter |
||||
|
* @tparam STEP Increase of the value per next()-call |
||||
|
*/ |
||||
|
template<typename T, T INIT, T MAX, T STEP = 1> |
||||
|
class Linear_Int_Range : public Labeled { |
||||
|
static_assert(std::is_integral<T>::value, "Linear_Int_Range requires an integral base type"); |
||||
|
|
||||
|
public: |
||||
|
const T max = MAX; |
||||
|
T current = INIT; |
||||
|
|
||||
|
void reset() {current = INIT; }; |
||||
|
|
||||
|
bool next() { |
||||
|
current += STEP; |
||||
|
return current < MAX; |
||||
|
}; |
||||
|
|
||||
|
bool valid() { return current < MAX; }; |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Class that is in contrast to Int_Range specialized for integrals that grow exponetially. |
||||
|
* |
||||
|
* @tparam T Integral base type of the Range |
||||
|
* @tparam INIT Initial value of the parameter |
||||
|
* @tparam MAX Maximal value of the parameter |
||||
|
* @tparam FACTOR Multiplicative Increase of the value per next()-call |
||||
|
*/ |
||||
|
template<typename T, T INIT, T MAX, T FACTOR = 2> |
||||
|
class Exp_Int_Range : public Labeled { |
||||
|
static_assert(std::is_integral<T>::value, "Exp_Int_Range requires an integral base type"); |
||||
|
|
||||
|
public: |
||||
|
const T max = MAX; |
||||
|
T current = INIT; |
||||
|
|
||||
|
void reset() {current = INIT; }; |
||||
|
|
||||
|
bool next() { |
||||
|
current *= FACTOR; |
||||
|
return current < MAX; |
||||
|
}; |
||||
|
|
||||
|
bool valid() { return current < MAX; }; |
||||
|
}; |
@ -0,0 +1,152 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <chrono> |
||||
|
#include <vector> |
||||
|
#include <string> |
||||
|
#include <algorithm> |
||||
|
|
||||
|
#include <numa.h> |
||||
|
|
||||
|
|
||||
|
#if PCM_M == 1 |
||||
|
#define PCM_MEASURE 1 |
||||
|
#include "pcm.h" |
||||
|
#endif |
||||
|
|
||||
|
|
||||
|
|
||||
|
struct pcm_value_collector { |
||||
|
const uint32_t value_count = 6; |
||||
|
|
||||
|
uint32_t threads; |
||||
|
std::vector<std::string> points; |
||||
|
#ifdef PCM_MEASURE |
||||
|
pcm::SystemCounterState** states; |
||||
|
#endif |
||||
|
uint64_t** collection; |
||||
|
|
||||
|
pcm_value_collector(const std::vector<std::string>& in_points, uint32_t threads, uint32_t memory_node) : threads(threads) { |
||||
|
#ifdef PCM_MEASURE |
||||
|
points = std::vector(in_points); |
||||
|
|
||||
|
collection = (uint64_t**) numa_alloc_onnode(threads * sizeof(uint64_t*), memory_node); |
||||
|
states = (pcm::SystemCounterState**) numa_alloc_onnode(threads * sizeof(pcm::SystemCounterState*), memory_node); |
||||
|
for(int i = 0; i < threads; ++i) { |
||||
|
collection[i] = (uint64_t*) numa_alloc_onnode(points.size() * value_count * sizeof(uint64_t), memory_node); |
||||
|
states[i] = (pcm::SystemCounterState*) numa_alloc_onnode(points.size() * sizeof(pcm::SystemCounterState), memory_node); |
||||
|
} |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
~pcm_value_collector() { |
||||
|
#ifdef PCM_MEASURE |
||||
|
for(int i = 0; i < threads; ++i) { |
||||
|
numa_free(collection[threads], points.size() * value_count * sizeof(uint64_t)); |
||||
|
} |
||||
|
numa_free(collection, threads * sizeof(uint64_t*)); |
||||
|
numa_free(states, threads * sizeof(pcm::SystemCounterState)); |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
void reset() { |
||||
|
#ifdef PCM_MEASURE |
||||
|
for(int i = 0; i < threads; ++i) |
||||
|
for(uint32_t j = 0; j < points.size() * value_count; ++j){ |
||||
|
collection[i][j] = 0; |
||||
|
} |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
int64_t point_index(const std::string& value) { |
||||
|
auto it = std::find(points.begin(), points.end(), value); |
||||
|
|
||||
|
if(it == points.end()) return -1; |
||||
|
else return it - points.begin(); |
||||
|
} |
||||
|
|
||||
|
std::vector<uint64_t> summarize(const std::string &point) { |
||||
|
#ifdef PCM_MEASURE |
||||
|
std::vector<uint64_t> sums(value_count); |
||||
|
int64_t idx = point_index(point); |
||||
|
if(idx < 0) return sums; |
||||
|
|
||||
|
for(uint32_t v = 0; v < value_count; ++v) { |
||||
|
for(uint32_t i = 0; i < threads; ++i) { |
||||
|
sums[v] += collection[i][static_cast<uint32_t>(idx) + points.size() * v]; |
||||
|
} |
||||
|
} |
||||
|
return sums; |
||||
|
#endif |
||||
|
return std::vector<uint64_t> {0}; |
||||
|
} |
||||
|
|
||||
|
std::string summarize_as_string(const std::string &point) { |
||||
|
#ifdef PCM_MEASURE |
||||
|
auto summary = summarize(point); |
||||
|
auto it = summary.begin(); |
||||
|
auto end = summary.end(); |
||||
|
|
||||
|
if(it >= end) return ""; |
||||
|
|
||||
|
std::string result(""); |
||||
|
result += std::to_string(*it); |
||||
|
++it; |
||||
|
|
||||
|
while(it < end) { |
||||
|
result += ","; |
||||
|
result += std::to_string(*it); |
||||
|
++it; |
||||
|
} |
||||
|
return result; |
||||
|
#endif |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
void start(const std::string& point, uint32_t thread) { |
||||
|
#ifdef PCM_MEASURE |
||||
|
int64_t idx = point_index(point); |
||||
|
if(idx < 0) { |
||||
|
std::cerr << "Invalid 'point' given. Ignored!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
states[thread][static_cast<uint32_t>(idx)] = pcm::getSystemCounterState(); |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
static std::string getHead(const std::string& point) { |
||||
|
return point + "_l2h," + |
||||
|
point + "_l2m," + |
||||
|
point + "_l3h," + |
||||
|
point + "_l3hns," + |
||||
|
point + "_l3m," + |
||||
|
point + "_mc"; |
||||
|
} |
||||
|
|
||||
|
#ifdef PCM_MEASURE |
||||
|
void read_values(uint32_t point_idx, uint32_t thread, pcm::SystemCounterState& start, pcm::SystemCounterState& end) { |
||||
|
collection[thread][point_idx + points.size() * 0] += getL2CacheHits(start, end); |
||||
|
collection[thread][point_idx + points.size() * 1] += getL2CacheMisses(start, end); |
||||
|
collection[thread][point_idx + points.size() * 2] += getL3CacheHits(start, end); |
||||
|
collection[thread][point_idx + points.size() * 3] += getL3CacheHitsNoSnoop(start, end); |
||||
|
collection[thread][point_idx + points.size() * 4] += getL3CacheMisses(start, end); |
||||
|
collection[thread][point_idx + points.size() * 5] += getBytesReadFromMC(start, end); |
||||
|
} |
||||
|
#endif |
||||
|
|
||||
|
void stop(const std::string& point, uint32_t thread) { |
||||
|
#ifdef PCM_MEASURE |
||||
|
auto state = pcm::getSystemCounterState(); |
||||
|
|
||||
|
int64_t idx = point_index(point); |
||||
|
if(idx < 0) { |
||||
|
std::cerr << "Invalid 'point' given. Ignored!" << std::endl; |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
auto start = states[thread][static_cast<uint32_t>(idx)]; |
||||
|
read_values(static_cast<uint32_t>(idx), thread, start, state); |
||||
|
#endif |
||||
|
} |
||||
|
}; |
@ -0,0 +1,45 @@ |
|||||
|
/** |
||||
|
* @file memory_literals.h |
||||
|
* @author André Berthold |
||||
|
* @brief Defines some operators that ease to define a certain size of memory. |
||||
|
* e.g. to alloc 3 Gib (Gibibit = 2^30 bit) of memory one can now simply write: "std::malloc(3_Gib)" |
||||
|
* to alloc 512 MB (Megabyte = 10^2 byte) of memory one can now simply write: "std::malloc(512_MB)" |
||||
|
* @version 0.1 |
||||
|
* @date 2023-05-25 |
||||
|
* |
||||
|
* @copyright Copyright (c) 2023 |
||||
|
* |
||||
|
*/ |
||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
|
||||
|
typedef const unsigned long long int ull_int; |
||||
|
//***************************************************************************// |
||||
|
// Bit **********************************************************************// |
||||
|
//***************************************************************************// |
||||
|
constexpr size_t operator ""_b(ull_int value) { |
||||
|
// one byte is 8 bit + one byte if bit is no multiple of 8 |
||||
|
return value / 8 + value % 8; |
||||
|
} |
||||
|
constexpr size_t operator ""_kb (ull_int value) { return value * 1000 / 8; } |
||||
|
constexpr size_t operator ""_kib(ull_int value) { return value * 1024 / 8; } |
||||
|
constexpr size_t operator ""_Mb (ull_int value) { return value * 1000 * 1000 / 8; } |
||||
|
constexpr size_t operator ""_Mib(ull_int value) { return value * 1024 * 1024 / 8; } |
||||
|
constexpr size_t operator ""_Gb (ull_int value) { return value * 1000 * 1000 * 1000 / 8; } |
||||
|
constexpr size_t operator ""_Gib(ull_int value) { return value * 1024 * 1024 * 1024 / 8; } |
||||
|
constexpr size_t operator ""_Tb (ull_int value) { return value * 1000 * 1000 * 1000 * 1000 / 8; } |
||||
|
constexpr size_t operator ""_Tib(ull_int value) { return value * 1024 * 1024 * 1024 * 1024 / 8; } |
||||
|
|
||||
|
//***************************************************************************// |
||||
|
// Byte *********************************************************************// |
||||
|
//***************************************************************************// |
||||
|
constexpr size_t operator ""_B (ull_int value) { return value; } |
||||
|
constexpr size_t operator ""_kB (ull_int value) { return value * 1000; } |
||||
|
constexpr size_t operator ""_kiB(ull_int value) { return value * 1024; } |
||||
|
constexpr size_t operator ""_MB (ull_int value) { return value * 1000 * 1000; } |
||||
|
constexpr size_t operator ""_MiB(ull_int value) { return value * 1024 * 1024; } |
||||
|
constexpr size_t operator ""_GB (ull_int value) { return value * 1000 * 1000 * 1000; } |
||||
|
constexpr size_t operator ""_GiB(ull_int value) { return value * 1024 * 1024 * 1024; } |
||||
|
constexpr size_t operator ""_TB (ull_int value) { return value * 1000 * 1000 * 1000 * 1000; } |
||||
|
constexpr size_t operator ""_TiB(ull_int value) { return value * 1024 * 1024 * 1024 * 1024; } |
@ -0,0 +1,6 @@ |
|||||
|
#pragma once |
||||
|
//this file includes all important header from the pcm repository |
||||
|
#include "cpucounters.h" |
||||
|
#include "msr.h" |
||||
|
#include "pci.h" |
||||
|
#include "mutex.h" |
@ -0,0 +1,80 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <chrono> |
||||
|
#include <barrier> |
||||
|
|
||||
|
#include <numa.h> |
||||
|
|
||||
|
#define THREAD_TIMINGS 1 |
||||
|
|
||||
|
|
||||
|
|
||||
|
struct thread_runtime_timing { |
||||
|
using time_point_t = std::chrono::time_point<std::chrono::steady_clock>; |
||||
|
|
||||
|
uint32_t time_points, time_threads; |
||||
|
time_point_t** start_times; |
||||
|
double** time_accumulator; |
||||
|
|
||||
|
thread_runtime_timing(uint32_t timing_points, uint32_t timing_threads, uint32_t memory_node) { |
||||
|
#ifdef THREAD_TIMINGS |
||||
|
time_points = timing_points; |
||||
|
time_threads = timing_threads; |
||||
|
start_times = (time_point_t**) numa_alloc_onnode(timing_points * sizeof(time_point_t*), memory_node); |
||||
|
time_accumulator = (double**) numa_alloc_onnode(timing_points * sizeof(double*), memory_node); |
||||
|
for(uint32_t i = 0; i < timing_points; ++i) { |
||||
|
start_times[i] = (time_point_t*) numa_alloc_onnode(timing_threads * sizeof(time_point_t), memory_node); |
||||
|
time_accumulator[i] = (double*) numa_alloc_onnode(timing_threads * sizeof(double), memory_node); |
||||
|
} |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
~thread_runtime_timing() { |
||||
|
#ifdef THREAD_TIMINGS |
||||
|
for(uint32_t i = 0; i < time_points; ++i) { |
||||
|
numa_free(start_times[i], time_threads * sizeof(time_point_t)); |
||||
|
numa_free(time_accumulator[i], time_threads * sizeof(double)); |
||||
|
} |
||||
|
numa_free(start_times, time_points * sizeof(time_point_t*)); |
||||
|
numa_free(time_accumulator, time_points * sizeof(double*)); |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
void reset_accumulator() { |
||||
|
#ifdef THREAD_TIMINGS |
||||
|
for(uint32_t i = 0; i < time_points; ++i){ |
||||
|
for(uint32_t j = 0; j < time_threads; ++j){ |
||||
|
time_accumulator[i][j] = 0.0; |
||||
|
}} |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
double summarize_time(uint32_t time_point) { |
||||
|
#ifdef THREAD_TIMINGS |
||||
|
double sum = 0.0; |
||||
|
for(uint32_t i = 0; i < time_threads; ++i) { |
||||
|
sum += time_accumulator[time_point][i]; |
||||
|
} |
||||
|
return sum; |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
void stop_timer(uint32_t point_id, uint32_t thread_id) { |
||||
|
#ifdef THREAD_TIMINGS |
||||
|
auto end_time = std::chrono::steady_clock::now(); |
||||
|
auto start_time = start_times[point_id][thread_id]; |
||||
|
|
||||
|
uint64_t time = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count(); |
||||
|
double seconds = time / (1000.0 * 1000.0 * 1000.0); |
||||
|
time_accumulator[point_id][thread_id] += seconds; |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
void start_timer(uint32_t point_id, uint32_t thread_id) { |
||||
|
#ifdef THREAD_TIMINGS |
||||
|
start_times[point_id][thread_id] = std::chrono::steady_clock::now(); |
||||
|
#endif |
||||
|
} |
||||
|
|
||||
|
}; |
@ -0,0 +1,93 @@ |
|||||
|
/** |
||||
|
* @file vector_loader.h |
||||
|
* @author André Berthold |
||||
|
* @brief Provides an interface to easily excange vector loading strategies |
||||
|
* @version 0.1 |
||||
|
* @date 2023-05-25 |
||||
|
* |
||||
|
* @copyright Copyright (c) 2023 |
||||
|
* |
||||
|
*/ |
||||
|
|
||||
|
#pragma once |
||||
|
|
||||
|
#include <cstdint> |
||||
|
#include <type_traits> |
||||
|
|
||||
|
#include <immintrin.h> |
||||
|
|
||||
|
enum load_mode {Unaligned = 0, Aligned = 1, Stream = 2}; |
||||
|
|
||||
|
/** |
||||
|
* @brief A class template that provides functions for loading and storing data of type *base_t* into/from vectors using the stretegy *mode*. |
||||
|
* |
||||
|
* @tparam base_t Base type of the data |
||||
|
* @tparam mode Strategy for loading the vector |
||||
|
*/ |
||||
|
template<typename base_t, load_mode mode> |
||||
|
class Vector_Loader {}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Template specialization for Vector_Loader with base_t = uint32_t. |
||||
|
* |
||||
|
* @tparam mode Strategy for loading the vector |
||||
|
*/ |
||||
|
template<load_mode mode> |
||||
|
class Vector_Loader<uint32_t, mode> { |
||||
|
using base_t = uint32_t; |
||||
|
using mask_t = __mmask16; |
||||
|
using mask_base_t = uint8_t; |
||||
|
public: |
||||
|
|
||||
|
/** |
||||
|
* @brief Loads 512 bit of data into a vector register |
||||
|
* |
||||
|
* @param src Pointer to the data to load |
||||
|
* @return __m512i The vector register with the loaded data |
||||
|
*/ |
||||
|
static inline __m512i load(base_t* src) { |
||||
|
if constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi32(src); |
||||
|
else if constexpr (mode == load_mode::Aligned) return _mm512_load_epi32 (src); |
||||
|
else if constexpr (mode == load_mode::Stream) return _mm512_stream_load_si512(src); |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Stroes data from a given vector register to a destination pointer |
||||
|
* |
||||
|
* @param dst Pointer to the data destination |
||||
|
* @param vector Vector register containing the data to store |
||||
|
*/ |
||||
|
static inline void store(base_t* dst, __m512i vector) { |
||||
|
if constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi32(dst, vector); |
||||
|
else if constexpr (mode == load_mode::Aligned) _mm512_store_epi32 (dst, vector); |
||||
|
else if constexpr (mode == load_mode::Stream) _mm512_stream_si512((__m512i*)(dst), vector); |
||||
|
}; |
||||
|
}; |
||||
|
|
||||
|
/** |
||||
|
* @brief Template specialization for Vector_Loader with base_t = uint64_t. |
||||
|
* |
||||
|
* @tparam mode Strategy for loading the vector |
||||
|
*/ |
||||
|
template<load_mode mode> |
||||
|
class Vector_Loader<uint64_t, mode> { |
||||
|
using base_t = uint64_t; |
||||
|
using mask_t = __mmask8; |
||||
|
using mask_base_t = uint8_t; |
||||
|
public: |
||||
|
|
||||
|
|
||||
|
|
||||
|
static inline __m512i load(base_t* src) { |
||||
|
if constexpr (mode == load_mode::Unaligned) return _mm512_loadu_epi64(src); |
||||
|
else if constexpr (mode == load_mode::Aligned) return _mm512_load_epi64 (src); |
||||
|
else if constexpr (mode == load_mode::Stream) return _mm512_stream_load_si512(src); |
||||
|
}; |
||||
|
|
||||
|
static inline void store(base_t* dst, __m512i vector) { |
||||
|
if constexpr (mode == load_mode::Unaligned) _mm512_storeu_epi64(dst, vector); |
||||
|
else if constexpr (mode == load_mode::Aligned) _mm512_store_epi64 (dst, vector); |
||||
|
else if constexpr (mode == load_mode::Stream) _mm512_stream_si512((__m512i*)(dst), vector); |
||||
|
}; |
||||
|
|
||||
|
}; |
Write
Preview
Loading…
Cancel
Save
Reference in new issue