From 50560606a360f492810c9a4470e5631805a57700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Thu, 25 Jan 2024 14:43:54 +0100 Subject: [PATCH] add complex query as benchmarking option and evaluate results --- qdp_project/src/Benchmark.cpp | 150 +++++++++++---------- qdp_project/src/BenchmarkModes.hpp | 72 ++++++++++ qdp_project/src/utils/BenchmarkHelpers.cpp | 24 ++-- 3 files changed, 170 insertions(+), 76 deletions(-) create mode 100644 qdp_project/src/BenchmarkModes.hpp diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp index ecc1969..32fcf22 100644 --- a/qdp_project/src/Benchmark.cpp +++ b/qdp_project/src/Benchmark.cpp @@ -14,62 +14,22 @@ #include "../../offloading-cacher/cache.hpp" -#include "BenchmarkHelpers.cpp" - -#define MODE_PREFETCH - -//////////////////////////////// -/// BENCHMARK SETUP - -constexpr size_t WL_SIZE_B = 4_GiB; -constexpr uint32_t WARMUP_ITERATION_COUNT = 5; -constexpr uint32_t ITERATION_COUNT = 5; - -#ifdef MODE_PREFETCH -constexpr uint32_t GROUP_COUNT = 16; -constexpr size_t CHUNK_SIZE_B = 8_MiB; -constexpr uint32_t TC_SCANA = 2; -constexpr uint32_t TC_SCANB = 1; -constexpr uint32_t TC_AGGRJ = 2; -constexpr bool PERFORM_CACHING = true; -constexpr bool DATA_IN_HBM = false; -constexpr char MODE_STRING[] = "prefetch"; -#endif -#ifdef MODE_DRAM -constexpr size_t CHUNK_SIZE_B = 2_MiB; -constexpr uint32_t GROUP_COUNT = 8; -constexpr uint32_t TC_SCANA = 2; -constexpr uint32_t TC_SCANB = 0; -constexpr uint32_t TC_AGGRJ = 1; -constexpr bool PERFORM_CACHING = false; -constexpr bool DATA_IN_HBM = false; -constexpr char MODE_STRING[] = "dram"; -#endif -#ifdef MODE_HBM -constexpr size_t CHUNK_SIZE_B = 2_MiB; -constexpr uint32_t GROUP_COUNT = 8; -constexpr uint32_t TC_SCANA = 2; -constexpr uint32_t TC_SCANB = 0; -constexpr uint32_t TC_AGGRJ = 1; -constexpr bool PERFORM_CACHING = false; -constexpr bool DATA_IN_HBM = true; -constexpr char MODE_STRING[] = "hbm"; +#ifndef MODE_SET_BY_CMAKE +#define MODE_COMPLEX_PREFETCH #endif -/// DO NOT CONFIGURE BEYOND THIS -//////////////////////////////// +#include "BenchmarkModes.hpp" +#include "BenchmarkHelpers.cpp" constexpr uint64_t CMP_A = 50; +constexpr uint64_t CMP_B = 42; constexpr uint32_t TC_COMBINED = TC_SCANA + TC_SCANB + TC_AGGRJ; constexpr size_t WL_SIZE_ELEMENTS = WL_SIZE_B / sizeof(uint64_t); constexpr size_t CHUNK_COUNT = WL_SIZE_B / CHUNK_SIZE_B; constexpr size_t CHUNK_SIZE_ELEMENTS = CHUNK_SIZE_B / sizeof(uint64_t); constexpr size_t RUN_COUNT = CHUNK_COUNT / GROUP_COUNT; -constexpr size_t MASK_ELEMENT_SIZE = 16; -constexpr size_t MASK_STEP_SIZE = CHUNK_SIZE_ELEMENTS / MASK_ELEMENT_SIZE; static_assert(RUN_COUNT > 0); -static_assert(TC_SCANB <= TC_AGGRJ); static_assert(WL_SIZE_B % 16 == 0); static_assert(CHUNK_SIZE_B % 16 == 0); @@ -96,6 +56,7 @@ std::shared_future LAUNCH_; uint64_t* DATA_A_; uint64_t* DATA_B_; uint16_t* MASK_A_; +uint16_t* MASK_B_; uint64_t* DATA_DST_; inline uint64_t get_chunk_index(const size_t gid, const size_t rid) { @@ -179,11 +140,24 @@ void process_timings( } } +// if more b than j -> perform b normal, subsplit j +// if more j than b -> subsplit b like it is now + void scan_b(size_t gid, size_t tid) { - constexpr size_t split = TC_AGGRJ / (TC_SCANB == 0 ? 1 : TC_SCANB); + constexpr bool SUBSPLIT_SCANB = TC_AGGRJ > TC_SCANB; + constexpr size_t SUBCHUNK_SIZE_ELEMENTS_SCANB = TC_AGGRJ / (TC_SCANB == 0 ? 1 : TC_SCANB); + constexpr uint32_t TC_SUBSPLIT_SCANB = SUBSPLIT_SCANB ? TC_SCANB : TC_AGGRJ; - const size_t start = tid * split; - const size_t end = start + split; + size_t start, end; + + if constexpr (SUBSPLIT_SCANB) { + start = tid * SUBCHUNK_SIZE_ELEMENTS_SCANB; + end = start + SUBCHUNK_SIZE_ELEMENTS_SCANB; + } + else { + start = 0; + end = RUN_COUNT; + } THREAD_TIMING_[SCANB_TIMING_INDEX][tid * gid].clear(); THREAD_TIMING_[SCANB_TIMING_INDEX][tid * gid].resize(1); @@ -194,11 +168,20 @@ void scan_b(size_t gid, size_t tid) { if constexpr (PERFORM_CACHING) { for (size_t i = start; i < end; i++) { + const size_t chunk_index = get_chunk_index(gid, 0); + uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, i); + + CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B / TC_SUBSPLIT_SCANB); + } + } + if constexpr (COMPLEX_QUERY) { + for (size_t i = start; i < end; i++) { const size_t chunk_index = get_chunk_index(gid, 0); - uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, i); + uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, i); + uint16_t* mask_ptr = get_mask(MASK_B_, chunk_index, i); - CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); + filter::apply_same(mask_ptr, nullptr, chunk_ptr, CMP_B, CHUNK_SIZE_B / TC_SUBSPLIT_SCANB); } } @@ -217,13 +200,11 @@ void scan_a(size_t gid, size_t tid) { THREAD_TIMING_[SCANA_TIMING_INDEX][tid * gid][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); for (size_t i = 0; i < RUN_COUNT; i++) { - const size_t chunk_index = get_chunk_index(gid, i); uint64_t* chunk_ptr = get_chunk(DATA_A_, chunk_index, tid); uint16_t* mask_ptr = get_mask(MASK_A_, chunk_index, tid); filter::apply_same(mask_ptr, nullptr, chunk_ptr, CMP_A, CHUNK_SIZE_B / TC_SCANA); - } THREAD_TIMING_[SCANA_TIMING_INDEX][tid * gid][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); @@ -233,31 +214,47 @@ void scan_a(size_t gid, size_t tid) { } void aggr_j(size_t gid, size_t tid) { + constexpr bool SUBSPLIT_AGGRJ = TC_SCANB > TC_AGGRJ; + constexpr size_t SUBCHUNK_SIZE_ELEMENTS_AGGRJ = TC_SCANB / TC_AGGRJ; + constexpr uint32_t TC_SUBSPLIT_AGGRJ = SUBSPLIT_AGGRJ ? TC_AGGRJ : TC_SCANB; + + size_t start, end; + + if constexpr (SUBSPLIT_AGGRJ) { + start = tid * SUBCHUNK_SIZE_ELEMENTS_AGGRJ; + end = start + SUBCHUNK_SIZE_ELEMENTS_AGGRJ; + } + else { + start = 0; + end = RUN_COUNT; + } + CACHE_HITS_[gid * tid] = 0; THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].clear(); THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].resize(1); - LAUNCH_.wait(); - __m512i aggregator = aggregation::OP::zero(); + LAUNCH_.wait(); + THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); BARRIERS_[gid]->arrive_and_wait(); THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); - for (size_t i = 0; i < RUN_COUNT; i++) { + for (size_t i = start; i < end; i++) { const size_t chunk_index = get_chunk_index(gid, i); - uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, tid); - uint16_t* mask_ptr = get_mask(MASK_A_, chunk_index, tid); + uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, tid); + uint16_t* mask_ptr_a = get_mask(MASK_A_, chunk_index, tid); + uint16_t* mask_ptr_b = get_mask(MASK_B_, chunk_index, tid); std::unique_ptr data; uint64_t* data_ptr; if constexpr (PERFORM_CACHING) { - data = CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); + data = CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B / TC_SUBSPLIT_AGGRJ); data->WaitOnCompletion(dsacache::WAIT_WEAK); data_ptr = reinterpret_cast(data->GetDataLocation()); @@ -273,7 +270,13 @@ void aggr_j(size_t gid, size_t tid) { } uint64_t tmp = _mm512_reduce_add_epi64(aggregator); - aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr, CHUNK_SIZE_B / TC_AGGRJ); + + if constexpr (COMPLEX_QUERY) { + aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, CHUNK_SIZE_B / TC_SUBSPLIT_AGGRJ); + } + else { + aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, CHUNK_SIZE_B / TC_SUBSPLIT_AGGRJ); + } } THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_END] = std::chrono::steady_clock::now(); @@ -297,17 +300,23 @@ int main() { fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;cache-hr;" << std::endl; - if constexpr (DATA_IN_HBM) { - DATA_A_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node); + // a is allways allocated in DRAM + + DATA_A_ = (uint64_t*) numa_alloc_local(WL_SIZE_B); + + // resulting masks for a and b and total result will allways reside in HBM + + MASK_A_ = (uint16_t*) numa_alloc_onnode(WL_SIZE_ELEMENTS, cache_node); + MASK_B_ = (uint16_t*) numa_alloc_onnode(WL_SIZE_ELEMENTS, cache_node); + DATA_DST_ = (uint64_t*) numa_alloc_onnode(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t), cache_node); + + // location of b depends on configuration + + if constexpr (STORE_B_IN_HBM) { DATA_B_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node); - MASK_A_ = (uint16_t*) numa_alloc_onnode(WL_SIZE_ELEMENTS, cache_node); - DATA_DST_ = (uint64_t*) numa_alloc_onnode(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t), cache_node); } else { - DATA_A_ = (uint64_t*) numa_alloc_local(WL_SIZE_B); DATA_B_ = (uint64_t*) numa_alloc_local(WL_SIZE_B); - MASK_A_ = (uint16_t*) numa_alloc_local(WL_SIZE_ELEMENTS); - DATA_DST_ = (uint64_t*) numa_alloc_local(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t)); } if constexpr (PERFORM_CACHING) { @@ -315,7 +324,7 @@ int main() { } fill_mt(DATA_A_, WL_SIZE_B, 0, 100, 42); - fill_mt(DATA_A_, WL_SIZE_B, 0, 100, 420); + fill_mt(DATA_B_, WL_SIZE_B, 0, 100, 420); for (uint32_t i = 0; i < ITERATION_COUNT + WARMUP_ITERATION_COUNT; i++) { std::promise launch_promise; @@ -353,6 +362,11 @@ int main() { const auto time_end = std::chrono::steady_clock::now(); + const uint64_t result_actual = DATA_DST_[0]; + const uint64_t result_expected = COMPLEX_QUERY ? sum_check(CMP_A, DATA_A_, DATA_B_, WL_SIZE_B) : sum_check_complex(CMP_A, CMP_B, DATA_A_, DATA_B_, WL_SIZE_B); + + std::cout << "Result Expected: " << result_expected << ", Result Actual: " << result_actual << std::endl; + if (i >= WARMUP_ITERATION_COUNT) { uint64_t scana_run = 0, scana_wait = 0, scanb_run = 0, scanb_wait = 0, aggrj_run = 0, aggrj_wait = 0; process_timings(&scana_run, &scana_wait, &scanb_run, &scanb_wait, &aggrj_run, &aggrj_wait); @@ -364,7 +378,7 @@ int main() { fout << i - WARMUP_ITERATION_COUNT << ";" << nanos << ";" << seconds << ";" - << std::hex << DATA_DST_[0] << std::dec << ";" + << DATA_DST_[0] << ";" << scana_run << ";" << scana_wait << ";" << scanb_run << ";" << scanb_wait << ";" << aggrj_run << ";" << aggrj_wait << ";" << process_cache_hitrate() << ";" << std::endl; diff --git a/qdp_project/src/BenchmarkModes.hpp b/qdp_project/src/BenchmarkModes.hpp new file mode 100644 index 0000000..3d9f842 --- /dev/null +++ b/qdp_project/src/BenchmarkModes.hpp @@ -0,0 +1,72 @@ +#pragma once + +constexpr size_t WL_SIZE_B = 4_GiB; +constexpr uint32_t WARMUP_ITERATION_COUNT = 5; +constexpr uint32_t ITERATION_COUNT = 5; + +#ifdef MODE_SIMPLE_PREFETCH +constexpr uint32_t GROUP_COUNT = 16; +constexpr size_t CHUNK_SIZE_B = 8_MiB; +constexpr uint32_t TC_SCANA = 2; +constexpr uint32_t TC_SCANB = 1; +constexpr uint32_t TC_AGGRJ = 2; +constexpr bool PERFORM_CACHING = true; +constexpr bool STORE_B_IN_HBM = false; +constexpr char MODE_STRING[] = "prefetch"; +constexpr bool COMPLEX_QUERY = false; +#endif +#ifdef MODE_SIMPLE_DRAM +constexpr size_t CHUNK_SIZE_B = 2_MiB; +constexpr uint32_t GROUP_COUNT = 16; +constexpr uint32_t TC_SCANA = 2; +constexpr uint32_t TC_SCANB = 0; +constexpr uint32_t TC_AGGRJ = 1; +constexpr bool PERFORM_CACHING = false; +constexpr bool STORE_B_IN_HBM = false; +constexpr char MODE_STRING[] = "dram"; +constexpr bool COMPLEX_QUERY = false; +#endif +#ifdef MODE_SIMPLE_HBM +constexpr size_t CHUNK_SIZE_B = 2_MiB; +constexpr uint32_t GROUP_COUNT = 16; +constexpr uint32_t TC_SCANA = 2; +constexpr uint32_t TC_SCANB = 0; +constexpr uint32_t TC_AGGRJ = 1; +constexpr bool PERFORM_CACHING = false; +constexpr bool STORE_B_IN_HBM = true; +constexpr char MODE_STRING[] = "hbm"; +constexpr bool COMPLEX_QUERY = false; +#endif +#ifdef MODE_COMPLEX_PREFETCH +constexpr uint32_t GROUP_COUNT = 16; +constexpr size_t CHUNK_SIZE_B = 8_MiB; +constexpr uint32_t TC_SCANA = 1; +constexpr uint32_t TC_SCANB = 1; +constexpr uint32_t TC_AGGRJ = 2; +constexpr bool PERFORM_CACHING = true; +constexpr bool STORE_B_IN_HBM = false; +constexpr char MODE_STRING[] = "prefetch"; +constexpr bool COMPLEX_QUERY = true; +#endif +#ifdef MODE_COMPLEX_DRAM +constexpr size_t CHUNK_SIZE_B = 2_MiB; +constexpr uint32_t GROUP_COUNT = 16; +constexpr uint32_t TC_SCANA = 1; +constexpr uint32_t TC_SCANB = 1; +constexpr uint32_t TC_AGGRJ = 2; +constexpr bool PERFORM_CACHING = false; +constexpr bool STORE_B_IN_HBM = false; +constexpr char MODE_STRING[] = "dram"; +constexpr bool COMPLEX_QUERY = true; +#endif +#ifdef MODE_COMPLEX_HBM +constexpr size_t CHUNK_SIZE_B = 2_MiB; +constexpr uint32_t GROUP_COUNT = 16; +constexpr uint32_t TC_SCANA = 1; +constexpr uint32_t TC_SCANB = 1; +constexpr uint32_t TC_AGGRJ = 2; +constexpr bool PERFORM_CACHING = false; +constexpr bool STORE_B_IN_HBM = true; +constexpr char MODE_STRING[] = "hbm"; +constexpr bool COMPLEX_QUERY = true; +#endif diff --git a/qdp_project/src/utils/BenchmarkHelpers.cpp b/qdp_project/src/utils/BenchmarkHelpers.cpp index f14cf2d..bde2c48 100644 --- a/qdp_project/src/utils/BenchmarkHelpers.cpp +++ b/qdp_project/src/utils/BenchmarkHelpers.cpp @@ -1,5 +1,21 @@ #include +uint64_t sum_check(uint64_t compare_value, uint64_t* row_A, uint64_t* row_B, size_t row_size) { + uint64_t sum = 0; + for(int i = 0; i < row_size / sizeof(uint64_t); ++i) { + sum += (row_A[i] < compare_value) * row_B[i]; + } + return sum; +} + +uint64_t sum_check_complex(uint64_t compare_value_a, uint64_t compare_value_b, uint64_t* row_A, uint64_t* row_B, size_t row_size) { + uint64_t sum = 0; + for(int i = 0; i < row_size / sizeof(uint64_t); ++i) { + sum += (row_A[i] < compare_value_a && row_B[i] < compare_value_b) * row_B[i]; + } + return sum; +} + int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) { return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; } @@ -29,11 +45,3 @@ struct NopStruct { return; } }; - -uint64_t sum_check(uint64_t compare_value, uint64_t* row_A, uint64_t* row_B, size_t row_size) { - uint64_t sum = 0; - for(int i = 0; i < row_size / sizeof(uint64_t); ++i) { - sum += (row_A[i] < compare_value) * row_B[i]; - } - return sum; -} \ No newline at end of file