diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp index 5b3e84b..3d7de33 100644 --- a/qdp_project/src/Benchmark.cpp +++ b/qdp_project/src/Benchmark.cpp @@ -18,40 +18,14 @@ #define MODE_COMPLEX_PREFETCH #endif -#include "BenchmarkModes.hpp" +#include "Configuration.hpp" #include "BenchmarkHelpers.cpp" -constexpr uint64_t CMP_A = 50; -constexpr uint64_t CMP_B = 42; -constexpr uint32_t TC_COMBINED = TC_SCANA + TC_SCANB + TC_AGGRJ; -constexpr size_t WL_SIZE_ELEMENTS = WL_SIZE_B / sizeof(uint64_t); -constexpr size_t CHUNK_COUNT = WL_SIZE_B / CHUNK_SIZE_B; -constexpr size_t CHUNK_SIZE_ELEMENTS = CHUNK_SIZE_B / sizeof(uint64_t); -constexpr size_t RUN_COUNT = CHUNK_COUNT / GROUP_COUNT; - -static_assert(TC_AGGRJ % (TC_SCANB > 0 ? TC_SCANB : TC_AGGRJ) == 0); -static_assert(TC_AGGRJ >= TC_SCANB); -static_assert(RUN_COUNT > 0); -static_assert(WL_SIZE_B % 16 == 0); -static_assert(CHUNK_SIZE_B % 16 == 0); - using filter = Filter; using aggregation = Aggregation; dsacache::Cache CACHE_; -constexpr size_t SCANA_TIMING_INDEX = 0; -constexpr size_t SCANB_TIMING_INDEX = 1; -constexpr size_t AGGRJ_TIMING_INDEX = 2; -constexpr size_t TIME_STAMP_BEGIN = 0; -constexpr size_t TIME_STAMP_WAIT = 1; -constexpr size_t TIME_STAMP_END = 2; - -// THREAD_TIMING_[TYPE][TID][ITERATION][STAMP] = TIMEPOINT -std::array>>, 3> THREAD_TIMING_; - -std::array CACHE_HITS_; - std::vector*> BARRIERS_; std::shared_future LAUNCH_; @@ -61,92 +35,10 @@ uint16_t* MASK_A_; uint16_t* MASK_B_; uint64_t* DATA_DST_; -inline uint64_t get_chunk_index(const size_t gid, const size_t rid) { - return gid + GROUP_COUNT * rid; -} - -template -inline uint64_t* get_chunk(uint64_t* base, const size_t chunk_index, const size_t tid) { - uint64_t* chunk_ptr = base + chunk_index * CHUNK_SIZE_ELEMENTS; - return chunk_ptr + tid * (CHUNK_SIZE_ELEMENTS / TC); -} - -template -inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) { - size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC); - return base + (offset / 16); -} - -double process_cache_hitrate() { - double hr = 0.0; - - for (const uint32_t& e : CACHE_HITS_) { - hr += e; - } - - return hr / (double)(TC_AGGRJ * GROUP_COUNT * RUN_COUNT); -} - -void process_timings( - uint64_t* scana_run, uint64_t* scana_wait, - uint64_t* scanb_run, uint64_t* scanb_wait, - uint64_t* aggrj_run, uint64_t* aggrj_wait -) { - { - uint64_t scana_rc = 0; - - for (const auto& e : THREAD_TIMING_[SCANA_TIMING_INDEX]) { - for (const auto& m : e) { - *scana_run += std::chrono::duration_cast(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count(); - *scana_wait += std::chrono::duration_cast(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count(); - scana_rc++; - } - } - - if (scana_rc != 0) { - *scana_run /= scana_rc; - *scana_wait /= scana_rc; - } - } - { - uint64_t scanb_rc = 0; - - for (const auto& e : THREAD_TIMING_[SCANB_TIMING_INDEX]) { - for (const auto& m : e) { - *scanb_run += std::chrono::duration_cast(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count(); - *scanb_wait += std::chrono::duration_cast(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count(); - scanb_rc++; - } - } - - if (scanb_rc != 0) { - *scana_run /= scanb_rc; - *scana_wait /= scanb_rc; - } - } - { - uint64_t aggrj_rc = 0; - - for (const auto& e : THREAD_TIMING_[AGGRJ_TIMING_INDEX]) { - for (const auto& m : e) { - *aggrj_wait += std::chrono::duration_cast(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count(); - *aggrj_run += std::chrono::duration_cast(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count(); - aggrj_rc++; - } - } - - if (aggrj_rc != 0) { - *aggrj_run /= aggrj_rc; - *aggrj_wait /= aggrj_rc; - } - } -} - // if more b than j -> perform b normal, subsplit j // if more j than b -> subsplit b like it is now void scan_b(size_t gid, size_t tid) { - constexpr bool SUBSPLIT_SCANB = TC_AGGRJ > TC_SCANB; constexpr size_t SUBCHUNK_COUNT = TC_AGGRJ / (TC_SCANB == 0 ? 1 : TC_SCANB); constexpr size_t SUBCHUNK_SIZE_B = CHUNK_SIZE_B / SUBCHUNK_COUNT; @@ -159,7 +51,7 @@ void scan_b(size_t gid, size_t tid) { if constexpr (PERFORM_CACHING) { for (size_t i = 0; i < RUN_COUNT; i++) { - const size_t chunk_index = get_chunk_index(gid, 0); + const size_t chunk_index = get_chunk_index(gid, i); uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, i); for (size_t j = 0; j < SUBCHUNK_COUNT; j++) { @@ -171,7 +63,7 @@ void scan_b(size_t gid, size_t tid) { if constexpr (COMPLEX_QUERY) { for (size_t i = 0; i < RUN_COUNT; i++) { - const size_t chunk_index = get_chunk_index(gid, 0); + const size_t chunk_index = get_chunk_index(gid, i); uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, i); uint16_t* mask_ptr = get_mask(MASK_B_, chunk_index, i); @@ -251,7 +143,7 @@ void aggr_j(size_t gid, size_t tid) { uint64_t tmp = _mm512_reduce_add_epi64(aggregator); if constexpr (COMPLEX_QUERY) { - aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, CHUNK_SIZE_B / TC_AGGRJ); + aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, mask_ptr_b, CHUNK_SIZE_B / TC_AGGRJ); } else { aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, CHUNK_SIZE_B / TC_AGGRJ); diff --git a/qdp_project/src/BenchmarkModes.hpp b/qdp_project/src/Configuration.hpp similarity index 80% rename from qdp_project/src/BenchmarkModes.hpp rename to qdp_project/src/Configuration.hpp index b2abc93..81e8e18 100644 --- a/qdp_project/src/BenchmarkModes.hpp +++ b/qdp_project/src/Configuration.hpp @@ -70,3 +70,17 @@ constexpr bool STORE_B_IN_HBM = true; constexpr char MODE_STRING[] = "complex-hbm"; constexpr bool COMPLEX_QUERY = true; #endif + +constexpr uint64_t CMP_A = 50; +constexpr uint64_t CMP_B = 42; +constexpr uint32_t TC_COMBINED = TC_SCANA + TC_SCANB + TC_AGGRJ; +constexpr size_t WL_SIZE_ELEMENTS = WL_SIZE_B / sizeof(uint64_t); +constexpr size_t CHUNK_COUNT = WL_SIZE_B / CHUNK_SIZE_B; +constexpr size_t CHUNK_SIZE_ELEMENTS = CHUNK_SIZE_B / sizeof(uint64_t); +constexpr size_t RUN_COUNT = CHUNK_COUNT / GROUP_COUNT; + +static_assert(TC_AGGRJ % (TC_SCANB > 0 ? TC_SCANB : TC_AGGRJ) == 0); +static_assert(TC_AGGRJ >= TC_SCANB); +static_assert(RUN_COUNT > 0); +static_assert(WL_SIZE_B % 16 == 0); +static_assert(CHUNK_SIZE_B % 16 == 0); \ No newline at end of file diff --git a/qdp_project/src/utils/BenchmarkHelpers.cpp b/qdp_project/src/utils/BenchmarkHelpers.cpp index bde2c48..713b925 100644 --- a/qdp_project/src/utils/BenchmarkHelpers.cpp +++ b/qdp_project/src/utils/BenchmarkHelpers.cpp @@ -1,5 +1,18 @@ #include +#include "../Configuration.hpp" + +constexpr size_t SCANA_TIMING_INDEX = 0; +constexpr size_t SCANB_TIMING_INDEX = 1; +constexpr size_t AGGRJ_TIMING_INDEX = 2; +constexpr size_t TIME_STAMP_BEGIN = 0; +constexpr size_t TIME_STAMP_WAIT = 1; +constexpr size_t TIME_STAMP_END = 2; + +std::array>>, 3> THREAD_TIMING_; + +std::array CACHE_HITS_; + uint64_t sum_check(uint64_t compare_value, uint64_t* row_A, uint64_t* row_B, size_t row_size) { uint64_t sum = 0; for(int i = 0; i < row_size / sizeof(uint64_t); ++i) { @@ -45,3 +58,84 @@ struct NopStruct { return; } }; + +inline uint64_t get_chunk_index(const size_t gid, const size_t rid) { + return gid + GROUP_COUNT * rid; +} + +template +inline uint64_t* get_chunk(uint64_t* base, const size_t chunk_index, const size_t tid) { + uint64_t* chunk_ptr = base + chunk_index * CHUNK_SIZE_ELEMENTS; + return chunk_ptr + tid * (CHUNK_SIZE_ELEMENTS / TC); +} + +template +inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) { + size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC); + return base + (offset / 16); +} + +double process_cache_hitrate() { + double hr = 0.0; + + for (const uint32_t& e : CACHE_HITS_) { + hr += e; + } + + return hr / (double)(TC_AGGRJ * GROUP_COUNT * RUN_COUNT); +} + +void process_timings( + uint64_t* scana_run, uint64_t* scana_wait, + uint64_t* scanb_run, uint64_t* scanb_wait, + uint64_t* aggrj_run, uint64_t* aggrj_wait +) { + { + uint64_t scana_rc = 0; + + for (const auto& e : THREAD_TIMING_[SCANA_TIMING_INDEX]) { + for (const auto& m : e) { + *scana_run += std::chrono::duration_cast(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count(); + *scana_wait += std::chrono::duration_cast(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count(); + scana_rc++; + } + } + + if (scana_rc != 0) { + *scana_run /= scana_rc; + *scana_wait /= scana_rc; + } + } + { + uint64_t scanb_rc = 0; + + for (const auto& e : THREAD_TIMING_[SCANB_TIMING_INDEX]) { + for (const auto& m : e) { + *scanb_run += std::chrono::duration_cast(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count(); + *scanb_wait += std::chrono::duration_cast(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count(); + scanb_rc++; + } + } + + if (scanb_rc != 0) { + *scana_run /= scanb_rc; + *scana_wait /= scanb_rc; + } + } + { + uint64_t aggrj_rc = 0; + + for (const auto& e : THREAD_TIMING_[AGGRJ_TIMING_INDEX]) { + for (const auto& m : e) { + *aggrj_wait += std::chrono::duration_cast(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count(); + *aggrj_run += std::chrono::duration_cast(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count(); + aggrj_rc++; + } + } + + if (aggrj_rc != 0) { + *aggrj_run /= aggrj_rc; + *aggrj_wait /= aggrj_rc; + } + } +} \ No newline at end of file