fix chunk selection in scanb, use the dataptr in aggrj complex mode, export some functions to src/utils/BenchmarkHelpers.cpp

11 months ago · c2b9e6656d
3 changed files with 112 additions and 112 deletions
--- a/qdp_project/src/Benchmark.cpp
+++ b/qdp_project/src/Benchmark.cpp
@ -18,40 +18,14 @@
 #define MODE_COMPLEX_PREFETCH
 #endif

-#include "BenchmarkModes.hpp"
+#include "Configuration.hpp"
 #include "BenchmarkHelpers.cpp"

-constexpr uint64_t CMP_A = 50;
-constexpr uint64_t CMP_B = 42;
-constexpr uint32_t TC_COMBINED = TC_SCANA + TC_SCANB + TC_AGGRJ;
-constexpr size_t WL_SIZE_ELEMENTS = WL_SIZE_B / sizeof(uint64_t);
-constexpr size_t CHUNK_COUNT = WL_SIZE_B / CHUNK_SIZE_B;
-constexpr size_t CHUNK_SIZE_ELEMENTS = CHUNK_SIZE_B / sizeof(uint64_t);
-constexpr size_t RUN_COUNT = CHUNK_COUNT / GROUP_COUNT;
-
-static_assert(TC_AGGRJ % (TC_SCANB > 0 ? TC_SCANB : TC_AGGRJ) == 0);
-static_assert(TC_AGGRJ >= TC_SCANB);
-static_assert(RUN_COUNT > 0);
-static_assert(WL_SIZE_B % 16 == 0);
-static_assert(CHUNK_SIZE_B % 16 == 0);
-
 using filter = Filter<uint64_t, LT, load_mode::Stream, false>;
 using aggregation = Aggregation<uint64_t, Sum, load_mode::Stream>;

 dsacache::Cache CACHE_;

-constexpr size_t SCANA_TIMING_INDEX = 0;
-constexpr size_t SCANB_TIMING_INDEX = 1;
-constexpr size_t AGGRJ_TIMING_INDEX = 2;
-constexpr size_t TIME_STAMP_BEGIN = 0;
-constexpr size_t TIME_STAMP_WAIT = 1;
-constexpr size_t TIME_STAMP_END = 2;
-
-// THREAD_TIMING_[TYPE][TID][ITERATION][STAMP] = TIMEPOINT
-std::array<std::vector<std::vector<std::array<std::chrono::steady_clock::time_point, 3>>>, 3> THREAD_TIMING_;
-
-std::array<uint32_t, GROUP_COUNT * TC_AGGRJ> CACHE_HITS_;
-
 std::vector<std::barrier<NopStruct>*> BARRIERS_;
 std::shared_future<void> LAUNCH_;

@ -61,92 +35,10 @@ uint16_t* MASK_A_;
 uint16_t* MASK_B_;
 uint64_t* DATA_DST_;

-inline uint64_t get_chunk_index(const size_t gid, const size_t rid) {
-    return gid + GROUP_COUNT * rid;
-}
-
-template<size_t TC>
-inline uint64_t* get_chunk(uint64_t* base, const size_t chunk_index, const size_t tid) {
-    uint64_t* chunk_ptr = base + chunk_index * CHUNK_SIZE_ELEMENTS;
-    return chunk_ptr + tid * (CHUNK_SIZE_ELEMENTS / TC);
-}
-
-template<size_t TC>
-inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) {
-    size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC);
-    return base + (offset / 16);
-}
-
-double process_cache_hitrate() {
-    double hr = 0.0;
-
-    for (const uint32_t& e : CACHE_HITS_) {
-        hr += e;
-    }
-
-    return hr / (double)(TC_AGGRJ * GROUP_COUNT * RUN_COUNT);
-}
-
-void process_timings(
-        uint64_t* scana_run, uint64_t* scana_wait,
-        uint64_t* scanb_run, uint64_t* scanb_wait,
-        uint64_t* aggrj_run, uint64_t* aggrj_wait
-) {
-    {
-        uint64_t scana_rc = 0;
-
-        for (const auto& e : THREAD_TIMING_[SCANA_TIMING_INDEX]) {
-            for (const auto& m : e) {
-                *scana_run += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count();
-                *scana_wait += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count();
-                scana_rc++;
-            }
-        }
-
-        if (scana_rc != 0) {
-            *scana_run /= scana_rc;
-            *scana_wait /= scana_rc;
-        }
-    }
-    {
-        uint64_t scanb_rc = 0;
-
-        for (const auto& e : THREAD_TIMING_[SCANB_TIMING_INDEX]) {
-            for (const auto& m : e) {
-                *scanb_run += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count();
-                *scanb_wait += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count();
-                scanb_rc++;
-            }
-        }
-
-        if (scanb_rc != 0) {
-            *scana_run /= scanb_rc;
-            *scana_wait /= scanb_rc;
-        }
-    }
-    {
-        uint64_t aggrj_rc = 0;
-
-        for (const auto& e : THREAD_TIMING_[AGGRJ_TIMING_INDEX]) {
-            for (const auto& m : e) {
-                *aggrj_wait += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count();
-                *aggrj_run += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count();
-                aggrj_rc++;
-            }
-        }
-
-        if (aggrj_rc != 0) {
-            *aggrj_run /= aggrj_rc;
-            *aggrj_wait /= aggrj_rc;
-        }
-    }
-}
-
 // if more b than j -> perform b normal, subsplit j
 // if more j than b -> subsplit b like it is now

 void scan_b(size_t gid, size_t tid) {
-    constexpr bool SUBSPLIT_SCANB = TC_AGGRJ > TC_SCANB;
    constexpr size_t SUBCHUNK_COUNT = TC_AGGRJ / (TC_SCANB == 0 ? 1 : TC_SCANB);
    constexpr size_t SUBCHUNK_SIZE_B = CHUNK_SIZE_B / SUBCHUNK_COUNT;

@ -159,7 +51,7 @@ void scan_b(size_t gid, size_t tid) {

    if constexpr (PERFORM_CACHING) {
        for (size_t i = 0; i < RUN_COUNT; i++) {
-            const size_t chunk_index = get_chunk_index(gid, 0);
+            const size_t chunk_index = get_chunk_index(gid, i);
            uint64_t* chunk_ptr = get_chunk<TC_SCANB>(DATA_B_, chunk_index, i);

            for (size_t j = 0; j < SUBCHUNK_COUNT; j++) {
@ -171,7 +63,7 @@ void scan_b(size_t gid, size_t tid) {

    if constexpr (COMPLEX_QUERY) {
        for (size_t i = 0; i < RUN_COUNT; i++) {
-            const size_t chunk_index = get_chunk_index(gid, 0);
+            const size_t chunk_index = get_chunk_index(gid, i);
            uint64_t* chunk_ptr = get_chunk<TC_SCANB>(DATA_B_, chunk_index, i);
            uint16_t* mask_ptr = get_mask<TC_SCANB>(MASK_B_, chunk_index, i);

@ -251,7 +143,7 @@ void aggr_j(size_t gid, size_t tid) {
        uint64_t tmp = _mm512_reduce_add_epi64(aggregator);

        if constexpr (COMPLEX_QUERY) {
-            aggregator = aggregation::apply_masked(aggregator, chunk_ptr, mask_ptr_a, mask_ptr_b, CHUNK_SIZE_B / TC_AGGRJ);
+            aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, mask_ptr_b, CHUNK_SIZE_B / TC_AGGRJ);
        }
        else {
            aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, CHUNK_SIZE_B / TC_AGGRJ);
--- a/qdp_project/src/BenchmarkModes.hpp
+++ b/qdp_project/src/BenchmarkModes.hpp
@ -70,3 +70,17 @@ constexpr bool STORE_B_IN_HBM = true;
 constexpr char MODE_STRING[] = "complex-hbm";
 constexpr bool COMPLEX_QUERY = true;
 #endif
+
+constexpr uint64_t CMP_A = 50;
+constexpr uint64_t CMP_B = 42;
+constexpr uint32_t TC_COMBINED = TC_SCANA + TC_SCANB + TC_AGGRJ;
+constexpr size_t WL_SIZE_ELEMENTS = WL_SIZE_B / sizeof(uint64_t);
+constexpr size_t CHUNK_COUNT = WL_SIZE_B / CHUNK_SIZE_B;
+constexpr size_t CHUNK_SIZE_ELEMENTS = CHUNK_SIZE_B / sizeof(uint64_t);
+constexpr size_t RUN_COUNT = CHUNK_COUNT / GROUP_COUNT;
+
+static_assert(TC_AGGRJ % (TC_SCANB > 0 ? TC_SCANB : TC_AGGRJ) == 0);
+static_assert(TC_AGGRJ >= TC_SCANB);
+static_assert(RUN_COUNT > 0);
+static_assert(WL_SIZE_B % 16 == 0);
+static_assert(CHUNK_SIZE_B % 16 == 0);
--- a/qdp_project/src/utils/BenchmarkHelpers.cpp
+++ b/qdp_project/src/utils/BenchmarkHelpers.cpp
@ -1,5 +1,18 @@
 #include <vector>

+#include "../Configuration.hpp"
+
+constexpr size_t SCANA_TIMING_INDEX = 0;
+constexpr size_t SCANB_TIMING_INDEX = 1;
+constexpr size_t AGGRJ_TIMING_INDEX = 2;
+constexpr size_t TIME_STAMP_BEGIN = 0;
+constexpr size_t TIME_STAMP_WAIT = 1;
+constexpr size_t TIME_STAMP_END = 2;
+
+std::array<std::vector<std::vector<std::array<std::chrono::steady_clock::time_point, 3>>>, 3> THREAD_TIMING_;
+
+std::array<uint32_t, GROUP_COUNT * TC_AGGRJ> CACHE_HITS_;
+
 uint64_t sum_check(uint64_t compare_value, uint64_t* row_A, uint64_t* row_B, size_t row_size) {
    uint64_t sum = 0;
    for(int i = 0; i < row_size / sizeof(uint64_t); ++i) {
@ -45,3 +58,84 @@ struct NopStruct {
        return;
    }
 };
+
+inline uint64_t get_chunk_index(const size_t gid, const size_t rid) {
+    return gid + GROUP_COUNT * rid;
+}
+
+template<size_t TC>
+inline uint64_t* get_chunk(uint64_t* base, const size_t chunk_index, const size_t tid) {
+    uint64_t* chunk_ptr = base + chunk_index * CHUNK_SIZE_ELEMENTS;
+    return chunk_ptr + tid * (CHUNK_SIZE_ELEMENTS / TC);
+}
+
+template<size_t TC>
+inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) {
+    size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC);
+    return base + (offset / 16);
+}
+
+double process_cache_hitrate() {
+    double hr = 0.0;
+
+    for (const uint32_t& e : CACHE_HITS_) {
+        hr += e;
+    }
+
+    return hr / (double)(TC_AGGRJ * GROUP_COUNT * RUN_COUNT);
+}
+
+void process_timings(
+        uint64_t* scana_run, uint64_t* scana_wait,
+        uint64_t* scanb_run, uint64_t* scanb_wait,
+        uint64_t* aggrj_run, uint64_t* aggrj_wait
+) {
+    {
+        uint64_t scana_rc = 0;
+
+        for (const auto& e : THREAD_TIMING_[SCANA_TIMING_INDEX]) {
+            for (const auto& m : e) {
+                *scana_run += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count();
+                *scana_wait += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count();
+                scana_rc++;
+            }
+        }
+
+        if (scana_rc != 0) {
+            *scana_run /= scana_rc;
+            *scana_wait /= scana_rc;
+        }
+    }
+    {
+        uint64_t scanb_rc = 0;
+
+        for (const auto& e : THREAD_TIMING_[SCANB_TIMING_INDEX]) {
+            for (const auto& m : e) {
+                *scanb_run += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count();
+                *scanb_wait += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count();
+                scanb_rc++;
+            }
+        }
+
+        if (scanb_rc != 0) {
+            *scana_run /= scanb_rc;
+            *scana_wait /= scanb_rc;
+        }
+    }
+    {
+        uint64_t aggrj_rc = 0;
+
+        for (const auto& e : THREAD_TIMING_[AGGRJ_TIMING_INDEX]) {
+            for (const auto& m : e) {
+                *aggrj_wait += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_WAIT] - m[TIME_STAMP_BEGIN]).count();
+                *aggrj_run += std::chrono::duration_cast<std::chrono::nanoseconds>(m[TIME_STAMP_END] - m[TIME_STAMP_WAIT]).count();
+                aggrj_rc++;
+            }
+        }
+
+        if (aggrj_rc != 0) {
+            *aggrj_run /= aggrj_rc;
+            *aggrj_wait /= aggrj_rc;
+        }
+    }
+}