remove the experimental code branches that turned out not to yield any benefit (sched-yield has too high delay and with the new load balancer, subchunking for aggrj is also not needed anymore)

11 months ago · 10a791dea1
1 changed files with 10 additions and 43 deletions
--- a/qdp_project/src/Benchmark.cpp
+++ b/qdp_project/src/Benchmark.cpp
@ -1,9 +1,5 @@
-#include <memory>
-#include <cassert>
 #include <mutex>
-#include <cstring>
 #include <bitset>
-#include <algorithm>
 #include <barrier>
 #include <vector>
 #include <fstream>
@ -20,7 +16,7 @@

 #include "BenchmarkHelpers.cpp"

-#define MODE_HBM
+#define MODE_PREFETCH

 ////////////////////////////////
 /// BENCHMARK SETUP
@ -30,15 +26,13 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
 constexpr uint32_t ITERATION_COUNT = 5;

 #ifdef MODE_PREFETCH
-constexpr uint32_t GROUP_COUNT = 8;
-constexpr size_t CHUNK_SIZE_B = 16_MiB;
-constexpr uint32_t TC_SCANA = 4;
+constexpr uint32_t GROUP_COUNT = 16;
+constexpr size_t CHUNK_SIZE_B = 8_MiB;
+constexpr uint32_t TC_SCANA = 2;
 constexpr uint32_t TC_SCANB = 1;
-constexpr uint32_t TC_AGGRJ = 4;
+constexpr uint32_t TC_AGGRJ = 2;
 constexpr bool PERFORM_CACHING = true;
-constexpr bool YIELD_ON_CACHE_MISS = false;
 constexpr bool DATA_IN_HBM = false;
-constexpr bool AGGRJ_ITERATIVE = true;
 constexpr char MODE_STRING[] = "prefetch";
 #endif
 #ifdef MODE_DRAM
@ -48,9 +42,7 @@ constexpr uint32_t TC_SCANA = 2;
 constexpr uint32_t TC_SCANB = 0;
 constexpr uint32_t TC_AGGRJ = 1;
 constexpr bool PERFORM_CACHING = false;
-constexpr bool YIELD_ON_CACHE_MISS = false;
 constexpr bool DATA_IN_HBM = false;
-constexpr bool AGGRJ_ITERATIVE = false;
 constexpr char MODE_STRING[] = "dram";
 #endif
 #ifdef MODE_HBM
@ -60,9 +52,7 @@ constexpr uint32_t TC_SCANA = 2;
 constexpr uint32_t TC_SCANB = 0;
 constexpr uint32_t TC_AGGRJ = 1;
 constexpr bool PERFORM_CACHING = false;
-constexpr bool YIELD_ON_CACHE_MISS = false;
 constexpr bool DATA_IN_HBM = true;
-constexpr bool AGGRJ_ITERATIVE = false;
 constexpr char MODE_STRING[] = "hbm";
 #endif

@ -254,12 +244,7 @@ void aggr_j(size_t gid, size_t tid) {

    THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now();

-    if constexpr (AGGRJ_ITERATIVE) {
-       if (tid == 0) BARRIERS_[gid]->arrive_and_wait();
-    }
-    else {
-        BARRIERS_[gid]->arrive_and_wait();
-    }
+    BARRIERS_[gid]->arrive_and_wait();

    THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now();

@ -278,7 +263,6 @@ void aggr_j(size_t gid, size_t tid) {

            if (data_ptr == nullptr) {
                data_ptr = chunk_ptr;
-                if constexpr (YIELD_ON_CACHE_MISS) sched_yield();
            }
            else {
                CACHE_HITS_[gid * tid]++;
@ -294,16 +278,9 @@ void aggr_j(size_t gid, size_t tid) {

    THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_END] = std::chrono::steady_clock::now();

-    if constexpr (!AGGRJ_ITERATIVE) {
-        BARRIERS_[gid]->arrive_and_drop();
-    }
+    BARRIERS_[gid]->arrive_and_drop();

    aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator);
-
-    if constexpr (AGGRJ_ITERATIVE) {
-        if (++tid < TC_AGGRJ) aggr_j(gid, tid);
-        else BARRIERS_[gid]->arrive_and_drop();
-    }
 }

 int main() {
@ -349,12 +326,7 @@ int main() {
        std::vector<std::thread>  agg_pool;

        for(uint32_t gid = 0; gid < GROUP_COUNT; ++gid) {
-            if constexpr (AGGRJ_ITERATIVE) {
-                BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_SCANA + TC_SCANB + 1));
-            }
-            else {
-                BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_COMBINED));
-            }
+            BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_COMBINED));

            for(uint32_t tid = 0; tid < TC_SCANA; ++tid) {
                filter_pool.emplace_back(scan_a, gid, tid);
@ -364,13 +336,8 @@ int main() {
                copy_pool.emplace_back(scan_b, gid, tid);
            }

-            if constexpr (AGGRJ_ITERATIVE) {
-                agg_pool.emplace_back(aggr_j, gid, 0);
-            }
-            else {
-                for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) {
-                    agg_pool.emplace_back(aggr_j, gid,  tid);
-                }
+            for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) {
+                agg_pool.emplace_back(aggr_j, gid,  tid);
            }
        }