remove the experimental code branches that turned out not to yield any benefit (sched-yield has too high delay and with the new load balancer, subchunking for aggrj is also not needed anymore)

11 months ago · 10a791dea1
1 changed files with 10 additions and 43 deletions
--- a/qdp_project/src/Benchmark.cpp
+++ b/qdp_project/src/Benchmark.cpp
@ -1,9 +1,5 @@
 #include <memory>
 #include <cassert>
 #include <mutex>
 #include <cstring>
 #include <bitset>
 #include <algorithm>
 #include <barrier>
 #include <vector>
 #include <fstream>
@ -20,7 +16,7 @@
 #include "BenchmarkHelpers.cpp"
 #define MODE_HBM
 #define MODE_PREFETCH
 ////////////////////////////////
 /// BENCHMARK SETUP
@ -30,15 +26,13 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
 constexpr uint32_t ITERATION_COUNT = 5;
 #ifdef MODE_PREFETCH
 constexpr uint32_t GROUP_COUNT = 8;
 constexpr size_t CHUNK_SIZE_B = 16_MiB;
 constexpr uint32_t TC_SCANA = 4;
 constexpr uint32_t GROUP_COUNT = 16;
 constexpr size_t CHUNK_SIZE_B = 8_MiB;
 constexpr uint32_t TC_SCANA = 2;
 constexpr uint32_t TC_SCANB = 1;
 constexpr uint32_t TC_AGGRJ = 4;
 constexpr uint32_t TC_AGGRJ = 2;
 constexpr bool PERFORM_CACHING = true;
 constexpr bool YIELD_ON_CACHE_MISS = false;
 constexpr bool DATA_IN_HBM = false;
 constexpr bool AGGRJ_ITERATIVE = true;
 constexpr char MODE_STRING[] = "prefetch";
 #endif
 #ifdef MODE_DRAM
@ -48,9 +42,7 @@ constexpr uint32_t TC_SCANA = 2;
 constexpr uint32_t TC_SCANB = 0;
 constexpr uint32_t TC_AGGRJ = 1;
 constexpr bool PERFORM_CACHING = false;
 constexpr bool YIELD_ON_CACHE_MISS = false;
 constexpr bool DATA_IN_HBM = false;
 constexpr bool AGGRJ_ITERATIVE = false;
 constexpr char MODE_STRING[] = "dram";
 #endif
 #ifdef MODE_HBM
@ -60,9 +52,7 @@ constexpr uint32_t TC_SCANA = 2;
 constexpr uint32_t TC_SCANB = 0;
 constexpr uint32_t TC_AGGRJ = 1;
 constexpr bool PERFORM_CACHING = false;
 constexpr bool YIELD_ON_CACHE_MISS = false;
 constexpr bool DATA_IN_HBM = true;
 constexpr bool AGGRJ_ITERATIVE = false;
 constexpr char MODE_STRING[] = "hbm";
 #endif
@ -254,12 +244,7 @@ void aggr_j(size_t gid, size_t tid) {
    THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now();
    if constexpr (AGGRJ_ITERATIVE) {
       if (tid == 0) BARRIERS_[gid]->arrive_and_wait();
    }
    else {
        BARRIERS_[gid]->arrive_and_wait();
    }
    BARRIERS_[gid]->arrive_and_wait();
    THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now();
@ -278,7 +263,6 @@ void aggr_j(size_t gid, size_t tid) {
            if (data_ptr == nullptr) {
                data_ptr = chunk_ptr;
                if constexpr (YIELD_ON_CACHE_MISS) sched_yield();
            }
            else {
                CACHE_HITS_[gid * tid]++;
@ -294,16 +278,9 @@ void aggr_j(size_t gid, size_t tid) {
    THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_END] = std::chrono::steady_clock::now();
    if constexpr (!AGGRJ_ITERATIVE) {
        BARRIERS_[gid]->arrive_and_drop();
    }
    BARRIERS_[gid]->arrive_and_drop();
    aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator);
    if constexpr (AGGRJ_ITERATIVE) {
        if (++tid < TC_AGGRJ) aggr_j(gid, tid);
        else BARRIERS_[gid]->arrive_and_drop();
    }
 }
 int main() {
@ -349,12 +326,7 @@ int main() {
        std::vector<std::thread>  agg_pool;
        for(uint32_t gid = 0; gid < GROUP_COUNT; ++gid) {
            if constexpr (AGGRJ_ITERATIVE) {
                BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_SCANA + TC_SCANB + 1));
            }
            else {
                BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_COMBINED));
            }
            BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_COMBINED));
            for(uint32_t tid = 0; tid < TC_SCANA; ++tid) {
                filter_pool.emplace_back(scan_a, gid, tid);
@ -364,13 +336,8 @@ int main() {
                copy_pool.emplace_back(scan_b, gid, tid);
            }
            if constexpr (AGGRJ_ITERATIVE) {
                agg_pool.emplace_back(aggr_j, gid, 0);
            }
            else {
                for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) {
                    agg_pool.emplace_back(aggr_j, gid,  tid);
                }
            for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) {
                agg_pool.emplace_back(aggr_j, gid,  tid);
            }
        }