From 178d45fafa3cbf2d6064c2a16d15d3f89881fc9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 24 Jan 2024 20:54:33 +0100 Subject: [PATCH] use weak wait, add options to tweak for caching mode --- qdp_project/src/Benchmark.cpp | 53 ++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp index 9687c35..0ba05e9 100644 --- a/qdp_project/src/Benchmark.cpp +++ b/qdp_project/src/Benchmark.cpp @@ -32,11 +32,13 @@ constexpr uint32_t ITERATION_COUNT = 5; #ifdef MODE_PREFETCH constexpr uint32_t GROUP_COUNT = 16; constexpr size_t CHUNK_SIZE_B = WL_SIZE_B / GROUP_COUNT; -constexpr uint32_t TC_SCANA = 1; +constexpr uint32_t TC_SCANA = 2; constexpr uint32_t TC_SCANB = 1; -constexpr uint32_t TC_AGGRJ = 4; +constexpr uint32_t TC_AGGRJ = 8; constexpr bool PERFORM_CACHING = true; +constexpr bool YIELD_ON_CACHE_MISS = false; constexpr bool DATA_IN_HBM = false; +constexpr bool AGGRJ_ITERATIVE = true; constexpr char MODE_STRING[] = "prefetch"; #endif #ifdef MODE_DRAM @@ -46,7 +48,9 @@ constexpr uint32_t TC_SCANA = 4; constexpr uint32_t TC_SCANB = 0; constexpr uint32_t TC_AGGRJ = 2; constexpr bool PERFORM_CACHING = false; +constexpr bool YIELD_ON_CACHE_MISS = false; constexpr bool DATA_IN_HBM = false; +constexpr bool AGGRJ_ITERATIVE = false; constexpr char MODE_STRING[] = "dram"; #endif #ifdef MODE_HBM @@ -56,7 +60,9 @@ constexpr uint32_t TC_SCANA = 4; constexpr uint32_t TC_SCANB = 0; constexpr uint32_t TC_AGGRJ = 2; constexpr bool PERFORM_CACHING = false; +constexpr bool YIELD_ON_CACHE_MISS = false; constexpr bool DATA_IN_HBM = true; +constexpr bool AGGRJ_ITERATIVE = false; constexpr char MODE_STRING[] = "hbm"; #endif @@ -92,6 +98,8 @@ constexpr size_t TIME_STAMP_END = 2; // THREAD_TIMING_[TYPE][TID][ITERATION][STAMP] = TIMEPOINT std::array>>, 3> THREAD_TIMING_; +std::array CACHE_HITS_; + std::vector*> BARRIERS_; std::shared_future LAUNCH_; @@ -114,7 +122,16 @@ template inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) { size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC); return base + (offset / 16); +} + +double process_cache_hitrate() { + double hr = 0.0; + for (const uint32_t& e : CACHE_HITS_) { + hr += e; + } + + return hr / (double)(TC_AGGRJ * GROUP_COUNT * RUN_COUNT); } void process_timings( @@ -226,8 +243,10 @@ void scan_a(size_t gid, size_t tid) { } void aggr_j(size_t gid, size_t tid) { - THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].clear(); - THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].resize(1); + CACHE_HITS_[gid * tid] = 0; + + THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].clear(); + THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].resize(1); LAUNCH_.wait(); @@ -249,11 +268,15 @@ void aggr_j(size_t gid, size_t tid) { if constexpr (PERFORM_CACHING) { data = CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); - data->WaitOnCompletion(); + data->WaitOnCompletion(dsacache::WAIT_WEAK); data_ptr = reinterpret_cast(data->GetDataLocation()); + if (data_ptr == nullptr) { - std::cerr << "[x] Cache Miss!" << std::endl; - exit(-1); + data_ptr = chunk_ptr; + if constexpr (YIELD_ON_CACHE_MISS) sched_yield(); + } + else { + CACHE_HITS_[gid * tid]++; } } else { @@ -269,6 +292,10 @@ void aggr_j(size_t gid, size_t tid) { BARRIERS_[gid]->arrive_and_drop(); aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator); + + if constexpr (AGGRJ_ITERATIVE) { + if (++tid < TC_AGGRJ) aggr_j(gid, tid); + } } int main() { @@ -283,7 +310,7 @@ int main() { const std::string ofname = "results/qdp-xeonmax-simpleq-" + std::string(MODE_STRING) + "-tca" + std::to_string(TC_SCANA) + "-tcb" + std::to_string(TC_SCANB) + "-tcj" + std::to_string(TC_AGGRJ) + "-tmul" + std::to_string(GROUP_COUNT) + "-wl" + std::to_string(WL_SIZE_B) + "-cs" + std::to_string(CHUNK_SIZE_B) + ".csv"; std::ofstream fout(ofname); - fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;" << std::endl; + fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;cache-hr;" << std::endl; if constexpr (DATA_IN_HBM) { DATA_A_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node); @@ -324,8 +351,13 @@ int main() { copy_pool.emplace_back(scan_b, gid, tid); } - for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) { - agg_pool.emplace_back(aggr_j, gid, tid); + if constexpr (AGGRJ_ITERATIVE) { + agg_pool.emplace_back(aggr_j, gid, 0); + } + else { + for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) { + agg_pool.emplace_back(aggr_j, gid, tid); + } } } @@ -354,6 +386,7 @@ int main() { << nanos << ";" << seconds << ";" << std::hex << DATA_DST_[0] << std::dec << ";" << scana_run << ";" << scana_wait << ";" << scanb_run << ";" << scanb_wait << ";" << aggrj_run << ";" << aggrj_wait << ";" + << process_cache_hitrate() << ";" << std::endl; }