From 4a587a36e2226ff0aad623a3421062372e4aa6dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 7 Feb 2024 16:03:36 +0100 Subject: [PATCH] remove overlap-execution barriers and run for the entire block --- qdp_project/src/Benchmark.cpp | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp index 2afc916..d11e403 100644 --- a/qdp_project/src/Benchmark.cpp +++ b/qdp_project/src/Benchmark.cpp @@ -54,46 +54,44 @@ void scan_b(size_t gid, size_t tid) { void scan_a(size_t gid, size_t tid) { THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)].clear(); - THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)].resize(RUN_COUNT); + THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)].resize(1); LAUNCH_.wait(); - for (size_t i = 0; i < RUN_COUNT; i++) { - THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); + THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); + for (size_t i = 0; i < RUN_COUNT; i++) { const size_t chunk_index = get_chunk_index(gid, i); uint64_t* chunk_ptr = get_chunk(DATA_A_, chunk_index, tid); uint16_t* mask_ptr = get_mask(MASK_A_, chunk_index, tid); filter::apply_same(mask_ptr, nullptr, chunk_ptr, CMP_A, SUBCHUNK_SIZE_B_SCANA); - - THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); - - BARRIERS_[gid]->arrive_and_wait(); - - THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_END] = std::chrono::steady_clock::now(); } + THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); + BARRIERS_[gid]->arrive_and_drop(); + + THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_END] = std::chrono::steady_clock::now(); } void aggr_j(size_t gid, size_t tid) { CACHE_HITS_[UniqueIndex(gid,tid)] = 0; THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)].clear(); - THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)].resize(RUN_COUNT); + THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)].resize(1); __m512i aggregator = aggregation::OP::zero(); LAUNCH_.wait(); - - for (size_t i = 0; i < RUN_COUNT; i++) { - THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); - BARRIERS_[gid]->arrive_and_wait(); + THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); + + BARRIERS_[gid]->arrive_and_drop(); - THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); + THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); + for (size_t i = 0; i < RUN_COUNT; i++) { const size_t chunk_index = get_chunk_index(gid, i); uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, tid); uint16_t* mask_ptr_a = get_mask(MASK_A_, chunk_index, tid); @@ -124,12 +122,11 @@ void aggr_j(size_t gid, size_t tid) { uint64_t tmp = _mm512_reduce_add_epi64(aggregator); aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, SUBCHUNK_SIZE_B_AGGRJ); - THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_END] = std::chrono::steady_clock::now(); } aggregation::happly(&DATA_DST_[UniqueIndex(gid,tid)], aggregator); - BARRIERS_[gid]->arrive_and_drop(); + THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_END] = std::chrono::steady_clock::now(); } int main() { @@ -162,6 +159,7 @@ int main() { // which is configured for xeonmax with smart assignment uint64_t cache_flags = 0; cache_flags |= dsacache::FLAG_WAIT_WEAK; + cache_flags |= dsacache::FLAG_HANDLE_PF; CACHE_.SetFlags(cache_flags); CACHE_.Init(CachePlacementPolicy, CopyMethodPolicy); }