diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp index 769fa82..2afc916 100644 --- a/qdp_project/src/Benchmark.cpp +++ b/qdp_project/src/Benchmark.cpp @@ -29,51 +29,6 @@ uint64_t* DATA_B_; uint16_t* MASK_A_; uint64_t* DATA_DST_; -// if more b than j -> perform b normal, subsplit j -// if more j than b -> subsplit b like it is now - -template -void caching(size_t gid, size_t tid) { - constexpr size_t VIRT_TID_INCREMENT = TC_CACHING / TC_AGGRJ; - constexpr size_t SUBCHUNK_THREAD_RATIO = TC_AGGRJ / (TC_CACHING == 0 ? 1 : TC_CACHING); - constexpr bool CACHE_SUBCHUNKING = SUBCHUNK_THREAD_RATIO > 1; - constexpr bool CACHE_OVERCHUNKING = VIRT_TID_INCREMENT > 1; - - if constexpr (CACHE_SUBCHUNKING) { - constexpr size_t SUBCHUNK_COUNT = SUBCHUNK_THREAD_RATIO > 0 ? SUBCHUNK_THREAD_RATIO : 1; - constexpr size_t SUBCHUNK_SIZE_B = CHUNK_SIZE_B / SUBCHUNK_COUNT; - constexpr size_t SUBCHUNK_SIZE_ELEMENTS = CHUNK_SIZE_ELEMENTS / SUBCHUNK_COUNT; - - for (size_t i = 0; i < RUN_COUNT; i++) { - const size_t chunk_index = get_chunk_index(gid, i); - uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, tid); - - for (size_t j = 0; j < SUBCHUNK_COUNT; j++) { - uint64_t* sub_chunk_ptr = &chunk_ptr[j * SUBCHUNK_SIZE_ELEMENTS]; - CACHE_.Access(reinterpret_cast(sub_chunk_ptr), SUBCHUNK_SIZE_B); - } - } - } - else if constexpr (CACHE_OVERCHUNKING) { - for (size_t tid_virt = tid; tid_virt < TC_AGGRJ; tid_virt += VIRT_TID_INCREMENT) { - for (size_t i = 0; i < RUN_COUNT; i++) { - const size_t chunk_index = get_chunk_index(gid, i); - uint64_t *chunk_ptr = get_chunk(DATA_B_, chunk_index, tid_virt); - - CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B); - } - } - } - else { - for (size_t i = 0; i < RUN_COUNT; i++) { - const size_t chunk_index = get_chunk_index(gid, i); - uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, tid); - - CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B); - } - } -} - void scan_b(size_t gid, size_t tid) { THREAD_TIMING_[SCANB_TIMING_INDEX][UniqueIndex(gid,tid)].clear(); THREAD_TIMING_[SCANB_TIMING_INDEX][UniqueIndex(gid,tid)].resize(1); @@ -83,7 +38,14 @@ void scan_b(size_t gid, size_t tid) { THREAD_TIMING_[SCANB_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); if constexpr (PERFORM_CACHING) { - caching(gid, tid); + static_assert(TC_AGGRJ == TC_SCANB); + + for (size_t i = 0; i < RUN_COUNT; i++) { + const size_t chunk_index = get_chunk_index(gid, i); + uint64_t* chunk_ptr = get_chunk(DATA_B_, chunk_index, tid); + + CACHE_.Access(reinterpret_cast(chunk_ptr), SUBCHUNK_SIZE_B_AGGRJ); + } } THREAD_TIMING_[SCANB_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); @@ -97,25 +59,25 @@ void scan_a(size_t gid, size_t tid) { LAUNCH_.wait(); for (size_t i = 0; i < RUN_COUNT; i++) { - THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); + THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); const size_t chunk_index = get_chunk_index(gid, i); uint64_t* chunk_ptr = get_chunk(DATA_A_, chunk_index, tid); uint16_t* mask_ptr = get_mask(MASK_A_, chunk_index, tid); - filter::apply_same(mask_ptr, nullptr, chunk_ptr, CMP_A, CHUNK_SIZE_B / TC_SCANA); + filter::apply_same(mask_ptr, nullptr, chunk_ptr, CMP_A, SUBCHUNK_SIZE_B_SCANA); + + THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); - THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); - THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][0][TIME_STAMP_END] = std::chrono::steady_clock::now(); + BARRIERS_[gid]->arrive_and_wait(); + + THREAD_TIMING_[SCANA_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_END] = std::chrono::steady_clock::now(); } BARRIERS_[gid]->arrive_and_drop(); } void aggr_j(size_t gid, size_t tid) { - constexpr size_t SUBCHUNK_SIZE_B = CHUNK_SIZE_B / TC_AGGRJ; - constexpr size_t LAST_CHUNK_SIZE_B = SUBCHUNK_SIZE_B + (CHUNK_SIZE_B % (TC_AGGRJ * GROUP_COUNT)); - CACHE_HITS_[UniqueIndex(gid,tid)] = 0; THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)].clear(); @@ -124,11 +86,12 @@ void aggr_j(size_t gid, size_t tid) { __m512i aggregator = aggregation::OP::zero(); LAUNCH_.wait(); - - BARRIERS_[gid]->arrive_and_drop(); - + for (size_t i = 0; i < RUN_COUNT; i++) { THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now(); + + BARRIERS_[gid]->arrive_and_wait(); + THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_WAIT] = std::chrono::steady_clock::now(); const size_t chunk_index = get_chunk_index(gid, i); @@ -139,7 +102,7 @@ void aggr_j(size_t gid, size_t tid) { uint64_t* data_ptr; if constexpr (PERFORM_CACHING) { - data = CACHE_.Access(reinterpret_cast(chunk_ptr), SUBCHUNK_SIZE_B, dsacache::FLAG_ACCESS_WEAK); + data = CACHE_.Access(reinterpret_cast(chunk_ptr), SUBCHUNK_SIZE_B_AGGRJ, dsacache::FLAG_ACCESS_WEAK); data->WaitOnCompletion(); data_ptr = reinterpret_cast(data->GetDataLocation()); @@ -159,12 +122,14 @@ void aggr_j(size_t gid, size_t tid) { } uint64_t tmp = _mm512_reduce_add_epi64(aggregator); - aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, SUBCHUNK_SIZE_B); + aggregator = aggregation::apply_masked(aggregator, data_ptr, mask_ptr_a, SUBCHUNK_SIZE_B_AGGRJ); THREAD_TIMING_[AGGRJ_TIMING_INDEX][UniqueIndex(gid,tid)][i][TIME_STAMP_END] = std::chrono::steady_clock::now(); } aggregation::happly(&DATA_DST_[UniqueIndex(gid,tid)], aggregator); + + BARRIERS_[gid]->arrive_and_drop(); } int main() { diff --git a/qdp_project/src/Configuration.hpp b/qdp_project/src/Configuration.hpp index c7a6445..ca0891e 100644 --- a/qdp_project/src/Configuration.hpp +++ b/qdp_project/src/Configuration.hpp @@ -13,37 +13,34 @@ constexpr int MEM_NODE_HBM = 8; constexpr int MEM_NODE_DRAM = 0; #ifdef MODE_PREFETCH -constexpr uint32_t GROUP_COUNT = 8; -constexpr size_t CHUNK_SIZE_B = 16_MiB; -constexpr uint32_t TC_SCANA = 2; +constexpr uint32_t GROUP_COUNT = 4; +constexpr size_t CHUNK_SIZE_B = 8_MiB; +constexpr uint32_t TC_SCANA = 4; constexpr uint32_t TC_SCANB = 1; constexpr uint32_t TC_AGGRJ = 1; constexpr bool PERFORM_CACHING = true; -constexpr bool PERFORM_CACHING_IN_AGGREGATION = false; -constexpr int MEM_NODE_A = 1; -constexpr int MEM_NODE_B = 2; +constexpr int MEM_NODE_A = 0; +constexpr int MEM_NODE_B = 0; constexpr char MODE_STRING[] = "prefetch"; #endif #ifdef MODE_DRAM constexpr size_t CHUNK_SIZE_B = 2_MiB; -constexpr uint32_t GROUP_COUNT = 8; -constexpr uint32_t TC_SCANA = 2; +constexpr uint32_t GROUP_COUNT = 4; +constexpr uint32_t TC_SCANA = 4; constexpr uint32_t TC_SCANB = 0; constexpr uint32_t TC_AGGRJ = 1; constexpr bool PERFORM_CACHING = false; -constexpr bool PERFORM_CACHING_IN_AGGREGATION = false; constexpr int MEM_NODE_A = 0; constexpr int MEM_NODE_B = 0; constexpr char MODE_STRING[] = "dram"; #endif #ifdef MODE_HBM constexpr size_t CHUNK_SIZE_B = 2_MiB; -constexpr uint32_t GROUP_COUNT = 8; -constexpr uint32_t TC_SCANA = 2; +constexpr uint32_t GROUP_COUNT = 4; +constexpr uint32_t TC_SCANA = 4; constexpr uint32_t TC_SCANB = 0; constexpr uint32_t TC_AGGRJ = 1; constexpr bool PERFORM_CACHING = false; -constexpr bool PERFORM_CACHING_IN_AGGREGATION = false; constexpr int MEM_NODE_A = 0; constexpr int MEM_NODE_B = 8; constexpr char MODE_STRING[] = "hbm"; @@ -55,6 +52,9 @@ constexpr size_t CHUNK_COUNT = WL_SIZE_B / CHUNK_SIZE_B; constexpr size_t CHUNK_SIZE_ELEMENTS = CHUNK_SIZE_B / sizeof(uint64_t); constexpr size_t RUN_COUNT = CHUNK_COUNT / GROUP_COUNT; +constexpr size_t SUBCHUNK_SIZE_B_SCANA = CHUNK_SIZE_B / TC_SCANA; +constexpr size_t SUBCHUNK_SIZE_B_AGGRJ = CHUNK_SIZE_B / TC_AGGRJ; + static_assert(RUN_COUNT > 0); static_assert(WL_SIZE_B % 16 == 0); static_assert(CHUNK_SIZE_B % 16 == 0); diff --git a/qdp_project/src/utils/aggregation.h b/qdp_project/src/utils/aggregation.h index 91920d2..648be48 100644 --- a/qdp_project/src/utils/aggregation.h +++ b/qdp_project/src/utils/aggregation.h @@ -252,8 +252,6 @@ public: size_t value_count = chunk_size_b / sizeof(base_t); size_t i = 0; - *time_load = 0; - // stop before! running out of space if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't. for(; i <= value_count - lanes; i += lanes) {