Browse Source

remove the experimental code branches that turned out not to yield any benefit (sched-yield has too high delay and with the new load balancer, subchunking for aggrj is also not needed anymore)

master
Constantin Fürst 11 months ago
parent
commit
10a791dea1
  1. 53
      qdp_project/src/Benchmark.cpp

53
qdp_project/src/Benchmark.cpp

@ -1,9 +1,5 @@
#include <memory>
#include <cassert>
#include <mutex>
#include <cstring>
#include <bitset>
#include <algorithm>
#include <barrier>
#include <vector>
#include <fstream>
@ -20,7 +16,7 @@
#include "BenchmarkHelpers.cpp"
#define MODE_HBM
#define MODE_PREFETCH
////////////////////////////////
/// BENCHMARK SETUP
@ -30,15 +26,13 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
constexpr uint32_t ITERATION_COUNT = 5;
#ifdef MODE_PREFETCH
constexpr uint32_t GROUP_COUNT = 8;
constexpr size_t CHUNK_SIZE_B = 16_MiB;
constexpr uint32_t TC_SCANA = 4;
constexpr uint32_t GROUP_COUNT = 16;
constexpr size_t CHUNK_SIZE_B = 8_MiB;
constexpr uint32_t TC_SCANA = 2;
constexpr uint32_t TC_SCANB = 1;
constexpr uint32_t TC_AGGRJ = 4;
constexpr uint32_t TC_AGGRJ = 2;
constexpr bool PERFORM_CACHING = true;
constexpr bool YIELD_ON_CACHE_MISS = false;
constexpr bool DATA_IN_HBM = false;
constexpr bool AGGRJ_ITERATIVE = true;
constexpr char MODE_STRING[] = "prefetch";
#endif
#ifdef MODE_DRAM
@ -48,9 +42,7 @@ constexpr uint32_t TC_SCANA = 2;
constexpr uint32_t TC_SCANB = 0;
constexpr uint32_t TC_AGGRJ = 1;
constexpr bool PERFORM_CACHING = false;
constexpr bool YIELD_ON_CACHE_MISS = false;
constexpr bool DATA_IN_HBM = false;
constexpr bool AGGRJ_ITERATIVE = false;
constexpr char MODE_STRING[] = "dram";
#endif
#ifdef MODE_HBM
@ -60,9 +52,7 @@ constexpr uint32_t TC_SCANA = 2;
constexpr uint32_t TC_SCANB = 0;
constexpr uint32_t TC_AGGRJ = 1;
constexpr bool PERFORM_CACHING = false;
constexpr bool YIELD_ON_CACHE_MISS = false;
constexpr bool DATA_IN_HBM = true;
constexpr bool AGGRJ_ITERATIVE = false;
constexpr char MODE_STRING[] = "hbm";
#endif
@ -254,12 +244,7 @@ void aggr_j(size_t gid, size_t tid) {
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_BEGIN] = std::chrono::steady_clock::now();
if constexpr (AGGRJ_ITERATIVE) {
if (tid == 0) BARRIERS_[gid]->arrive_and_wait();
}
else {
BARRIERS_[gid]->arrive_and_wait();
}
BARRIERS_[gid]->arrive_and_wait();
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_WAIT] = std::chrono::steady_clock::now();
@ -278,7 +263,6 @@ void aggr_j(size_t gid, size_t tid) {
if (data_ptr == nullptr) {
data_ptr = chunk_ptr;
if constexpr (YIELD_ON_CACHE_MISS) sched_yield();
}
else {
CACHE_HITS_[gid * tid]++;
@ -294,16 +278,9 @@ void aggr_j(size_t gid, size_t tid) {
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid][0][TIME_STAMP_END] = std::chrono::steady_clock::now();
if constexpr (!AGGRJ_ITERATIVE) {
BARRIERS_[gid]->arrive_and_drop();
}
BARRIERS_[gid]->arrive_and_drop();
aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator);
if constexpr (AGGRJ_ITERATIVE) {
if (++tid < TC_AGGRJ) aggr_j(gid, tid);
else BARRIERS_[gid]->arrive_and_drop();
}
}
int main() {
@ -349,12 +326,7 @@ int main() {
std::vector<std::thread> agg_pool;
for(uint32_t gid = 0; gid < GROUP_COUNT; ++gid) {
if constexpr (AGGRJ_ITERATIVE) {
BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_SCANA + TC_SCANB + 1));
}
else {
BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_COMBINED));
}
BARRIERS_.emplace_back(new std::barrier<NopStruct>(TC_COMBINED));
for(uint32_t tid = 0; tid < TC_SCANA; ++tid) {
filter_pool.emplace_back(scan_a, gid, tid);
@ -364,13 +336,8 @@ int main() {
copy_pool.emplace_back(scan_b, gid, tid);
}
if constexpr (AGGRJ_ITERATIVE) {
agg_pool.emplace_back(aggr_j, gid, 0);
}
else {
for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) {
agg_pool.emplace_back(aggr_j, gid, tid);
}
for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) {
agg_pool.emplace_back(aggr_j, gid, tid);
}
}

Loading…
Cancel
Save