Browse Source

use weak wait, add options to tweak for caching mode

master
Constantin Fürst 11 months ago
parent
commit
178d45fafa
  1. 45
      qdp_project/src/Benchmark.cpp

45
qdp_project/src/Benchmark.cpp

@ -32,11 +32,13 @@ constexpr uint32_t ITERATION_COUNT = 5;
#ifdef MODE_PREFETCH #ifdef MODE_PREFETCH
constexpr uint32_t GROUP_COUNT = 16; constexpr uint32_t GROUP_COUNT = 16;
constexpr size_t CHUNK_SIZE_B = WL_SIZE_B / GROUP_COUNT; constexpr size_t CHUNK_SIZE_B = WL_SIZE_B / GROUP_COUNT;
constexpr uint32_t TC_SCANA = 1;
constexpr uint32_t TC_SCANA = 2;
constexpr uint32_t TC_SCANB = 1; constexpr uint32_t TC_SCANB = 1;
constexpr uint32_t TC_AGGRJ = 4;
constexpr uint32_t TC_AGGRJ = 8;
constexpr bool PERFORM_CACHING = true; constexpr bool PERFORM_CACHING = true;
constexpr bool YIELD_ON_CACHE_MISS = false;
constexpr bool DATA_IN_HBM = false; constexpr bool DATA_IN_HBM = false;
constexpr bool AGGRJ_ITERATIVE = true;
constexpr char MODE_STRING[] = "prefetch"; constexpr char MODE_STRING[] = "prefetch";
#endif #endif
#ifdef MODE_DRAM #ifdef MODE_DRAM
@ -46,7 +48,9 @@ constexpr uint32_t TC_SCANA = 4;
constexpr uint32_t TC_SCANB = 0; constexpr uint32_t TC_SCANB = 0;
constexpr uint32_t TC_AGGRJ = 2; constexpr uint32_t TC_AGGRJ = 2;
constexpr bool PERFORM_CACHING = false; constexpr bool PERFORM_CACHING = false;
constexpr bool YIELD_ON_CACHE_MISS = false;
constexpr bool DATA_IN_HBM = false; constexpr bool DATA_IN_HBM = false;
constexpr bool AGGRJ_ITERATIVE = false;
constexpr char MODE_STRING[] = "dram"; constexpr char MODE_STRING[] = "dram";
#endif #endif
#ifdef MODE_HBM #ifdef MODE_HBM
@ -56,7 +60,9 @@ constexpr uint32_t TC_SCANA = 4;
constexpr uint32_t TC_SCANB = 0; constexpr uint32_t TC_SCANB = 0;
constexpr uint32_t TC_AGGRJ = 2; constexpr uint32_t TC_AGGRJ = 2;
constexpr bool PERFORM_CACHING = false; constexpr bool PERFORM_CACHING = false;
constexpr bool YIELD_ON_CACHE_MISS = false;
constexpr bool DATA_IN_HBM = true; constexpr bool DATA_IN_HBM = true;
constexpr bool AGGRJ_ITERATIVE = false;
constexpr char MODE_STRING[] = "hbm"; constexpr char MODE_STRING[] = "hbm";
#endif #endif
@ -92,6 +98,8 @@ constexpr size_t TIME_STAMP_END = 2;
// THREAD_TIMING_[TYPE][TID][ITERATION][STAMP] = TIMEPOINT // THREAD_TIMING_[TYPE][TID][ITERATION][STAMP] = TIMEPOINT
std::array<std::vector<std::vector<std::array<std::chrono::steady_clock::time_point, 3>>>, 3> THREAD_TIMING_; std::array<std::vector<std::vector<std::array<std::chrono::steady_clock::time_point, 3>>>, 3> THREAD_TIMING_;
std::array<uint32_t, GROUP_COUNT * TC_AGGRJ> CACHE_HITS_;
std::vector<std::barrier<NopStruct>*> BARRIERS_; std::vector<std::barrier<NopStruct>*> BARRIERS_;
std::shared_future<void> LAUNCH_; std::shared_future<void> LAUNCH_;
@ -114,7 +122,16 @@ template<size_t TC>
inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) { inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) {
size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC); size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC);
return base + (offset / 16); return base + (offset / 16);
}
double process_cache_hitrate() {
double hr = 0.0;
for (const uint32_t& e : CACHE_HITS_) {
hr += e;
}
return hr / (double)(TC_AGGRJ * GROUP_COUNT * RUN_COUNT);
} }
void process_timings( void process_timings(
@ -226,6 +243,8 @@ void scan_a(size_t gid, size_t tid) {
} }
void aggr_j(size_t gid, size_t tid) { void aggr_j(size_t gid, size_t tid) {
CACHE_HITS_[gid * tid] = 0;
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].clear(); THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].clear();
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].resize(1); THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].resize(1);
@ -249,11 +268,15 @@ void aggr_j(size_t gid, size_t tid) {
if constexpr (PERFORM_CACHING) { if constexpr (PERFORM_CACHING) {
data = CACHE_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); data = CACHE_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ);
data->WaitOnCompletion();
data->WaitOnCompletion(dsacache::WAIT_WEAK);
data_ptr = reinterpret_cast<uint64_t*>(data->GetDataLocation()); data_ptr = reinterpret_cast<uint64_t*>(data->GetDataLocation());
if (data_ptr == nullptr) { if (data_ptr == nullptr) {
std::cerr << "[x] Cache Miss!" << std::endl;
exit(-1);
data_ptr = chunk_ptr;
if constexpr (YIELD_ON_CACHE_MISS) sched_yield();
}
else {
CACHE_HITS_[gid * tid]++;
} }
} }
else { else {
@ -269,6 +292,10 @@ void aggr_j(size_t gid, size_t tid) {
BARRIERS_[gid]->arrive_and_drop(); BARRIERS_[gid]->arrive_and_drop();
aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator); aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator);
if constexpr (AGGRJ_ITERATIVE) {
if (++tid < TC_AGGRJ) aggr_j(gid, tid);
}
} }
int main() { int main() {
@ -283,7 +310,7 @@ int main() {
const std::string ofname = "results/qdp-xeonmax-simpleq-" + std::string(MODE_STRING) + "-tca" + std::to_string(TC_SCANA) + "-tcb" + std::to_string(TC_SCANB) + "-tcj" + std::to_string(TC_AGGRJ) + "-tmul" + std::to_string(GROUP_COUNT) + "-wl" + std::to_string(WL_SIZE_B) + "-cs" + std::to_string(CHUNK_SIZE_B) + ".csv"; const std::string ofname = "results/qdp-xeonmax-simpleq-" + std::string(MODE_STRING) + "-tca" + std::to_string(TC_SCANA) + "-tcb" + std::to_string(TC_SCANB) + "-tcj" + std::to_string(TC_AGGRJ) + "-tmul" + std::to_string(GROUP_COUNT) + "-wl" + std::to_string(WL_SIZE_B) + "-cs" + std::to_string(CHUNK_SIZE_B) + ".csv";
std::ofstream fout(ofname); std::ofstream fout(ofname);
fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;" << std::endl;
fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;cache-hr;" << std::endl;
if constexpr (DATA_IN_HBM) { if constexpr (DATA_IN_HBM) {
DATA_A_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node); DATA_A_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node);
@ -324,10 +351,15 @@ int main() {
copy_pool.emplace_back(scan_b, gid, tid); copy_pool.emplace_back(scan_b, gid, tid);
} }
if constexpr (AGGRJ_ITERATIVE) {
agg_pool.emplace_back(aggr_j, gid, 0);
}
else {
for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) { for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) {
agg_pool.emplace_back(aggr_j, gid, tid); agg_pool.emplace_back(aggr_j, gid, tid);
} }
} }
}
const auto time_start = std::chrono::steady_clock::now(); const auto time_start = std::chrono::steady_clock::now();
@ -354,6 +386,7 @@ int main() {
<< nanos << ";" << seconds << ";" << nanos << ";" << seconds << ";"
<< std::hex << DATA_DST_[0] << std::dec << ";" << std::hex << DATA_DST_[0] << std::dec << ";"
<< scana_run << ";" << scana_wait << ";" << scanb_run << ";" << scanb_wait << ";" << aggrj_run << ";" << aggrj_wait << ";" << scana_run << ";" << scana_wait << ";" << scanb_run << ";" << scanb_wait << ";" << aggrj_run << ";" << aggrj_wait << ";"
<< process_cache_hitrate() << ";"
<< std::endl; << std::endl;
} }

Loading…
Cancel
Save