|
@ -32,11 +32,13 @@ constexpr uint32_t ITERATION_COUNT = 5; |
|
|
#ifdef MODE_PREFETCH
|
|
|
#ifdef MODE_PREFETCH
|
|
|
constexpr uint32_t GROUP_COUNT = 16; |
|
|
constexpr uint32_t GROUP_COUNT = 16; |
|
|
constexpr size_t CHUNK_SIZE_B = WL_SIZE_B / GROUP_COUNT; |
|
|
constexpr size_t CHUNK_SIZE_B = WL_SIZE_B / GROUP_COUNT; |
|
|
constexpr uint32_t TC_SCANA = 1; |
|
|
|
|
|
|
|
|
constexpr uint32_t TC_SCANA = 2; |
|
|
constexpr uint32_t TC_SCANB = 1; |
|
|
constexpr uint32_t TC_SCANB = 1; |
|
|
constexpr uint32_t TC_AGGRJ = 4; |
|
|
|
|
|
|
|
|
constexpr uint32_t TC_AGGRJ = 8; |
|
|
constexpr bool PERFORM_CACHING = true; |
|
|
constexpr bool PERFORM_CACHING = true; |
|
|
|
|
|
constexpr bool YIELD_ON_CACHE_MISS = false; |
|
|
constexpr bool DATA_IN_HBM = false; |
|
|
constexpr bool DATA_IN_HBM = false; |
|
|
|
|
|
constexpr bool AGGRJ_ITERATIVE = true; |
|
|
constexpr char MODE_STRING[] = "prefetch"; |
|
|
constexpr char MODE_STRING[] = "prefetch"; |
|
|
#endif
|
|
|
#endif
|
|
|
#ifdef MODE_DRAM
|
|
|
#ifdef MODE_DRAM
|
|
@ -46,7 +48,9 @@ constexpr uint32_t TC_SCANA = 4; |
|
|
constexpr uint32_t TC_SCANB = 0; |
|
|
constexpr uint32_t TC_SCANB = 0; |
|
|
constexpr uint32_t TC_AGGRJ = 2; |
|
|
constexpr uint32_t TC_AGGRJ = 2; |
|
|
constexpr bool PERFORM_CACHING = false; |
|
|
constexpr bool PERFORM_CACHING = false; |
|
|
|
|
|
constexpr bool YIELD_ON_CACHE_MISS = false; |
|
|
constexpr bool DATA_IN_HBM = false; |
|
|
constexpr bool DATA_IN_HBM = false; |
|
|
|
|
|
constexpr bool AGGRJ_ITERATIVE = false; |
|
|
constexpr char MODE_STRING[] = "dram"; |
|
|
constexpr char MODE_STRING[] = "dram"; |
|
|
#endif
|
|
|
#endif
|
|
|
#ifdef MODE_HBM
|
|
|
#ifdef MODE_HBM
|
|
@ -56,7 +60,9 @@ constexpr uint32_t TC_SCANA = 4; |
|
|
constexpr uint32_t TC_SCANB = 0; |
|
|
constexpr uint32_t TC_SCANB = 0; |
|
|
constexpr uint32_t TC_AGGRJ = 2; |
|
|
constexpr uint32_t TC_AGGRJ = 2; |
|
|
constexpr bool PERFORM_CACHING = false; |
|
|
constexpr bool PERFORM_CACHING = false; |
|
|
|
|
|
constexpr bool YIELD_ON_CACHE_MISS = false; |
|
|
constexpr bool DATA_IN_HBM = true; |
|
|
constexpr bool DATA_IN_HBM = true; |
|
|
|
|
|
constexpr bool AGGRJ_ITERATIVE = false; |
|
|
constexpr char MODE_STRING[] = "hbm"; |
|
|
constexpr char MODE_STRING[] = "hbm"; |
|
|
#endif
|
|
|
#endif
|
|
|
|
|
|
|
|
@ -92,6 +98,8 @@ constexpr size_t TIME_STAMP_END = 2; |
|
|
// THREAD_TIMING_[TYPE][TID][ITERATION][STAMP] = TIMEPOINT
|
|
|
// THREAD_TIMING_[TYPE][TID][ITERATION][STAMP] = TIMEPOINT
|
|
|
std::array<std::vector<std::vector<std::array<std::chrono::steady_clock::time_point, 3>>>, 3> THREAD_TIMING_; |
|
|
std::array<std::vector<std::vector<std::array<std::chrono::steady_clock::time_point, 3>>>, 3> THREAD_TIMING_; |
|
|
|
|
|
|
|
|
|
|
|
std::array<uint32_t, GROUP_COUNT * TC_AGGRJ> CACHE_HITS_; |
|
|
|
|
|
|
|
|
std::vector<std::barrier<NopStruct>*> BARRIERS_; |
|
|
std::vector<std::barrier<NopStruct>*> BARRIERS_; |
|
|
std::shared_future<void> LAUNCH_; |
|
|
std::shared_future<void> LAUNCH_; |
|
|
|
|
|
|
|
@ -114,7 +122,16 @@ template<size_t TC> |
|
|
inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) { |
|
|
inline uint16_t* get_mask(uint16_t* base, const size_t chunk_index, const size_t tid) { |
|
|
size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC); |
|
|
size_t offset = chunk_index * CHUNK_SIZE_ELEMENTS + tid * (CHUNK_SIZE_ELEMENTS / TC); |
|
|
return base + (offset / 16); |
|
|
return base + (offset / 16); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
double process_cache_hitrate() { |
|
|
|
|
|
double hr = 0.0; |
|
|
|
|
|
|
|
|
|
|
|
for (const uint32_t& e : CACHE_HITS_) { |
|
|
|
|
|
hr += e; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return hr / (double)(TC_AGGRJ * GROUP_COUNT * RUN_COUNT); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void process_timings( |
|
|
void process_timings( |
|
@ -226,8 +243,10 @@ void scan_a(size_t gid, size_t tid) { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void aggr_j(size_t gid, size_t tid) { |
|
|
void aggr_j(size_t gid, size_t tid) { |
|
|
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].clear(); |
|
|
|
|
|
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].resize(1); |
|
|
|
|
|
|
|
|
CACHE_HITS_[gid * tid] = 0; |
|
|
|
|
|
|
|
|
|
|
|
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].clear(); |
|
|
|
|
|
THREAD_TIMING_[AGGRJ_TIMING_INDEX][tid * gid].resize(1); |
|
|
|
|
|
|
|
|
LAUNCH_.wait(); |
|
|
LAUNCH_.wait(); |
|
|
|
|
|
|
|
@ -249,11 +268,15 @@ void aggr_j(size_t gid, size_t tid) { |
|
|
|
|
|
|
|
|
if constexpr (PERFORM_CACHING) { |
|
|
if constexpr (PERFORM_CACHING) { |
|
|
data = CACHE_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); |
|
|
data = CACHE_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); |
|
|
data->WaitOnCompletion(); |
|
|
|
|
|
|
|
|
data->WaitOnCompletion(dsacache::WAIT_WEAK); |
|
|
data_ptr = reinterpret_cast<uint64_t*>(data->GetDataLocation()); |
|
|
data_ptr = reinterpret_cast<uint64_t*>(data->GetDataLocation()); |
|
|
|
|
|
|
|
|
if (data_ptr == nullptr) { |
|
|
if (data_ptr == nullptr) { |
|
|
std::cerr << "[x] Cache Miss!" << std::endl; |
|
|
|
|
|
exit(-1); |
|
|
|
|
|
|
|
|
data_ptr = chunk_ptr; |
|
|
|
|
|
if constexpr (YIELD_ON_CACHE_MISS) sched_yield(); |
|
|
|
|
|
} |
|
|
|
|
|
else { |
|
|
|
|
|
CACHE_HITS_[gid * tid]++; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
else { |
|
|
else { |
|
@ -269,6 +292,10 @@ void aggr_j(size_t gid, size_t tid) { |
|
|
BARRIERS_[gid]->arrive_and_drop(); |
|
|
BARRIERS_[gid]->arrive_and_drop(); |
|
|
|
|
|
|
|
|
aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator); |
|
|
aggregation::happly(DATA_DST_ + (tid * GROUP_COUNT + gid), aggregator); |
|
|
|
|
|
|
|
|
|
|
|
if constexpr (AGGRJ_ITERATIVE) { |
|
|
|
|
|
if (++tid < TC_AGGRJ) aggr_j(gid, tid); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int main() { |
|
|
int main() { |
|
@ -283,7 +310,7 @@ int main() { |
|
|
const std::string ofname = "results/qdp-xeonmax-simpleq-" + std::string(MODE_STRING) + "-tca" + std::to_string(TC_SCANA) + "-tcb" + std::to_string(TC_SCANB) + "-tcj" + std::to_string(TC_AGGRJ) + "-tmul" + std::to_string(GROUP_COUNT) + "-wl" + std::to_string(WL_SIZE_B) + "-cs" + std::to_string(CHUNK_SIZE_B) + ".csv"; |
|
|
const std::string ofname = "results/qdp-xeonmax-simpleq-" + std::string(MODE_STRING) + "-tca" + std::to_string(TC_SCANA) + "-tcb" + std::to_string(TC_SCANB) + "-tcj" + std::to_string(TC_AGGRJ) + "-tmul" + std::to_string(GROUP_COUNT) + "-wl" + std::to_string(WL_SIZE_B) + "-cs" + std::to_string(CHUNK_SIZE_B) + ".csv"; |
|
|
std::ofstream fout(ofname); |
|
|
std::ofstream fout(ofname); |
|
|
|
|
|
|
|
|
fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;" << std::endl; |
|
|
|
|
|
|
|
|
fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;cache-hr;" << std::endl; |
|
|
|
|
|
|
|
|
if constexpr (DATA_IN_HBM) { |
|
|
if constexpr (DATA_IN_HBM) { |
|
|
DATA_A_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node); |
|
|
DATA_A_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node); |
|
@ -324,8 +351,13 @@ int main() { |
|
|
copy_pool.emplace_back(scan_b, gid, tid); |
|
|
copy_pool.emplace_back(scan_b, gid, tid); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) { |
|
|
|
|
|
agg_pool.emplace_back(aggr_j, gid, tid); |
|
|
|
|
|
|
|
|
if constexpr (AGGRJ_ITERATIVE) { |
|
|
|
|
|
agg_pool.emplace_back(aggr_j, gid, 0); |
|
|
|
|
|
} |
|
|
|
|
|
else { |
|
|
|
|
|
for(uint32_t tid = 0; tid < TC_AGGRJ; ++tid) { |
|
|
|
|
|
agg_pool.emplace_back(aggr_j, gid, tid); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
@ -354,6 +386,7 @@ int main() { |
|
|
<< nanos << ";" << seconds << ";" |
|
|
<< nanos << ";" << seconds << ";" |
|
|
<< std::hex << DATA_DST_[0] << std::dec << ";" |
|
|
<< std::hex << DATA_DST_[0] << std::dec << ";" |
|
|
<< scana_run << ";" << scana_wait << ";" << scanb_run << ";" << scanb_wait << ";" << aggrj_run << ";" << aggrj_wait << ";" |
|
|
<< scana_run << ";" << scana_wait << ";" << scanb_run << ";" << scanb_wait << ";" << aggrj_run << ";" << aggrj_wait << ";" |
|
|
|
|
|
<< process_cache_hitrate() << ";" |
|
|
<< std::endl; |
|
|
<< std::endl; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|