|
|
@ -70,12 +70,11 @@ base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* |
|
|
|
enum class ExecMode { |
|
|
|
DramBaseline, |
|
|
|
HbmPeak, |
|
|
|
HbmPrefetch, |
|
|
|
HbmPrefetchOpt |
|
|
|
HbmPrefetch |
|
|
|
}; |
|
|
|
|
|
|
|
int main(int argc, char** argv) { |
|
|
|
constexpr ExecMode mode = ExecMode::HbmPrefetchOpt; |
|
|
|
constexpr ExecMode mode = ExecMode::HbmPrefetch; |
|
|
|
|
|
|
|
constexpr size_t workload_b = 4_GiB; |
|
|
|
constexpr base_t compare_value_a = 50; |
|
|
@ -97,13 +96,8 @@ int main(int argc, char** argv) { |
|
|
|
|
|
|
|
if constexpr (mode == ExecMode::HbmPrefetch) { |
|
|
|
tc_filter = 8; |
|
|
|
tc_copy = 8; |
|
|
|
tc_agg = 8; |
|
|
|
} |
|
|
|
else if constexpr (mode == ExecMode::HbmPrefetchOpt) { |
|
|
|
tc_filter = 0; |
|
|
|
tc_copy = 0; |
|
|
|
tc_agg = 1; |
|
|
|
tc_copy = 1; |
|
|
|
tc_agg = 4; |
|
|
|
} |
|
|
|
else { |
|
|
|
tc_filter = 8; |
|
|
@ -133,9 +127,6 @@ int main(int argc, char** argv) { |
|
|
|
else if constexpr (mode == ExecMode::DramBaseline) { |
|
|
|
mode_string = "DramBaseline"; |
|
|
|
} |
|
|
|
else if constexpr (mode == ExecMode::HbmPrefetchOpt) { |
|
|
|
mode_string = "HbmDsaPrefetchOpt"; |
|
|
|
} |
|
|
|
else { |
|
|
|
mode_string = "Unknown"; |
|
|
|
} |
|
|
@ -175,7 +166,7 @@ int main(int argc, char** argv) { |
|
|
|
std::promise<void> p; |
|
|
|
std::shared_future<void> ready_future(p.get_future()); |
|
|
|
|
|
|
|
Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch || mode == ExecMode::HbmPrefetchOpt> qw ( |
|
|
|
Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch> qw ( |
|
|
|
&ready_future, workload_b, chunk_size, |
|
|
|
data_a, data_b, results, tc_filter, tc_copy, |
|
|
|
tc_agg,compare_value_a, compare_value_b |
|
|
@ -187,7 +178,6 @@ int main(int argc, char** argv) { |
|
|
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.scan_a(gid, gcnt, tid); }; |
|
|
|
auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.scan_b(gid, gcnt, tid); }; |
|
|
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.aggr_j(gid, gcnt, tid); }; |
|
|
|
auto combined_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.combined(gid, gcnt, tid); }; |
|
|
|
|
|
|
|
std::vector<std::thread> filter_pool; |
|
|
|
std::vector<std::thread> copy_pool; |
|
|
@ -195,24 +185,17 @@ int main(int argc, char** argv) { |
|
|
|
std::vector<std::thread> combined_pool; |
|
|
|
|
|
|
|
for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { |
|
|
|
if constexpr (mode == ExecMode::HbmPrefetchOpt) { |
|
|
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
|
|
|
agg_pool.emplace_back(combined_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
for(uint32_t tid = 0; tid < tc_filter; ++tid) { |
|
|
|
filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
else { |
|
|
|
for(uint32_t tid = 0; tid < tc_filter; ++tid) { |
|
|
|
filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
|
|
|
|
// if tc_copy == 0 this loop is skipped
|
|
|
|
for(uint32_t tid = 0; tid < tc_copy; ++tid) { |
|
|
|
copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
|
|
|
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
|
|
|
agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
|
|
|
|
// if tc_copy == 0 this loop is skipped
|
|
|
|
for(uint32_t tid = 0; tid < tc_copy; ++tid) { |
|
|
|
copy_pool.emplace_back(copy_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
|
|
|
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
|
|
|
agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|