|
|
@ -70,17 +70,18 @@ base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* |
|
|
|
enum class ExecMode { |
|
|
|
DramBaseline, |
|
|
|
HbmPeak, |
|
|
|
HbmPrefetch |
|
|
|
HbmPrefetch, |
|
|
|
HbmPrefetchOpt |
|
|
|
}; |
|
|
|
|
|
|
|
int main(int argc, char** argv) { |
|
|
|
constexpr ExecMode mode = ExecMode::HbmPrefetch; |
|
|
|
constexpr ExecMode mode = ExecMode::HbmPrefetchOpt; |
|
|
|
|
|
|
|
constexpr size_t workload_b = 4_GiB; |
|
|
|
constexpr base_t compare_value_a = 50; |
|
|
|
constexpr base_t compare_value_b = 42; |
|
|
|
|
|
|
|
constexpr size_t chunk_size = 256_MiB; |
|
|
|
constexpr size_t chunk_size = 64_MiB; |
|
|
|
|
|
|
|
// thread count is 12 here but as the default measurement uses 6
|
|
|
|
// we must restrict the core assignment of these 12 threads to
|
|
|
@ -99,6 +100,11 @@ int main(int argc, char** argv) { |
|
|
|
tc_copy = 8; |
|
|
|
tc_agg = 8; |
|
|
|
} |
|
|
|
else if constexpr (mode == ExecMode::HbmPrefetchOpt) { |
|
|
|
tc_filter = 0; |
|
|
|
tc_copy = 0; |
|
|
|
tc_agg = 1; |
|
|
|
} |
|
|
|
else { |
|
|
|
tc_filter = 8; |
|
|
|
tc_copy = 0; |
|
|
@ -127,6 +133,9 @@ int main(int argc, char** argv) { |
|
|
|
else if constexpr (mode == ExecMode::DramBaseline) { |
|
|
|
mode_string = "DramBaseline"; |
|
|
|
} |
|
|
|
else if constexpr (mode == ExecMode::HbmPrefetchOpt) { |
|
|
|
mode_string = "HbmDsaPrefetchOpt"; |
|
|
|
} |
|
|
|
else { |
|
|
|
mode_string = "Unknown"; |
|
|
|
} |
|
|
@ -166,7 +175,7 @@ int main(int argc, char** argv) { |
|
|
|
std::promise<void> p; |
|
|
|
std::shared_future<void> ready_future(p.get_future()); |
|
|
|
|
|
|
|
Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch> qw ( |
|
|
|
Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch || mode == ExecMode::HbmPrefetchOpt> qw ( |
|
|
|
&ready_future, workload_b, chunk_size, |
|
|
|
data_a, data_b, results, tc_filter, tc_copy, |
|
|
|
tc_agg,compare_value_a, compare_value_b |
|
|
@ -178,12 +187,20 @@ int main(int argc, char** argv) { |
|
|
|
auto filter_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.scan_a(gid, gcnt, tid); }; |
|
|
|
auto copy_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.scan_b(gid, gcnt, tid); }; |
|
|
|
auto aggregation_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.aggr_j(gid, gcnt, tid); }; |
|
|
|
auto combined_lambda = [&qw](uint32_t gid, uint32_t gcnt, uint32_t tid) { qw.combined(gid, gcnt, tid); }; |
|
|
|
|
|
|
|
std::vector<std::thread> filter_pool; |
|
|
|
std::vector<std::thread> copy_pool; |
|
|
|
std::vector<std::thread> agg_pool; |
|
|
|
std::vector<std::thread> combined_pool; |
|
|
|
|
|
|
|
for(uint32_t gid = 0; gid < THREAD_GROUP_MULTIPLIER; ++gid) { |
|
|
|
if constexpr (mode == ExecMode::HbmPrefetchOpt) { |
|
|
|
for(uint32_t tid = 0; tid < tc_agg; ++tid) { |
|
|
|
agg_pool.emplace_back(combined_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
for(uint32_t tid = 0; tid < tc_filter; ++tid) { |
|
|
|
filter_pool.emplace_back(filter_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
@ -197,6 +214,7 @@ int main(int argc, char** argv) { |
|
|
|
agg_pool.emplace_back(aggregation_lambda, gid, THREAD_GROUP_MULTIPLIER, tid); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
auto start = std::chrono::steady_clock::now(); |
|
|
|
p.set_value(); |
|
|
|