Browse Source

publish best run config for dsa-prefetch

master
Constantin Fürst 11 months ago
parent
commit
5c896dbf04
  1. 2
      qdp_project/bench_max.sh
  2. 21
      qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_16_8_16.csv
  3. 21
      qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_4_2_4.csv
  4. 21
      qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_8_4_8.csv
  5. 32
      qdp_project/src/benchmark/MAX_benchmark.cpp
  6. 12
      qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h

2
qdp_project/bench_max.sh

@ -3,7 +3,7 @@
current_date_time=$(date) current_date_time=$(date)
echo "Benchmark start at: $current_date_time" echo "Benchmark start at: $current_date_time"
sudo numactl --cpunodebind=0 -- taskset -c 0,1,2,3,4,5 cmake-build-release/MAXBench
sudo numactl --cpunodebind=0 cmake-build-release/MAXBench
current_date_time=$(date) current_date_time=$(date)
echo "Benchmark end at: $current_date_time" echo "Benchmark end at: $current_date_time"

21
qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_16_8_16.csv

@ -1,21 +0,0 @@
run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result
0,1048576,DSA-HBM-Prefetch,1,5.731291,0.242809,3.994670,37.579085,91.376857,41.818492,54.057804,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.625771,0.226044,3.602548,35.034827,89.717724,41.368940,54.929067,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.680908,0.257387,3.754257,36.585247,90.564257,41.657704,54.241928,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.781943,0.241056,4.126496,37.855612,92.198931,42.094104,54.583788,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.549320,0.228519,3.124105,35.097378,88.492427,41.235822,53.630639,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.777009,0.251029,4.437124,37.964752,92.110037,41.743883,54.397299,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.670860,0.250156,3.967051,36.702871,90.416031,41.364926,53.975008,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.625941,0.238702,3.556221,36.190441,89.711710,41.418562,53.754304,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.612005,0.233228,3.329597,36.169258,89.492509,41.531469,53.571190,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.737394,0.273648,4.268621,37.213991,91.457626,41.595946,54.518085,6644468581

21
qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_4_2_4.csv

@ -1,21 +0,0 @@
run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result
0,1048576,DSA-HBM-Prefetch,1,6.075639,0.252186,0.466490,12.072944,24.036687,11.678257,12.221073,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.137456,0.247107,0.463454,11.995021,20.289227,9.804897,8.542523,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.208535,0.245603,0.394126,11.709888,20.574239,10.016171,9.111716,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.239474,0.249562,0.512251,12.281586,20.694555,9.960118,8.667565,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.890495,0.236555,0.362265,12.314338,23.308626,11.410607,11.239033,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.537051,0.242188,0.416423,11.960895,21.891977,10.650780,10.177964,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.066196,0.246366,0.565382,12.489395,20.002075,9.558870,7.767503,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.742250,0.244299,0.355104,12.181505,22.711610,11.123181,10.778970,6644468581
0,1048576,DSA-HBM-Prefetch,1,4.877906,0.263570,0.551561,12.230428,19.230705,9.195729,7.272716,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.089114,0.264795,0.455927,11.982116,20.077993,9.715731,8.363290,6644468581

21
qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_8_4_8.csv

@ -1,21 +0,0 @@
run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result
0,1048576,DSA-HBM-Prefetch,1,5.915989,0.248599,1.540611,20.985578,47.025569,22.095341,26.314946,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.808085,0.255004,1.910872,21.808717,46.179794,21.307009,24.634189,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.689409,0.244086,1.530266,20.217566,45.238112,21.211273,25.274080,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.648455,0.259139,1.824648,20.801884,44.893138,20.750985,24.363573,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.845340,0.273403,1.774087,21.006538,46.448981,21.586245,25.736062,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.756498,0.249321,1.565101,20.486862,45.773878,21.445992,25.540477,6644468581
0,1048576,DSA-HBM-Prefetch,1,6.012016,0.259178,1.556886,21.666145,47.807526,22.476177,26.403770,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.763325,0.257247,1.686996,20.699718,45.823583,21.351852,25.381857,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.900840,0.244329,1.704807,20.110657,46.917694,21.876056,27.065674,6644468581
0,1048576,DSA-HBM-Prefetch,1,5.957580,0.232572,1.208086,20.515314,47.396213,22.607704,27.120903,6644468581

32
qdp_project/src/benchmark/MAX_benchmark.cpp

@ -74,15 +74,13 @@ enum class ExecMode {
}; };
int main(int argc, char** argv) { int main(int argc, char** argv) {
constexpr ExecMode mode = ExecMode::DramBaseline;
constexpr ExecMode mode = ExecMode::HbmPrefetch;
constexpr size_t workload_b = 2_GiB;
constexpr size_t workload_b = 4_GiB;
constexpr base_t compare_value_a = 50; constexpr base_t compare_value_a = 50;
constexpr base_t compare_value_b = 42; constexpr base_t compare_value_b = 42;
constexpr size_t chunk_min = 1_MiB;
constexpr size_t chunk_max = 8_MiB + 1;
constexpr size_t chunk_incr = 128_kiB;
constexpr size_t chunk_size = 256_MiB;
// thread count is 12 here but as the default measurement uses 6 // thread count is 12 here but as the default measurement uses 6
// we must restrict the core assignment of these 12 threads to // we must restrict the core assignment of these 12 threads to
@ -97,14 +95,14 @@ int main(int argc, char** argv) {
uint8_t tc_agg; uint8_t tc_agg;
if constexpr (mode == ExecMode::HbmPrefetch) { if constexpr (mode == ExecMode::HbmPrefetch) {
tc_filter = 4;
tc_copy = 1;
tc_agg = 1;
tc_filter = 8;
tc_copy = 8;
tc_agg = 8;
} }
else { else {
tc_filter = 4;
tc_filter = 8;
tc_copy = 0; tc_copy = 0;
tc_agg = 2;
tc_agg = 4;
} }
const uint8_t tc_combined = tc_filter + tc_copy + tc_agg; const uint8_t tc_combined = tc_filter + tc_copy + tc_agg;
@ -133,19 +131,15 @@ int main(int argc, char** argv) {
mode_string = "Unknown"; mode_string = "Unknown";
} }
const std::string ofname = "qdp-xeonmax-simpleq-" + mode_string + "-tca" + std::to_string(tc_filter) + "-tcb" + std::to_string(tc_copy) + "-tcj" + std::to_string(tc_agg) + "-tmul" + std::to_string(THREAD_GROUP_MULTIPLIER) ;
const std::string ofname = "results/qdp-xeonmax-simpleq-" + mode_string + "-tca" + std::to_string(tc_filter) + "-tcb" + std::to_string(tc_copy) + "-tcj" + std::to_string(tc_agg) + "-tmul" + std::to_string(THREAD_GROUP_MULTIPLIER) + "-wl" + std::to_string(workload_b) + "-cs" + std::to_string(chunk_size) + ".csv";
out_file.open(ofname); out_file.open(ofname);
if (out_file.bad()) {
if (!out_file.is_open()) {
std::cerr << "Failed to open Output File '" << ofname << "'" << std::endl; std::cerr << "Failed to open Output File '" << ofname << "'" << std::endl;
} }
// set benchmark parameter
Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
Linear_Int_Range<size_t, chunk_min, chunk_max, chunk_incr> chunk_size("chunk_size");
print_to_file(out_file, generateHead(run, chunk_size), "thread_group", "time",
print_to_file(out_file, "thread_group", "time",
#ifdef THREAD_TIMINGS #ifdef THREAD_TIMINGS
"scan_a", "scan_b", "aggr_j", "scan_a", "scan_b", "aggr_j",
#endif #endif
@ -173,7 +167,7 @@ int main(int argc, char** argv) {
std::shared_future<void> ready_future(p.get_future()); std::shared_future<void> ready_future(p.get_future());
Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch> qw ( Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch> qw (
&ready_future, workload_b, chunk_size.current,
&ready_future, workload_b, chunk_size,
data_a, data_b, results, tc_filter, tc_copy, data_a, data_b, results, tc_filter, tc_copy,
tc_agg,compare_value_a, compare_value_b tc_agg,compare_value_a, compare_value_b
); );
@ -219,7 +213,7 @@ int main(int argc, char** argv) {
double seconds = (double)(nanos) / nanos_per_second; double seconds = (double)(nanos) / nanos_per_second;
if (i >= 5) { if (i >= 5) {
print_to_file(out_file, run, chunk_size, THREAD_GROUP_MULTIPLIER, seconds,
print_to_file(out_file, THREAD_GROUP_MULTIPLIER, seconds,
#ifdef THREAD_TIMINGS #ifdef THREAD_TIMINGS
qw.trt->summarize_time(0), qw.trt->summarize_time(1), qw.trt->summarize_time(2), qw.trt->summarize_time(0), qw.trt->summarize_time(1), qw.trt->summarize_time(2),
#endif #endif

12
qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h

@ -253,7 +253,8 @@ public:
base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);
if constexpr (caching) { if constexpr (caching) {
cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
const auto data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
data->WaitOnCompletion();
} }
pvc->stop("scan_b", tid * gcnt + gid); pvc->stop("scan_b", tid * gcnt + gid);
@ -322,11 +323,6 @@ public:
data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt); data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
// wait on the caching task to complete, this will give time for other processes
// to make progress here which will therefore not hurt performance
data->WaitOnCompletion();
// after the copy task has finished we obtain the pointer to the cached // after the copy task has finished we obtain the pointer to the cached
// copy of data_b which is then used from now on // copy of data_b which is then used from now on
@ -336,8 +332,8 @@ public:
// even after waiting, so this must be checked // even after waiting, so this must be checked
if (data_ptr == nullptr) { if (data_ptr == nullptr) {
data_ptr = chunk_ptr;
std::cerr << "[!] Cache Miss in AggrJ" << std::endl;
std::cerr << "[x] Cache Miss!" << std::endl;
exit(-1);
} }
} }
else { else {

Loading…
Cancel
Save