diff --git a/qdp_project/bench_max.sh b/qdp_project/bench_max.sh index 079c25a..f05aacd 100644 --- a/qdp_project/bench_max.sh +++ b/qdp_project/bench_max.sh @@ -3,7 +3,7 @@ current_date_time=$(date) echo "Benchmark start at: $current_date_time" -sudo numactl --cpunodebind=0 -- taskset -c 0,1,2,3,4,5 cmake-build-release/MAXBench +sudo numactl --cpunodebind=0 cmake-build-release/MAXBench current_date_time=$(date) echo "Benchmark end at: $current_date_time" diff --git a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_16_8_16.csv b/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_16_8_16.csv deleted file mode 100644 index 6f0ec38..0000000 --- a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_16_8_16.csv +++ /dev/null @@ -1,21 +0,0 @@ -run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result -0,1048576,DSA-HBM-Prefetch,1,5.731291,0.242809,3.994670,37.579085,91.376857,41.818492,54.057804,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.625771,0.226044,3.602548,35.034827,89.717724,41.368940,54.929067,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.680908,0.257387,3.754257,36.585247,90.564257,41.657704,54.241928,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.781943,0.241056,4.126496,37.855612,92.198931,42.094104,54.583788,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.549320,0.228519,3.124105,35.097378,88.492427,41.235822,53.630639,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.777009,0.251029,4.437124,37.964752,92.110037,41.743883,54.397299,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.670860,0.250156,3.967051,36.702871,90.416031,41.364926,53.975008,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.625941,0.238702,3.556221,36.190441,89.711710,41.418562,53.754304,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.612005,0.233228,3.329597,36.169258,89.492509,41.531469,53.571190,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.737394,0.273648,4.268621,37.213991,91.457626,41.595946,54.518085,6644468581 - diff --git a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_4_2_4.csv b/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_4_2_4.csv deleted file mode 100644 index 23cd73e..0000000 --- a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_4_2_4.csv +++ /dev/null @@ -1,21 +0,0 @@ -run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result -0,1048576,DSA-HBM-Prefetch,1,6.075639,0.252186,0.466490,12.072944,24.036687,11.678257,12.221073,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.137456,0.247107,0.463454,11.995021,20.289227,9.804897,8.542523,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.208535,0.245603,0.394126,11.709888,20.574239,10.016171,9.111716,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.239474,0.249562,0.512251,12.281586,20.694555,9.960118,8.667565,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.890495,0.236555,0.362265,12.314338,23.308626,11.410607,11.239033,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.537051,0.242188,0.416423,11.960895,21.891977,10.650780,10.177964,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.066196,0.246366,0.565382,12.489395,20.002075,9.558870,7.767503,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.742250,0.244299,0.355104,12.181505,22.711610,11.123181,10.778970,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,4.877906,0.263570,0.551561,12.230428,19.230705,9.195729,7.272716,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.089114,0.264795,0.455927,11.982116,20.077993,9.715731,8.363290,6644468581 - diff --git a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_8_4_8.csv b/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_8_4_8.csv deleted file mode 100644 index bb38d48..0000000 --- a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_8_4_8.csv +++ /dev/null @@ -1,21 +0,0 @@ -run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result -0,1048576,DSA-HBM-Prefetch,1,5.915989,0.248599,1.540611,20.985578,47.025569,22.095341,26.314946,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.808085,0.255004,1.910872,21.808717,46.179794,21.307009,24.634189,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.689409,0.244086,1.530266,20.217566,45.238112,21.211273,25.274080,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.648455,0.259139,1.824648,20.801884,44.893138,20.750985,24.363573,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.845340,0.273403,1.774087,21.006538,46.448981,21.586245,25.736062,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.756498,0.249321,1.565101,20.486862,45.773878,21.445992,25.540477,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,6.012016,0.259178,1.556886,21.666145,47.807526,22.476177,26.403770,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.763325,0.257247,1.686996,20.699718,45.823583,21.351852,25.381857,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.900840,0.244329,1.704807,20.110657,46.917694,21.876056,27.065674,6644468581 - -0,1048576,DSA-HBM-Prefetch,1,5.957580,0.232572,1.208086,20.515314,47.396213,22.607704,27.120903,6644468581 - diff --git a/qdp_project/src/benchmark/MAX_benchmark.cpp b/qdp_project/src/benchmark/MAX_benchmark.cpp index c015666..57e6c18 100644 --- a/qdp_project/src/benchmark/MAX_benchmark.cpp +++ b/qdp_project/src/benchmark/MAX_benchmark.cpp @@ -74,15 +74,13 @@ enum class ExecMode { }; int main(int argc, char** argv) { - constexpr ExecMode mode = ExecMode::DramBaseline; + constexpr ExecMode mode = ExecMode::HbmPrefetch; - constexpr size_t workload_b = 2_GiB; + constexpr size_t workload_b = 4_GiB; constexpr base_t compare_value_a = 50; constexpr base_t compare_value_b = 42; - constexpr size_t chunk_min = 1_MiB; - constexpr size_t chunk_max = 8_MiB + 1; - constexpr size_t chunk_incr = 128_kiB; + constexpr size_t chunk_size = 256_MiB; // thread count is 12 here but as the default measurement uses 6 // we must restrict the core assignment of these 12 threads to @@ -97,14 +95,14 @@ int main(int argc, char** argv) { uint8_t tc_agg; if constexpr (mode == ExecMode::HbmPrefetch) { - tc_filter = 4; - tc_copy = 1; - tc_agg = 1; + tc_filter = 8; + tc_copy = 8; + tc_agg = 8; } else { - tc_filter = 4; + tc_filter = 8; tc_copy = 0; - tc_agg = 2; + tc_agg = 4; } const uint8_t tc_combined = tc_filter + tc_copy + tc_agg; @@ -133,19 +131,15 @@ int main(int argc, char** argv) { mode_string = "Unknown"; } - const std::string ofname = "qdp-xeonmax-simpleq-" + mode_string + "-tca" + std::to_string(tc_filter) + "-tcb" + std::to_string(tc_copy) + "-tcj" + std::to_string(tc_agg) + "-tmul" + std::to_string(THREAD_GROUP_MULTIPLIER) ; + const std::string ofname = "results/qdp-xeonmax-simpleq-" + mode_string + "-tca" + std::to_string(tc_filter) + "-tcb" + std::to_string(tc_copy) + "-tcj" + std::to_string(tc_agg) + "-tmul" + std::to_string(THREAD_GROUP_MULTIPLIER) + "-wl" + std::to_string(workload_b) + "-cs" + std::to_string(chunk_size) + ".csv"; out_file.open(ofname); - if (out_file.bad()) { + if (!out_file.is_open()) { std::cerr << "Failed to open Output File '" << ofname << "'" << std::endl; } - // set benchmark parameter - Linear_Int_Range run("run"); - Linear_Int_Range chunk_size("chunk_size"); - - print_to_file(out_file, generateHead(run, chunk_size), "thread_group", "time", + print_to_file(out_file, "thread_group", "time", #ifdef THREAD_TIMINGS "scan_a", "scan_b", "aggr_j", #endif @@ -173,7 +167,7 @@ int main(int argc, char** argv) { std::shared_future ready_future(p.get_future()); Query_Wrapper qw ( - &ready_future, workload_b, chunk_size.current, + &ready_future, workload_b, chunk_size, data_a, data_b, results, tc_filter, tc_copy, tc_agg,compare_value_a, compare_value_b ); @@ -219,7 +213,7 @@ int main(int argc, char** argv) { double seconds = (double)(nanos) / nanos_per_second; if (i >= 5) { - print_to_file(out_file, run, chunk_size, THREAD_GROUP_MULTIPLIER, seconds, + print_to_file(out_file, THREAD_GROUP_MULTIPLIER, seconds, #ifdef THREAD_TIMINGS qw.trt->summarize_time(0), qw.trt->summarize_time(1), qw.trt->summarize_time(2), #endif @@ -234,4 +228,4 @@ int main(int argc, char** argv) { numa_free(data_a, workload_b); numa_free(data_b, workload_b); numa_free(results, THREAD_GROUP_MULTIPLIER * tc_combined * sizeof(base_t)); -} \ No newline at end of file +} diff --git a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h index 88ebef0..152df14 100755 --- a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h +++ b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h @@ -253,7 +253,8 @@ public: base_t* chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt); if constexpr (caching) { - cache_.Access(reinterpret_cast(chunk_ptr), chunk_size_b / tcnt); + const auto data = cache_.Access(reinterpret_cast(chunk_ptr), chunk_size_b / tcnt); + data->WaitOnCompletion(); } pvc->stop("scan_b", tid * gcnt + gid); @@ -322,11 +323,6 @@ public: data = cache_.Access(reinterpret_cast(chunk_ptr), chunk_size_b / tcnt); - // wait on the caching task to complete, this will give time for other processes - // to make progress here which will therefore not hurt performance - - data->WaitOnCompletion(); - // after the copy task has finished we obtain the pointer to the cached // copy of data_b which is then used from now on @@ -336,8 +332,8 @@ public: // even after waiting, so this must be checked if (data_ptr == nullptr) { - data_ptr = chunk_ptr; - std::cerr << "[!] Cache Miss in AggrJ" << std::endl; + std::cerr << "[x] Cache Miss!" << std::endl; + exit(-1); } } else { @@ -359,4 +355,4 @@ public: aggregation::happly(dest + (tid * gcnt + gid), aggregator); } -}; \ No newline at end of file +};