publish best run config for dsa-prefetch

11 months ago · 5c896dbf04
6 changed files with 20 additions and 93 deletions
--- a/qdp_project/bench_max.sh
+++ b/qdp_project/bench_max.sh
@ -3,7 +3,7 @@
 current_date_time=$(date)
 echo "Benchmark start at: $current_date_time"

-sudo numactl --cpunodebind=0 -- taskset -c 0,1,2,3,4,5 cmake-build-release/MAXBench
+sudo numactl --cpunodebind=0 cmake-build-release/MAXBench

 current_date_time=$(date)
 echo "Benchmark end at: $current_date_time"
--- a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_16_8_16.csv
+++ b/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_16_8_16.csv
@ -1,21 +0,0 @@
-run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result
-0,1048576,DSA-HBM-Prefetch,1,5.731291,0.242809,3.994670,37.579085,91.376857,41.818492,54.057804,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.625771,0.226044,3.602548,35.034827,89.717724,41.368940,54.929067,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.680908,0.257387,3.754257,36.585247,90.564257,41.657704,54.241928,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.781943,0.241056,4.126496,37.855612,92.198931,42.094104,54.583788,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.549320,0.228519,3.124105,35.097378,88.492427,41.235822,53.630639,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.777009,0.251029,4.437124,37.964752,92.110037,41.743883,54.397299,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.670860,0.250156,3.967051,36.702871,90.416031,41.364926,53.975008,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.625941,0.238702,3.556221,36.190441,89.711710,41.418562,53.754304,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.612005,0.233228,3.329597,36.169258,89.492509,41.531469,53.571190,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.737394,0.273648,4.268621,37.213991,91.457626,41.595946,54.518085,6644468581
-
--- a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_4_2_4.csv
+++ b/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_4_2_4.csv
@ -1,21 +0,0 @@
-run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result
-0,1048576,DSA-HBM-Prefetch,1,6.075639,0.252186,0.466490,12.072944,24.036687,11.678257,12.221073,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.137456,0.247107,0.463454,11.995021,20.289227,9.804897,8.542523,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.208535,0.245603,0.394126,11.709888,20.574239,10.016171,9.111716,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.239474,0.249562,0.512251,12.281586,20.694555,9.960118,8.667565,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.890495,0.236555,0.362265,12.314338,23.308626,11.410607,11.239033,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.537051,0.242188,0.416423,11.960895,21.891977,10.650780,10.177964,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.066196,0.246366,0.565382,12.489395,20.002075,9.558870,7.767503,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.742250,0.244299,0.355104,12.181505,22.711610,11.123181,10.778970,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,4.877906,0.263570,0.551561,12.230428,19.230705,9.195729,7.272716,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.089114,0.264795,0.455927,11.982116,20.077993,9.715731,8.363290,6644468581
-
--- a/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_8_4_8.csv
+++ b/qdp_project/results/max_q-simple_bm-global_bl-6t-121MiB-2MiB-tc_8_4_8.csv
@ -1,21 +0,0 @@
-run,chunk_size,mode,thread_group,time,scan_a,scan_b,aggr_j,wait_scan_a,wait_scan_b,wait_aggr_j,result
-0,1048576,DSA-HBM-Prefetch,1,5.915989,0.248599,1.540611,20.985578,47.025569,22.095341,26.314946,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.808085,0.255004,1.910872,21.808717,46.179794,21.307009,24.634189,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.689409,0.244086,1.530266,20.217566,45.238112,21.211273,25.274080,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.648455,0.259139,1.824648,20.801884,44.893138,20.750985,24.363573,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.845340,0.273403,1.774087,21.006538,46.448981,21.586245,25.736062,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.756498,0.249321,1.565101,20.486862,45.773878,21.445992,25.540477,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,6.012016,0.259178,1.556886,21.666145,47.807526,22.476177,26.403770,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.763325,0.257247,1.686996,20.699718,45.823583,21.351852,25.381857,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.900840,0.244329,1.704807,20.110657,46.917694,21.876056,27.065674,6644468581
-
-0,1048576,DSA-HBM-Prefetch,1,5.957580,0.232572,1.208086,20.515314,47.396213,22.607704,27.120903,6644468581
-
--- a/qdp_project/src/benchmark/MAX_benchmark.cpp
+++ b/qdp_project/src/benchmark/MAX_benchmark.cpp
@ -74,15 +74,13 @@ enum class ExecMode {
 };

 int main(int argc, char** argv) {
-    constexpr ExecMode mode = ExecMode::DramBaseline;
+    constexpr ExecMode mode = ExecMode::HbmPrefetch;

-    constexpr size_t workload_b = 2_GiB;
+    constexpr size_t workload_b = 4_GiB;
    constexpr base_t compare_value_a = 50;
    constexpr base_t compare_value_b = 42;

-    constexpr size_t chunk_min = 1_MiB;
-    constexpr size_t chunk_max = 8_MiB + 1;
-    constexpr size_t chunk_incr = 128_kiB;
+    constexpr size_t chunk_size = 256_MiB;

    // thread count is 12 here but as the default measurement uses 6
    // we must restrict the core assignment of these 12 threads to
@ -97,14 +95,14 @@ int main(int argc, char** argv) {
    uint8_t tc_agg;

    if constexpr (mode == ExecMode::HbmPrefetch) {
-        tc_filter = 4;
-        tc_copy   = 1;
-        tc_agg    = 1;
+        tc_filter = 8;
+        tc_copy   = 8;
+        tc_agg    = 8; 
    }
    else {
-        tc_filter = 4;
+        tc_filter = 8;
        tc_copy   = 0;
-        tc_agg    = 2;
+        tc_agg    = 4;
    }

    const uint8_t tc_combined = tc_filter + tc_copy + tc_agg;
@ -133,19 +131,15 @@ int main(int argc, char** argv) {
        mode_string = "Unknown";
    }

-    const std::string ofname = "qdp-xeonmax-simpleq-" + mode_string + "-tca" + std::to_string(tc_filter) + "-tcb" + std::to_string(tc_copy) + "-tcj" + std::to_string(tc_agg) + "-tmul" + std::to_string(THREAD_GROUP_MULTIPLIER) ;
+    const std::string ofname = "results/qdp-xeonmax-simpleq-" + mode_string + "-tca" + std::to_string(tc_filter) + "-tcb" + std::to_string(tc_copy) + "-tcj" + std::to_string(tc_agg) + "-tmul" + std::to_string(THREAD_GROUP_MULTIPLIER) + "-wl" + std::to_string(workload_b) + "-cs" + std::to_string(chunk_size) + ".csv";

    out_file.open(ofname);

-    if (out_file.bad()) {
+    if (!out_file.is_open()) {
        std::cerr << "Failed to open Output File '" << ofname << "'" << std::endl;
    }

-    // set benchmark parameter
-    Linear_Int_Range<uint32_t, 0, 30, 1> run("run");
-    Linear_Int_Range<size_t, chunk_min, chunk_max, chunk_incr> chunk_size("chunk_size");
-
-    print_to_file(out_file, generateHead(run, chunk_size), "thread_group", "time",
+    print_to_file(out_file, "thread_group", "time",
 #ifdef THREAD_TIMINGS
                  "scan_a", "scan_b", "aggr_j",
 #endif
@ -173,7 +167,7 @@ int main(int argc, char** argv) {
        std::shared_future<void> ready_future(p.get_future());

        Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch> qw (
-                &ready_future, workload_b, chunk_size.current,
+                &ready_future, workload_b, chunk_size,
                data_a, data_b, results, tc_filter, tc_copy,
                tc_agg,compare_value_a, compare_value_b
        );
@ -219,7 +213,7 @@ int main(int argc, char** argv) {
        double seconds   = (double)(nanos) / nanos_per_second;

        if (i >= 5) {
-            print_to_file(out_file, run, chunk_size, THREAD_GROUP_MULTIPLIER, seconds,
+            print_to_file(out_file, THREAD_GROUP_MULTIPLIER, seconds,
                #ifdef THREAD_TIMINGS
                          qw.trt->summarize_time(0),  qw.trt->summarize_time(1),  qw.trt->summarize_time(2),
                #endif
--- a/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
+++ b/qdp_project/src/benchmark/pipelines/MAX_scan_filter_pipe.h
@ -253,7 +253,8 @@ public:
            base_t*  chunk_ptr = get_sub_chunk_ptr(data_b, chunk_id, chunk_size_w, tid, tcnt);

            if constexpr (caching) {
-                cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
+                const auto data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);
+		data->WaitOnCompletion();
            }

            pvc->stop("scan_b", tid * gcnt + gid);
@ -322,11 +323,6 @@ public:

                data = cache_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), chunk_size_b / tcnt);

-                // wait on the caching task to complete, this will give time for other processes
-                // to make progress here which will therefore not hurt performance
-
-                data->WaitOnCompletion();
-
                // after the copy task has finished we obtain the pointer to the cached
                // copy of data_b which is then used from now on

@ -336,8 +332,8 @@ public:
                // even after waiting, so this must be checked

                if (data_ptr == nullptr) {
-                    data_ptr = chunk_ptr;
-                    std::cerr << "[!] Cache Miss in AggrJ" << std::endl;
+			std::cerr << "[x] Cache Miss!" << std::endl;
+			exit(-1);
                }
            }
            else {