|
|
@ -21,10 +21,6 @@ |
|
|
|
#define THREAD_GROUP_MULTIPLIER 2
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef QUERY
|
|
|
|
#define QUERY 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef BARRIER_MODE
|
|
|
|
#define BARRIER_MODE "global"
|
|
|
|
#endif
|
|
|
@ -33,18 +29,6 @@ |
|
|
|
#define BUFFER_LIMIT 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef PINNING
|
|
|
|
#define PINNING 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef PCM_M
|
|
|
|
#define PCM_M 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if PCM_M == 1
|
|
|
|
#include "pcm.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "const.h"
|
|
|
|
|
|
|
|
#include "file_output.h"
|
|
|
@ -55,6 +39,7 @@ |
|
|
|
#include "cpu_set_utils.h"
|
|
|
|
#include "iterable_range.h"
|
|
|
|
#include "memory_literals.h"
|
|
|
|
|
|
|
|
#include "pipelines/MAX_scan_filter_pipe.h"
|
|
|
|
|
|
|
|
#include "aggregation.h"
|
|
|
@ -62,6 +47,10 @@ |
|
|
|
|
|
|
|
using base_t = uint64_t; |
|
|
|
|
|
|
|
static int CachePlacementPolicy(const int numa_dst_node, const int numa_src_node, const size_t data_size) { |
|
|
|
return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node; |
|
|
|
} |
|
|
|
|
|
|
|
base_t sum_check(base_t compare_value, base_t* row_A, base_t* row_B, size_t row_size) { |
|
|
|
base_t sum = 0; |
|
|
|
for(int i = 0; i < row_size / sizeof(base_t); ++i) { |
|
|
@ -78,26 +67,18 @@ base_t sum_check_complex(base_t compare_value_a, base_t compare_value_b, base_t* |
|
|
|
return sum; |
|
|
|
} |
|
|
|
|
|
|
|
enum class ExecMode { |
|
|
|
DramBaseline, |
|
|
|
HbmPeak, |
|
|
|
HbmPrefetch |
|
|
|
}; |
|
|
|
|
|
|
|
int main(int argc, char** argv) { |
|
|
|
#if PCM == 1
|
|
|
|
pcm::PCM *pcm = pcm::PCM::getInstance(); |
|
|
|
//and check for errors
|
|
|
|
auto error_code = pcm->program(); |
|
|
|
if(error_code != pcm::PCM::Success) { |
|
|
|
std::cerr << "PCM couldn't start" << std::endl; |
|
|
|
std::cerr << "Error code: " << error_code << std::endl; |
|
|
|
std::cerr << "Try to execute 'sudo modprobe msr' and execute this program with root privigeges."; |
|
|
|
return 1; |
|
|
|
} |
|
|
|
#endif
|
|
|
|
constexpr ExecMode mode = ExecMode::DramBaseline; |
|
|
|
|
|
|
|
// set constants
|
|
|
|
constexpr size_t workload_b = 2_GiB; |
|
|
|
constexpr base_t compare_value_a = 50; |
|
|
|
constexpr base_t compare_value_b = 42; |
|
|
|
constexpr bool simple_query = (QUERY == 1); |
|
|
|
constexpr bool cache_a = false; |
|
|
|
constexpr bool wait_b = false; |
|
|
|
|
|
|
|
constexpr size_t chunk_min = 1_MiB; |
|
|
|
constexpr size_t chunk_max = 8_MiB + 1; |
|
|
@ -107,15 +88,52 @@ int main(int argc, char** argv) { |
|
|
|
// we must restrict the core assignment of these 12 threads to
|
|
|
|
// 6 physical cpu cores on the executing node
|
|
|
|
|
|
|
|
constexpr size_t thread_count = 12; |
|
|
|
|
|
|
|
|
|
|
|
/*** alloc data and buffers ************************************************/ |
|
|
|
|
|
|
|
uint8_t tc_filter; |
|
|
|
uint8_t tc_copy; |
|
|
|
uint8_t tc_agg; |
|
|
|
|
|
|
|
if constexpr (mode == ExecMode::HbmPrefetch) { |
|
|
|
tc_filter = 4; |
|
|
|
tc_copy = 1; |
|
|
|
tc_agg = 1; |
|
|
|
} |
|
|
|
else { |
|
|
|
tc_filter = 4; |
|
|
|
tc_copy = 0; |
|
|
|
tc_agg = 2; |
|
|
|
} |
|
|
|
|
|
|
|
const uint8_t tc_combined = tc_filter + tc_copy + tc_agg; |
|
|
|
|
|
|
|
base_t* data_a; |
|
|
|
base_t* data_b; |
|
|
|
base_t* results; |
|
|
|
|
|
|
|
const int current_cpu = sched_getcpu(); |
|
|
|
const int current_node = numa_node_of_cpu(current_cpu); |
|
|
|
const int cache_node = CachePlacementPolicy(current_node, current_node, 0); |
|
|
|
|
|
|
|
std::ofstream out_file; |
|
|
|
|
|
|
|
const std::string ofname = std::string("results/max_") + |
|
|
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
|
|
|
"_bm-" + (std::string) BARRIER_MODE + |
|
|
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
|
|
|
"_tc-" + std::to_string(thread_count) + "1MiB-2MiB.csv"; |
|
|
|
std::string mode_string; |
|
|
|
if constexpr (mode == ExecMode::HbmPrefetch) { |
|
|
|
mode_string = "HbmDsaPrefetch"; |
|
|
|
} |
|
|
|
else if constexpr (mode == ExecMode::HbmPeak) { |
|
|
|
mode_string = "HbmAllocPeak"; |
|
|
|
} |
|
|
|
else if constexpr (mode == ExecMode::DramBaseline) { |
|
|
|
mode_string = "DramBaseline"; |
|
|
|
} |
|
|
|
else { |
|
|
|
mode_string = "Unknown"; |
|
|
|
} |
|
|
|
|
|
|
|
const std::string ofname = "qdp-xeonmax-simpleq-" + mode_string + "-tca" + std::to_string(tc_filter) + "-tcb" + std::to_string(tc_copy) + "-tcj" + std::to_string(tc_agg) + "-tmul" + std::to_string(THREAD_GROUP_MULTIPLIER) ; |
|
|
|
|
|
|
|
out_file.open(ofname); |
|
|
|
|
|
|
@ -127,61 +145,37 @@ int main(int argc, char** argv) { |
|
|
|
Linear_Int_Range<uint32_t, 0, 30, 1> run("run"); |
|
|
|
Linear_Int_Range<size_t, chunk_min, chunk_max, chunk_incr> chunk_size("chunk_size"); |
|
|
|
|
|
|
|
print_to_file(out_file, generateHead(run, chunk_size, "mode"), "thread_group", "time", |
|
|
|
print_to_file(out_file, generateHead(run, chunk_size), "thread_group", "time", |
|
|
|
#ifdef THREAD_TIMINGS
|
|
|
|
"scan_a", "scan_b", "aggr_j", |
|
|
|
#endif
|
|
|
|
#ifdef BARRIER_TIMINGS
|
|
|
|
"wait_scan_a", "wait_scan_b", "wait_aggr_j", |
|
|
|
#endif
|
|
|
|
#if PCM == 1
|
|
|
|
pcm_value_collector::getHead("scan_a"), |
|
|
|
pcm_value_collector::getHead("scan_b"), |
|
|
|
pcm_value_collector::getHead("aggr_j"), |
|
|
|
#endif
|
|
|
|
"result"); |
|
|
|
|
|
|
|
/*** alloc data and buffers ************************************************/ |
|
|
|
|
|
|
|
base_t* data_a = (base_t*) numa_alloc_local(workload_b); |
|
|
|
base_t* data_b = (base_t*) numa_alloc_local(workload_b); |
|
|
|
base_t* results = (base_t*) numa_alloc_local(thread_count * sizeof(base_t)); |
|
|
|
if constexpr (mode == ExecMode::HbmPeak) { |
|
|
|
data_a = (base_t*) numa_alloc_onnode(workload_b, cache_node); |
|
|
|
data_b = (base_t*) numa_alloc_onnode(workload_b, cache_node); |
|
|
|
results = (base_t*) numa_alloc_onnode(tc_combined * sizeof(base_t), cache_node); |
|
|
|
} |
|
|
|
else { |
|
|
|
data_a = (base_t*) numa_alloc_onnode(workload_b, current_node); |
|
|
|
data_b = (base_t*) numa_alloc_onnode(workload_b, current_node); |
|
|
|
results = (base_t*) numa_alloc_onnode(tc_combined * sizeof(base_t), current_node); |
|
|
|
} |
|
|
|
|
|
|
|
fill_mt<base_t>(data_a, workload_b, 0, 100, 42); |
|
|
|
fill_mt<base_t>(data_b, workload_b, 0, 100, 420); |
|
|
|
|
|
|
|
const std::string cfname = std::string("results/max_") + |
|
|
|
"q-" + (std::string)(simple_query == true ? "simple" : "complex") + |
|
|
|
"_bm-" + (std::string) BARRIER_MODE + |
|
|
|
"_bl-" + (std::string)(BUFFER_LIMIT == 1 ? "limited" : "unlimited") + |
|
|
|
"_tc-" + std::to_string(thread_count) + ".checksum"; |
|
|
|
|
|
|
|
std::ofstream check_file; |
|
|
|
check_file.open(cfname); |
|
|
|
|
|
|
|
if (check_file.bad()) { |
|
|
|
std::cerr << "Failed to open Checksum File '" << cfname << "'" << std::endl; |
|
|
|
} |
|
|
|
|
|
|
|
if constexpr (QUERY == 1) { |
|
|
|
//calculate simple checksum if QUERY == 1 -> simple query is applied
|
|
|
|
check_file << sum_check(compare_value_a, data_a, data_b, workload_b); |
|
|
|
} else { |
|
|
|
check_file << sum_check_complex(compare_value_a, compare_value_b, data_a, data_b, workload_b); |
|
|
|
} |
|
|
|
check_file.close(); |
|
|
|
|
|
|
|
for(uint32_t i = 0; i < 15; i++) { |
|
|
|
std::promise<void> p; |
|
|
|
std::shared_future<void> ready_future(p.get_future()); |
|
|
|
|
|
|
|
const uint8_t tc_filter = 6; |
|
|
|
const uint8_t tc_copy = 2; |
|
|
|
const uint8_t tc_agg = 4; |
|
|
|
|
|
|
|
Query_Wrapper<base_t, simple_query, cache_a, wait_b> qw ( |
|
|
|
Query_Wrapper<base_t, mode == ExecMode::HbmPrefetch> qw ( |
|
|
|
&ready_future, workload_b, chunk_size.current, |
|
|
|
data_a, data_b, results, tc_filter, tc_copy, tc_agg, 50, 42 |
|
|
|
data_a, data_b, results, tc_filter, tc_copy, |
|
|
|
tc_agg,compare_value_a, compare_value_b |
|
|
|
); |
|
|
|
|
|
|
|
qw.ready_future = &ready_future; |
|
|
@ -225,7 +219,7 @@ int main(int argc, char** argv) { |
|
|
|
double seconds = (double)(nanos) / nanos_per_second; |
|
|
|
|
|
|
|
if (i >= 5) { |
|
|
|
print_to_file(out_file, run, chunk_size, "DSA-HBM-Prefetch", THREAD_GROUP_MULTIPLIER, seconds, |
|
|
|
print_to_file(out_file, run, chunk_size, THREAD_GROUP_MULTIPLIER, seconds, |
|
|
|
#ifdef THREAD_TIMINGS
|
|
|
|
qw.trt->summarize_time(0), qw.trt->summarize_time(1), qw.trt->summarize_time(2), |
|
|
|
#endif
|
|
|
@ -239,5 +233,5 @@ int main(int argc, char** argv) { |
|
|
|
|
|
|
|
numa_free(data_a, workload_b); |
|
|
|
numa_free(data_b, workload_b); |
|
|
|
numa_free(results, THREAD_GROUP_MULTIPLIER * thread_count * sizeof(base_t)); |
|
|
|
numa_free(results, THREAD_GROUP_MULTIPLIER * tc_combined * sizeof(base_t)); |
|
|
|
} |