Constantin Fürst 11 months ago
parent
commit
cff9081f4c
  1. 18
      offloading-cacher/cache.hpp
  2. 6
      qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv
  3. 6
      qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv
  4. 6
      qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv
  5. 6
      qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv
  6. 6
      qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl2147483648-cs134217728.csv
  7. 6
      qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl4294967296-cs134217728.csv
  8. 9
      qdp_project/src/Benchmark.cpp

18
offloading-cacher/cache.hpp

@ -400,12 +400,6 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
const size_t size = task->GetSize() / task_count;
const size_t last_size = size + task->GetSize() % task_count;
// save the current numa node mask to restore later
// as executing the copy task will place this thread
// on a different node
bitmask* nodemask = numa_get_run_node_mask();
auto handlers = new std::vector<dml_job_t*>();
for (uint32_t i = 0; i < task_count; i++) {
@ -418,18 +412,11 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
}
task->SetTaskHandlersAndCache(dst, handlers);
// restore the previous nodemask
numa_run_on_node_mask(nodemask);
numa_free_nodemask(nodemask);
}
inline dml_job_t* dsacache::Cache::ExecuteCopy(
const uint8_t* src, uint8_t* dst, const size_t size, const int node
) const {
numa_run_on_node(node);
uint32_t job_size = 0;
dml_status_t status = dml_get_job_size(DML_PATH_HW, &job_size);
@ -452,6 +439,7 @@ inline dml_job_t* dsacache::Cache::ExecuteCopy(
job->destination_first_ptr = dst;
job->source_length = size;
job->flags |= DML_FLAG_BLOCK_ON_FAULT | DML_FLAG_COPY_ONLY;
job->numa_id = node;
status = dml_submit_job(job);
@ -642,9 +630,13 @@ inline dsacache::CacheData::~CacheData() {
Deallocate();
std::vector<dml_job_t*>* handlers = handlers_->load();
if (handlers != nullptr && handlers != reinterpret_cast<std::vector<dml_job_t*>*>(maxptr)) {
for (dml_job_t* job : *handlers_->load()) {
if (job != nullptr) delete job;
}
}
delete active_;
delete cache_;

6
qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv

@ -1,6 +0,0 @@
run;time;result[0];
0;22199017;0;
1;16588422;0;
2;18267635;0;
3;17026004;0;
4;16958071;0;

6
qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv

@ -0,0 +1,6 @@
run;time;result[0];
0;97400868;0;
1;97565944;0;
2;89098555;0;
3;93226925;0;
4;97550283;0;

6
qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv

@ -1,6 +0,0 @@
run;time;result[0];
0;14448722;0;
1;17734795;0;
2;19240141;0;
3;15579654;0;
4;14252101;0;

6
qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv

@ -0,0 +1,6 @@
run;time;result[0];
0;67853704;0;
1;85513791;0;
2;66482278;0;
3;67492755;0;
4;68083298;0;

6
qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl2147483648-cs134217728.csv

@ -1,6 +0,0 @@
run;time;result[0];
0;916843;0;
1;1060229;0;
2;914006;0;
3;1217119;0;
4;1029607;0;

6
qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl4294967296-cs134217728.csv

@ -0,0 +1,6 @@
run;time;result[0];
0;6136464327;0;
1;6125809164;0;
2;6122382339;0;
3;6207611081;0;
4;6149495534;0;

9
qdp_project/src/Benchmark.cpp

@ -20,7 +20,7 @@
#include "BenchmarkHelpers.cpp"
#define MODE_PREFETCH
#define MODE_HBM
////////////////////////////////
/// BENCHMARK SETUP
@ -30,8 +30,8 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
constexpr uint32_t ITERATION_COUNT = 5;
#ifdef MODE_PREFETCH
constexpr size_t CHUNK_SIZE_B = 256_MiB;
constexpr uint32_t GROUP_COUNT = 8;
constexpr size_t CHUNK_SIZE_B = 128_MiB;
constexpr uint32_t GROUP_COUNT = 32;
constexpr uint32_t TC_SCANA = 1;
constexpr uint32_t TC_SCANB = 2;
constexpr uint32_t TC_AGGRJ = 2;
@ -74,7 +74,6 @@ constexpr size_t MASK_STEP_SIZE = CHUNK_SIZE_ELEMENTS / MASK_ELEMENT_SIZE;
static_assert(RUN_COUNT > 0);
static_assert(TC_SCANB <= TC_AGGRJ);
static_assert(TC_AGGRJ % TC_SCANB == 0);
static_assert(WL_SIZE_B % 16 == 0);
static_assert(CHUNK_SIZE_B % 16 == 0);
@ -168,7 +167,7 @@ void process_timings(
}
void scan_b(size_t gid, size_t tid) {
constexpr size_t split = TC_AGGRJ / TC_SCANB;
constexpr size_t split = TC_AGGRJ / (TC_SCANB == 0 ? 1 : TC_SCANB);
const size_t start = tid * split;
const size_t end = start + split;

Loading…
Cancel
Save