Merge branch 'master' of https://git.constantin-fuerst.com/constantin/bachelor-thesis

11 months ago · cff9081f4c
8 changed files with 29 additions and 38 deletions
--- a/offloading-cacher/cache.hpp
+++ b/offloading-cacher/cache.hpp
@ -400,12 +400,6 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
    const size_t size = task->GetSize() / task_count;
    const size_t last_size = size + task->GetSize() % task_count;
    // save the current numa node mask to restore later
    // as executing the copy task will place this thread
    // on a different node
    bitmask* nodemask = numa_get_run_node_mask();
    auto handlers = new std::vector<dml_job_t*>();
    for (uint32_t i = 0; i < task_count; i++) {
@ -418,18 +412,11 @@ inline void dsacache::Cache::SubmitTask(CacheData* task, const int dst_node, con
    }
    task->SetTaskHandlersAndCache(dst, handlers);
    // restore the previous nodemask
    numa_run_on_node_mask(nodemask);
    numa_free_nodemask(nodemask);
 }
 inline dml_job_t* dsacache::Cache::ExecuteCopy(
        const uint8_t* src, uint8_t* dst, const size_t size, const int node
 ) const {
    numa_run_on_node(node);
    uint32_t job_size = 0;
    dml_status_t status = dml_get_job_size(DML_PATH_HW, &job_size);
@ -452,6 +439,7 @@ inline dml_job_t* dsacache::Cache::ExecuteCopy(
    job->destination_first_ptr = dst;
    job->source_length = size;
    job->flags |= DML_FLAG_BLOCK_ON_FAULT | DML_FLAG_COPY_ONLY;
    job->numa_id = node;
    status = dml_submit_job(job);
@ -642,8 +630,12 @@ inline dsacache::CacheData::~CacheData() {
        Deallocate();
        for (dml_job_t* job : *handlers_->load()) {
            if (job != nullptr) delete job;
 	std::vector<dml_job_t*>* handlers = handlers_->load();
 	if (handlers != nullptr && handlers != reinterpret_cast<std::vector<dml_job_t*>*>(maxptr)) {
 	    for (dml_job_t* job : *handlers_->load()) {
 	        if (job != nullptr) delete job;
 	    }
        }
        delete active_;
--- a/qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv
+++ b/qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv
@ -1,6 +0,0 @@
 run;time;result[0];
 0;22199017;0;
 1;16588422;0;
 2;18267635;0;
 3;17026004;0;
 4;16958071;0;
--- a/qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv
+++ b/qdp_project/results/qdp-xeonmax-simpleq-dram-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv
@ -0,0 +1,6 @@
 run;time;result[0];
 0;97400868;0;
 1;97565944;0;
 2;89098555;0;
 3;93226925;0;
 4;97550283;0;
--- a/qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv
+++ b/qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl2147483648-cs2097152.csv
@ -1,6 +0,0 @@
 run;time;result[0];
 0;14448722;0;
 1;17734795;0;
 2;19240141;0;
 3;15579654;0;
 4;14252101;0;
--- a/qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv
+++ b/qdp_project/results/qdp-xeonmax-simpleq-hbm-tca4-tcb0-tcj2-tmul4-wl4294967296-cs2097152.csv
@ -0,0 +1,6 @@
 run;time;result[0];
 0;67853704;0;
 1;85513791;0;
 2;66482278;0;
 3;67492755;0;
 4;68083298;0;
--- a/qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl2147483648-cs134217728.csv
+++ b/qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl2147483648-cs134217728.csv
@ -1,6 +0,0 @@
 run;time;result[0];
 0;916843;0;
 1;1060229;0;
 2;914006;0;
 3;1217119;0;
 4;1029607;0;
--- a/qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl4294967296-cs134217728.csv
+++ b/qdp_project/results/qdp-xeonmax-simpleq-prefetch-tca1-tcb1-tcj1-tmul32-wl4294967296-cs134217728.csv
@ -0,0 +1,6 @@
 run;time;result[0];
 0;6136464327;0;
 1;6125809164;0;
 2;6122382339;0;
 3;6207611081;0;
 4;6149495534;0;
--- a/qdp_project/src/Benchmark.cpp
+++ b/qdp_project/src/Benchmark.cpp
@ -20,7 +20,7 @@
 #include "BenchmarkHelpers.cpp"
 #define MODE_PREFETCH
 #define MODE_HBM
 ////////////////////////////////
 /// BENCHMARK SETUP
@ -30,8 +30,8 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
 constexpr uint32_t ITERATION_COUNT = 5;
 #ifdef MODE_PREFETCH
 constexpr size_t CHUNK_SIZE_B = 256_MiB;
 constexpr uint32_t GROUP_COUNT = 8;
 constexpr size_t CHUNK_SIZE_B = 128_MiB;
 constexpr uint32_t GROUP_COUNT = 32;
 constexpr uint32_t TC_SCANA = 1;
 constexpr uint32_t TC_SCANB = 2;
 constexpr uint32_t TC_AGGRJ = 2;
@ -74,7 +74,6 @@ constexpr size_t MASK_STEP_SIZE = CHUNK_SIZE_ELEMENTS / MASK_ELEMENT_SIZE;
 static_assert(RUN_COUNT > 0);
 static_assert(TC_SCANB <= TC_AGGRJ);
 static_assert(TC_AGGRJ % TC_SCANB == 0);
 static_assert(WL_SIZE_B % 16 == 0);
 static_assert(CHUNK_SIZE_B % 16 == 0);
@ -168,7 +167,7 @@ void process_timings(
 }
 void scan_b(size_t gid, size_t tid) {
    constexpr size_t split =  TC_AGGRJ / TC_SCANB;
    constexpr size_t split =  TC_AGGRJ / (TC_SCANB == 0 ? 1 : TC_SCANB);
    const size_t start = tid * split;
    const size_t end = start + split;