bachelor-thesis/offloading-cacher/main.cpp


								#include <iostream>

								#include <random>

								#include <vector>

								#include <string>


								#include <omp.h>


								#include "cache.hpp"


								static constexpr size_t SIZE_64_MIB = 64 * 1024 * 1024;


								dsacache::Cache CACHE;


								void InitCache(const std::string& device) {

								    if (device == "default") {

								        auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {

								            return numa_dst_node;

								        };


								        auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {

								            return std::vector<int>{ numa_dst_node };

								        };


								        CACHE.Init(cache_policy,copy_policy);

								    }

								    else if (device == "xeonmax") {

								        auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {

								            // xeon max is configured to have hbm on node ids that are +8


								            return numa_dst_node < 8 ? numa_dst_node + 8 : numa_dst_node;

								        };


								        auto copy_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) {

								            if (data_size < SIZE_64_MIB) {

								                // if the data size is small then the copy will just be carried

								                // out by the destination node which does not require setting numa

								                // thread affinity as the selected dsa engine is already the one

								                // present on the calling thread


								                return std::vector<int>{ (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node) };

								            }

								            else {

								                // for sufficiently large data, smart copy is used which will utilize

								                // all four engines for intra-socket copy operations and cross copy on

								                // the source and destination nodes for inter-socket copy


								                const bool same_socket = ((numa_dst_node ^ numa_src_node) & 4) == 0;


								                if (same_socket) {

								                    const bool socket_number = numa_dst_node >> 2;

								                    if (socket_number == 0) return std::vector<int>{ 0, 1, 2, 3 };

								                    else return std::vector<int>{ 4, 5, 6, 7 };

								                }

								                else {

								                    return std::vector<int>{

								                        (numa_src_node >= 8 ? numa_src_node - 8 : numa_src_node),

								                        (numa_dst_node >= 8 ? numa_dst_node - 8 : numa_dst_node)

								                    };

								                }

								            }

								        };


								        CACHE.Init(cache_policy,copy_policy);

								    }

								    else {

								        std::cerr << "Given device '" << device << "' not supported!" << std::endl;

								        exit(-1);

								    }

								}


								uint8_t* GetRandomArray(const size_t size) {

								    uint8_t* array = new uint8_t[size];


								    std::uniform_int_distribution<uint8_t> unif(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());

								    std::default_random_engine re;


								    for (size_t i = 0; i < size; i++) {

								        array[i] = unif(re);

								    }


								    return array;

								}


								bool IsEqual(const uint8_t* a, const uint8_t* b, const size_t size) {

								    for (size_t i = 0; i < size; i++) {

								        try {

								            if (a[i] != b[i]) return false;

								        }

								        catch (...) {

								            return false;

								        }

								    }


								    return true;

								}


								std::unique_ptr<dsacache::CacheData> PerformAccessAndTest(uint8_t* src, const size_t size, const int tid) {

								    std::unique_ptr<dsacache::CacheData> data_cache = CACHE.Access(

								            reinterpret_cast<uint8_t *>(src),

								            size * sizeof(uint8_t)

								    );


								    data_cache->SetFlags(dsacache::FLAG_WAIT_WEAK);

								    data_cache->WaitOnCompletion();


								    uint8_t* cached_imm = reinterpret_cast<uint8_t *>(data_cache->GetDataLocation());


								    // check the value immediately just to see if ram or cache was returned


								    if (src == cached_imm) {

								        std::cout << "[" << tid << "] Caching did not immediately yield different data location." << std::endl;

								    }

								    else if (cached_imm == nullptr) {

								        std::cout << "[" << tid << "] Immediately got nullptr." << std::endl;

								    }

								    else {

								        std::cout << "[" << tid << "] Immediately got different data location." << std::endl;

								    }


								    // waits for the completion of the asynchronous caching operation


								    data_cache->SetFlags(dsacache::FLAG_DEFAULT);

								    data_cache->WaitOnCompletion();


								    // gets the cache-data-location from the struct


								    uint8_t* cached = reinterpret_cast<uint8_t *>(data_cache->GetDataLocation());


								    // tests on the resulting value


								    if (src == cached) {

								        std::cout << "[" << tid << "] Caching did not affect data location." << std::endl;

								    }

								    else if (cached == nullptr) {

								        std::cerr << "[" << tid << "] Got nullptr from cache." << std::endl;

								    }

								    else {

								        std::cout << "[" << tid << "] Got different data location from cache." << std::endl;

								    }


								    if (IsEqual(src,cached,size)) {

								        std::cout << "[" << tid << "] Cached data is correct." << std::endl;

								    }

								    else {

								        std::cerr << "[" << tid << "] Cached data is wrong." << std::endl;

								    }


								    return std::move(data_cache);

								}


								void RunTestST(const size_t size) {

								    uint8_t* data = GetRandomArray(size);


								    static constexpr int tid = 0;


								    std::cout << "[" << tid << "] first access --- " << std::endl;


								    PerformAccessAndTest(data, size, tid);


								    std::cout << "[" << tid << "] second access --- " << std::endl;


								    PerformAccessAndTest(data, size, tid);


								    std::cout << "[" << tid << "] end of application --- " << std::endl;

								}


								void RunTestMT(const size_t size) {

								    uint8_t* data = GetRandomArray(size);


								    #pragma omp parallel

								    {

								        const int tid = omp_get_thread_num();


								        std::cout << "[" << tid << "] first access --- " << std::endl;


								        PerformAccessAndTest(data, size, tid);


								        std::cout << "[" << tid << "] second access --- " << std::endl;


								        PerformAccessAndTest(data, size, tid);


								        std::cout << "[" << tid << "] end of block --- " << std::endl;

								    }

								}


								void RunTestFlush(const size_t size) {

								    uint8_t* data1 = GetRandomArray(size);

								    uint8_t* data2 = GetRandomArray(size);

								    uint8_t* data3 = GetRandomArray(size);


								    static constexpr int tid = 0;


								    std::cout << "[" << tid << "] first access to data d1 and keepalive --- " << std::endl;


								    const auto c1 = PerformAccessAndTest(data1, size, tid);


								    std::cout << "[" << tid << "] second access to d2 lets d2 vanish --- " << std::endl;


								    PerformAccessAndTest(data2, size, tid);


								    std::cout << "[" << tid << "] third access to d3 should clear d2 --- " << std::endl;


								    PerformAccessAndTest(data3, size, tid);


								    std::cout << "[" << tid << "] end of block and test d1 == cache1 --- " << std::endl;


								    if (IsEqual(data1, c1->GetDataLocation(), size)) {

								        std::cout << "[" << tid << "] Cached d1 is still correct." << std::endl;

								    }

								    else {

								        std::cerr << "[" << tid << "] Cached d1 is bad." << std::endl;

								    }

								}


								int main(int argc, char **argv) {

								    if (argc != 4) {

								        std::cerr << "This application requires three parameters!" << std::endl;


								        std::cout << "Please provide the following positional arguments: [device] [mode] [size]" << std::endl;

								        std::cout << "[device] from { default, xeonmax } which influences cache and execution placement" << std::endl;

								        std::cout << "[mode] from { st, mt, flt } or single and multi threaded and flushtest respectively" << std::endl;

								        std::cout << "[size] positive integral number, amount of bytes in data array" << std::endl;

								        std::cout << "for flushtest the given size should be 1/3 of the available cache size" << std::endl;


								        exit(-1);

								    }


								    const std::string device = argv[1];

								    const std::string mode = argv[2];

								    const std::string size_s = argv[3];


								    uint32_t size = 0;


								    try {

								        size = std::stoul(size_s);

								    }

								    catch (...) {

								        std::cerr << "Given Size '" << size_s << "' caused error during conversion to number!" << std::endl;

								    }


								    InitCache(device);


								    if (mode == "st") {

								        RunTestST(size);

								    }

								    else if (mode == "mt") {

								        RunTestMT(size);

								    }

								    else if (mode == "flt") {

								        RunTestFlush(size);

								    }

								    else {

								        std::cerr << "Given Mode '" << mode << "' not supported!" << std::endl;

								        exit(-1);

								    }

								}