bachelor-thesis/offloading-cacher/offloading-cache.hpp


								#pragma once


								#include <atomic>

								#include <vector>

								#include <thread>

								#include <unordered_map>


								#include <semaphore.h>


								#include <numa.h>


								#include <dml/dml.hpp>


								namespace offcache {

								    // execution policy selects in which way the data is supposed to be cached

								    // and returned with the following behaviour is guaranteed in addition to the

								    // returned value being valid:

								    // Immediate:        return as fast as possible

								    //                   may return cached data, can return data in RAM

								    //                   will trigger caching of the data provided

								    // ImmediateNoCache: return as fast as possible and never trigger caching

								    //                   same as Immediate but will not trigger caching

								    // Relaxed:          no rapid return needed, take time

								    //                   will trigger caching and may only return

								    //                   once the caching is successful but can still

								    //                   provide data in RAM

								    enum class ExecutionPolicy {

								        Relaxed, Immediate, ImmediateNoCache

								    };


								    struct WorkerTask {

								        uint8_t* src_;

								        uint8_t* dst_;

								        size_t size_;

								        std::atomic<bool> completed_ { false };

								    };


								    // the cache task structure will be used to submit and

								    // control a cache element, while providing source pointer

								    // and size in bytes for submission

								    //

								    // then the submitting thread may wait on the atomic "result"

								    // which will be notified by the cache worker upon processing

								    // after which the atomic-bool-ptr active will also become valid

								    struct CacheTask {

								        uint8_t* data_;

								        size_t size_;

								        ExecutionPolicy policy_;

								        uint8_t* result_;

								        std::atomic<bool> active_;

								        std::vector<WorkerTask> sub_tasks_;

								    };


								    // worker class, one for each numa node

								    // discovers its node configuration on startup

								    // and keeps track of available memory

								    class CacheWorker {

								    public:

								        uint8_t numa_node_ = 0;


								        // this is the mailbox of the worker to which a new task

								        // may be submitted by exchanging nullptr with a valid one

								        // and notifying on the atomic after which ownership

								        // of the CacheTask structure is transferred to the worker

								        std::atomic<WorkerTask*>* task_slot_ = nullptr;


								        static void run(CacheWorker* this_);

								    };


								    // singleton which holds the cache workers

								    // and is the place where work will be submited

								    class CacheCoordinator {

								    public:

								        // cache policy is defined as a type here to allow flexible usage of the cacher

								        // given a numa destination node (where the data will be needed), the numa source

								        // node (current location of the data) and the data size, this function should

								        // return optimal cache placement

								        // dst node and returned value can differ if the system, for example, has HBM

								        // attached accessible directly to node n under a different node id m

								        typedef uint8_t (CachePolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node, const size_t data_size);


								        // copy policy specifies the copy-executing nodes for a given task

								        // which allows flexibility in assignment for optimizing raw throughput

								        // or choosing a conservative usage policy

								        typedef std::vector<uint8_t> (CopyPolicy)(const uint8_t numa_dst_node, const uint8_t numa_src_node);


								    private:

								        std::unordered_map<uint8_t, CacheWorker> workers_;


								        std::unordered_map<uint8_t*, CacheTask*> cache_state_;


								        CachePolicy* cache_policy_function_ = nullptr;

								        CopyPolicy* copy_policy_function_ = nullptr;


								    public:

								        void Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function);


								        // submits the given task and takes ownership of the pointer

								        void SubmitTask(CacheTask* task, const ExecutionPolicy policy) const;


								        // waits upon completion of caching

								        // returns the location of the data

								        static uint8_t* WaitOnCompletion(CacheTask* task);


								        // invalidates the given pointer

								        static void SignalDataUnused(CacheTask* task);

								    };

								}


								void offcache::CacheWorker::run(CacheWorker* this_) {


								}


								void offcache::CacheCoordinator::Init(CachePolicy* cache_policy_function, CopyPolicy* copy_policy_function) {

								    cache_policy_function_ = cache_policy_function;

								    copy_policy_function_ = copy_policy_function;


								    // initialize numa library

								    numa_available();


								    const uint8_t nodes_max = numa_num_configured_nodes();

								    const uint8_t valid_nodes = numa_get_mems_allowed();


								    for (uint8_t node = 0; node < nodes_max; node++) {

								        if (numa_bitmask_isbitset(valid_nodes, node)) {

								            workers_.insert({ node, CacheWorker() });

								            workers_[node].numa_node_ = node;

								            std::thread t (CacheWorker::run, &workers_[node]);

								            t.detach();

								        }

								    }

								}


								void offcache::CacheCoordinator::SubmitTask(CacheTask* task, const ExecutionPolicy policy) const {


								}


								uint8_t* offcache::CacheCoordinator::WaitOnCompletion(CacheTask* task) {

								    while (!task->sub_tasks_.empty()) {

								        task->sub_tasks_.back().completed_.wait(false);

								        task->sub_tasks_.pop_back();

								    }

								}


								void offcache::CacheCoordinator::SignalDataUnused(CacheTask* task) {

								    task->active_.store(false);

								}