diff --git a/offloading-cacher/cache.hpp b/offloading-cacher/cache.hpp index 6a75305..01e4f49 100644 --- a/offloading-cacher/cache.hpp +++ b/offloading-cacher/cache.hpp @@ -39,7 +39,18 @@ namespace dml { } namespace dsacache { - constexpr bool WAIT_WEAK = true; + inline bool CheckFlag(const uint64_t value, const uint64_t flag) { + return (value & ~flag) != 0ULL; + } + inline uint64_t UnsetFlag(const uint64_t value, const uint64_t flag) { + return value & ~flag; + } + inline uint64_t SetFlag(const uint64_t value, const uint64_t flag) { + return value + flag; + } + + constexpr uint64_t FLAG_WAIT_WEAK = 0b1ULL << 63; + constexpr uint64_t FLAG_HANDLE_PF = 0b1ULL << 62; class Cache; @@ -95,6 +106,9 @@ namespace dsacache { // contract: only access when being in sole posession of handlers uint8_t** incomplete_cache_; + // flags inherited from parent cache + uint64_t flags_ = 0; + // dml handler vector pointer which is used // to wait on caching task completion std::atomic* handler_; @@ -122,13 +136,16 @@ namespace dsacache { // waits on completion of caching operations // for this task and is safe to be called in // any state of the object - void WaitOnCompletion(const bool weak = false); + void WaitOnCompletion(); // returns the cache data location for this // instance which is valid as long as the // instance is alive - !!! this may also // yield a nullptr !!! uint8_t* GetDataLocation() const { return cache_->load(); } + + void SetFlags(const uint64_t flags) { flags_ = flags; } + uint64_t GetFlags() const { return flags_; } }; /* @@ -204,8 +221,9 @@ namespace dsacache { typedef std::vector (CopyPolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size); private: - // mutex for accessing the cache state map + // flags to store options duh + uint64_t flags_ = 0; // map from [dst-numa-node,map2] // map2 from [data-ptr,cache-structure] @@ -276,6 +294,9 @@ namespace dsacache { void Clear(); void Invalidate(uint8_t* data); + + void SetFlags(const uint64_t flags) { flags_ = flags; } + uint64_t GetFlags() { return flags_; } }; } @@ -334,9 +355,10 @@ inline std::unique_ptr dsacache::Cache::Access(uint8_t* dat } // at this point the requested data is not present in cache - // and we create a caching task for it + // and we create a caching task for it, copying our current flags task = std::make_unique(data, size); + task->SetFlags(flags_); { LockedNodeCacheState* local_cache_state = cache_state_[dst_node]; @@ -439,10 +461,18 @@ inline dml::handler> dsacache:: dml::const_data_view srcv = dml::make_view(src, size); dml::data_view dstv = dml::make_view(dst, size); - return dml::submit( - dml::mem_copy.block_on_fault(), srcv, dstv, - dml::execution_interface>(), node - ); + if (CheckFlag(flags_, FLAG_HANDLE_PF)) { + return dml::submit( + dml::mem_copy.block_on_fault(), srcv, dstv, + dml::execution_interface>(), node + ); + } + else { + return dml::submit( + dml::mem_copy, srcv, dstv, + dml::execution_interface>(), node + ); + } } inline void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { @@ -598,6 +628,7 @@ inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) { src_ = other.src_; size_ = other.size_; cache_ = other.cache_; + flags_ = other.flags_; incomplete_cache_ = other.incomplete_cache_; handler_ = other.handler_; @@ -643,7 +674,7 @@ inline void dsacache::CacheData::Deallocate() { else; } -inline void dsacache::CacheData::WaitOnCompletion(const bool weak) { +inline void dsacache::CacheData::WaitOnCompletion() { // first check if waiting is even neccessary as a valid // cache pointer signals that no waiting is to be performed @@ -674,7 +705,7 @@ inline void dsacache::CacheData::WaitOnCompletion(const bool weak) { // at this point we are responsible for waiting for the handlers // and handling any error that comes through them gracefully - if (weak && !local_handler->is_finished()) { + if (CheckFlag(flags_, FLAG_WAIT_WEAK) && !local_handler->is_finished()) { handler_->store(local_handler); return; } diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp index d895635..12d829c 100644 --- a/qdp_project/src/Benchmark.cpp +++ b/qdp_project/src/Benchmark.cpp @@ -15,7 +15,7 @@ #include "../../offloading-cacher/cache.hpp" #ifndef MODE_SET_BY_CMAKE -#define MODE_COMPLEX_HBM +#define MODE_SIMPLE_PREFETCH #endif #include "Configuration.hpp" @@ -126,7 +126,7 @@ void aggr_j(size_t gid, size_t tid) { if constexpr (PERFORM_CACHING) { data = CACHE_.Access(reinterpret_cast(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); - data->WaitOnCompletion(dsacache::WAIT_WEAK); + data->WaitOnCompletion(); data_ptr = reinterpret_cast(data->GetDataLocation()); if (data_ptr == nullptr) { @@ -195,6 +195,13 @@ int main() { DATA_DST_ = (uint64_t*) numa_alloc_local(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t)); if constexpr (PERFORM_CACHING) { + // cache will be configured to wait weak by default + // it will also not handle page faults which cause delay + // it will use the copy and caching policy from BenchmarkHelpers.cpp + // which is configured for xeonmax with smart assignment + uint64_t cache_flags = 0; + cache_flags += dsacache::FLAG_WAIT_WEAK; + CACHE_.SetFlags(cache_flags); CACHE_.Init(CachePlacementPolicy, CopyMethodPolicy); } diff --git a/qdp_project/src/Configuration.hpp b/qdp_project/src/Configuration.hpp index 81e8e18..545068f 100644 --- a/qdp_project/src/Configuration.hpp +++ b/qdp_project/src/Configuration.hpp @@ -5,11 +5,11 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5; constexpr uint32_t ITERATION_COUNT = 5; #ifdef MODE_SIMPLE_PREFETCH -constexpr uint32_t GROUP_COUNT = 16; +constexpr uint32_t GROUP_COUNT = 8; constexpr size_t CHUNK_SIZE_B = 8_MiB; -constexpr uint32_t TC_SCANA = 2; +constexpr uint32_t TC_SCANA = 4; constexpr uint32_t TC_SCANB = 1; -constexpr uint32_t TC_AGGRJ = 2; +constexpr uint32_t TC_AGGRJ = 4; constexpr bool PERFORM_CACHING = true; constexpr bool STORE_B_IN_HBM = false; constexpr char MODE_STRING[] = "simple-prefetch"; @@ -38,11 +38,11 @@ constexpr char MODE_STRING[] = "simple-hbm"; constexpr bool COMPLEX_QUERY = false; #endif #ifdef MODE_COMPLEX_PREFETCH -constexpr uint32_t GROUP_COUNT = 16; +constexpr uint32_t GROUP_COUNT = 8; constexpr size_t CHUNK_SIZE_B = 8_MiB; -constexpr uint32_t TC_SCANA = 1; +constexpr uint32_t TC_SCANA = 4; constexpr uint32_t TC_SCANB = 1; -constexpr uint32_t TC_AGGRJ = 2; +constexpr uint32_t TC_AGGRJ = 4; constexpr bool PERFORM_CACHING = true; constexpr bool STORE_B_IN_HBM = false; constexpr char MODE_STRING[] = "complex-prefetch";