Browse Source

add a flags-concept to cacher, add the option to select whether to handle pagefaults or not

master
Constantin Fürst 11 months ago
parent
commit
b3607329a6
  1. 51
      offloading-cacher/cache.hpp
  2. 11
      qdp_project/src/Benchmark.cpp
  3. 12
      qdp_project/src/Configuration.hpp

51
offloading-cacher/cache.hpp

@ -39,7 +39,18 @@ namespace dml {
} }
namespace dsacache { namespace dsacache {
constexpr bool WAIT_WEAK = true;
inline bool CheckFlag(const uint64_t value, const uint64_t flag) {
return (value & ~flag) != 0ULL;
}
inline uint64_t UnsetFlag(const uint64_t value, const uint64_t flag) {
return value & ~flag;
}
inline uint64_t SetFlag(const uint64_t value, const uint64_t flag) {
return value + flag;
}
constexpr uint64_t FLAG_WAIT_WEAK = 0b1ULL << 63;
constexpr uint64_t FLAG_HANDLE_PF = 0b1ULL << 62;
class Cache; class Cache;
@ -95,6 +106,9 @@ namespace dsacache {
// contract: only access when being in sole posession of handlers // contract: only access when being in sole posession of handlers
uint8_t** incomplete_cache_; uint8_t** incomplete_cache_;
// flags inherited from parent cache
uint64_t flags_ = 0;
// dml handler vector pointer which is used // dml handler vector pointer which is used
// to wait on caching task completion // to wait on caching task completion
std::atomic<dml_handler*>* handler_; std::atomic<dml_handler*>* handler_;
@ -122,13 +136,16 @@ namespace dsacache {
// waits on completion of caching operations // waits on completion of caching operations
// for this task and is safe to be called in // for this task and is safe to be called in
// any state of the object // any state of the object
void WaitOnCompletion(const bool weak = false);
void WaitOnCompletion();
// returns the cache data location for this // returns the cache data location for this
// instance which is valid as long as the // instance which is valid as long as the
// instance is alive - !!! this may also // instance is alive - !!! this may also
// yield a nullptr !!! // yield a nullptr !!!
uint8_t* GetDataLocation() const { return cache_->load(); } uint8_t* GetDataLocation() const { return cache_->load(); }
void SetFlags(const uint64_t flags) { flags_ = flags; }
uint64_t GetFlags() const { return flags_; }
}; };
/* /*
@ -204,8 +221,9 @@ namespace dsacache {
typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size); typedef std::vector<int> (CopyPolicy)(const int numa_dst_node, const int numa_src_node, const size_t data_size);
private: private:
// mutex for accessing the cache state map
// flags to store options duh
uint64_t flags_ = 0;
// map from [dst-numa-node,map2] // map from [dst-numa-node,map2]
// map2 from [data-ptr,cache-structure] // map2 from [data-ptr,cache-structure]
@ -276,6 +294,9 @@ namespace dsacache {
void Clear(); void Clear();
void Invalidate(uint8_t* data); void Invalidate(uint8_t* data);
void SetFlags(const uint64_t flags) { flags_ = flags; }
uint64_t GetFlags() { return flags_; }
}; };
} }
@ -334,9 +355,10 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::Access(uint8_t* dat
} }
// at this point the requested data is not present in cache // at this point the requested data is not present in cache
// and we create a caching task for it
// and we create a caching task for it, copying our current flags
task = std::make_unique<CacheData>(data, size); task = std::make_unique<CacheData>(data, size);
task->SetFlags(flags_);
{ {
LockedNodeCacheState* local_cache_state = cache_state_[dst_node]; LockedNodeCacheState* local_cache_state = cache_state_[dst_node];
@ -439,10 +461,18 @@ inline dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>> dsacache::
dml::const_data_view srcv = dml::make_view(src, size); dml::const_data_view srcv = dml::make_view(src, size);
dml::data_view dstv = dml::make_view(dst, size); dml::data_view dstv = dml::make_view(dst, size);
return dml::submit<dml::hardware>(
dml::mem_copy.block_on_fault(), srcv, dstv,
dml::execution_interface<dml::hardware,std::allocator<uint8_t>>(), node
);
if (CheckFlag(flags_, FLAG_HANDLE_PF)) {
return dml::submit<dml::hardware>(
dml::mem_copy.block_on_fault(), srcv, dstv,
dml::execution_interface<dml::hardware,std::allocator<uint8_t>>(), node
);
}
else {
return dml::submit<dml::hardware>(
dml::mem_copy, srcv, dstv,
dml::execution_interface<dml::hardware,std::allocator<uint8_t>>(), node
);
}
} }
inline void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const { inline void dsacache::Cache::GetCacheNode(uint8_t* src, const size_t size, int* OUT_DST_NODE, int* OUT_SRC_NODE) const {
@ -598,6 +628,7 @@ inline dsacache::CacheData::CacheData(const dsacache::CacheData& other) {
src_ = other.src_; src_ = other.src_;
size_ = other.size_; size_ = other.size_;
cache_ = other.cache_; cache_ = other.cache_;
flags_ = other.flags_;
incomplete_cache_ = other.incomplete_cache_; incomplete_cache_ = other.incomplete_cache_;
handler_ = other.handler_; handler_ = other.handler_;
@ -643,7 +674,7 @@ inline void dsacache::CacheData::Deallocate() {
else; else;
} }
inline void dsacache::CacheData::WaitOnCompletion(const bool weak) {
inline void dsacache::CacheData::WaitOnCompletion() {
// first check if waiting is even neccessary as a valid // first check if waiting is even neccessary as a valid
// cache pointer signals that no waiting is to be performed // cache pointer signals that no waiting is to be performed
@ -674,7 +705,7 @@ inline void dsacache::CacheData::WaitOnCompletion(const bool weak) {
// at this point we are responsible for waiting for the handlers // at this point we are responsible for waiting for the handlers
// and handling any error that comes through them gracefully // and handling any error that comes through them gracefully
if (weak && !local_handler->is_finished()) {
if (CheckFlag(flags_, FLAG_WAIT_WEAK) && !local_handler->is_finished()) {
handler_->store(local_handler); handler_->store(local_handler);
return; return;
} }

11
qdp_project/src/Benchmark.cpp

@ -15,7 +15,7 @@
#include "../../offloading-cacher/cache.hpp" #include "../../offloading-cacher/cache.hpp"
#ifndef MODE_SET_BY_CMAKE #ifndef MODE_SET_BY_CMAKE
#define MODE_COMPLEX_HBM
#define MODE_SIMPLE_PREFETCH
#endif #endif
#include "Configuration.hpp" #include "Configuration.hpp"
@ -126,7 +126,7 @@ void aggr_j(size_t gid, size_t tid) {
if constexpr (PERFORM_CACHING) { if constexpr (PERFORM_CACHING) {
data = CACHE_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ); data = CACHE_.Access(reinterpret_cast<uint8_t *>(chunk_ptr), CHUNK_SIZE_B / TC_AGGRJ);
data->WaitOnCompletion(dsacache::WAIT_WEAK);
data->WaitOnCompletion();
data_ptr = reinterpret_cast<uint64_t*>(data->GetDataLocation()); data_ptr = reinterpret_cast<uint64_t*>(data->GetDataLocation());
if (data_ptr == nullptr) { if (data_ptr == nullptr) {
@ -195,6 +195,13 @@ int main() {
DATA_DST_ = (uint64_t*) numa_alloc_local(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t)); DATA_DST_ = (uint64_t*) numa_alloc_local(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t));
if constexpr (PERFORM_CACHING) { if constexpr (PERFORM_CACHING) {
// cache will be configured to wait weak by default
// it will also not handle page faults which cause delay
// it will use the copy and caching policy from BenchmarkHelpers.cpp
// which is configured for xeonmax with smart assignment
uint64_t cache_flags = 0;
cache_flags += dsacache::FLAG_WAIT_WEAK;
CACHE_.SetFlags(cache_flags);
CACHE_.Init(CachePlacementPolicy, CopyMethodPolicy); CACHE_.Init(CachePlacementPolicy, CopyMethodPolicy);
} }

12
qdp_project/src/Configuration.hpp

@ -5,11 +5,11 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
constexpr uint32_t ITERATION_COUNT = 5; constexpr uint32_t ITERATION_COUNT = 5;
#ifdef MODE_SIMPLE_PREFETCH #ifdef MODE_SIMPLE_PREFETCH
constexpr uint32_t GROUP_COUNT = 16;
constexpr uint32_t GROUP_COUNT = 8;
constexpr size_t CHUNK_SIZE_B = 8_MiB; constexpr size_t CHUNK_SIZE_B = 8_MiB;
constexpr uint32_t TC_SCANA = 2;
constexpr uint32_t TC_SCANA = 4;
constexpr uint32_t TC_SCANB = 1; constexpr uint32_t TC_SCANB = 1;
constexpr uint32_t TC_AGGRJ = 2;
constexpr uint32_t TC_AGGRJ = 4;
constexpr bool PERFORM_CACHING = true; constexpr bool PERFORM_CACHING = true;
constexpr bool STORE_B_IN_HBM = false; constexpr bool STORE_B_IN_HBM = false;
constexpr char MODE_STRING[] = "simple-prefetch"; constexpr char MODE_STRING[] = "simple-prefetch";
@ -38,11 +38,11 @@ constexpr char MODE_STRING[] = "simple-hbm";
constexpr bool COMPLEX_QUERY = false; constexpr bool COMPLEX_QUERY = false;
#endif #endif
#ifdef MODE_COMPLEX_PREFETCH #ifdef MODE_COMPLEX_PREFETCH
constexpr uint32_t GROUP_COUNT = 16;
constexpr uint32_t GROUP_COUNT = 8;
constexpr size_t CHUNK_SIZE_B = 8_MiB; constexpr size_t CHUNK_SIZE_B = 8_MiB;
constexpr uint32_t TC_SCANA = 1;
constexpr uint32_t TC_SCANA = 4;
constexpr uint32_t TC_SCANB = 1; constexpr uint32_t TC_SCANB = 1;
constexpr uint32_t TC_AGGRJ = 2;
constexpr uint32_t TC_AGGRJ = 4;
constexpr bool PERFORM_CACHING = true; constexpr bool PERFORM_CACHING = true;
constexpr bool STORE_B_IN_HBM = false; constexpr bool STORE_B_IN_HBM = false;
constexpr char MODE_STRING[] = "complex-prefetch"; constexpr char MODE_STRING[] = "complex-prefetch";

Loading…
Cancel
Save