|
@ -59,13 +59,33 @@ namespace dml { |
|
|
namespace dsacache { |
|
|
namespace dsacache { |
|
|
class Cache; |
|
|
class Cache; |
|
|
|
|
|
|
|
|
// cache data holds all required information on
|
|
|
|
|
|
// one cache entry and will both be stored
|
|
|
|
|
|
// internally by the cache and handed out
|
|
|
|
|
|
// as copies to the user
|
|
|
|
|
|
// this class uses its object lifetime and
|
|
|
|
|
|
// a global reference counter to allow
|
|
|
|
|
|
// thread-safe copies and resource management
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* Class Description: |
|
|
|
|
|
* Holds all required information on one cache entry and is used |
|
|
|
|
|
* both internally by the Cache and externally by the user. |
|
|
|
|
|
* |
|
|
|
|
|
* Important Usage Notes: |
|
|
|
|
|
* The pointer is only updated in WaitOnCompletion() which |
|
|
|
|
|
* therefore must be called by the user at some point in order |
|
|
|
|
|
* to use the cached data. Using this class as T for |
|
|
|
|
|
* std::shared_ptr<T> is not recommended as references are |
|
|
|
|
|
* already counted internally. |
|
|
|
|
|
* |
|
|
|
|
|
* Cache Lifetime: |
|
|
|
|
|
* As long as the instance is referenced, the pointer it stores |
|
|
|
|
|
* is guaranteed to be either nullptr or pointing to a valid copy. |
|
|
|
|
|
* |
|
|
|
|
|
* Implementation Detail: |
|
|
|
|
|
* Performs self-reference counting with a shared atomic integer. |
|
|
|
|
|
* Therefore on creating a copy the reference count is increased |
|
|
|
|
|
* and with the destructor it is deacresed. If the last copy is |
|
|
|
|
|
* destroyed the actual underlying data is freed and all shared |
|
|
|
|
|
* variables deleted. |
|
|
|
|
|
* |
|
|
|
|
|
* Notes on Thread Safety: |
|
|
|
|
|
* Class is thread safe in any possible state and performs |
|
|
|
|
|
* reference counting and deallocation itself entirely atomically. |
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
class CacheData { |
|
|
class CacheData { |
|
|
public: |
|
|
public: |
|
@ -101,6 +121,7 @@ namespace dsacache { |
|
|
bool Active() const; |
|
|
bool Active() const; |
|
|
|
|
|
|
|
|
friend Cache; |
|
|
friend Cache; |
|
|
|
|
|
|
|
|
public: |
|
|
public: |
|
|
CacheData(uint8_t* data, const size_t size); |
|
|
CacheData(uint8_t* data, const size_t size); |
|
|
CacheData(const CacheData& other); |
|
|
CacheData(const CacheData& other); |
|
@ -118,11 +139,62 @@ namespace dsacache { |
|
|
uint8_t* GetDataLocation() const; |
|
|
uint8_t* GetDataLocation() const; |
|
|
}; |
|
|
}; |
|
|
|
|
|
|
|
|
// cache class will handle access to data through the cache
|
|
|
|
|
|
// by managing the cache through work submission, it sticks
|
|
|
|
|
|
// to user-defined caching and copy policies, is thread
|
|
|
|
|
|
// safe after initialization and returns copies of
|
|
|
|
|
|
// cache data class to the user
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* Class Description: |
|
|
|
|
|
* Class will handle access to data through internal copies. |
|
|
|
|
|
* These are obtained via work submission to the Intel DSA which takes |
|
|
|
|
|
* care of asynchronously duplicating the data. The user will define |
|
|
|
|
|
* where these copies lie and which system nodes will perform the copy. |
|
|
|
|
|
* This is done through policy functions set during initialization. |
|
|
|
|
|
* |
|
|
|
|
|
* Placement Policy: |
|
|
|
|
|
* The Placement Policy Function decides on which node a particular |
|
|
|
|
|
* entry is to be placed, given the current executing node and the |
|
|
|
|
|
* data source node and data size. This in turn means that for one |
|
|
|
|
|
* datum, multiple cached copies may exist at one time. |
|
|
|
|
|
* |
|
|
|
|
|
* Cache Lifetime: |
|
|
|
|
|
* When accessing the cache, a CacheData-object will be returned. |
|
|
|
|
|
* As long as this object lives, the pointer which it holds is |
|
|
|
|
|
* guaranteed to be either nullptr or a valid copy. When destroyed |
|
|
|
|
|
* the entry is marked for deletion which is only carried out |
|
|
|
|
|
* when system memory pressure drives an automated cache flush. |
|
|
|
|
|
* |
|
|
|
|
|
* Restrictions: |
|
|
|
|
|
* - Overlapping Pointers may lead to undefined behaviour during |
|
|
|
|
|
* manual cache invalidation which should not be used if you |
|
|
|
|
|
* intend to have these types of pointers |
|
|
|
|
|
* - Cache Invalidation may only be performed manually and gives |
|
|
|
|
|
* no ordering guarantees. Therefore, it is the users responsibility |
|
|
|
|
|
* to ensure that results after invalidation have been generated |
|
|
|
|
|
* using the latest state of data. The cache is best suited |
|
|
|
|
|
* to static data. |
|
|
|
|
|
* |
|
|
|
|
|
* Notes on Thread Safety: |
|
|
|
|
|
* - Cache is completely thread-safe after initialization |
|
|
|
|
|
* - CacheData-class will handle deallocation of data itself by |
|
|
|
|
|
* performing self-reference-counting atomically and only |
|
|
|
|
|
* deallocating if the last reference is destroyed |
|
|
|
|
|
* - The internal cache state has one lock which is either |
|
|
|
|
|
* acquired shared for reading the state (upon accessing an already |
|
|
|
|
|
* cached element) or unique (accessing a new element, flushing, invalidating) |
|
|
|
|
|
* - Waiting on copy completion is done over an atomic-wait in copies |
|
|
|
|
|
* of the original CacheData-instance |
|
|
|
|
|
* - Overall this class may experience performance issues due to the use |
|
|
|
|
|
* of locking (in any configuration), lock contention (worsens with higher |
|
|
|
|
|
* core count, node count and utilization) and atomics (worse in the same |
|
|
|
|
|
* situations as lock contention) |
|
|
|
|
|
* |
|
|
|
|
|
* Improving Performance: |
|
|
|
|
|
* When data is never shared between threads or memory size for the cache is |
|
|
|
|
|
* not an issue you may consider having one Cache-instance per thread and removing |
|
|
|
|
|
* the lock in Cache and modifying the reference counting and waiting mechanisms |
|
|
|
|
|
* of CacheData accordingly (although this is high effort and will yield little due |
|
|
|
|
|
* to the atomics not being shared among cores/nodes). |
|
|
|
|
|
* Otherwise, one Cache-instance per node could also be considered. This will allow |
|
|
|
|
|
* the placement policy function to be barebones and reduces the lock contention and |
|
|
|
|
|
* synchronization impact of the atomic variables. |
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
class Cache { |
|
|
class Cache { |
|
|
public: |
|
|
public: |
|
@ -202,6 +274,8 @@ namespace dsacache { |
|
|
// be properly deleted, but the cache
|
|
|
// be properly deleted, but the cache
|
|
|
// will be fresh - use for testing
|
|
|
// will be fresh - use for testing
|
|
|
void Clear(); |
|
|
void Clear(); |
|
|
|
|
|
|
|
|
|
|
|
void Invalidate(uint8_t* data); |
|
|
}; |
|
|
}; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
@ -486,6 +560,28 @@ inline std::unique_ptr<dsacache::CacheData> dsacache::Cache::GetFromCache(uint8_ |
|
|
return nullptr; |
|
|
return nullptr; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void dsacache::Cache::Invalidate(uint8_t* data) { |
|
|
|
|
|
// as the cache is modified we must obtain a unique writers lock
|
|
|
|
|
|
|
|
|
|
|
|
std::unique_lock<std::shared_mutex> lock(cache_mutex_); |
|
|
|
|
|
|
|
|
|
|
|
// loop through all per-node-caches available
|
|
|
|
|
|
|
|
|
|
|
|
for (auto node : cache_state_) { |
|
|
|
|
|
// search for an entry for the given data pointer
|
|
|
|
|
|
|
|
|
|
|
|
auto search = node.second.find(data); |
|
|
|
|
|
|
|
|
|
|
|
if (search != node.second.end()) { |
|
|
|
|
|
// if the data is represented in-cache
|
|
|
|
|
|
// then it will be erased to re-trigger
|
|
|
|
|
|
// caching on next access
|
|
|
|
|
|
|
|
|
|
|
|
node.second.erase(search); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { |
|
|
inline dsacache::CacheData::CacheData(uint8_t* data, const size_t size) { |
|
|
src_ = data; |
|
|
src_ = data; |
|
|
size_ = size; |
|
|
size_ = size; |
|
@ -552,6 +648,11 @@ inline void dsacache::CacheData::Deallocate() { |
|
|
|
|
|
|
|
|
uint8_t* cache_local = cache_->exchange(nullptr); |
|
|
uint8_t* cache_local = cache_->exchange(nullptr); |
|
|
if (cache_local != nullptr) numa_free(cache_local, size_); |
|
|
if (cache_local != nullptr) numa_free(cache_local, size_); |
|
|
|
|
|
|
|
|
|
|
|
// if the cache was never waited for then incomplete_cache_
|
|
|
|
|
|
// may still contain a valid pointer which has to be freed
|
|
|
|
|
|
|
|
|
|
|
|
if (incomplete_cache_ != nullptr) numa_free(incomplete_cache_, size_); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
inline uint8_t* dsacache::CacheData::GetDataLocation() const { |
|
|
inline uint8_t* dsacache::CacheData::GetDataLocation() const { |
|
|