Browse Source
finish first stage of caching implementation and provide a rudimentary test function in the main
master
finish first stage of caching implementation and provide a rudimentary test function in the main
master
Constantin Fürst
12 months ago
3 changed files with 236 additions and 308 deletions
-
174offloading-cacher/benchmark.hpp
-
80offloading-cacher/main.cpp
-
290offloading-cacher/offloading-cache.hpp
@ -1,174 +0,0 @@ |
|||||
#pragma once
|
|
||||
|
|
||||
#include <iostream>
|
|
||||
#include <vector>
|
|
||||
#include <chrono>
|
|
||||
#include <numeric>
|
|
||||
|
|
||||
#include <pthread.h>
|
|
||||
#include <semaphore.h>
|
|
||||
#include <numa.h>
|
|
||||
|
|
||||
#include <dml/dml.hpp>
|
|
||||
|
|
||||
#include "util/barrier.hpp"
|
|
||||
#include "util/dml-helper.hpp"
|
|
||||
#include "util/task-data.hpp"
|
|
||||
|
|
||||
#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
|
|
||||
#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
|
|
||||
#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }}
|
|
||||
|
|
||||
#define ADD_TIMING_MESSUREMENT { if (i >= 5) { args->submit_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(se - st).count()); args->complete_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - se).count()); args->combined_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - st).count());}}
|
|
||||
|
|
||||
template <typename path> |
|
||||
void* thread_function(void* argp) { |
|
||||
TaskData* args = reinterpret_cast<TaskData*>(argp); |
|
||||
|
|
||||
// set numa node and core affinity of the current thread
|
|
||||
numa_run_on_node(args->numa_node); |
|
||||
|
|
||||
// allocate memory for the move operation on the requested numa nodes
|
|
||||
void* src = numa_alloc_onnode(args->size, args->nnode_src); |
|
||||
void* dst = numa_alloc_onnode(args->size, args->nnode_dst); |
|
||||
dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size); |
|
||||
dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size); |
|
||||
|
|
||||
std::memset(src, 0, args->size); |
|
||||
std::memset(dst, 0, args->size); |
|
||||
|
|
||||
args->status = dml::status_code::ok; |
|
||||
args->rep_completed = 0; |
|
||||
|
|
||||
std::chrono::time_point<std::chrono::steady_clock> tps; |
|
||||
|
|
||||
// we add 5 as the first 5 iterations will not be meassured
|
|
||||
// to remove exceptional values encountered during warmup
|
|
||||
for (uint32_t i = 0; i < args->rep_count + 5; i++) { |
|
||||
// synchronize the start of each iteration
|
|
||||
// using the barrier structure
|
|
||||
args->barrier_->wait(); |
|
||||
|
|
||||
if (args->batch_submit) { |
|
||||
const auto st = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
auto sequence = dml::sequence(args->batch_size, std::allocator<dml::byte_t>()); |
|
||||
|
|
||||
for (uint32_t j = 0; j < args->batch_size; j++) { |
|
||||
// block_on_fault() is required to submit the task in a way so that the
|
|
||||
// DSA engine can handle page faults itself together with the IOMMU which
|
|
||||
// requires the WQ to be configured to allow this too
|
|
||||
|
|
||||
const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv); |
|
||||
CHECK_STATUS(status, "Adding operation to batch failed!"); |
|
||||
} |
|
||||
|
|
||||
// we use the asynchronous submit-routine even though this is not required
|
|
||||
// here, however the project later on will only use async operation and
|
|
||||
// therefore this behaviour should be benchmarked
|
|
||||
|
|
||||
auto handler = dml::submit<path>(dml::batch, sequence); |
|
||||
|
|
||||
const auto se = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
auto result = handler.get(); |
|
||||
|
|
||||
const auto et = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
const dml::status_code status = result.status; |
|
||||
CHECK_STATUS(status, "Batch completed with an Error!"); |
|
||||
|
|
||||
ADD_TIMING_MESSUREMENT; |
|
||||
} |
|
||||
else if (args->batch_size > 1) { |
|
||||
// implementation for non-batched batch submit follows here
|
|
||||
// this means we submit a bunch of work as single descriptors
|
|
||||
// but then dont wait for the completion immediately
|
|
||||
|
|
||||
std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers; |
|
||||
|
|
||||
const auto st = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
for (uint32_t j = 0; j < args->batch_size; j++) { |
|
||||
// block_on_fault() is required to submit the task in a way so that the
|
|
||||
// DSA engine can handle page faults itself together with the IOMMU which
|
|
||||
// requires the WQ to be configured to allow this too
|
|
||||
|
|
||||
handlers.emplace_back(dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv)); |
|
||||
} |
|
||||
|
|
||||
const auto se = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
for (auto& handler : handlers) { |
|
||||
auto result = handler.get(); |
|
||||
const dml::status_code status = result.status; |
|
||||
CHECK_STATUS(status, "Operation completed with an Error!"); |
|
||||
} |
|
||||
|
|
||||
const auto et = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
ADD_TIMING_MESSUREMENT; |
|
||||
} |
|
||||
else { |
|
||||
const auto st = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
// we use the asynchronous submit-routine even though this is not required
|
|
||||
// here, however the project later on will only use async operation and
|
|
||||
// therefore this behaviour should be benchmarked
|
|
||||
// block_on_fault() is required to submit the task in a way so that the
|
|
||||
// DSA engine can handle page faults itself together with the IOMMU which
|
|
||||
// requires the WQ to be configured to allow this too
|
|
||||
auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv); |
|
||||
|
|
||||
const auto se = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
auto result = handler.get(); |
|
||||
|
|
||||
const auto et = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
const dml::status_code status = result.status; |
|
||||
CHECK_STATUS(status, "Operation completed with an Error!"); |
|
||||
|
|
||||
ADD_TIMING_MESSUREMENT; |
|
||||
} |
|
||||
|
|
||||
// again: we do not count the first 5 repetitions
|
|
||||
if (i == 5) tps = std::chrono::steady_clock::now(); |
|
||||
if (i >= 5) args->rep_completed++; |
|
||||
} |
|
||||
|
|
||||
const auto tpe = std::chrono::steady_clock::now(); |
|
||||
|
|
||||
args->total_time = std::chrono::duration_cast<std::chrono::nanoseconds>(tpe - tps).count(); |
|
||||
|
|
||||
// free the allocated memory regions on the selected nodes
|
|
||||
numa_free(src, args->size); |
|
||||
numa_free(dst, args->size); |
|
||||
|
|
||||
return nullptr; |
|
||||
} |
|
||||
|
|
||||
template <typename path> |
|
||||
void execute_dml_memcpy(std::vector<TaskData>& args) { |
|
||||
barrier task_barrier(args.size()); |
|
||||
std::vector<pthread_t> threads; |
|
||||
|
|
||||
// initialize numa library
|
|
||||
numa_available(); |
|
||||
|
|
||||
// for each submitted task we link the semaphore
|
|
||||
// and create the thread, passing the argument
|
|
||||
for (auto& arg : args) { |
|
||||
arg.barrier_ = &task_barrier; |
|
||||
threads.emplace_back(); |
|
||||
|
|
||||
if (pthread_create(&threads.back(), nullptr, thread_function<path>, &arg) != 0) { |
|
||||
std::cerr << "Error creating thread" << std::endl; |
|
||||
exit(1); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
for (pthread_t& t : threads) { |
|
||||
pthread_join(t, nullptr); |
|
||||
} |
|
||||
} |
|
@ -1,42 +1,64 @@ |
|||||
#include <dml/dml.hpp>
|
|
||||
|
|
||||
#include <vector>
|
|
||||
#include <iostream>
|
#include <iostream>
|
||||
#include <fstream>
|
|
||||
|
#include <random>
|
||||
|
|
||||
#include "benchmark.hpp"
|
|
||||
|
#include "offloading-cache.hpp"
|
||||
|
|
||||
int main(int argc, char **argv) { |
|
||||
if (argc < 3) { |
|
||||
std::cout << "Missing input and output file names." << std::endl; |
|
||||
std::cout << "Usage: ./benchmarks [input.json] [output.json]" << std::endl; |
|
||||
return 1; |
|
||||
} |
|
||||
|
double* GetRandomArray(const size_t size) { |
||||
|
double* array = new double[size]; |
||||
|
|
||||
const std::string input = argv[1]; |
|
||||
const std::string output = argv[2]; |
|
||||
|
std::uniform_real_distribution<double> unif(std::numeric_limits<double>::min(), std::numeric_limits<double>::max()); |
||||
|
std::default_random_engine re; |
||||
|
|
||||
std::string path; |
|
||||
std::vector<TaskData> args; |
|
||||
|
for (size_t i = 0; i < size; i++) { |
||||
|
array[i] = unif(re); |
||||
|
} |
||||
|
|
||||
std::ifstream is(input); |
|
||||
ReadWorkDescription(args, path, is); |
|
||||
is.close(); |
|
||||
|
return array; |
||||
|
} |
||||
|
|
||||
if (path == "hw") { |
|
||||
execute_dml_memcpy<dml::hardware>(args); |
|
||||
|
bool IsEqual(const double* a, const double* b, const size_t size) { |
||||
|
for (size_t i = 0; i < size; i++) { |
||||
|
try { |
||||
|
if (a[i] != b[i]) return false; |
||||
} |
} |
||||
else if (path == "sw") { |
|
||||
execute_dml_memcpy<dml::software>(args); |
|
||||
|
catch (...) { |
||||
|
return false; |
||||
} |
} |
||||
else if (path == "auto") { |
|
||||
execute_dml_memcpy<dml::automatic>(args); |
|
||||
} |
} |
||||
else { |
|
||||
std::cerr << "Path is neither hw/sw/auto." << std::endl; |
|
||||
|
|
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
int main(int argc, char **argv) { |
||||
|
offcache::Cache cache; |
||||
|
|
||||
|
auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { |
||||
|
return numa_dst_node; |
||||
|
}; |
||||
|
|
||||
|
auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { |
||||
|
return std::vector{ numa_src_node }; |
||||
|
}; |
||||
|
|
||||
|
cache.Init(cache_policy,copy_policy); |
||||
|
|
||||
|
static constexpr size_t data_size = 8192; |
||||
|
double* data = GetRandomArray(data_size); |
||||
|
|
||||
|
std::unique_ptr<offcache::CacheData> data_cache = cache.Access(reinterpret_cast<uint8_t *>(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed); |
||||
|
|
||||
|
data_cache->WaitOnCompletion(); |
||||
|
|
||||
|
double* cached = reinterpret_cast<double *>(data_cache->GetDataLocation()); |
||||
|
|
||||
|
if (data == cached) { |
||||
|
std::cout << "Caching did not affect data location." << std::endl; |
||||
} |
} |
||||
|
|
||||
std::ofstream os(output); |
|
||||
WriteResultLog(args, path, os); |
|
||||
os.close(); |
|
||||
|
if (IsEqual(data,cached,data_size)) { |
||||
|
std::cout << "Cached data is correct." << std::endl; |
||||
|
} |
||||
|
else { |
||||
|
std::cout << "Cached data is wrong." << std::endl; |
||||
|
} |
||||
} |
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue