Browse Source
finish first stage of caching implementation and provide a rudimentary test function in the main
master
finish first stage of caching implementation and provide a rudimentary test function in the main
master
Constantin Fürst
12 months ago
3 changed files with 236 additions and 308 deletions
-
174offloading-cacher/benchmark.hpp
-
80offloading-cacher/main.cpp
-
290offloading-cacher/offloading-cache.hpp
@ -1,174 +0,0 @@ |
|||
#pragma once
|
|||
|
|||
#include <iostream>
|
|||
#include <vector>
|
|||
#include <chrono>
|
|||
#include <numeric>
|
|||
|
|||
#include <pthread.h>
|
|||
#include <semaphore.h>
|
|||
#include <numa.h>
|
|||
|
|||
#include <dml/dml.hpp>
|
|||
|
|||
#include "util/barrier.hpp"
|
|||
#include "util/dml-helper.hpp"
|
|||
#include "util/task-data.hpp"
|
|||
|
|||
#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
|
|||
#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
|
|||
#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }}
|
|||
|
|||
#define ADD_TIMING_MESSUREMENT { if (i >= 5) { args->submit_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(se - st).count()); args->complete_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - se).count()); args->combined_duration.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(et - st).count());}}
|
|||
|
|||
template <typename path> |
|||
void* thread_function(void* argp) { |
|||
TaskData* args = reinterpret_cast<TaskData*>(argp); |
|||
|
|||
// set numa node and core affinity of the current thread
|
|||
numa_run_on_node(args->numa_node); |
|||
|
|||
// allocate memory for the move operation on the requested numa nodes
|
|||
void* src = numa_alloc_onnode(args->size, args->nnode_src); |
|||
void* dst = numa_alloc_onnode(args->size, args->nnode_dst); |
|||
dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size); |
|||
dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size); |
|||
|
|||
std::memset(src, 0, args->size); |
|||
std::memset(dst, 0, args->size); |
|||
|
|||
args->status = dml::status_code::ok; |
|||
args->rep_completed = 0; |
|||
|
|||
std::chrono::time_point<std::chrono::steady_clock> tps; |
|||
|
|||
// we add 5 as the first 5 iterations will not be meassured
|
|||
// to remove exceptional values encountered during warmup
|
|||
for (uint32_t i = 0; i < args->rep_count + 5; i++) { |
|||
// synchronize the start of each iteration
|
|||
// using the barrier structure
|
|||
args->barrier_->wait(); |
|||
|
|||
if (args->batch_submit) { |
|||
const auto st = std::chrono::steady_clock::now(); |
|||
|
|||
auto sequence = dml::sequence(args->batch_size, std::allocator<dml::byte_t>()); |
|||
|
|||
for (uint32_t j = 0; j < args->batch_size; j++) { |
|||
// block_on_fault() is required to submit the task in a way so that the
|
|||
// DSA engine can handle page faults itself together with the IOMMU which
|
|||
// requires the WQ to be configured to allow this too
|
|||
|
|||
const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv); |
|||
CHECK_STATUS(status, "Adding operation to batch failed!"); |
|||
} |
|||
|
|||
// we use the asynchronous submit-routine even though this is not required
|
|||
// here, however the project later on will only use async operation and
|
|||
// therefore this behaviour should be benchmarked
|
|||
|
|||
auto handler = dml::submit<path>(dml::batch, sequence); |
|||
|
|||
const auto se = std::chrono::steady_clock::now(); |
|||
|
|||
auto result = handler.get(); |
|||
|
|||
const auto et = std::chrono::steady_clock::now(); |
|||
|
|||
const dml::status_code status = result.status; |
|||
CHECK_STATUS(status, "Batch completed with an Error!"); |
|||
|
|||
ADD_TIMING_MESSUREMENT; |
|||
} |
|||
else if (args->batch_size > 1) { |
|||
// implementation for non-batched batch submit follows here
|
|||
// this means we submit a bunch of work as single descriptors
|
|||
// but then dont wait for the completion immediately
|
|||
|
|||
std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers; |
|||
|
|||
const auto st = std::chrono::steady_clock::now(); |
|||
|
|||
for (uint32_t j = 0; j < args->batch_size; j++) { |
|||
// block_on_fault() is required to submit the task in a way so that the
|
|||
// DSA engine can handle page faults itself together with the IOMMU which
|
|||
// requires the WQ to be configured to allow this too
|
|||
|
|||
handlers.emplace_back(dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv)); |
|||
} |
|||
|
|||
const auto se = std::chrono::steady_clock::now(); |
|||
|
|||
for (auto& handler : handlers) { |
|||
auto result = handler.get(); |
|||
const dml::status_code status = result.status; |
|||
CHECK_STATUS(status, "Operation completed with an Error!"); |
|||
} |
|||
|
|||
const auto et = std::chrono::steady_clock::now(); |
|||
|
|||
ADD_TIMING_MESSUREMENT; |
|||
} |
|||
else { |
|||
const auto st = std::chrono::steady_clock::now(); |
|||
|
|||
// we use the asynchronous submit-routine even though this is not required
|
|||
// here, however the project later on will only use async operation and
|
|||
// therefore this behaviour should be benchmarked
|
|||
// block_on_fault() is required to submit the task in a way so that the
|
|||
// DSA engine can handle page faults itself together with the IOMMU which
|
|||
// requires the WQ to be configured to allow this too
|
|||
auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv); |
|||
|
|||
const auto se = std::chrono::steady_clock::now(); |
|||
|
|||
auto result = handler.get(); |
|||
|
|||
const auto et = std::chrono::steady_clock::now(); |
|||
|
|||
const dml::status_code status = result.status; |
|||
CHECK_STATUS(status, "Operation completed with an Error!"); |
|||
|
|||
ADD_TIMING_MESSUREMENT; |
|||
} |
|||
|
|||
// again: we do not count the first 5 repetitions
|
|||
if (i == 5) tps = std::chrono::steady_clock::now(); |
|||
if (i >= 5) args->rep_completed++; |
|||
} |
|||
|
|||
const auto tpe = std::chrono::steady_clock::now(); |
|||
|
|||
args->total_time = std::chrono::duration_cast<std::chrono::nanoseconds>(tpe - tps).count(); |
|||
|
|||
// free the allocated memory regions on the selected nodes
|
|||
numa_free(src, args->size); |
|||
numa_free(dst, args->size); |
|||
|
|||
return nullptr; |
|||
} |
|||
|
|||
template <typename path> |
|||
void execute_dml_memcpy(std::vector<TaskData>& args) { |
|||
barrier task_barrier(args.size()); |
|||
std::vector<pthread_t> threads; |
|||
|
|||
// initialize numa library
|
|||
numa_available(); |
|||
|
|||
// for each submitted task we link the semaphore
|
|||
// and create the thread, passing the argument
|
|||
for (auto& arg : args) { |
|||
arg.barrier_ = &task_barrier; |
|||
threads.emplace_back(); |
|||
|
|||
if (pthread_create(&threads.back(), nullptr, thread_function<path>, &arg) != 0) { |
|||
std::cerr << "Error creating thread" << std::endl; |
|||
exit(1); |
|||
} |
|||
} |
|||
|
|||
for (pthread_t& t : threads) { |
|||
pthread_join(t, nullptr); |
|||
} |
|||
} |
@ -1,42 +1,64 @@ |
|||
#include <dml/dml.hpp>
|
|||
|
|||
#include <vector>
|
|||
#include <iostream>
|
|||
#include <fstream>
|
|||
#include <random>
|
|||
|
|||
#include "benchmark.hpp"
|
|||
#include "offloading-cache.hpp"
|
|||
|
|||
int main(int argc, char **argv) { |
|||
if (argc < 3) { |
|||
std::cout << "Missing input and output file names." << std::endl; |
|||
std::cout << "Usage: ./benchmarks [input.json] [output.json]" << std::endl; |
|||
return 1; |
|||
} |
|||
double* GetRandomArray(const size_t size) { |
|||
double* array = new double[size]; |
|||
|
|||
const std::string input = argv[1]; |
|||
const std::string output = argv[2]; |
|||
std::uniform_real_distribution<double> unif(std::numeric_limits<double>::min(), std::numeric_limits<double>::max()); |
|||
std::default_random_engine re; |
|||
|
|||
std::string path; |
|||
std::vector<TaskData> args; |
|||
for (size_t i = 0; i < size; i++) { |
|||
array[i] = unif(re); |
|||
} |
|||
|
|||
std::ifstream is(input); |
|||
ReadWorkDescription(args, path, is); |
|||
is.close(); |
|||
return array; |
|||
} |
|||
|
|||
if (path == "hw") { |
|||
execute_dml_memcpy<dml::hardware>(args); |
|||
bool IsEqual(const double* a, const double* b, const size_t size) { |
|||
for (size_t i = 0; i < size; i++) { |
|||
try { |
|||
if (a[i] != b[i]) return false; |
|||
} |
|||
catch (...) { |
|||
return false; |
|||
} |
|||
} |
|||
else if (path == "sw") { |
|||
execute_dml_memcpy<dml::software>(args); |
|||
|
|||
return true; |
|||
} |
|||
|
|||
int main(int argc, char **argv) { |
|||
offcache::Cache cache; |
|||
|
|||
auto cache_policy = [](const int numa_dst_node, const int numa_src_node, const size_t data_size) { |
|||
return numa_dst_node; |
|||
}; |
|||
|
|||
auto copy_policy = [](const int numa_dst_node, const int numa_src_node) { |
|||
return std::vector{ numa_src_node }; |
|||
}; |
|||
|
|||
cache.Init(cache_policy,copy_policy); |
|||
|
|||
static constexpr size_t data_size = 8192; |
|||
double* data = GetRandomArray(data_size); |
|||
|
|||
std::unique_ptr<offcache::CacheData> data_cache = cache.Access(reinterpret_cast<uint8_t *>(data), data_size * sizeof(double), offcache::ExecutionPolicy::Relaxed); |
|||
|
|||
data_cache->WaitOnCompletion(); |
|||
|
|||
double* cached = reinterpret_cast<double *>(data_cache->GetDataLocation()); |
|||
|
|||
if (data == cached) { |
|||
std::cout << "Caching did not affect data location." << std::endl; |
|||
} |
|||
else if (path == "auto") { |
|||
execute_dml_memcpy<dml::automatic>(args); |
|||
|
|||
if (IsEqual(data,cached,data_size)) { |
|||
std::cout << "Cached data is correct." << std::endl; |
|||
} |
|||
else { |
|||
std::cerr << "Path is neither hw/sw/auto." << std::endl; |
|||
std::cout << "Cached data is wrong." << std::endl; |
|||
} |
|||
|
|||
std::ofstream os(output); |
|||
WriteResultLog(args, path, os); |
|||
os.close(); |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue