Browse Source

ignore first five runs to reduce influence of warmup, add non-batch-descriptor batch loop for testing, calculate standard deviation for all three messurements

master
Constantin Fürst 1 year ago
parent
commit
1bfb1f316c
  1. 78
      benchmarks/benchmark.hpp
  2. 58
      benchmarks/task-data.hpp
  3. 12
      benchmarks/task-description.json

78
benchmarks/benchmark.hpp

@ -19,10 +19,20 @@ double avg(const std::vector<uint64_t>& v) {
return static_cast<long double>(std::accumulate(v.begin(), v.end(), 0)) / static_cast<long double>(v.size()); return static_cast<long double>(std::accumulate(v.begin(), v.end(), 0)) / static_cast<long double>(v.size());
} }
double stdev(const std::vector<uint64_t>& v, const double mean) {
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [mean](double x) { return x - mean; });
const double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
const double stdev = std::sqrt(sq_sum / static_cast<double>(v.size()));
return stdev;
}
#define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
#define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO #define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
#define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }} #define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << msg << std::endl; args->status = status; return nullptr; }}
#define ADD_TIMING_MESSUREMENT { if (i >= 5) { submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count()); completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count()); combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());}}
template <typename path> template <typename path>
void* thread_function(void* argp) { void* thread_function(void* argp) {
TaskData* args = reinterpret_cast<TaskData*>(argp); TaskData* args = reinterpret_cast<TaskData*>(argp);
@ -43,21 +53,17 @@ void* thread_function(void* argp) {
args->status = dml::status_code::ok; args->status = dml::status_code::ok;
args->rep_completed = 0; args->rep_completed = 0;
for (uint32_t i = 0; i < args->rep_count; i++) {
// we add 5 as the first 5 iterations will not be meassured
// to remove exceptional values encountered during warmup
for (uint32_t i = 0; i < args->rep_count + 5; i++) {
// synchronize the start of each iteration // synchronize the start of each iteration
// using the barrier structure // using the barrier structure
args->barrier_->wait(); args->barrier_->wait();
if (args->batch_submit) { if (args->batch_submit) {
uint32_t opcount = args->batch_size;
const auto st = std::chrono::steady_clock::now();
if (args->barrier_after_n_operations > 0) {
opcount += opcount / args->barrier_after_n_operations;
}
const auto st = std::chrono::high_resolution_clock::now();
auto sequence = dml::sequence(opcount, std::allocator<dml::byte_t>());
auto sequence = dml::sequence(args->batch_size, std::allocator<dml::byte_t>());
for (uint32_t j = 0; j < args->batch_size; j++) { for (uint32_t j = 0; j < args->batch_size; j++) {
// block_on_fault() is required to submit the task in a way so that the // block_on_fault() is required to submit the task in a way so that the
@ -66,10 +72,6 @@ void* thread_function(void* argp) {
const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv); const auto status = sequence.add(dml::mem_copy.block_on_fault(), srcv, dstv);
CHECK_STATUS(status, "Adding operation to batch failed!"); CHECK_STATUS(status, "Adding operation to batch failed!");
if (j % args->barrier_after_n_operations == 0) {
sequence.add(dml::nop);
}
} }
// we use the asynchronous submit-routine even though this is not required // we use the asynchronous submit-routine even though this is not required
@ -78,21 +80,48 @@ void* thread_function(void* argp) {
auto handler = dml::submit<path>(dml::batch, sequence); auto handler = dml::submit<path>(dml::batch, sequence);
const auto se = std::chrono::high_resolution_clock::now();
const auto se = std::chrono::steady_clock::now();
auto result = handler.get(); auto result = handler.get();
const auto et = std::chrono::high_resolution_clock::now();
const auto et = std::chrono::steady_clock::now();
const dml::status_code status = result.status; const dml::status_code status = result.status;
CHECK_STATUS(status, "Batch completed with an Error!"); CHECK_STATUS(status, "Batch completed with an Error!");
submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count());
completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count());
combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());
ADD_TIMING_MESSUREMENT;
}
else if (args->batch_size > 1) {
// implementation for non-batched batch submit follows here
// this means we submit a bunch of work as single descriptors
// but then dont wait for the completion immediately
std::vector<dml::handler<dml::mem_copy_operation, std::allocator<uint8_t>>> handlers;
const auto st = std::chrono::steady_clock::now();
for (uint32_t j = 0; j < args->batch_size; j++) {
// block_on_fault() is required to submit the task in a way so that the
// DSA engine can handle page faults itself together with the IOMMU which
// requires the WQ to be configured to allow this too
handlers.emplace_back(dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv));
}
const auto se = std::chrono::steady_clock::now();
for (auto& handler : handlers) {
auto result = handler.get();
const dml::status_code status = result.status;
CHECK_STATUS(status, "Operation completed with an Error!");
}
const auto et = std::chrono::steady_clock::now();
ADD_TIMING_MESSUREMENT;
} }
else { else {
const auto st = std::chrono::high_resolution_clock::now();
const auto st = std::chrono::steady_clock::now();
// we use the asynchronous submit-routine even though this is not required // we use the asynchronous submit-routine even though this is not required
// here, however the project later on will only use async operation and // here, however the project later on will only use async operation and
@ -102,18 +131,16 @@ void* thread_function(void* argp) {
// requires the WQ to be configured to allow this too // requires the WQ to be configured to allow this too
auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv); auto handler = dml::submit<path>(dml::mem_copy.block_on_fault(), srcv, dstv);
const auto se = std::chrono::high_resolution_clock::now();
const auto se = std::chrono::steady_clock::now();
auto result = handler.get(); auto result = handler.get();
const auto et = std::chrono::high_resolution_clock::now();
const auto et = std::chrono::steady_clock::now();
const dml::status_code status = result.status; const dml::status_code status = result.status;
CHECK_STATUS(status, "Operation completed with an Error!"); CHECK_STATUS(status, "Operation completed with an Error!");
submission_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(se - st).count());
completion_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - se).count());
combined_durations.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(et - st).count());
ADD_TIMING_MESSUREMENT;
} }
args->rep_completed++; args->rep_completed++;
@ -126,6 +153,9 @@ void* thread_function(void* argp) {
args->combined_duration = avg(combined_durations); args->combined_duration = avg(combined_durations);
args->complete_duration = avg(completion_durations); args->complete_duration = avg(completion_durations);
args->submit_duration = avg(submission_durations); args->submit_duration = avg(submission_durations);
args->combined_duration_stdev = stdev(combined_durations, args->combined_duration);
args->complete_duration_stdev = stdev(completion_durations, args->complete_duration);
args->submit_duration_stdev = stdev(submission_durations, args->submit_duration);
return nullptr; return nullptr;
} }

58
benchmarks/task-data.hpp

@ -18,25 +18,41 @@ struct TaskData {
uint32_t rep_count; uint32_t rep_count;
bool batch_submit; bool batch_submit;
uint32_t batch_size; uint32_t batch_size;
uint32_t barrier_after_n_operations;
// thread output // thread output
dml::status_code status; dml::status_code status;
// average run duration in microseconds // average run duration in microseconds
double combined_duration; double combined_duration;
double submit_duration; double submit_duration;
double complete_duration; double complete_duration;
double combined_duration_stdev;
double submit_duration_stdev;
double complete_duration_stdev;
// completed iterations // completed iterations
uint32_t rep_completed; uint32_t rep_completed;
// set by execution // set by execution
barrier* barrier_; barrier* barrier_;
}; };
struct ReadTaskData {
// thread placement / engine selection
uint8_t numa_node;
// region size and source+destination for move
std::vector<size_t> sizes;
uint8_t nnode_src;
uint8_t nnode_dst;
// repetition
uint32_t rep_count;
bool batch_submit;
uint32_t batch_size;
void AddToTaskVector(std::vector<TaskData>& v) const;
};
inline void to_json(nlohmann::json& j, const TaskData& a) { inline void to_json(nlohmann::json& j, const TaskData& a) {
j["task"]["size"] = a.size; j["task"]["size"] = a.size;
j["task"]["iterations"] = a.rep_count; j["task"]["iterations"] = a.rep_count;
j["task"]["batching"]["enabled"] = a.batch_submit;
j["task"]["batching"]["batch_submit"] = a.batch_submit;
j["task"]["batching"]["batch_size"] = a.batch_size; j["task"]["batching"]["batch_size"] = a.batch_size;
j["task"]["batching"]["barrier_after_n_operations"] = a.barrier_after_n_operations;
j["affinity"]["node"] = a.numa_node; j["affinity"]["node"] = a.numa_node;
j["affinity"]["nnode_src"] = a.nnode_src; j["affinity"]["nnode_src"] = a.nnode_src;
j["affinity"]["nnode_dst"] = a.nnode_dst; j["affinity"]["nnode_dst"] = a.nnode_dst;
@ -44,21 +60,40 @@ inline void to_json(nlohmann::json& j, const TaskData& a) {
j["report"]["time"]["completion_avg"] = a.complete_duration; j["report"]["time"]["completion_avg"] = a.complete_duration;
j["report"]["time"]["submission_avg"] = a.submit_duration; j["report"]["time"]["submission_avg"] = a.submit_duration;
j["report"]["time"]["combined_avg"] = a.combined_duration; j["report"]["time"]["combined_avg"] = a.combined_duration;
j["report"]["time"]["completion_stdev"] = a.complete_duration_stdev;
j["report"]["time"]["submission_stdev"] = a.submit_duration_stdev;
j["report"]["time"]["combined_stdev"] = a.combined_duration_stdev;
j["report"]["iterations_completed"] = a.rep_completed; j["report"]["iterations_completed"] = a.rep_completed;
j["report"]["status"] = StatusCodeToString(a.status); j["report"]["status"] = StatusCodeToString(a.status);
} }
inline void from_json(const nlohmann::json& j, TaskData& a) {
j["task"]["size"].get_to(a.size);
inline void from_json(const nlohmann::json& j, ReadTaskData& a) {
const uint32_t size_count = j["tast"]["size_count"].template get<uint32_t>();
for (uint32_t i = 0; i < size_count; i++) {
a.sizes.emplace_back(j["task"]["size"][i].template get<uint32_t>());
}
j["task"]["iterations"].get_to(a.rep_count); j["task"]["iterations"].get_to(a.rep_count);
j["task"]["batching"]["enabled"].get_to(a.batch_submit);
j["task"]["batching"]["batch_submit"].get_to(a.batch_submit);
j["task"]["batching"]["batch_size"].get_to(a.batch_size); j["task"]["batching"]["batch_size"].get_to(a.batch_size);
j["task"]["batching"]["barrier_after_n_operations"].get_to(a.barrier_after_n_operations);
j["affinity"]["node"].get_to(a.numa_node); j["affinity"]["node"].get_to(a.numa_node);
j["affinity"]["nnode_src"].get_to(a.nnode_src); j["affinity"]["nnode_src"].get_to(a.nnode_src);
j["affinity"]["nnode_dst"].get_to(a.nnode_dst); j["affinity"]["nnode_dst"].get_to(a.nnode_dst);
} }
inline void ReadTaskData::AddToTaskVector(std::vector<TaskData> &v) const {
for (const auto s : sizes) {
TaskData t;
t.size = s;
t.rep_count = rep_count;
t.batch_submit = batch_submit;
t.batch_size = batch_size;
t.numa_node = numa_node;
t.nnode_dst = nnode_dst;
t.nnode_src = nnode_src;
}
}
inline void WriteResultLog(const std::vector<TaskData>& args, const std::string& path, std::ostream& os) { inline void WriteResultLog(const std::vector<TaskData>& args, const std::string& path, std::ostream& os) {
nlohmann::json json; nlohmann::json json;
@ -74,10 +109,15 @@ inline void ReadWorkDescription(std::vector<TaskData>& args, std::string& path,
is >> json; is >> json;
const uint32_t count = json.at("count"); const uint32_t count = json.at("count");
args.resize(count);
std::vector<ReadTaskData> rtd;
rtd.resize(count);
path = json.at("path"); path = json.at("path");
for (uint32_t i = 0; i < count; i++) { for (uint32_t i = 0; i < count; i++) {
args[i] = json["list"][i].template get<TaskData>();
rtd[i] = json["list"][i].template get<ReadTaskData>();
}
for (const auto& e : rtd) {
e.AddToTaskVector(args);
} }
} }

12
benchmarks/task-description.json

@ -1,15 +1,15 @@
{ {
"count": 1, "count": 1,
"path" : "sw",
"path" : "hw",
"list": [ "list": [
{ {
"task": { "task": {
"size": 4096,
"iterations": 10000,
"size_count": 5,
"size": [ 1024, 4096, 1048576, 134217728, 1073741824 ],
"iterations": 1000,
"batching": { "batching": {
"enabled": false,
"batch_size": 100,
"barrier_after_n_operations": 10
"batch_submit": false,
"batch_size": 0
} }
}, },
"affinity": { "affinity": {

Loading…
Cancel
Save