@ -10,6 +10,8 @@
# include <dml/dml.hpp>
# include "statuscode-tostring.hpp"
struct ThreadArgs {
// thread placement / engine selection
uint8_t numa_node ;
@ -18,24 +20,54 @@ struct ThreadArgs {
size_t size ;
uint8_t nnode_src ;
uint8_t nnode_dst ;
// repetition
uint32_t rep_count ;
bool batch_submit ;
uint32_t batch_size ;
uint32_t barrier_after_n_operations ;
// thread output
dml : : status_code status ;
std : : chrono : : microseconds duration ;
// average run duration in microseconds
double combined_duration ;
double submit_duration ;
double complete_duration ;
// completed iterations
uint32_t rep_completed ;
// set by execution
sem_t * sig ;
} ;
double avg ( const std : : vector < double > & v ) {
int n = 0 ;
double mean = 0.0 ;
for ( const auto x : v ) {
const double delta = static_cast < double > ( x ) - mean ;
mean + = delta / + + n ;
return mean ;
# define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
# define LOG_ERR { pthread_t t = pthread_self(); std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << args->numa_node << " | Core " << args->core << " | Thread " << t << "]" << std::endl; } std::cerr << LOG_CODE_INFO
# define CHECK_STATUS(status,msg) { if (status != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(status) << std::endl << #msg << std::endl; args->status = status; return nullptr; }}
template < typename path >
void * thread_function ( void * argp ) {
ThreadArgs * args = reinterpret_cast < ThreadArgs * > ( argp ) ;
std : : vector < double > submission_durations ;
std : : vector < double > completion_durations ;
std : : vector < double > combined_durations ;
// set numa node and core affinity of the current thread
numa_run_on_node ( args - > numa_node ) ;
cpu_set_t cpuset ;
CPU_ZERO ( & cpuset ) ;
CPU_SET ( args - > core , & cpuset ) ;
if ( pthread_setaffinity_np ( pthread_self ( ) , sizeof ( cpu_set_t ) , & cpuset ) ! = 0 ) {
std : : cerr < < " Error setting affinity for thread designated to core " < < args - > core < < " on node " < < args - > numa_node < < std : : endl ;
LOG_ERR < < " Error setting affinity for thread " < < std : : endl ;
return nullptr ;
@ -45,31 +77,83 @@ void* thread_function(void* argp) {
dml : : data_view srcv = dml : : make_view ( reinterpret_cast < uint8_t * > ( src ) , args - > size ) ;
dml : : data_view dstv = dml : : make_view ( reinterpret_cast < uint8_t * > ( dst ) , args - > size ) ;
args - > status = dml : : status_code : : ok ;
args - > rep_completed = 0 ;
// wait for specified signal so that all operations start at the same time
sem_wait ( args - > sig ) ;
const auto st = std : : chrono : : high_resolution_clock : : now ( ) ;
// we use the asynchronous submit-routine even though this is not required
// here, however the project later on will only use async operation
auto handler = dml : : submit < path > ( dml : : mem_move , srcv , dstv ) ;
auto result = handler . get ( ) ;
for ( uint32_t i = 0 ; i < args - > rep_count ; i + + ) {
if ( args - > batch_submit ) {
uint32_t opcount = args - > batch_size ;
if ( args - > barrier_after_n_operations > 0 ) {
opcount + = opcount / args - > barrier_after_n_operations ;
const auto st = std : : chrono : : high_resolution_clock : : now ( ) ;
auto sequence = dml : : sequence ( opcount , std : : allocator < dml : : byte_t > ( ) ) ;
for ( uint32_t j = 0 ; j < args - > batch_size ; j + + ) {
const auto status = sequence . add ( dml : : mem_copy , srcv , dstv ) ;
if ( j % args - > barrier_after_n_operations = = 0 ) {
sequence . add ( dml : : nop ) ;
const auto et = std : : chrono : : high_resolution_clock : : now ( ) ;
auto handler = dml : : submit < path > ( dml : : batch , sequence ) ;
const auto se = std : : chrono : : high_resolution_clock : : now ( ) ;
auto result = handler . get ( ) ;
const auto et = std : : chrono : : high_resolution_clock : : now ( ) ;
submission_durations . emplace_back ( std : : chrono : : duration_cast < std : : chrono : : microseconds > ( se - st ) . count ( ) ) ;
completion_durations . emplace_back ( std : : chrono : : duration_cast < std : : chrono : : microseconds > ( et - se ) . count ( ) ) ;
combined_durations . emplace_back ( std : : chrono : : duration_cast < std : : chrono : : microseconds > ( et - st ) . count ( ) ) ;
else {
const auto st = std : : chrono : : high_resolution_clock : : now ( ) ;
// we use the asynchronous submit-routine even though this is not required
// here, however the project later on will only use async operation and
// therefore this behaviour should be benchmarked
auto handler = dml : : submit < path > ( dml : : mem_copy , srcv , dstv ) ;
const auto se = std : : chrono : : high_resolution_clock : : now ( ) ;
auto result = handler . get ( ) ;
const auto et = std : : chrono : : high_resolution_clock : : now ( ) ;
const dml : : status_code status = result . status ;
CHECK_STATUS ( status , " Operation completed with an Error! " ) ;
submission_durations . emplace_back ( std : : chrono : : duration_cast < std : : chrono : : microseconds > ( se - st ) . count ( ) ) ;
completion_durations . emplace_back ( std : : chrono : : duration_cast < std : : chrono : : microseconds > ( et - se ) . count ( ) ) ;
combined_durations . emplace_back ( std : : chrono : : duration_cast < std : : chrono : : microseconds > ( et - st ) . count ( ) ) ;
args - > rep_completed + + ;
// free the allocated memory regions on the selected nodes
numa_free ( src , args - > size ) ;
numa_free ( dst , args - > size ) ;
args - > duration = std : : chrono : : duration_cast < std : : chrono : : microseconds > ( et - st ) ;
args - > status = result . status ;
args - > combined_duration = avg ( combined_durations ) ;
args - > complete_duration = avg ( completion_durations ) ;
args - > submit_duration = avg ( submission_durations ) ;
args - > sig = nullptr ;
return nullptr ;
template < typename path >
void execute_mem_move ( std : : vector < ThreadArgs > & args ) {
void execute_dml_memcpy ( std : : vector < ThreadArgs > & args ) {
sem_t sem ;
std : : vector < pthread_t > threads ;