This contains my bachelors thesis and associated tex files, code snippets and maybe more. Topic: Data Movement in Heterogeneous Memories with Intel Data Streaming Accelerator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

116 lines
4.6 KiB

  1. #pragma once
  2. #include <iostream>
  3. #include <vector>
  4. #include <chrono>
  5. #include <future>
  6. #include <vector>
  7. #include <numa.h>
  8. #include <dml/dml.hpp>
  9. #include "util/dml-helper.hpp"
  10. #include "util/task-data.hpp"
  11. #define LOG_CODE_INFO "Location: " << __FILE__ << "@" << __LINE__ << "::" << __FUNCTION__ << std::endl
  12. #define LOG_ERR { std::cerr << "--- BEGIN ERROR MSG ---" << std::endl << "Physical: [Node " << task->numa_node << " | Thread " << tid << "]" << std::endl; } std::cerr << LOG_CODE_INFO
  13. #define CHECK_STATUS(stat,msg) { if (stat != dml::status_code::ok) { LOG_ERR << "Status Code: " << StatusCodeToString(stat) << std::endl << msg << std::endl; task->status = stat; return; }}
  14. std::shared_future<void> LAUNCH_;
  15. std::vector<uint64_t> ITERATION_TIMING_;
  16. std::vector<void*> SOURCE_;
  17. std::vector<void*> DESTINATION_;
  18. template <typename path>
  19. void thread_function(const uint32_t tid, TaskData* task) {
  20. dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(SOURCE_[tid]), task->size);
  21. dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(DESTINATION_[tid]), task->size);
  22. task->status = dml::status_code::ok;
  23. LAUNCH_.wait();
  24. if (task->batch_size > 1) {
  25. auto sequence = dml::sequence(task->batch_size, std::allocator<dml::byte_t>());
  26. for (uint32_t j = 0; j < task->batch_size; j++) {
  27. const auto status = sequence.add(dml::mem_copy, srcv, dstv);
  28. CHECK_STATUS(status, "Adding operation to batch failed!");
  29. }
  30. // we use the asynchronous submit-routine even though this is not required
  31. // here, however the project later on will only use async operation and
  32. // therefore this behaviour should be benchmarked
  33. auto handler = dml::submit<path>(dml::batch, sequence, dml::execution_interface<path, std::allocator<dml::byte_t>>(), task->numa_node);
  34. auto result = handler.get();
  35. const dml::status_code status = result.status;
  36. CHECK_STATUS(status, "Batch completed with an Error!");
  37. }
  38. else {
  39. // we use the asynchronous submit-routine even though this is not required
  40. // here, however the project later on will only use async operation and
  41. // therefore this behaviour should be benchmarked
  42. auto handler = dml::submit<path>(dml::mem_copy, srcv, dstv, dml::execution_interface<path, std::allocator<dml::byte_t>>(), task->numa_node);
  43. auto result = handler.get();
  44. const dml::status_code status = result.status;
  45. CHECK_STATUS(status, "Operation completed with an Error!");
  46. }
  47. }
  48. template <typename path>
  49. void execute_dml_memcpy(std::vector<TaskData>& args, const uint64_t iterations) {
  50. // initialize numa library
  51. numa_available();
  52. // initialize data fields for use
  53. for (uint32_t tid = 0; tid < args.size(); tid++) {
  54. SOURCE_[tid] = numa_alloc_onnode(args[tid].size, args[tid].nnode_src);
  55. DESTINATION_[tid] = numa_alloc_onnode(args[tid].size, args[tid].nnode_dst);
  56. std::memset(SOURCE_[tid], 0xAB, args[tid].size);
  57. std::memset(DESTINATION_[tid], 0xAB, args[tid].size);
  58. }
  59. // for each requested iteration this is repeated, plus 5 iterations as warmup
  60. for (uint64_t i = 0; i < iterations + 5; i++) {
  61. std::vector<std::thread> threads;
  62. std::promise<void> launch_promise;
  63. LAUNCH_ = launch_promise.get_future();
  64. for (uint32_t tid = 0; tid < args.size(); tid++) {
  65. // we flush the cache for the memory regions to avoid any caching effects
  66. dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(SOURCE_[tid]), args[tid].size);
  67. dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(DESTINATION_[tid]), args[tid].size);
  68. auto rsrc = dml::execute<dml::software>(dml::cache_flush, srcv);
  69. auto rdst = dml::execute<dml::software>(dml::cache_flush, dstv);
  70. TaskData* task = &args[tid];
  71. CHECK_STATUS(rsrc.status, "Flushing Cache for Source failed!");
  72. CHECK_STATUS(rdst.status, "Flushing Cache for Destination failed!");
  73. // then spawn the thread
  74. threads.emplace_back(thread_function<path>, tid, &args[tid]);
  75. }
  76. using namespace std::chrono_literals;
  77. std::this_thread::sleep_for(1ms);
  78. const auto time_start = std::chrono::steady_clock::now();
  79. launch_promise.set_value();
  80. for(std::thread& t : threads) { t.join(); }
  81. const auto time_end = std::chrono::steady_clock::now();
  82. if (i >= 5) ITERATION_TIMING_.emplace_back(std::chrono::duration_cast<std::chrono::nanoseconds>(time_end - time_start).count());
  83. }
  84. }