This contains my bachelors thesis and associated tex files, code snippets and maybe more. Topic: Data Movement in Heterogeneous Memories with Intel Data Streaming Accelerator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

98 lines
3.0 KiB

  1. #pragma once
  2. #include <iostream>
  3. #include <vector>
  4. #include <chrono>
  5. #include <pthread_np.h>
  6. #include <semaphore.h>
  7. #include <dml/dml.hpp>
  8. struct ThreadArgs {
  9. // thread placement / engine selection
  10. uint8_t numa_node;
  11. uint8_t core;
  12. // region size and source+destination for move
  13. size_t size;
  14. uint8_t nnode_src;
  15. uint8_t nnode_dst;
  16. // repetition
  17. uint8_t count; // TODO: unused
  18. bool batched; // TODO: unused
  19. // thread output
  20. dml::status_code status;
  21. std::chrono::microseconds duration;
  22. // set by execution
  23. sem_t* sig;
  24. };
  25. template <typename path>
  26. void* thread_function(void* argp) {
  27. ThreadArgs* args = reinterpret_cast<ThreadArgs*>(argp);
  28. // set numa node and core affinity of the current thread
  29. numa_run_on_node(args->numa_node);
  30. cpu_set_t cpuset;
  31. CPU_ZERO(&cpuset);
  32. CPU_SET(args->core, &cpuset);
  33. if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) {
  34. std::cerr << "Error setting affinity for thread designated to core " << args->core << " on node " << args->numa_node << std::endl;
  35. return nullptr;
  36. }
  37. // allocate memory for the move operation on the requested numa nodes
  38. void* src = numa_alloc_onnode(args->size, args->nnode_src);
  39. void* dst = numa_alloc_onnode(args->size, args->nnode_dst);
  40. dml::data_view srcv = dml::make_view(reinterpret_cast<uint8_t*>(src), args->size);
  41. dml::data_view dstv = dml::make_view(reinterpret_cast<uint8_t*>(dst), args->size);
  42. // wait for specified signal so that all operations start at the same time
  43. sem_wait(args->sig);
  44. const auto st = std::chrono::high_resolution_clock::now();
  45. // we use the asynchronous submit-routine even though this is not required
  46. // here, however the project later on will only use async operation
  47. auto handler = dml::submit<path>(dml::mem_move, srcv, dstv, args->numa_node);
  48. auto result = handler.get();
  49. const auto et = std::chrono::high_resolution_clock::now();
  50. // free the allocated memory regions on the selected nodes
  51. numa_free(src, args->size);
  52. numa_free(dst, args->size);
  53. args->duration = std::chrono::duration_cast<std::chrono::microseconds>(et - st);
  54. args->status = result.status;
  55. return nullptr;
  56. }
  57. template <typename path>
  58. void execute_mem_move(std::vector<ThreadArgs> args) {
  59. sem_t sem;
  60. std::vector<pthread_t> threads;
  61. // initialize semaphore and numactl-library
  62. sem_init(&sem, 0, 0);
  63. numa_available();
  64. // for each submitted task we link the semaphore
  65. // and create the thread, passing the argument
  66. for (auto arg : args) {
  67. arg.sig = &sem;
  68. threads.emplace_back();
  69. if (pthread_create(&threads.back(), nullptr, thread_function<path>, &arg) != 0) {
  70. std::cerr << "Error creating thread" << std::endl;
  71. exit(1);
  72. }
  73. }
  74. // post will make all waiting threads pass
  75. sem_post(&sem);
  76. for (pthread_t& t : threads) {
  77. pthread_join(t, nullptr);
  78. }
  79. sem_destroy(&sem);
  80. }