@ -1,9 +1,5 @@
# include <memory>
# include <cassert>
# include <mutex>
# include <cstring>
# include <bitset>
# include <algorithm>
# include <barrier>
# include <vector>
# include <fstream>
@ -20,7 +16,7 @@
# include "BenchmarkHelpers.cpp"
# define MODE_HBM
# define MODE_PREFETC H
////////////////////////////////
/// BENCHMARK SETUP
@ -30,15 +26,13 @@ constexpr uint32_t WARMUP_ITERATION_COUNT = 5;
constexpr uint32_t ITERATION_COUNT = 5 ;
# ifdef MODE_PREFETCH
constexpr uint32_t GROUP_COUNT = 8 ;
constexpr size_t CHUNK_SIZE_B = 16 _MiB ;
constexpr uint32_t TC_SCANA = 4 ;
constexpr uint32_t GROUP_COUNT = 16 ;
constexpr size_t CHUNK_SIZE_B = 8 _MiB ;
constexpr uint32_t TC_SCANA = 2 ;
constexpr uint32_t TC_SCANB = 1 ;
constexpr uint32_t TC_AGGRJ = 4 ;
constexpr uint32_t TC_AGGRJ = 2 ;
constexpr bool PERFORM_CACHING = true ;
constexpr bool YIELD_ON_CACHE_MISS = false ;
constexpr bool DATA_IN_HBM = false ;
constexpr bool AGGRJ_ITERATIVE = true ;
constexpr char MODE_STRING [ ] = " prefetch " ;
# endif
# ifdef MODE_DRAM
@ -48,9 +42,7 @@ constexpr uint32_t TC_SCANA = 2;
constexpr uint32_t TC_SCANB = 0 ;
constexpr uint32_t TC_AGGRJ = 1 ;
constexpr bool PERFORM_CACHING = false ;
constexpr bool YIELD_ON_CACHE_MISS = false ;
constexpr bool DATA_IN_HBM = false ;
constexpr bool AGGRJ_ITERATIVE = false ;
constexpr char MODE_STRING [ ] = " dram " ;
# endif
# ifdef MODE_HBM
@ -60,9 +52,7 @@ constexpr uint32_t TC_SCANA = 2;
constexpr uint32_t TC_SCANB = 0 ;
constexpr uint32_t TC_AGGRJ = 1 ;
constexpr bool PERFORM_CACHING = false ;
constexpr bool YIELD_ON_CACHE_MISS = false ;
constexpr bool DATA_IN_HBM = true ;
constexpr bool AGGRJ_ITERATIVE = false ;
constexpr char MODE_STRING [ ] = " hbm " ;
# endif
@ -254,12 +244,7 @@ void aggr_j(size_t gid, size_t tid) {
THREAD_TIMING_ [ AGGRJ_TIMING_INDEX ] [ tid * gid ] [ 0 ] [ TIME_STAMP_BEGIN ] = std : : chrono : : steady_clock : : now ( ) ;
if constexpr ( AGGRJ_ITERATIVE ) {
if ( tid = = 0 ) BARRIERS_ [ gid ] - > arrive_and_wait ( ) ;
}
else {
BARRIERS_ [ gid ] - > arrive_and_wait ( ) ;
}
BARRIERS_ [ gid ] - > arrive_and_wait ( ) ;
THREAD_TIMING_ [ AGGRJ_TIMING_INDEX ] [ tid * gid ] [ 0 ] [ TIME_STAMP_WAIT ] = std : : chrono : : steady_clock : : now ( ) ;
@ -278,7 +263,6 @@ void aggr_j(size_t gid, size_t tid) {
if ( data_ptr = = nullptr ) {
data_ptr = chunk_ptr ;
if constexpr ( YIELD_ON_CACHE_MISS ) sched_yield ( ) ;
}
else {
CACHE_HITS_ [ gid * tid ] + + ;
@ -294,16 +278,9 @@ void aggr_j(size_t gid, size_t tid) {
THREAD_TIMING_ [ AGGRJ_TIMING_INDEX ] [ tid * gid ] [ 0 ] [ TIME_STAMP_END ] = std : : chrono : : steady_clock : : now ( ) ;
if constexpr ( ! AGGRJ_ITERATIVE ) {
BARRIERS_ [ gid ] - > arrive_and_drop ( ) ;
}
BARRIERS_ [ gid ] - > arrive_and_drop ( ) ;
aggregation : : happly ( DATA_DST_ + ( tid * GROUP_COUNT + gid ) , aggregator ) ;
if constexpr ( AGGRJ_ITERATIVE ) {
if ( + + tid < TC_AGGRJ ) aggr_j ( gid , tid ) ;
else BARRIERS_ [ gid ] - > arrive_and_drop ( ) ;
}
}
int main ( ) {
@ -349,12 +326,7 @@ int main() {
std : : vector < std : : thread > agg_pool ;
for ( uint32_t gid = 0 ; gid < GROUP_COUNT ; + + gid ) {
if constexpr ( AGGRJ_ITERATIVE ) {
BARRIERS_ . emplace_back ( new std : : barrier < NopStruct > ( TC_SCANA + TC_SCANB + 1 ) ) ;
}
else {
BARRIERS_ . emplace_back ( new std : : barrier < NopStruct > ( TC_COMBINED ) ) ;
}
BARRIERS_ . emplace_back ( new std : : barrier < NopStruct > ( TC_COMBINED ) ) ;
for ( uint32_t tid = 0 ; tid < TC_SCANA ; + + tid ) {
filter_pool . emplace_back ( scan_a , gid , tid ) ;
@ -364,13 +336,8 @@ int main() {
copy_pool . emplace_back ( scan_b , gid , tid ) ;
}
if constexpr ( AGGRJ_ITERATIVE ) {
agg_pool . emplace_back ( aggr_j , gid , 0 ) ;
}
else {
for ( uint32_t tid = 0 ; tid < TC_AGGRJ ; + + tid ) {
agg_pool . emplace_back ( aggr_j , gid , tid ) ;
}
for ( uint32_t tid = 0 ; tid < TC_AGGRJ ; + + tid ) {
agg_pool . emplace_back ( aggr_j , gid , tid ) ;
}
}