diff --git a/qdp_project/src/Benchmark.cpp b/qdp_project/src/Benchmark.cpp
index ecc1969..0934f7d 100644
--- a/qdp_project/src/Benchmark.cpp
+++ b/qdp_project/src/Benchmark.cpp
@@ -32,27 +32,27 @@ constexpr uint32_t TC_SCANA = 2;
 constexpr uint32_t TC_SCANB = 1;
 constexpr uint32_t TC_AGGRJ = 2;
 constexpr bool PERFORM_CACHING = true;
-constexpr bool DATA_IN_HBM = false;
+constexpr bool DATA_B_IN_HBM = false;
 constexpr char MODE_STRING[] = "prefetch";
 #endif
 #ifdef MODE_DRAM
 constexpr size_t CHUNK_SIZE_B = 2_MiB;
-constexpr uint32_t GROUP_COUNT = 8;
-constexpr uint32_t TC_SCANA = 2;
+constexpr uint32_t GROUP_COUNT = 32;
+constexpr uint32_t TC_SCANA = 1;
 constexpr uint32_t TC_SCANB = 0;
 constexpr uint32_t TC_AGGRJ = 1;
 constexpr bool PERFORM_CACHING = false;
-constexpr bool DATA_IN_HBM = false;
+constexpr bool DATA_B_IN_HBM = false;
 constexpr char MODE_STRING[] = "dram";
 #endif
 #ifdef MODE_HBM
 constexpr size_t CHUNK_SIZE_B = 2_MiB;
-constexpr uint32_t GROUP_COUNT = 8;
-constexpr uint32_t TC_SCANA = 2;
+constexpr uint32_t GROUP_COUNT = 32;
+constexpr uint32_t TC_SCANA = 1;
 constexpr uint32_t TC_SCANB = 0;
 constexpr uint32_t TC_AGGRJ = 1;
 constexpr bool PERFORM_CACHING = false;
-constexpr bool DATA_IN_HBM = true;
+constexpr bool DATA_B_IN_HBM = true;
 constexpr char MODE_STRING[] = "hbm";
 #endif
 
@@ -297,25 +297,23 @@ int main() {
 
     fout << "run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;cache-hr;" << std::endl;
 
-    if constexpr (DATA_IN_HBM) {
-        DATA_A_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node);
+    if constexpr (DATA_B_IN_HBM) {
         DATA_B_ = (uint64_t*) numa_alloc_onnode(WL_SIZE_B, cache_node);
-        MASK_A_ = (uint16_t*) numa_alloc_onnode(WL_SIZE_ELEMENTS, cache_node);
-        DATA_DST_ = (uint64_t*) numa_alloc_onnode(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t), cache_node);
     }
     else {
-        DATA_A_ = (uint64_t*) numa_alloc_local(WL_SIZE_B);
         DATA_B_ = (uint64_t*) numa_alloc_local(WL_SIZE_B);
-        MASK_A_ = (uint16_t*) numa_alloc_local(WL_SIZE_ELEMENTS);
-        DATA_DST_ = (uint64_t*) numa_alloc_local(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t));
     }
 
+    DATA_A_ = (uint64_t*) numa_alloc_local(WL_SIZE_B);
+    MASK_A_ = (uint16_t*) numa_alloc_local(WL_SIZE_ELEMENTS);
+    DATA_DST_ = (uint64_t*) numa_alloc_local(TC_AGGRJ * GROUP_COUNT * sizeof(uint64_t));
+
     if constexpr (PERFORM_CACHING) {
         CACHE_.Init(CachePlacementPolicy, CopyMethodPolicy);
     }
 
     fill_mt<uint64_t>(DATA_A_, WL_SIZE_B, 0, 100, 42);
-    fill_mt<uint64_t>(DATA_A_, WL_SIZE_B, 0, 100, 420);
+    fill_mt<uint64_t>(DATA_B_, WL_SIZE_B, 0, 100, 420);
 
     for (uint32_t i = 0; i < ITERATION_COUNT + WARMUP_ITERATION_COUNT; i++) {
         std::promise<void> launch_promise;