From 099f454f19941a2ef927a1cda7c1161032ad4b3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Constantin=20F=C3=BCrst?= <c@fuersten.info>
Date: Mon, 11 Dec 2023 18:45:05 +0100
Subject: [PATCH] modify plotters to a more streamlined state, all now use the
 file-loop in main and have a function that processes one file into the
 dataset, also adds the peakthroughput plotter and removes the defunct
 opt-submitmethod plotter

---
 .../benchmark-plotters/plot-cost-mtsubmit.py  |  47 ++++----
 .../plot-opt-submitmethod.py                  | 104 ------------------
 .../plot-perf-enginelocation.py               |  16 +--
 .../plot-perf-peakthroughput.py               |  80 ++++++++++++++
 .../plot-perf-submitmethod.py                 |  86 ++++++++-------
 5 files changed, 155 insertions(+), 178 deletions(-)
 delete mode 100644 benchmarks/benchmark-plotters/plot-opt-submitmethod.py
 create mode 100644 benchmarks/benchmark-plotters/plot-perf-peakthroughput.py

diff --git a/benchmarks/benchmark-plotters/plot-cost-mtsubmit.py b/benchmarks/benchmark-plotters/plot-cost-mtsubmit.py
index c9c72ee..e1d4879 100644
--- a/benchmarks/benchmark-plotters/plot-cost-mtsubmit.py
+++ b/benchmarks/benchmark-plotters/plot-cost-mtsubmit.py
@@ -18,8 +18,8 @@ title = "Per-Thread Throughput - 120 Copy Operations split on Threads Intra-Node
 index = [runid, x_label, var_label]
 data = []
 
-def calc_throughput(size_bytes,time_nanosec):
-    time_seconds = time_nanosec * 1e-9
+def calc_throughput(size_bytes,time_ns):
+    time_seconds = time_ns * 1e-9
     size_gib = size_bytes / (1024 ** 3)
     throughput_gibs = size_gib / time_seconds
     return throughput_gibs
@@ -31,13 +31,16 @@ def index_from_element(value,array):
     return 0
 
 
-def load_and_process_copy_json(file_path):
+def load_time_mesurements(file_path):
     with open(file_path, 'r') as file:
         data = json.load(file)
      
         count = data["count"]
         iterations = data["list"][0]["task"]["iterations"]
 
+        # work queue size is 120 which is split over all available threads
+        # therefore we divide the result by 120/n_threads to get the per-element speed
+
         return {
             "total" : sum([x / (iterations * (120 / count)) for x in list(chain(*[data["list"][i]["report"]["time"]["total"] for i in range(count)]))]),
             "combined" : [x / (120 / count) for x in list(chain(*[data["list"][i]["report"]["time"]["combined"] for i in range(count)]))],
@@ -45,41 +48,35 @@ def load_and_process_copy_json(file_path):
             "completion" : [x / (120 / count) for x in list(chain(*[data["list"][i]["report"]["time"]["completion"] for i in range(count)]))]
         }
 
-# Function to plot the graph for the new benchmark
-def create_mtsubmit_dataset(file_paths, engine_label):
-    times = []
-
+def process_file_to_dataset(file_path, engine_label, thread_count):
     engine_index = index_from_element(engine_label,engine_counts)
     engine_nice = engine_counts_nice[engine_index]
+    threadc_index = index_from_element(thread_count, thread_counts)
+    thread_count_nice = thread_counts_nice[threadc_index]
+    data_size = 0
 
-    idx = 0
-    for file_path in file_paths:
-        time = load_and_process_copy_json(file_path)
-        times.append(time["total"])
-        idx = idx + 1
-
-    throughput = []
     if engine_label in ["1gib-1e", "1gib-4e"]:
-        throughput = [[calc_throughput(1024*1024*1024,time) for time in t] for t in times]
+        data_size = 1024*1024*1024
     else:
-        throughput = [[calc_throughput(1024*1024,time) for time in t] for t in times]
+        data_size = 1024*1024
 
-    idx = 0
-    for run_set in throughput:
+    try:
+        time = load_time_mesurements(file_path)["total"]
         run_idx = 0
-        for run in run_set:
-            data.append({ runid : run_idx, x_label: thread_counts_nice[idx], var_label : engine_nice, y_label : throughput[idx][run_idx]})
+        for t in time:
+            data.append({ runid : run_idx, x_label: thread_count_nice, var_label : engine_nice, y_label : calc_throughput(data_size, t)})
             run_idx = run_idx + 1
-        idx = idx + 1
+    except FileNotFoundError:
+        return
 
 
-# Main function to iterate over files and create plots for the new benchmark
 def main():
-    folder_path = "benchmark-results/"  # Replace with the actual path to your folder
+    folder_path = "benchmark-results/"
 
     for engine_label in engine_counts:
-        mt_file_paths = [os.path.join(folder_path, f"mtsubmit-{thread_count}-{engine_label}.json") for thread_count in thread_counts]
-        create_mtsubmit_dataset(mt_file_paths, engine_label)
+        for thread_count in thread_counts:
+            file = os.path.join(folder_path, f"mtsubmit-{thread_count}-{engine_label}.json")
+            process_file_to_dataset(file, engine_label, thread_count)
 
     df = pd.DataFrame(data)
     df.set_index(index, inplace=True)
diff --git a/benchmarks/benchmark-plotters/plot-opt-submitmethod.py b/benchmarks/benchmark-plotters/plot-opt-submitmethod.py
deleted file mode 100644
index ae1f5c4..0000000
--- a/benchmarks/benchmark-plotters/plot-opt-submitmethod.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import os
-import json
-import pandas as pd
-from pandas.core.ops import methods
-from typing import List
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-runid = "Run ID"
-x_label = "Size of Submitted Task"
-y_label = "Throughput in GiB/s, LogScale"
-var_label = "Submission Type"
-sizes = ["1kib", "4kib", "1mib", "32mib"]
-sizes_nice = ["1 KiB", "4 KiB", "1 MiB", "32 MiB"]
-types = ["bs10", "bs50", "ms10", "ms50", "ssaw"]
-types_nice = ["Batch, Size 10", "Batch, Size 50", "Multi-Submit, Count 10", "Multi-Submit, Count 50", "Single Submit"]
-title = "Optimal Submission Method - Copy Operation tested Intra-Node on DDR"
-
-index = [runid, x_label, var_label]
-data = []
-
-def calc_throughput(size_bytes,time_microseconds):
-    time_seconds = time_microseconds * 1e-9
-    size_gib = size_bytes / (1024 ** 3)
-    throughput_gibs = size_gib / time_seconds
-    return throughput_gibs
-
-
-def index_from_element(value,array):
-    for (idx,val) in enumerate(array):
-        if val == value: return idx
-    return 0
-
-
-def load_and_process_submit_json(file_path):
-    with open(file_path, 'r') as file:
-        data = json.load(file)
-        iterations = data["list"][0]["task"]["iterations"]
-
-        return {
-            "total": data["list"][0]["report"]["total"] / iterations,
-            "combined": data["list"][0]["report"]["combined"],
-            "submission": data["list"][0]["report"]["submission"],
-            "completion": data["list"][0]["report"]["completion"]
-        }
-
-
-# Function to plot the graph for the new benchmark
-def create_submit_dataset(file_paths, type_label):
-    times = []
-
-    type_index = index_from_element(type_label,types)
-    type_nice = types_nice[type_index]
-
-    idx = 0
-    for file_path in file_paths:
-        time = load_and_process_submit_json(file_path)
-        times.append(time["total"])
-        idx = idx + 1
-
-    # Adjust time measurements based on type
-    # which can contain multiple submissions
-    if type_label in {"bs10", "ms10"}:
-        times = [[t / 10  for t in time] for time in times]
-    elif type_label in {"ms50", "bs50"}:
-        times = [[t / 50  for t in time] for time in times]
-
-    times[0] = [t / 1 for t in times[0]]
-    times[1] = [t / 4 for t in times[1]]
-    times[2] = [t / (1024) for t in times[2]]
-    times[3] = [t / (32*1024) for t in times[3]]
-
-    throughput = [[calc_throughput(1024,time) for time in t] for t in times]
-
-    idx = 0
-    for run_set in throughput:
-        run_idx = 0
-        for run in run_set:
-            data.append({ runid : run_idx, x_label: sizes_nice[idx], var_label : type_nice, y_label : throughput[idx][run_idx]})
-            run_idx = run_idx + 1
-        idx = idx + 1
-
-
-# Main function to iterate over files and create plots for the new benchmark
-def main():
-    folder_path = "benchmark-results/"  # Replace with the actual path to your folder
-
-    for type_label in types:
-        file_paths = [os.path.join(folder_path, f"submit-{type_label}-{size}-1e.json") for size in sizes]
-        create_submit_dataset(file_paths, type_label)
-
-    df = pd.DataFrame(data)
-    df.set_index(index, inplace=True)
-    df = df.sort_values(y_label)
-
-    ax = sns.barplot(x=x_label, y=y_label, hue=var_label, data=df, palette="rocket", errorbar="sd")
-    ax.set(yscale="log")
-    sns.move_legend(ax, "lower right")
-    plt.title(title)
-    plt.savefig(os.path.join(folder_path, "plot-opt-submitmethod.png"), bbox_inches='tight')
-    plt.show()
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/benchmarks/benchmark-plotters/plot-perf-enginelocation.py b/benchmarks/benchmark-plotters/plot-perf-enginelocation.py
index c4df13c..2111cd7 100644
--- a/benchmarks/benchmark-plotters/plot-perf-enginelocation.py
+++ b/benchmarks/benchmark-plotters/plot-perf-enginelocation.py
@@ -18,8 +18,8 @@ title = "Performance of Engine Location - Copy Operation on DDR with 1 Engine pe
 index = [runid, x_label, var_label]
 data = []
 
-def calc_throughput(size_bytes,time_microseconds):
-    time_seconds = time_microseconds * 1e-9
+def calc_throughput(size_bytes,time_ns):
+    time_seconds = time_ns * 1e-9
     size_gib = size_bytes / (1024 ** 3)
     throughput_gibs = size_gib / time_seconds
     return throughput_gibs
@@ -31,14 +31,16 @@ def index_from_element(value,array):
     return 0
 
 
-def load_and_process_copy_json(file_path,method_label):
+def load_time_mesurements(file_path,method_label):
     with open(file_path, 'r') as file:
         data = json.load(file)
         iterations = data["list"][0]["task"]["iterations"]
         
-        # Extracting time from JSON structure
         if method_label == "xcopy":
-            # For xcopy method, add times from two entries and divide by 3
+            # xcopy runs on two engines that both copy 1/2 of the entire
+            # specified size of 1gib, therefore the maximum time between
+            # these two is going to be the total time for copy
+
             time0 = data["list"][0]["report"]["time"]
             time1 = data["list"][1]["report"]["time"]
 
@@ -52,7 +54,6 @@ def load_and_process_copy_json(file_path,method_label):
         else:
             return data["list"][0]["report"]["time"]
 
-# Function to plot the graph for the new benchmark
 def create_copy_dataset(file_path, method_label, type_label):
     method_index = index_from_element(method_label,copy_methods)
     method_nice = copy_methods_nice[method_index]
@@ -66,7 +67,7 @@ def create_copy_dataset(file_path, method_label, type_label):
         data_size = 1024*1024*1024
 
     try:
-        time = load_and_process_copy_json(file_path,method_label)["total"]
+        time = load_time_mesurements(file_path,method_label)["total"]
         run_idx = 0
         for t in time:
             data.append({ runid : run_idx, x_label: type_nice, var_label : method_nice, y_label : calc_throughput(data_size, t)})
@@ -74,7 +75,6 @@ def create_copy_dataset(file_path, method_label, type_label):
     except FileNotFoundError:
         return
 
-# Main function to iterate over files and create plots for the new benchmark
 def main():
     folder_path = "benchmark-results/"
 
diff --git a/benchmarks/benchmark-plotters/plot-perf-peakthroughput.py b/benchmarks/benchmark-plotters/plot-perf-peakthroughput.py
new file mode 100644
index 0000000..fc65159
--- /dev/null
+++ b/benchmarks/benchmark-plotters/plot-perf-peakthroughput.py
@@ -0,0 +1,80 @@
+import os
+import json
+import pandas as pd
+from pandas.core.ops import methods
+from typing import List
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+runid = "Run ID"
+x_label = "Destination Node"
+y_label = "Source Node"
+v_label = "Throughput"
+title = "Copy Throughput for 1GiB Elements running on SRC Node"
+
+data = []
+
+
+def mean_without_outliers(x):
+    return x.sort_values()[2:-2].mean()
+
+
+def calc_throughput(size_bytes,time_ns):
+    time_seconds = time_ns * 1e-9
+    size_gib = size_bytes / (1024 ** 3)
+    throughput_gibs = size_gib / time_seconds
+    return throughput_gibs
+
+
+def index_from_element(value,array):
+    for (idx,val) in enumerate(array):
+        if val == value: return idx
+    return 0
+
+
+def load_time_mesurements(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+        iterations = data["list"][0]["task"]["iterations"]
+
+        return {
+            "total": data["list"][0]["report"]["total"] / iterations,
+            "combined": data["list"][0]["report"]["combined"],
+            "submission": data["list"][0]["report"]["submission"],
+            "completion": data["list"][0]["report"]["completion"]
+        }
+
+
+def process_file_to_dataset(file_path, src_node, dst_node):
+    data_size = 1024*1024*1024
+
+    try:
+        time = load_time_mesurements(file_path)["total"]
+        run_idx = 0
+        for t in time:
+            data.append({ runid : run_idx, x_label : dst_node, y_label : src_node, v_label: calc_throughput(data_size, t)})
+            run_idx = run_idx + 1
+    except FileNotFoundError:
+        return
+
+
+def main():
+    folder_path = "benchmark-results/"
+
+    for src_node in range(16):
+        for dst_node in range(16):
+            file = os.path .join(folder_path, f"copy-n{src_node}ton{dst_node}-1gib-1e.json")
+            process_file_to_dataset(file, src_node, dst_node)
+
+    df = pd.DataFrame(data)
+    data_pivot = df.pivot_table(index=y_label, columns=x_label, values=v_label, aggfunc=mean_without_outliers)
+
+    sns.heatmap(data_pivot, annot=True, palette="rocket", fmt=".0f")
+
+    plt.title(title)
+    plt.savefig(os.path.join(folder_path, "plot-perf-peakthroughput.png"), bbox_inches='tight')
+    plt.show()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmarks/benchmark-plotters/plot-perf-submitmethod.py b/benchmarks/benchmark-plotters/plot-perf-submitmethod.py
index c2381bd..22a5cc2 100644
--- a/benchmarks/benchmark-plotters/plot-perf-submitmethod.py
+++ b/benchmarks/benchmark-plotters/plot-perf-submitmethod.py
@@ -8,19 +8,19 @@ import matplotlib.pyplot as plt
 
 runid = "Run ID"
 x_label = "Size of Submitted Task"
-y_label = "Throughput in GiB/s"
+y_label = "Throughput in GiB/s, LogScale"
 var_label = "Submission Type"
 sizes = ["1kib", "4kib", "1mib", "32mib"]
 sizes_nice = ["1 KiB", "4 KiB", "1 MiB", "32 MiB"]
 types = ["bs10", "bs50", "ms10", "ms50", "ssaw"]
 types_nice = ["Batch, Size 10", "Batch, Size 50", "Multi-Submit, Count 10", "Multi-Submit, Count 50", "Single Submit"]
-title = "Performance of Submission Methods - Copy Operation tested Intra-Node on DDR"
+title = "Optimal Submission Method - Copy Operation tested Intra-Node on DDR"
 
 index = [runid, x_label, var_label]
 data = []
 
-def calc_throughput(size_bytes,time_microseconds):
-    time_seconds = time_microseconds * 1e-9
+def calc_throughput(size_bytes,time_ns):
+    time_seconds = time_ns * 1e-9
     size_gib = size_bytes / (1024 ** 3)
     throughput_gibs = size_gib / time_seconds
     return throughput_gibs
@@ -32,64 +32,68 @@ def index_from_element(value,array):
     return 0
 
 
-def load_and_process_submit_json(file_path):
+def load_time_mesurements(file_path,type_label):
     with open(file_path, 'r') as file:
         data = json.load(file)
-        return data["list"][0]["report"]["time"]
+        iterations = data["list"][0]["task"]["iterations"]
+        divisor = 1
 
+        # bs and ms types for submission process more than one
+        # element per run and the results therefore must be
+        # divided by this number
 
-# Function to plot the graph for the new benchmark
-def create_submit_dataset(file_paths, type_label):
-    times = []
+        if type_label in ["bs10", "ms10"]: divisor = 10
+        elif type_label in ["ms50", "bs50"]: divisor = 50
+        else: divisor = 1
 
-    type_index = index_from_element(type_label,types)
-    type_nice = types_nice[type_index]
-
-    idx = 0
-    for file_path in file_paths:
-        time = load_and_process_submit_json(file_path)
-        times.append(time["combined"])
-        idx = idx + 1
-
-    # Adjust time measurements based on type
-    # which can contain multiple submissions
-    if type_label in {"bs10", "ms10"}:
-        times = [[t / 10  for t in time] for time in times]
-    elif type_label in {"ms50", "bs50"}:
-        times = [[t / 50  for t in time] for time in times]
-
-    times[0] = [t / 1 for t in times[0]]
-    times[1] = [t / 4 for t in times[1]]
-    times[2] = [t / (1024) for t in times[2]]
-    times[3] = [t / (32*1024) for t in times[3]]
+        return {
+            "total": data["list"][0]["report"]["total"] / (iterations * divisor),
+            "combined": [ x / divisor for x in data["list"][0]["report"]["combined"]],
+            "submission": [ x / divisor for x in data["list"][0]["report"]["submission"]],
+            "completion": [ x / divisor for x in data["list"][0]["report"]["completion"]]
+        }
 
-    throughput = [[calc_throughput(1024,time) for time in t] for t in times]
 
-    idx = 0
-    for run_set in throughput:
+def process_file_to_dataset(file_path, type_label,size_label):
+    type_index = index_from_element(type_label,types)
+    type_nice = types_nice[type_index]
+    size_index = index_from_element(size_label, sizes)
+    size_nice = sizes_nice[size_index]
+    data_size = 0
+
+    if size_label == "1kib": data_size = 1024;
+    elif size_label == "4kib": data_size = 4 * 1024;
+    elif size_label == "1mib": data_size = 1024 * 1024;
+    elif size_label == "32mib": data_size = 32 * 1024 * 1024;
+    elif size_label == "1gib": data_size = 1024 * 1024 * 1024;
+    else: data_size = 0
+
+    try:
+        time = load_time_mesurements(file_path,type_label)["total"]
         run_idx = 0
-        for run in run_set:
-            data.append({ runid : run_idx, x_label: sizes_nice[idx], var_label : type_nice, y_label : throughput[idx][run_idx]})
+        for t in time:
+            data.append({ runid : run_idx, x_label: type_nice, var_label : size_nice, y_label : calc_throughput(data_size, t)})
             run_idx = run_idx + 1
-        idx = idx + 1
+    except FileNotFoundError:
+        return
+
 
 
-# Main function to iterate over files and create plots for the new benchmark
 def main():
-    folder_path = "benchmark-results/"  # Replace with the actual path to your folder
+    folder_path = "benchmark-results/"
 
     for type_label in types:
-        file_paths = [os.path.join(folder_path, f"submit-{type_label}-{size}-1e.json") for size in sizes]
-        create_submit_dataset(file_paths, type_label)
+        for size in sizes:
+            file = os.path.join(folder_path, f"submit-{type_label}-{size}-1e.json")
+            process_file_to_dataset(file, type_label, size)
 
     df = pd.DataFrame(data)
     df.set_index(index, inplace=True)
     df = df.sort_values(y_label)
 
     sns.barplot(x=x_label, y=y_label, hue=var_label, data=df, palette="rocket", errorbar="sd")
-
     plt.title(title)
-    plt.savefig(os.path.join(folder_path, "plot-perf-submitmethod.png"), bbox_inches='tight')
+    plt.savefig(os.path.join(folder_path, "plot-opt-submitmethod.png"), bbox_inches='tight')
     plt.show()
 
 if __name__ == "__main__":