From 62dca0a7f77c53d2f69b7c73eedd724623166230 Mon Sep 17 00:00:00 2001
From: Rutwik <Rutwik Jain>
Date: Thu, 27 Jun 2024 22:41:54 -0500
Subject: [PATCH] A2 experiments for losweep updated

---
 Readme.md                  | 9 +++------
 blox/blox_manager.py       | 3 ++-
 blox_new_flow_multi_run.py | 3 +++
 placement/pal.py           | 7 ++++---
 plot_sia_sim_baseline.py   | 4 ++--
 plot_sia_sim_losweep.py    | 4 ++--
 run_sia_sim_losweep.sh     | 7 ++++++-
 7 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/Readme.md b/Readme.md
index c829bd7..96c5e41 100644
--- a/Readme.md
+++ b/Readme.md
@@ -106,15 +106,12 @@ python plot_sia_sim_losweep.py
 We vary the job arrival rate, and in turn, the contention level on the cluster, and measure JCTs to evaluate the performance of our policies. We use the Synergy workload and simulate a 256 GPU cluster. These experiments are also run with three different scheduling policies - FIFO, LAS, and SRTF. 
 
 ### Execution
-We provide a run script `run_synergy_baseline.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers. 
+We provide a run script `run_synergy_sim.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers. 
 
-{{ \footnotesize
-\begin{minted}{bash}
+```
 conda activate envpal
 ./run_synergy_sim.sh
-\end{minted}
-}}
-
+```
 Lines 441-447 of `simulator_synergy.py` specify the job load and scheduling policies to run as arrays. These can be modified to reduce the number of experiments, and consequently the execution time for A<sub>3</sub>. 
 
 To reproduce Figure 13 in the paper, run the following plotting script:
diff --git a/blox/blox_manager.py b/blox/blox_manager.py
index dae8215..99a5f5b 100644
--- a/blox/blox_manager.py
+++ b/blox/blox_manager.py
@@ -48,6 +48,7 @@ def __init__(self, args: argparse.ArgumentParser) -> None:
         self.first_submit_time = None
         self.last_round_time = 0
         self.terminate = False
+        self.locality_penalty = args.locality_penalty
         return None
 
     def reset(self, args: argparse.ArgumentParser) -> None:
@@ -436,7 +437,7 @@ def _get_locality_penalty(self, gpu_df, gpus_to_launch, job_model):
         else:
             penalty = 1.0
 
-        const_lf = 1.7  # const_lf locality penalty if allocation is ACROSS nodes   
+        const_lf = self.locality_penalty # const_lf locality penalty if allocation is ACROSS nodes   
 
         node_df = gpu_df[gpu_df["GPU_ID"].isin(gpus_to_launch)]
         unique_node_ids = list(node_df["Node_ID"].unique())
diff --git a/blox_new_flow_multi_run.py b/blox_new_flow_multi_run.py
index 69c9219..35e91c7 100644
--- a/blox_new_flow_multi_run.py
+++ b/blox_new_flow_multi_run.py
@@ -199,6 +199,9 @@ def parse_args(parser):
     parser.add_argument(
         "--round-duration", type=int, default=300, help="Round duration in seconds"
     )
+    parser.add_argument(
+        "--locality-penalty", type=float, default=1.7, help="Locality Penalty (Cost for allocation spilling across nodes)"
+    )
     parser.add_argument(
         "--start-id-track", type=int, default=3000, help="Starting ID to track"
     )
diff --git a/placement/pal.py b/placement/pal.py
index 996ab7a..d9402c4 100644
--- a/placement/pal.py
+++ b/placement/pal.py
@@ -18,6 +18,7 @@ def __init__(self, args):
         self.alloc_order = {}
         # per class dataframes containing GPU_ID, Node_ID and sf for all GPUs
         self.dict_of_dfs = {}
+        self.locality_penalty = args.locality_penalty
         pass
 
     @staticmethod
@@ -54,7 +55,7 @@ def place(
         """
         if not bool(self.alloc_order):
             if not gpu_df.empty:   #get alloc order based on L-V Matrix only if there are GPUs registered
-                self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df)
+                self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df, self.locality_penalty)
         #gpu_df.to_csv("/scratch1/08503/rnjain/blox-pal/logs/pal-runs/gpu_df.csv")
         job_order = new_job_schedule["job_order"]
         scheduler = new_job_schedule.get("scheduler")
@@ -498,7 +499,7 @@ def get_optimal_k(data: pd.DataFrame, outliers: pd.Series) -> int:
         #optimal_k += num_outliers
         return optimal_k
 
-def get_slowdown_factors(gpu_df: pd.DataFrame) ->  Tuple[dict,dict]:
+def get_slowdown_factors(gpu_df: pd.DataFrame, locality_penalty) ->  Tuple[dict,dict]:
     slowdown_factors = {}
     locality_dict = {}
     dict_of_dfs = {}
@@ -590,7 +591,7 @@ def get_slowdown_factors(gpu_df: pd.DataFrame) ->  Tuple[dict,dict]:
         lf_within        = 1.0
         # Constant locality penalty of 1.7
         # lf_across        = lf_model[key]
-        lf_across        = 1.7
+        lf_across        = locality_penalty
         sfs_list = df_copy['sf'].unique()
 
         # Ordering of sfs vs locality for each class
diff --git a/plot_sia_sim_baseline.py b/plot_sia_sim_baseline.py
index 80ae9db..2c348ca 100644
--- a/plot_sia_sim_baseline.py
+++ b/plot_sia_sim_baseline.py
@@ -63,8 +63,8 @@ def _get_jct_dist(time_dict):
 exp_prefix = "None"
 placement = ["Default-Packed-S", "Default-Packed-NS", "Default-Random-NS", "Default-Random-S", "PMFirst", "PAL"]
 placement_labels = {
-    "Default-Packed-S": "Packed-Sticky",
-    "Default-Packed-NS": "Packed-Non-Sticky", 
+    "Default-Packed-S": "Packed-Non-Sticky",
+    "Default-Packed-NS": "Packed-Sticky", 
     "Default-Random-S": "Random-Sticky", 
     "Default-Random-NS": "Random-Non-Sticky",
     "PMFirst":"PMFirst",
diff --git a/plot_sia_sim_losweep.py b/plot_sia_sim_losweep.py
index cea6de0..7c5a6a0 100644
--- a/plot_sia_sim_losweep.py
+++ b/plot_sia_sim_losweep.py
@@ -149,7 +149,7 @@ def _get_jct_dist(time_dict):
 
 
 # Sort the DataFrame for better presentation
-df_jct.sort_values(by=['placement', 'workload'], inplace=True)
+df_jct.sort_values(by=['placement', 'locality_penalty'], inplace=True)
 
 # print(result_df)
 # result_df.to_csv("geomean.csv")
@@ -171,7 +171,7 @@ def _get_jct_dist(time_dict):
 # Sort the dataframe based on avg_jct in descending order
 g = sns.catplot(
     x="locality_penalty",
-    y="norm_perf",
+    y="avg_jct",
     hue="placement",
     hue_order=custom_order,
     legend=False,
diff --git a/run_sia_sim_losweep.sh b/run_sia_sim_losweep.sh
index 8d97fd1..82dd428 100755
--- a/run_sia_sim_losweep.sh
+++ b/run_sia_sim_losweep.sh
@@ -23,7 +23,12 @@ for lf in "${locality_penalty[@]}"; do
     --trace $trace_path &
     pid1=$!
 
-    PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py --start-id-track 0 --stop-id-track 159 --round-duration 360 --simulate &
+    PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py \
+    --start-id-track 0 \
+    --stop-id-track 159 \
+    --round-duration 360 \
+    --locality-penalty $lf \
+    --simulate &
     pid2=$!
 
     # Wait for resource_manager.py to finish