A2 experiments for losweep updated

hal-uw · Jun 28, 2024 · 62dca0a · 62dca0a
1 parent 237299a
commit 62dca0a
Show file tree

Hide file tree

Showing 7 changed files with 22 additions and 15 deletions.
diff --git a/Readme.md b/Readme.md
@@ -106,15 +106,12 @@ python plot_sia_sim_losweep.py
 We vary the job arrival rate, and in turn, the contention level on the cluster, and measure JCTs to evaluate the performance of our policies. We use the Synergy workload and simulate a 256 GPU cluster. These experiments are also run with three different scheduling policies - FIFO, LAS, and SRTF. 
 
 ### Execution
-We provide a run script `run_synergy_baseline.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers. 
+We provide a run script `run_synergy_sim.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers. 
 
-{{ \footnotesize
-\begin{minted}{bash}
+```
 conda activate envpal
 ./run_synergy_sim.sh
-\end{minted}
-}}
-
+```
 Lines 441-447 of `simulator_synergy.py` specify the job load and scheduling policies to run as arrays. These can be modified to reduce the number of experiments, and consequently the execution time for A<sub>3</sub>. 
 
 To reproduce Figure 13 in the paper, run the following plotting script:

diff --git a/blox/blox_manager.py b/blox/blox_manager.py
@@ -48,6 +48,7 @@ def __init__(self, args: argparse.ArgumentParser) -> None:
         self.first_submit_time = None
         self.last_round_time = 0
         self.terminate = False
+        self.locality_penalty = args.locality_penalty
         return None
 
     def reset(self, args: argparse.ArgumentParser) -> None:
@@ -436,7 +437,7 @@ def _get_locality_penalty(self, gpu_df, gpus_to_launch, job_model):
         else:
             penalty = 1.0
 
-        const_lf = 1.7  # const_lf locality penalty if allocation is ACROSS nodes   
+        const_lf = self.locality_penalty # const_lf locality penalty if allocation is ACROSS nodes   
 
         node_df = gpu_df[gpu_df["GPU_ID"].isin(gpus_to_launch)]
         unique_node_ids = list(node_df["Node_ID"].unique())

diff --git a/blox_new_flow_multi_run.py b/blox_new_flow_multi_run.py
@@ -199,6 +199,9 @@ def parse_args(parser):
     parser.add_argument(
         "--round-duration", type=int, default=300, help="Round duration in seconds"
     )
+    parser.add_argument(
+        "--locality-penalty", type=float, default=1.7, help="Locality Penalty (Cost for allocation spilling across nodes)"
+    )
     parser.add_argument(
         "--start-id-track", type=int, default=3000, help="Starting ID to track"
     )

diff --git a/placement/pal.py b/placement/pal.py
@@ -18,6 +18,7 @@ def __init__(self, args):
         self.alloc_order = {}
         # per class dataframes containing GPU_ID, Node_ID and sf for all GPUs
         self.dict_of_dfs = {}
+        self.locality_penalty = args.locality_penalty
         pass
 
     @staticmethod
@@ -54,7 +55,7 @@ def place(
         """
         if not bool(self.alloc_order):
             if not gpu_df.empty:   #get alloc order based on L-V Matrix only if there are GPUs registered
-                self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df)
+                self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df, self.locality_penalty)
         #gpu_df.to_csv("/scratch1/08503/rnjain/blox-pal/logs/pal-runs/gpu_df.csv")
         job_order = new_job_schedule["job_order"]
         scheduler = new_job_schedule.get("scheduler")
@@ -498,7 +499,7 @@ def get_optimal_k(data: pd.DataFrame, outliers: pd.Series) -> int:
         #optimal_k += num_outliers
         return optimal_k
 
-def get_slowdown_factors(gpu_df: pd.DataFrame) ->  Tuple[dict,dict]:
+def get_slowdown_factors(gpu_df: pd.DataFrame, locality_penalty) ->  Tuple[dict,dict]:
     slowdown_factors = {}
     locality_dict = {}
     dict_of_dfs = {}
@@ -590,7 +591,7 @@ def get_slowdown_factors(gpu_df: pd.DataFrame) ->  Tuple[dict,dict]:
         lf_within        = 1.0
         # Constant locality penalty of 1.7
         # lf_across        = lf_model[key]
-        lf_across        = 1.7
+        lf_across        = locality_penalty
         sfs_list = df_copy['sf'].unique()
 
         # Ordering of sfs vs locality for each class

diff --git a/plot_sia_sim_baseline.py b/plot_sia_sim_baseline.py
@@ -63,8 +63,8 @@ def _get_jct_dist(time_dict):
 exp_prefix = "None"
 placement = ["Default-Packed-S", "Default-Packed-NS", "Default-Random-NS", "Default-Random-S", "PMFirst", "PAL"]
 placement_labels = {
-    "Default-Packed-S": "Packed-Sticky",
-    "Default-Packed-NS": "Packed-Non-Sticky", 
+    "Default-Packed-S": "Packed-Non-Sticky",
+    "Default-Packed-NS": "Packed-Sticky", 
     "Default-Random-S": "Random-Sticky", 
     "Default-Random-NS": "Random-Non-Sticky",
     "PMFirst":"PMFirst",

diff --git a/plot_sia_sim_losweep.py b/plot_sia_sim_losweep.py
@@ -149,7 +149,7 @@ def _get_jct_dist(time_dict):
 
 
 # Sort the DataFrame for better presentation
-df_jct.sort_values(by=['placement', 'workload'], inplace=True)
+df_jct.sort_values(by=['placement', 'locality_penalty'], inplace=True)
 
 # print(result_df)
 # result_df.to_csv("geomean.csv")
@@ -171,7 +171,7 @@ def _get_jct_dist(time_dict):
 # Sort the dataframe based on avg_jct in descending order
 g = sns.catplot(
     x="locality_penalty",
-    y="norm_perf",
+    y="avg_jct",
     hue="placement",
     hue_order=custom_order,
     legend=False,

diff --git a/run_sia_sim_losweep.sh b/run_sia_sim_losweep.sh
@@ -23,7 +23,12 @@ for lf in "${locality_penalty[@]}"; do
     --trace $trace_path &
     pid1=$!
 
-    PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py --start-id-track 0 --stop-id-track 159 --round-duration 360 --simulate &
+    PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py \
+    --start-id-track 0 \
+    --stop-id-track 159 \
+    --round-duration 360 \
+    --locality-penalty $lf \
+    --simulate &
     pid2=$!
 
     # Wait for resource_manager.py to finish