From 62dca0a7f77c53d2f69b7c73eedd724623166230 Mon Sep 17 00:00:00 2001 From: Rutwik Date: Thu, 27 Jun 2024 22:41:54 -0500 Subject: [PATCH] A2 experiments for losweep updated --- Readme.md | 9 +++------ blox/blox_manager.py | 3 ++- blox_new_flow_multi_run.py | 3 +++ placement/pal.py | 7 ++++--- plot_sia_sim_baseline.py | 4 ++-- plot_sia_sim_losweep.py | 4 ++-- run_sia_sim_losweep.sh | 7 ++++++- 7 files changed, 22 insertions(+), 15 deletions(-) diff --git a/Readme.md b/Readme.md index c829bd7..96c5e41 100644 --- a/Readme.md +++ b/Readme.md @@ -106,15 +106,12 @@ python plot_sia_sim_losweep.py We vary the job arrival rate, and in turn, the contention level on the cluster, and measure JCTs to evaluate the performance of our policies. We use the Synergy workload and simulate a 256 GPU cluster. These experiments are also run with three different scheduling policies - FIFO, LAS, and SRTF. ### Execution -We provide a run script `run_synergy_baseline.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers. +We provide a run script `run_synergy_sim.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers. -{{ \footnotesize -\begin{minted}{bash} +``` conda activate envpal ./run_synergy_sim.sh -\end{minted} -}} - +``` Lines 441-447 of `simulator_synergy.py` specify the job load and scheduling policies to run as arrays. These can be modified to reduce the number of experiments, and consequently the execution time for A3. To reproduce Figure 13 in the paper, run the following plotting script: diff --git a/blox/blox_manager.py b/blox/blox_manager.py index dae8215..99a5f5b 100644 --- a/blox/blox_manager.py +++ b/blox/blox_manager.py @@ -48,6 +48,7 @@ def __init__(self, args: argparse.ArgumentParser) -> None: self.first_submit_time = None self.last_round_time = 0 self.terminate = False + self.locality_penalty = args.locality_penalty return None def reset(self, args: argparse.ArgumentParser) -> None: @@ -436,7 +437,7 @@ def _get_locality_penalty(self, gpu_df, gpus_to_launch, job_model): else: penalty = 1.0 - const_lf = 1.7 # const_lf locality penalty if allocation is ACROSS nodes + const_lf = self.locality_penalty # const_lf locality penalty if allocation is ACROSS nodes node_df = gpu_df[gpu_df["GPU_ID"].isin(gpus_to_launch)] unique_node_ids = list(node_df["Node_ID"].unique()) diff --git a/blox_new_flow_multi_run.py b/blox_new_flow_multi_run.py index 69c9219..35e91c7 100644 --- a/blox_new_flow_multi_run.py +++ b/blox_new_flow_multi_run.py @@ -199,6 +199,9 @@ def parse_args(parser): parser.add_argument( "--round-duration", type=int, default=300, help="Round duration in seconds" ) + parser.add_argument( + "--locality-penalty", type=float, default=1.7, help="Locality Penalty (Cost for allocation spilling across nodes)" + ) parser.add_argument( "--start-id-track", type=int, default=3000, help="Starting ID to track" ) diff --git a/placement/pal.py b/placement/pal.py index 996ab7a..d9402c4 100644 --- a/placement/pal.py +++ b/placement/pal.py @@ -18,6 +18,7 @@ def __init__(self, args): self.alloc_order = {} # per class dataframes containing GPU_ID, Node_ID and sf for all GPUs self.dict_of_dfs = {} + self.locality_penalty = args.locality_penalty pass @staticmethod @@ -54,7 +55,7 @@ def place( """ if not bool(self.alloc_order): if not gpu_df.empty: #get alloc order based on L-V Matrix only if there are GPUs registered - self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df) + self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df, self.locality_penalty) #gpu_df.to_csv("/scratch1/08503/rnjain/blox-pal/logs/pal-runs/gpu_df.csv") job_order = new_job_schedule["job_order"] scheduler = new_job_schedule.get("scheduler") @@ -498,7 +499,7 @@ def get_optimal_k(data: pd.DataFrame, outliers: pd.Series) -> int: #optimal_k += num_outliers return optimal_k -def get_slowdown_factors(gpu_df: pd.DataFrame) -> Tuple[dict,dict]: +def get_slowdown_factors(gpu_df: pd.DataFrame, locality_penalty) -> Tuple[dict,dict]: slowdown_factors = {} locality_dict = {} dict_of_dfs = {} @@ -590,7 +591,7 @@ def get_slowdown_factors(gpu_df: pd.DataFrame) -> Tuple[dict,dict]: lf_within = 1.0 # Constant locality penalty of 1.7 # lf_across = lf_model[key] - lf_across = 1.7 + lf_across = locality_penalty sfs_list = df_copy['sf'].unique() # Ordering of sfs vs locality for each class diff --git a/plot_sia_sim_baseline.py b/plot_sia_sim_baseline.py index 80ae9db..2c348ca 100644 --- a/plot_sia_sim_baseline.py +++ b/plot_sia_sim_baseline.py @@ -63,8 +63,8 @@ def _get_jct_dist(time_dict): exp_prefix = "None" placement = ["Default-Packed-S", "Default-Packed-NS", "Default-Random-NS", "Default-Random-S", "PMFirst", "PAL"] placement_labels = { - "Default-Packed-S": "Packed-Sticky", - "Default-Packed-NS": "Packed-Non-Sticky", + "Default-Packed-S": "Packed-Non-Sticky", + "Default-Packed-NS": "Packed-Sticky", "Default-Random-S": "Random-Sticky", "Default-Random-NS": "Random-Non-Sticky", "PMFirst":"PMFirst", diff --git a/plot_sia_sim_losweep.py b/plot_sia_sim_losweep.py index cea6de0..7c5a6a0 100644 --- a/plot_sia_sim_losweep.py +++ b/plot_sia_sim_losweep.py @@ -149,7 +149,7 @@ def _get_jct_dist(time_dict): # Sort the DataFrame for better presentation -df_jct.sort_values(by=['placement', 'workload'], inplace=True) +df_jct.sort_values(by=['placement', 'locality_penalty'], inplace=True) # print(result_df) # result_df.to_csv("geomean.csv") @@ -171,7 +171,7 @@ def _get_jct_dist(time_dict): # Sort the dataframe based on avg_jct in descending order g = sns.catplot( x="locality_penalty", - y="norm_perf", + y="avg_jct", hue="placement", hue_order=custom_order, legend=False, diff --git a/run_sia_sim_losweep.sh b/run_sia_sim_losweep.sh index 8d97fd1..82dd428 100755 --- a/run_sia_sim_losweep.sh +++ b/run_sia_sim_losweep.sh @@ -23,7 +23,12 @@ for lf in "${locality_penalty[@]}"; do --trace $trace_path & pid1=$! - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py --start-id-track 0 --stop-id-track 159 --round-duration 360 --simulate & + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py \ + --start-id-track 0 \ + --stop-id-track 159 \ + --round-duration 360 \ + --locality-penalty $lf \ + --simulate & pid2=$! # Wait for resource_manager.py to finish