Skip to content

Commit

Permalink
A2 experiments for losweep updated
Browse files Browse the repository at this point in the history
  • Loading branch information
Rutwik authored and Rutwik committed Jun 28, 2024
1 parent 237299a commit 62dca0a
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 15 deletions.
9 changes: 3 additions & 6 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,12 @@ python plot_sia_sim_losweep.py
We vary the job arrival rate, and in turn, the contention level on the cluster, and measure JCTs to evaluate the performance of our policies. We use the Synergy workload and simulate a 256 GPU cluster. These experiments are also run with three different scheduling policies - FIFO, LAS, and SRTF.

### Execution
We provide a run script `run_synergy_baseline.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers.
We provide a run script `run_synergy_sim.sh` to vary job load from 8 to 14 jobs/hour and run experiments for all 3 schedulers.

{{ \footnotesize
\begin{minted}{bash}
```
conda activate envpal
./run_synergy_sim.sh
\end{minted}
}}

```
Lines 441-447 of `simulator_synergy.py` specify the job load and scheduling policies to run as arrays. These can be modified to reduce the number of experiments, and consequently the execution time for A<sub>3</sub>.

To reproduce Figure 13 in the paper, run the following plotting script:
Expand Down
3 changes: 2 additions & 1 deletion blox/blox_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self, args: argparse.ArgumentParser) -> None:
self.first_submit_time = None
self.last_round_time = 0
self.terminate = False
self.locality_penalty = args.locality_penalty
return None

def reset(self, args: argparse.ArgumentParser) -> None:
Expand Down Expand Up @@ -436,7 +437,7 @@ def _get_locality_penalty(self, gpu_df, gpus_to_launch, job_model):
else:
penalty = 1.0

const_lf = 1.7 # const_lf locality penalty if allocation is ACROSS nodes
const_lf = self.locality_penalty # const_lf locality penalty if allocation is ACROSS nodes

node_df = gpu_df[gpu_df["GPU_ID"].isin(gpus_to_launch)]
unique_node_ids = list(node_df["Node_ID"].unique())
Expand Down
3 changes: 3 additions & 0 deletions blox_new_flow_multi_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ def parse_args(parser):
parser.add_argument(
"--round-duration", type=int, default=300, help="Round duration in seconds"
)
parser.add_argument(
"--locality-penalty", type=float, default=1.7, help="Locality Penalty (Cost for allocation spilling across nodes)"
)
parser.add_argument(
"--start-id-track", type=int, default=3000, help="Starting ID to track"
)
Expand Down
7 changes: 4 additions & 3 deletions placement/pal.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, args):
self.alloc_order = {}
# per class dataframes containing GPU_ID, Node_ID and sf for all GPUs
self.dict_of_dfs = {}
self.locality_penalty = args.locality_penalty
pass

@staticmethod
Expand Down Expand Up @@ -54,7 +55,7 @@ def place(
"""
if not bool(self.alloc_order):
if not gpu_df.empty: #get alloc order based on L-V Matrix only if there are GPUs registered
self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df)
self.alloc_order, self.dict_of_dfs = get_slowdown_factors(gpu_df, self.locality_penalty)
#gpu_df.to_csv("/scratch1/08503/rnjain/blox-pal/logs/pal-runs/gpu_df.csv")
job_order = new_job_schedule["job_order"]
scheduler = new_job_schedule.get("scheduler")
Expand Down Expand Up @@ -498,7 +499,7 @@ def get_optimal_k(data: pd.DataFrame, outliers: pd.Series) -> int:
#optimal_k += num_outliers
return optimal_k

def get_slowdown_factors(gpu_df: pd.DataFrame) -> Tuple[dict,dict]:
def get_slowdown_factors(gpu_df: pd.DataFrame, locality_penalty) -> Tuple[dict,dict]:
slowdown_factors = {}
locality_dict = {}
dict_of_dfs = {}
Expand Down Expand Up @@ -590,7 +591,7 @@ def get_slowdown_factors(gpu_df: pd.DataFrame) -> Tuple[dict,dict]:
lf_within = 1.0
# Constant locality penalty of 1.7
# lf_across = lf_model[key]
lf_across = 1.7
lf_across = locality_penalty
sfs_list = df_copy['sf'].unique()

# Ordering of sfs vs locality for each class
Expand Down
4 changes: 2 additions & 2 deletions plot_sia_sim_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def _get_jct_dist(time_dict):
exp_prefix = "None"
placement = ["Default-Packed-S", "Default-Packed-NS", "Default-Random-NS", "Default-Random-S", "PMFirst", "PAL"]
placement_labels = {
"Default-Packed-S": "Packed-Sticky",
"Default-Packed-NS": "Packed-Non-Sticky",
"Default-Packed-S": "Packed-Non-Sticky",
"Default-Packed-NS": "Packed-Sticky",
"Default-Random-S": "Random-Sticky",
"Default-Random-NS": "Random-Non-Sticky",
"PMFirst":"PMFirst",
Expand Down
4 changes: 2 additions & 2 deletions plot_sia_sim_losweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _get_jct_dist(time_dict):


# Sort the DataFrame for better presentation
df_jct.sort_values(by=['placement', 'workload'], inplace=True)
df_jct.sort_values(by=['placement', 'locality_penalty'], inplace=True)

# print(result_df)
# result_df.to_csv("geomean.csv")
Expand All @@ -171,7 +171,7 @@ def _get_jct_dist(time_dict):
# Sort the dataframe based on avg_jct in descending order
g = sns.catplot(
x="locality_penalty",
y="norm_perf",
y="avg_jct",
hue="placement",
hue_order=custom_order,
legend=False,
Expand Down
7 changes: 6 additions & 1 deletion run_sia_sim_losweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ for lf in "${locality_penalty[@]}"; do
--trace $trace_path &
pid1=$!

PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py --start-id-track 0 --stop-id-track 159 --round-duration 360 --simulate &
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python blox_new_flow_multi_run.py \
--start-id-track 0 \
--stop-id-track 159 \
--round-duration 360 \
--locality-penalty $lf \
--simulate &
pid2=$!

# Wait for resource_manager.py to finish
Expand Down

0 comments on commit 62dca0a

Please sign in to comment.