From 8fd8302c73a4dbd8da88aed414100359becbf062 Mon Sep 17 00:00:00 2001 From: Sukrit Kalra Date: Mon, 29 Jan 2024 11:53:11 -0800 Subject: [PATCH] Updated scripts / numbers for motivation. --- main.py | 6 + .../workers/alibaba_cluster_30_slots.yaml | 6 + schedulers/tetrisched_scheduler.py | 4 +- scripts/run_alibaba_motivation_experiments.sh | 156 ++++++++++++++++++ 4 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 profiles/workers/alibaba_cluster_30_slots.yaml create mode 100755 scripts/run_alibaba_motivation_experiments.sh diff --git a/main.py b/main.py index 2aa6aa21..c501e4cc 100644 --- a/main.py +++ b/main.py @@ -451,6 +451,12 @@ "If `scheduler_selective_rescheduling` is True, then this flag defines the number " "of TaskGraphs to sample for rescheduling.", ) +flags.DEFINE_float( + "scheduler_reconsideration_period", + 0.1, + "The percentage of critical path duration until which the scheduler will try " + "placing the TaskGraph, and drop the TaskGraph if it cannot be placed after.", +) # Workload definition related flags. flags.DEFINE_integer( diff --git a/profiles/workers/alibaba_cluster_30_slots.yaml b/profiles/workers/alibaba_cluster_30_slots.yaml new file mode 100644 index 00000000..7e48e942 --- /dev/null +++ b/profiles/workers/alibaba_cluster_30_slots.yaml @@ -0,0 +1,6 @@ +- name: WorkerPool_1 + workers: + - name: Worker_1_1 + resources: + - name: Slot_1 + quantity: 30 diff --git a/schedulers/tetrisched_scheduler.py b/schedulers/tetrisched_scheduler.py index aa737878..9e0624eb 100644 --- a/schedulers/tetrisched_scheduler.py +++ b/schedulers/tetrisched_scheduler.py @@ -248,7 +248,9 @@ def __init__( # the release time and the deadline. So, if a TaskGraph was released at 100 and # has a deadline of 500, it will be retried until scheduler invocations upto # 180, and will be dropped after. - self._task_graph_reconsideration_period = 0.10 + self._task_graph_reconsideration_period = ( + 0.10 if _flags is None else _flags.scheduler_reconsideration_period + ) self._previously_considered_task_graphs: Set[str] = set() # A cache for the STRLs generated for individual tasks. diff --git a/scripts/run_alibaba_motivation_experiments.sh b/scripts/run_alibaba_motivation_experiments.sh new file mode 100755 index 00000000..df1b5e7f --- /dev/null +++ b/scripts/run_alibaba_motivation_experiments.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# Move to the simulator directory. +if [[ -z ${ERDOS_SIMULATOR_DIR} ]]; then + echo "[x] ERRROR: ERDOS_SIMULATOR_DIR is not set" + exit 1 +fi +cd ${ERDOS_SIMULATOR_DIR} + +LOG_DIR=$1 +if [[ -z ${LOG_DIR} ]]; then + echo "[x] ERROR: Please provide a directory to output results to as the first argument." + exit 2 +fi + + +# Random seeds. +# We use different seeds so that we can run with a different set of TaskGraphs +# being chosen from the trace, along with different arrival patterns. +RANDOM_SEEDS=(420665456 9165261 106432512 95498947 105937176 66362485 416681780) + +# Schedulers +# We use the following baseline schedulers to compare the performance of DAGSched with. +SCHEDULERS=(EDF TetriSched GraphenePrime DAGSched) + +# Poisson arrival rates. +# We use the following arrival rates for the Poisson arrival process. +MEDIUM_ARRIVAL_RATES=(0.0075 0.005 0.0046 0.004 0.0036) +HARD_ARRIVAL_RATES=(0.0025 0.0025 0.002 0.002 0.0015) + +execute_experiment () { + SCHEDULER=$1 + RANDOM_SEED=$2 + LOG_DIR=$3 + LOG_BASE="run_${RANDOM_SEED}" + MEDIUM_ARRIVAL_RATE=$4 + HARD_ARRIVAL_RATE=$5 + + EXPERIMENT_DIR="${LOG_DIR}/${SCHEDULER}/${LOG_BASE}/arrival_rate_${MEDIUM_ARRIVAL_RATE}_${HARD_ARRIVAL_RATE}" + mkdir -p ${EXPERIMENT_DIR} + + if [ -f "${EXPERIMENT_DIR}/alibaba_trace_replay.csv" ]; then + echo "[x] The experiment for ${SCHEDULER} with random seed ${RANDOM_SEED} has already been run." + return + fi + + # Build the baseline configuration for the experiment. + EXPERIMENT_CONF="\ + # Output configuration. + --log_dir=${EXPERIMENT_DIR} + --log_file_name=alibaba_trace_replay.log + --csv_file_name=alibaba_trace_replay.csv + --log_level=debug + " + + EXPERIMENT_CONF+=" + # Worker configuration. + --worker_profile_path=profiles/workers/alibaba_cluster_30_slots.yaml + " + + EXPERIMENT_CONF+=" + # Workload configuration. + --execution_mode=replay + --replay_trace=alibaba + --workload_profile_paths=traces/alibaba-cluster-trace-v2018/easy_dag_sukrit_10k.pkl,traces/alibaba-cluster-trace-v2018/medium_dag_sukrit_10k.pkl,traces/alibaba-cluster-trace-v2018/hard_dag_sukrit_10k.pkl + --workload_profile_path_labels=easy,medium,hard + --override_release_policies=poisson,poisson,poisson + --override_num_invocations=0,200,100 + --override_poisson_arrival_rates=0.0075,${MEDIUM_ARRIVAL_RATE},${HARD_ARRIVAL_RATE} + --randomize_start_time_max=50 + --min_deadline=5 + --max_deadline=500 + --min_deadline_variances=25,50,10 + --max_deadline_variances=50,100,25 + + # Loader configuration. + --alibaba_loader_task_cpu_divisor=10 + --alibaba_loader_min_critical_path_runtimes=200,500,600 + --alibaba_loader_max_critical_path_runtimes=500,1000,1000 + " + + if [[ ${SCHEDULER} == "EDF" ]]; then + EXPERIMENT_CONF+=" + # Scheduler configuration. + --scheduler=EDF + --enforce_deadlines + " + elif [[ ${SCHEDULER} == "TetriSched" ]]; then + EXPERIMENT_CONF+=" + # Scheduler configuration. + --scheduler=TetriSched + --enforce_deadlines + --scheduler_time_discretization=1 + --scheduler_enable_optimization_pass + --retract_schedules + --scheduler_time_limit=120 + " + elif [[ ${SCHEDULER} == "GraphenePrime" ]]; then + EXPERIMENT_CONF+=" + # Scheduler configuration. + --scheduler=GraphenePrime + --scheduler_selective_rescheduling + --scheduler_selective_rescheduling_sample_size=2 + --scheduler_time_discretization=2 + --scheduler_enable_optimization_pass + --scheduler_plan_ahead=1000 + --retract_schedules + --scheduler_time_limit=120 + " + elif [[ ${SCHEDULER} == "DAGSched" ]]; then + EXPERIMENT_CONF+=" + # Scheduler configuration. + --scheduler=TetriSched + --release_taskgraphs + --enforce_deadlines + --scheduler_time_discretization=1 + --scheduler_enable_optimization_pass + --scheduler_reconsideration_period=0.2 + --retract_schedules + --scheduler_time_limit=120 + " + else + echo "[x] ERROR: Unknown scheduler ${SCHEDULER}" + exit 1 + fi + + EXPERIMENT_CONF+="\ + --scheduler_runtime=0 + --random_seed=${RANDOM_SEED}" + + echo "${EXPERIMENT_CONF}" | sed -e 's/^[ \t]*//' > ${EXPERIMENT_DIR}/alibaba_trace_replay.conf + + echo "[x] Constructed configuration for ${EXPERIMENT_DIR}. Beginning experiment" + + if ! python3 main.py --flagfile=${EXPERIMENT_DIR}/alibaba_trace_replay.conf > ${EXPERIMENT_DIR}/alibaba_trace_replay.output; then + echo "[x] Failed in the execution of ${LOG_BASE}. Exiting." + exit 3 + fi + + echo "[x] Finished execution of ${EXPERIMENT_DIR}." +} + +if [ ${#MEDIUM_ARRIVAL_RATES[@]} -ne ${#HARD_ARRIVAL_RATES[@]} ]; then + echo "[x] ERROR: The number of medium and hard arrival rates must be the same." + exit 1 +fi + +for SCHEDULER in ${SCHEDULERS[@]}; do + for RANDOM_SEED in ${RANDOM_SEEDS[@]}; do + for ((i=0; i<${#MEDIUM_ARRIVAL_RATES[@]}; i++)); do + MEDIUM_ARRIVAL_RATE=${MEDIUM_ARRIVAL_RATES[$i]} + HARD_ARRIVAL_RATE=${HARD_ARRIVAL_RATES[$i]} + echo "[x] Running ${SCHEDULER} with random seed ${RANDOM_SEED} and arrival rates ${MEDIUM_ARRIVAL_RATE} and ${HARD_ARRIVAL_RATE}" + execute_experiment ${SCHEDULER} ${RANDOM_SEED} ${LOG_DIR} ${MEDIUM_ARRIVAL_RATE} ${HARD_ARRIVAL_RATE} + done + done +done \ No newline at end of file