-
Notifications
You must be signed in to change notification settings - Fork 0
/
cifar-l.submit_file
58 lines (46 loc) · 2.26 KB
/
cifar-l.submit_file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
####################
#
# Example Job for HTCondor
#
####################
#---------------------------------------------
# Name your batch so it's easy to distinguish in condor_q.
JobBatchName = "CIFAR10-l"
# --------------------------------------------
# Executable: Choose cu version depends on docker_image
executable = /mnt/fast/nobackup/users/nt00601/miniconda3/envs/cu118_py311/bin/python3.11
# ---------------------------------------------------
# Universe (vanilla, docker): Choose CUDADriverVersion depends on what's shown on condor_status
# see https://docs.pages.surrey.ac.uk/research_computing/condor/tips.html#cuda-requirements
universe = docker
docker_image = nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
#docker_image = nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
#docker_image = nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
# -------------------------------------------------
# Event, out and error logs
log = c$(cluster).p$(process).log
error = c$(cluster).p$(process).error
# --------------------------------------
# GPU, Storage and CUDA Requirements for the Job
# All of these requirements say: We are using rtx2080 or rtx3090
requirements = (CUDAGlobalMemoryMb > 10000) && (CUDAGlobalMemoryMb < 75000) && (CUDACapability > 7) && \
(HasWeka)
# Resources
request_GPUs = 1
+GPUMem = 10000
request_CPUs = 1
request_memory = 26G
#This job will complete in less than 1 hour
+JobRunTime = 4
#This job can checkpoint
+CanCheckpoint = true
# Request for guaranteed run time(measured in s to match epoch runtime). 0 mean job is happy to checkpoint and move at any time.
# This lets Condor remove our job ASAP if a machine needs rebooting. Useful when we can checkpoint and restore
MaxJobRetirementTime = 0
# -----------------------------------
arguments = $(script) --use-avai-gpus --workers 4 --model $(model) --data_location $(root) --data_set $(dataset) --optim sgd --lr 0.5 --weight-decay 2e-05 --lr-scheduler sequential --epochs 4 --batch-size 128 --save-dir $(save)
dataset = CIFAR10
root = /mnt/fast/nobackup/users/nt00601/content
save = /mnt/fast/nobackup/users/nt00601/AttentionRollout_ReImplementation-main/logs/$(model)/$(dataset)
script = /mnt/fast/nobackup/users/nt00601/AttentionRollout_ReImplementation-main/main.py
queue 1 model in vit_large_32