-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetting_llama2.yaml
40 lines (33 loc) · 1.66 KB
/
setting_llama2.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
namespace: teammlb # namespace to deploy to (required)
jobName: ptgen-llm # name of the generated AppWrapper and PyTorchJob objects (required)
queueName: default-queue # local queue to submit to (default: default-queue)
numPods: 2 # total pod count including master and worker pods (default: 1)
numCpusPerPod: 2 # requested number of cpus per pod (default: 1)
numGpusPerPod: 1 # requested number of gpus per pod (default: 0)
totalMemoryPerPod: 48Gi # requested amount of memory per pod (default: 1Gi)
priority: default-priority # default-priority (default), low-priority, or high-priority
volumes:
- name: llama2
claimName: pvc-lroc
mountPath: /mnt/fs1/lroc/
# initContainers:
# - name: install-git
# image: centos:8
# command: ['sh', '-c', 'yum install -y git']
# container image for the pods (required)
containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
# containerImage: pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
# setup commands to run in each pod (optional)
setupCommands:
# - git clone https://github.com/leipanhz/collections
- pip install --upgrade transformers sentencepiece datasets
- pip install accelerate -U
- git clone https://github.com/leipanhz/collections.git
- cd collections
- if [ -d /mnt/fs1/lroc ]; then echo 'PV Exists'; else echo 'PV Not found'; fi
- if [ -d /mnt/fs1/lroc/llama2/hf_model ]; then echo 'hf_model dir Exists'; else echo 'hf_model Not found'; fi
- ls /mnt/fs1/lroc/llama2/hf_model
- export HF_HOME='/tmp/hf_cache'
- export TRANSFORMERS_CACHE='/tmp/hf_cache'
# main program to invoke via torchrun (optional)
mainProgram: train.py