forked from alextrott16/benchmarks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path125m.yaml
129 lines (113 loc) · 2.52 KB
/
125m.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
data_remote: &data_remote ./my-copy-c4
data_local: &data_local ./my-copy-c4
max_seq_len: &max_seq_len 2048
tokenizer_name: &tokenizer_name gpt2
# Run Name
run_name: gpt-125m
# Model
model:
name: mosaic_gpt
device: meta
tokenizer_name: *tokenizer_name
d_model: 768
n_heads: 12
n_layers: 12
mlp_ratio: 4
max_seq_len: *max_seq_len
vocab_size: 50257
init_std: 0.02
attn_pdrop: 0.0
resid_pdrop: 0.0
emb_pdrop: 0.0
attn_impl: flash
# Tokenizer
tokenizer:
type: hftokenizer
args:
tokenizer_name: *tokenizer_name
max_seq_len: *max_seq_len
# Dataloaders
train_loader:
dataset:
remote: *data_remote
local: *data_local
split: train
shuffle: true
prefetch: 1_000_000
tokenizer_name: *tokenizer_name
max_seq_len: *max_seq_len
group_method: concat
drop_last: true
num_workers: 8
pin_memory: true
prefetch_factor: 2
persistent_workers: true
timeout: 0
eval_loader:
dataset:
remote: *data_remote
local: *data_local
split: val
shuffle: false
prefetch: 1000
tokenizer_name: *tokenizer_name
max_seq_len: *max_seq_len
group_method: truncate
drop_last: false
num_workers: 8
pin_memory: true
prefetch_factor: 2
persistent_workers: true
timeout: 0
# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1
optimizer:
name: decoupled_adamw
lr: 6.0e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0
algorithms:
gradient_clipping:
clipping_type: 'norm'
clipping_threshold: 1.0
max_duration: 4800ba # ~ 2.5B tokens
eval_interval: 500ba
global_train_batch_size: 256
# System
seed: 17
device_eval_batch_size: 16
device_train_microbatch_size: 16
# device_train_microbatch_size: auto
precision: amp_bf16
# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
min_params: 1e8
mixed_precision: DEFAULT
activation_checkpointing: false
activation_cpu_offload: false
verbose: true
# Logging
progress_bar: true
log_to_console: true
callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
# loggers:
# wandb: {}
# Checkpoint to local filesystem or remote object store
# save_interval: 500ba
# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
# save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
# Load from local filesystem or remote object store
# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt