diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index b1acc80ca3..cdf74931d9 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -156,6 +156,7 @@ model: use_cpu_initialization: False # Init weights on the CPU (slow for large models) onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gc_interval: 10 #Manual garbage collection # Nsys profiling options nsys_profile: diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml index 4a79bc886e..891ed31cde 100644 --- a/launcher_scripts/conf/training/llama/llama2_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml @@ -139,7 +139,7 @@ model: use_flash_attention: true overlap_p2p_comm: true batch_p2p_comm: false - gc_interval: 100 + gc_interval: 10 optim: name: distributed_fused_adam lr: 0.00015