Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add perf model configs gb200 #12140

Merged
merged 13 commits into from
Feb 14, 2025
2 changes: 1 addition & 1 deletion scripts/llm/performance/finetune_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [])
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
malay-nagda marked this conversation as resolved.
Show resolved Hide resolved
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
7 changes: 6 additions & 1 deletion scripts/llm/performance/finetune_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@
NOTE: Use fp8 precision training with caution. It might not give desirable results.
"""
finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)

Check warning

Code scanning / CodeQL

Variable defined multiple times Warning

This assignment to 'recipe' is unnecessary as it is
redefined
before this value is used.
This assignment to 'recipe' is unnecessary as it is
redefined
before this value is used.
gpu_type = args.gpu.lower()
if gpu_type in ["gb200"] and finetuning_scheme == "lora":
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048)
else:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
recipe = set_primary_perf_configs(
recipe,
args.tensorboard,
Expand All @@ -77,7 +82,7 @@
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [] and finetuning_scheme != "lora")
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"] and finetuning_scheme == "lora")
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
4 changes: 2 additions & 2 deletions scripts/llm/performance/finetune_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def override_recipe_configs(
finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning

gpu_type = args.gpu.lower()
if gpu_type == "b200":
if gpu_type in ["b200", "gb200"]:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=16384)
else:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
Expand Down Expand Up @@ -82,7 +82,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(gpu_type in ["b200"])
enable_cuda_graph = bool(gpu_type in ["b200", "gb200"])
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
2 changes: 1 addition & 1 deletion scripts/llm/performance/pretrain_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [])
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph

Expand Down
4 changes: 4 additions & 0 deletions scripts/llm/performance/pretrain_mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

# to mitigate the incorrect gradient_scaling_factor calculation in megatron.core
# under scenario average_in_collective=True and etp_size>1, disabling average_in_collective.
malay-nagda marked this conversation as resolved.
Show resolved Hide resolved
recipe.trainer.strategy.ddp.average_in_collective = False

return recipe


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size
malay-nagda marked this conversation as resolved.
Show resolved Hide resolved
lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
lora,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
lora,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
lora,llama31,405b,gb200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
sft,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
sft,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
Loading