Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add perf model configs gb200 #12140

Merged
merged 13 commits into from
Feb 14, 2025
2 changes: 1 addition & 1 deletion scripts/llm/performance/finetune_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [])
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
malay-nagda marked this conversation as resolved.
Show resolved Hide resolved
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
8 changes: 6 additions & 2 deletions scripts/llm/performance/finetune_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ def override_recipe_configs(
NOTE: Use fp8 precision training with caution. It might not give desirable results.
"""
finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
gpu_type = args.gpu.lower()
if gpu_type in ["gb200"] and finetuning_scheme == "lora":
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048)
else:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
recipe = set_primary_perf_configs(
recipe,
args.tensorboard,
Expand All @@ -77,7 +81,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [] and finetuning_scheme != "lora")
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"] and finetuning_scheme == "lora")
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
4 changes: 2 additions & 2 deletions scripts/llm/performance/finetune_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def override_recipe_configs(
finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning

gpu_type = args.gpu.lower()
if gpu_type == "b200":
if gpu_type in ["b200", "gb200"]:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=16384)
else:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
Expand Down Expand Up @@ -82,7 +82,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(gpu_type in ["b200"])
enable_cuda_graph = bool(gpu_type in ["b200", "gb200"])
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
2 changes: 1 addition & 1 deletion scripts/llm/performance/pretrain_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [])
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph

Expand Down
8 changes: 7 additions & 1 deletion scripts/llm/performance/pretrain_mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from os.path import basename, splitext
from typing import Optional

import nemo_run as run
from argument_parser import parse_cli_args
Expand All @@ -33,7 +34,7 @@ def override_recipe_configs(
cp_size: int,
vp_size: int,
ep_size: int,
etp_size: int,
etp_size: Optional[int],
):
"""
mixtral 8x22b pre-train recipe aimed at achieving best possible performance.
Expand Down Expand Up @@ -66,6 +67,11 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

# to mitigate the incorrect gradient_scaling_factor calculation in megatron.core
# under scenario average_in_collective=True and tp_size != etp_size, disabling average_in_collective.
if etp_size is not None and etp_size != tp_size:
recipe.trainer.strategy.ddp.average_in_collective = False

return recipe


Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size
lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0,
pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0,
pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0,
pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0,
pre_train,llama31,405b,b200,bf16,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0,
pre_train,llama31,405b,b200,fp8,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0,
pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0,
pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0,
pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0
pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0
pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0,
pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0,
pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0,
pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0,
sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0,
sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0,
sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size
lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0,
lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0,
lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0,
pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0,
pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0,
pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0,
pre_train,llama31,405b,b200,bf16,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,128.0,
pre_train,llama31,405b,b200,fp8,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,128.0,
pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0,
pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0,
pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0
pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0
pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0,
pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0,
pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0,
pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0,
sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0,
sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0,
sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size
lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0,
lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0,
lora,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
lora,llama31,405b,gb200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
sft,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
sft,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ lora,llama31,405b,h100,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
pre_train,gpt3,175b,h100,fp8,512.0,2048.0,4.0,8.0,1.0,16.0,1.0,6.0,2.0,2048.0,
pre_train,gpt3,175b,h100,fp8,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0,
pre_train,gpt3,175b,h100,bf16,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0,
pre_train,llama31,405b,h100,fp8,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0,
pre_train,llama31,405b,h100,bf16,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0,
pre_train,llama31,405b,h100,fp8,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,128.0,
pre_train,llama31,405b,h100,bf16,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,128.0,
pre_train,llama3,70b,h100,fp8,64.0,8192.0,4.0,8.0,1.0,2.0,1.0,5.0,1.0,128.0,
pre_train,llama3,70b,h100,bf16,64.0,8192.0,4.0,4.0,2.0,2.0,1.0,5.0,1.0,128.0,
pre_train,llama3,8b,h100,fp8,8.0,8192.0,1.0,1.0,2.0,4.0,1.0,1.0,1.0,128.0,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs
pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0
pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0
pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0
pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size
pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0,
pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0,
pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0,
pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0,
Loading