From 7a0088690a5f868a498f51567e00023c3ffe7915 Mon Sep 17 00:00:00 2001 From: guyueh1 <140554423+guyueh1@users.noreply.github.com> Date: Fri, 14 Feb 2025 08:21:51 -0800 Subject: [PATCH] Add perf model configs gb200 (#12140) * Bug fix with generation of expert_tensor_parallel_rank Signed-off-by: Guyue Huang * Fix pylint Signed-off-by: Guyue Huang * Support perf script with gb200 Signed-off-by: Guyue Huang * fix Signed-off-by: Guyue Huang * mixtral 8x22b recipe change to mitigate a megatron.core bug Signed-off-by: Guyue Huang * Make column names of all csvs align; fix for 8x22b Signed-off-by: Guyue Huang * typing fix Signed-off-by: Guyue Huang * Adjust 405B config due to asymmetric PP Signed-off-by: Guyue Huang * Add a comment Signed-off-by: Guyue Huang * Change 405B GBS Signed-off-by: Guyue Huang * fix gbs for gb200 finetuning Signed-off-by: Guyue Huang --------- Signed-off-by: Guyue Huang --- .../llm/performance/finetune_llama31_405b.py | 2 +- .../llm/performance/finetune_llama3_70b.py | 11 +++- scripts/llm/performance/finetune_llama3_8b.py | 4 +- scripts/llm/performance/pretrain_llama3_8b.py | 2 +- .../llm/performance/pretrain_mixtral_8x22b.py | 8 ++- .../model_configs_b200.csv | 54 +++++++++---------- .../model_configs_gb200.csv | 11 ++++ .../model_configs_h100.csv | 4 +- .../strong_scaling_model_configs_h100.csv | 22 ++++---- 9 files changed, 71 insertions(+), 47 deletions(-) create mode 100644 scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py index f87aa713f2af..67ab8a720380 100644 --- a/scripts/llm/performance/finetune_llama31_405b.py +++ b/scripts/llm/performance/finetune_llama31_405b.py @@ -77,7 +77,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(args.gpu.lower() in []) + enable_cuda_graph = bool(args.gpu.lower() in ["gb200"]) recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py index 8676754078b5..90b0e73ba4b7 100644 --- a/scripts/llm/performance/finetune_llama3_70b.py +++ b/scripts/llm/performance/finetune_llama3_70b.py @@ -50,7 +50,14 @@ def override_recipe_configs( NOTE: Use fp8 precision training with caution. It might not give desirable results. """ finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning - recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) + gpu_type = args.gpu.lower() + if gpu_type in ["gb200"] and finetuning_scheme == "lora": + # On GB200 for lora task, we need to enable Cuda Graph for optimal performance. + # However, Cuda Graph increases memory usage, so in order to avoid OOM, we need + # to reduce the sequence length. + recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048) + else: + recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) recipe = set_primary_perf_configs( recipe, args.tensorboard, @@ -77,7 +84,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(args.gpu.lower() in [] and finetuning_scheme != "lora") + enable_cuda_graph = bool(args.gpu.lower() in ["gb200"] and finetuning_scheme == "lora") recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py index 83016e9b7cf4..f227093b86bc 100644 --- a/scripts/llm/performance/finetune_llama3_8b.py +++ b/scripts/llm/performance/finetune_llama3_8b.py @@ -52,7 +52,7 @@ def override_recipe_configs( finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning gpu_type = args.gpu.lower() - if gpu_type == "b200": + if gpu_type in ["b200", "gb200"]: recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=16384) else: recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) @@ -82,7 +82,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(gpu_type in ["b200"]) + enable_cuda_graph = bool(gpu_type in ["b200", "gb200"]) recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph diff --git a/scripts/llm/performance/pretrain_llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py index 3543608482a3..756105079219 100644 --- a/scripts/llm/performance/pretrain_llama3_8b.py +++ b/scripts/llm/performance/pretrain_llama3_8b.py @@ -65,7 +65,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(args.gpu.lower() in []) + enable_cuda_graph = bool(args.gpu.lower() in ["gb200"]) recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py index 8d28f290fbd9..14c7a46714ca 100644 --- a/scripts/llm/performance/pretrain_mixtral_8x22b.py +++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py @@ -13,6 +13,7 @@ # limitations under the License. from os.path import basename, splitext +from typing import Optional import nemo_run as run from argument_parser import parse_cli_args @@ -33,7 +34,7 @@ def override_recipe_configs( cp_size: int, vp_size: int, ep_size: int, - etp_size: int, + etp_size: Optional[int], ): """ mixtral 8x22b pre-train recipe aimed at achieving best possible performance. @@ -66,6 +67,11 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False + # to mitigate the incorrect gradient_scaling_factor calculation in megatron.core + # under scenario average_in_collective=True and tp_size != etp_size, disabling average_in_collective. + if etp_size is not None and etp_size != tp_size: + recipe.trainer.strategy.ddp.average_in_collective = False + return recipe diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv index b585d17c3e10..a564ffb03494 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv @@ -1,27 +1,27 @@ -task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size -lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, -lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, -lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, -lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, -pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0, -pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0, -pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, -pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, -pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0, -pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0, -pre_train,llama31,405b,b200,bf16,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0, -pre_train,llama31,405b,b200,fp8,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0, -pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0, -pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0, -pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0 -pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0 -pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0, -pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0, -pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0, -pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0, -sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0, -sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0, -sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, +task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size +lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, +lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, +lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0, +pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0, +pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, +pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, +pre_train,llama31,405b,b200,bf16,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,256.0, +pre_train,llama31,405b,b200,fp8,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,256.0, +pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, +pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, +pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0 +pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0 +pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0, +pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0, +pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0, +pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0, +sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0, +sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0, +sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv new file mode 100644 index 000000000000..275555b13174 --- /dev/null +++ b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv @@ -0,0 +1,11 @@ +task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size +lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,64.0, +lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,64.0, +lora,llama3,8b,gb200,bf16,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama3,8b,gb200,fp8,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +lora,llama31,405b,gb200,bf16,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +sft,llama3,8b,gb200,fp8,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +sft,llama3,8b,gb200,bf16,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv b/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv index 5dc2b5c04cee..cef1379dc366 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv @@ -8,8 +8,8 @@ lora,llama31,405b,h100,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, pre_train,gpt3,175b,h100,fp8,512.0,2048.0,4.0,8.0,1.0,16.0,1.0,6.0,2.0,2048.0, pre_train,gpt3,175b,h100,fp8,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0, pre_train,gpt3,175b,h100,bf16,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0, -pre_train,llama31,405b,h100,fp8,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0, -pre_train,llama31,405b,h100,bf16,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0, +pre_train,llama31,405b,h100,fp8,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,256.0, +pre_train,llama31,405b,h100,bf16,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,256.0, pre_train,llama3,70b,h100,fp8,64.0,8192.0,4.0,8.0,1.0,2.0,1.0,5.0,1.0,128.0, pre_train,llama3,70b,h100,bf16,64.0,8192.0,4.0,4.0,2.0,2.0,1.0,5.0,1.0,128.0, pre_train,llama3,8b,h100,fp8,8.0,8192.0,1.0,1.0,2.0,4.0,1.0,1.0,1.0,128.0, diff --git a/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv b/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv index 4984243fe066..e43af08fdb0a 100644 --- a/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv +++ b/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv @@ -1,11 +1,11 @@ -task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs -pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0 -pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0 -pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0 -pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0 +task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size +pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0, +pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0, +pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0, +pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0,