From 8cc23e8058414ef1d591a10fc7e9d5015dd4e5fe Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Mon, 10 Feb 2025 14:13:51 -0800 Subject: [PATCH 01/11] Bug fix with generation of expert_tensor_parallel_rank Signed-off-by: Guyue Huang --- nemo/lightning/megatron_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py index 5f1d744e5b77..798bc4edb8bc 100644 --- a/nemo/lightning/megatron_init.py +++ b/nemo/lightning/megatron_init.py @@ -498,7 +498,7 @@ def generator_wrapper(group_type, is_expert=False, **kwargs): # ETP expert_tensor_parallel_rank = 0 if expert_tensor_parallel_size_ is not None and expert_tensor_parallel_size_ > 1: - for ranks in generator_wrapper('tp-ep', is_expert=True): + for ranks in generator_wrapper('tp', is_expert=True): if rank in ranks: expert_tensor_parallel_rank = list(ranks).index(rank) From bc8441e4cf7f68aaf341e80ce331a381033ce029 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Mon, 10 Feb 2025 14:18:23 -0800 Subject: [PATCH 02/11] Fix pylint Signed-off-by: Guyue Huang --- nemo/lightning/megatron_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py index 798bc4edb8bc..fab6d17da3cb 100644 --- a/nemo/lightning/megatron_init.py +++ b/nemo/lightning/megatron_init.py @@ -108,7 +108,7 @@ def initialize_model_parallel_for_nemo( use_tp_pp_dp_mapping=False, use_te_rng_tracker=False, ): - + """Initialize model parallel groups in NeMo.""" if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED: raise ValueError("set_virtual_pipeline_model_parallel_world_size is needed in megatron-core for interleaved.") From 37dbae31cbbbb5a71ab8b413891b0f510b958932 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Tue, 11 Feb 2025 11:53:57 -0800 Subject: [PATCH 03/11] Support perf script with gb200 Signed-off-by: Guyue Huang --- scripts/llm/performance/finetune_llama31_405b.py | 2 +- scripts/llm/performance/finetune_llama3_70b.py | 2 +- scripts/llm/performance/finetune_llama3_8b.py | 4 ++-- scripts/llm/performance/pretrain_llama3_8b.py | 2 +- .../recommended_model_configs/model_configs_gb200.csv | 11 +++++++++++ 5 files changed, 16 insertions(+), 5 deletions(-) create mode 100644 scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py index f87aa713f2af..67ab8a720380 100644 --- a/scripts/llm/performance/finetune_llama31_405b.py +++ b/scripts/llm/performance/finetune_llama31_405b.py @@ -77,7 +77,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(args.gpu.lower() in []) + enable_cuda_graph = bool(args.gpu.lower() in ["gb200"]) recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py index 8676754078b5..3ba26aa8a7e9 100644 --- a/scripts/llm/performance/finetune_llama3_70b.py +++ b/scripts/llm/performance/finetune_llama3_70b.py @@ -77,7 +77,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(args.gpu.lower() in [] and finetuning_scheme != "lora") + enable_cuda_graph = bool(args.gpu.lower() in ["gb200"] and finetuning_scheme == "lora") recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py index 83016e9b7cf4..f227093b86bc 100644 --- a/scripts/llm/performance/finetune_llama3_8b.py +++ b/scripts/llm/performance/finetune_llama3_8b.py @@ -52,7 +52,7 @@ def override_recipe_configs( finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning gpu_type = args.gpu.lower() - if gpu_type == "b200": + if gpu_type in ["b200", "gb200"]: recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=16384) else: recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) @@ -82,7 +82,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(gpu_type in ["b200"]) + enable_cuda_graph = bool(gpu_type in ["b200", "gb200"]) recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph diff --git a/scripts/llm/performance/pretrain_llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py index 3543608482a3..756105079219 100644 --- a/scripts/llm/performance/pretrain_llama3_8b.py +++ b/scripts/llm/performance/pretrain_llama3_8b.py @@ -65,7 +65,7 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False - enable_cuda_graph = bool(args.gpu.lower() in []) + enable_cuda_graph = bool(args.gpu.lower() in ["gb200"]) recipe.model.config.enable_cuda_graph = enable_cuda_graph recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv new file mode 100644 index 000000000000..c258b1da165d --- /dev/null +++ b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv @@ -0,0 +1,11 @@ +task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size +lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, +lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, +lora,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, +lora,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, +lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, +lora,llama31,405b,gb200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, +pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, +pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, +sft,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, +sft,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, From 2db23e2caa1b22681ec39908cdae1b3d59697ae6 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Tue, 11 Feb 2025 12:06:50 -0800 Subject: [PATCH 04/11] fix Signed-off-by: Guyue Huang --- scripts/llm/performance/finetune_llama3_70b.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py index 3ba26aa8a7e9..bf47bfb4cdbf 100644 --- a/scripts/llm/performance/finetune_llama3_70b.py +++ b/scripts/llm/performance/finetune_llama3_70b.py @@ -51,6 +51,11 @@ def override_recipe_configs( """ finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) + gpu_type = args.gpu.lower() + if gpu_type in ["gb200"] and finetuning_scheme == "lora": + recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048) + else: + recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) recipe = set_primary_perf_configs( recipe, args.tensorboard, From 8d87cbfea79eb13ed550eb41fc2af70caed42b04 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Tue, 11 Feb 2025 13:21:35 -0800 Subject: [PATCH 05/11] mixtral 8x22b recipe change to mitigate a megatron.core bug Signed-off-by: Guyue Huang --- scripts/llm/performance/pretrain_mixtral_8x22b.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py index 8d28f290fbd9..b2458fdc90fe 100644 --- a/scripts/llm/performance/pretrain_mixtral_8x22b.py +++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py @@ -66,6 +66,10 @@ def override_recipe_configs( recipe.trainer.plugins = bf16_with_fp8_mixed() recipe.trainer.plugins.grad_reduce_in_fp32 = False + # to mitigate the incorrect gradient_scaling_factor calculation in megatron.core + # under scenario average_in_collective=True and etp_size>1, disabling average_in_collective. + recipe.trainer.strategy.ddp.average_in_collective = False + return recipe From 215b51eee641cee70a1111b24135f3da67a28133 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 12 Feb 2025 09:46:34 -0800 Subject: [PATCH 06/11] Make column names of all csvs align; fix for 8x22b Signed-off-by: Guyue Huang --- .../llm/performance/finetune_llama3_70b.py | 1 - .../llm/performance/pretrain_mixtral_8x22b.py | 7 +-- .../model_configs_b200.csv | 54 +++++++++---------- .../model_configs_gb200.csv | 22 ++++---- .../strong_scaling_model_configs_h100.csv | 22 ++++---- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py index bf47bfb4cdbf..7c9cab5c40bb 100644 --- a/scripts/llm/performance/finetune_llama3_70b.py +++ b/scripts/llm/performance/finetune_llama3_70b.py @@ -50,7 +50,6 @@ def override_recipe_configs( NOTE: Use fp8 precision training with caution. It might not give desirable results. """ finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning - recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) gpu_type = args.gpu.lower() if gpu_type in ["gb200"] and finetuning_scheme == "lora": recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048) diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py index b2458fdc90fe..d75efc9a5936 100644 --- a/scripts/llm/performance/pretrain_mixtral_8x22b.py +++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py @@ -33,7 +33,7 @@ def override_recipe_configs( cp_size: int, vp_size: int, ep_size: int, - etp_size: int, + etp_size: Optional[int], ): """ mixtral 8x22b pre-train recipe aimed at achieving best possible performance. @@ -67,8 +67,9 @@ def override_recipe_configs( recipe.trainer.plugins.grad_reduce_in_fp32 = False # to mitigate the incorrect gradient_scaling_factor calculation in megatron.core - # under scenario average_in_collective=True and etp_size>1, disabling average_in_collective. - recipe.trainer.strategy.ddp.average_in_collective = False + # under scenario average_in_collective=True and tp_size != etp_size, disabling average_in_collective. + if etp_size is not None and etp_size != tp_size: + recipe.trainer.strategy.ddp.average_in_collective = False return recipe diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv index b585d17c3e10..8e8b940be04f 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv @@ -1,27 +1,27 @@ -task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size -lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, -lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, -lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, -lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, -pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0, -pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0, -pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, -pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, -pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0, -pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0, -pre_train,llama31,405b,b200,bf16,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0, -pre_train,llama31,405b,b200,fp8,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0, -pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0, -pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0, -pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0 -pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0 -pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0, -pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0, -pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0, -pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0, -sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0, -sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0, -sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, +task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size +lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, +lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, +lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0, +pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0, +pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, +pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, +pre_train,llama31,405b,b200,bf16,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,7.0,1.0,36.0, +pre_train,llama31,405b,b200,fp8,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,7.0,1.0,36.0, +pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, +pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, +pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0 +pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0 +pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0, +pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0, +pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0, +pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0, +sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0, +sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0, +sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv index c258b1da165d..54a873c0e976 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv @@ -1,11 +1,11 @@ -task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size -lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, -lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0, -lora,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -lora,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, -lora,llama31,405b,gb200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0, -pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, -pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0, -sft,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, -sft,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0, +task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size +lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, +lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, +lora,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +lora,llama31,405b,gb200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, +sft,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +sft,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, diff --git a/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv b/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv index 4984243fe066..e43af08fdb0a 100644 --- a/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv +++ b/scripts/llm/performance/recommended_model_configs/strong_scaling_model_configs_h100.csv @@ -1,11 +1,11 @@ -task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs -pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0 -pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0 -pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0 -pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0 -pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0 +task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size +pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0, +pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0, +pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0, +pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0, +pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0, From 3071d05ef1cd4f26f260bb23bacf725c53bf50ad Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 12 Feb 2025 09:48:08 -0800 Subject: [PATCH 07/11] typing fix Signed-off-by: Guyue Huang --- scripts/llm/performance/pretrain_mixtral_8x22b.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py index d75efc9a5936..14c7a46714ca 100644 --- a/scripts/llm/performance/pretrain_mixtral_8x22b.py +++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py @@ -13,6 +13,7 @@ # limitations under the License. from os.path import basename, splitext +from typing import Optional import nemo_run as run from argument_parser import parse_cli_args From a9531976f0517cdc420364c7fb58b8e8380dc504 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 12 Feb 2025 10:21:54 -0800 Subject: [PATCH 08/11] Adjust 405B config due to asymmetric PP Signed-off-by: Guyue Huang --- .../recommended_model_configs/model_configs_b200.csv | 4 ++-- .../recommended_model_configs/model_configs_h100.csv | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv index 8e8b940be04f..9d7b44ff6d3a 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv @@ -11,8 +11,8 @@ pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, -pre_train,llama31,405b,b200,bf16,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,7.0,1.0,36.0, -pre_train,llama31,405b,b200,fp8,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,7.0,1.0,36.0, +pre_train,llama31,405b,b200,bf16,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,128.0, +pre_train,llama31,405b,b200,fp8,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,128.0, pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0 diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv b/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv index 5dc2b5c04cee..965d0fc5b8ea 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv @@ -8,8 +8,8 @@ lora,llama31,405b,h100,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, pre_train,gpt3,175b,h100,fp8,512.0,2048.0,4.0,8.0,1.0,16.0,1.0,6.0,2.0,2048.0, pre_train,gpt3,175b,h100,fp8,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0, pre_train,gpt3,175b,h100,bf16,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0, -pre_train,llama31,405b,h100,fp8,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0, -pre_train,llama31,405b,h100,bf16,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0, +pre_train,llama31,405b,h100,fp8,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,128.0, +pre_train,llama31,405b,h100,bf16,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,128.0, pre_train,llama3,70b,h100,fp8,64.0,8192.0,4.0,8.0,1.0,2.0,1.0,5.0,1.0,128.0, pre_train,llama3,70b,h100,bf16,64.0,8192.0,4.0,4.0,2.0,2.0,1.0,5.0,1.0,128.0, pre_train,llama3,8b,h100,fp8,8.0,8192.0,1.0,1.0,2.0,4.0,1.0,1.0,1.0,128.0, From 7e51eabc79326f7fa3fbb4c7bd7e13ed46e80859 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 12 Feb 2025 10:29:46 -0800 Subject: [PATCH 09/11] Add a comment Signed-off-by: Guyue Huang --- scripts/llm/performance/finetune_llama3_70b.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py index 7c9cab5c40bb..90b0e73ba4b7 100644 --- a/scripts/llm/performance/finetune_llama3_70b.py +++ b/scripts/llm/performance/finetune_llama3_70b.py @@ -52,6 +52,9 @@ def override_recipe_configs( finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning gpu_type = args.gpu.lower() if gpu_type in ["gb200"] and finetuning_scheme == "lora": + # On GB200 for lora task, we need to enable Cuda Graph for optimal performance. + # However, Cuda Graph increases memory usage, so in order to avoid OOM, we need + # to reduce the sequence length. recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048) else: recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True) From 0f28241cd139539f1802a364e200395d998d7964 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 12 Feb 2025 11:03:06 -0800 Subject: [PATCH 10/11] Change 405B GBS Signed-off-by: Guyue Huang --- .../recommended_model_configs/model_configs_b200.csv | 4 ++-- .../recommended_model_configs/model_configs_h100.csv | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv index 9d7b44ff6d3a..a564ffb03494 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_b200.csv @@ -11,8 +11,8 @@ pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0, -pre_train,llama31,405b,b200,bf16,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,128.0, -pre_train,llama31,405b,b200,fp8,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,128.0, +pre_train,llama31,405b,b200,bf16,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,256.0, +pre_train,llama31,405b,b200,fp8,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,256.0, pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0, pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0 diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv b/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv index 965d0fc5b8ea..cef1379dc366 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_h100.csv @@ -8,8 +8,8 @@ lora,llama31,405b,h100,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, pre_train,gpt3,175b,h100,fp8,512.0,2048.0,4.0,8.0,1.0,16.0,1.0,6.0,2.0,2048.0, pre_train,gpt3,175b,h100,fp8,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0, pre_train,gpt3,175b,h100,bf16,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0, -pre_train,llama31,405b,h100,fp8,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,128.0, -pre_train,llama31,405b,h100,bf16,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,128.0, +pre_train,llama31,405b,h100,fp8,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,256.0, +pre_train,llama31,405b,h100,bf16,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,256.0, pre_train,llama3,70b,h100,fp8,64.0,8192.0,4.0,8.0,1.0,2.0,1.0,5.0,1.0,128.0, pre_train,llama3,70b,h100,bf16,64.0,8192.0,4.0,4.0,2.0,2.0,1.0,5.0,1.0,128.0, pre_train,llama3,8b,h100,fp8,8.0,8192.0,1.0,1.0,2.0,4.0,1.0,1.0,1.0,128.0, From 18fa9d425f1e4241e7489ece0ebc4f761ff7501e Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 12 Feb 2025 15:00:35 -0800 Subject: [PATCH 11/11] fix gbs for gb200 finetuning Signed-off-by: Guyue Huang --- .../model_configs_gb200.csv | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv index 54a873c0e976..275555b13174 100644 --- a/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv +++ b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv @@ -1,11 +1,11 @@ task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size -lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, -lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0, -lora,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, -lora,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,64.0, +lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,64.0, +lora,llama3,8b,gb200,bf16,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +lora,llama3,8b,gb200,fp8,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, -lora,llama31,405b,gb200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, +lora,llama31,405b,gb200,bf16,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0, pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0, -sft,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, -sft,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +sft,llama3,8b,gb200,fp8,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0, +sft,llama3,8b,gb200,bf16,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,