Skip to content

Commit

Permalink
Add perf model configs gb200 (#12140)
Browse files Browse the repository at this point in the history
* Bug fix with generation of expert_tensor_parallel_rank

Signed-off-by: Guyue Huang <[email protected]>

* Fix pylint

Signed-off-by: Guyue Huang <[email protected]>

* Support perf script with gb200

Signed-off-by: Guyue Huang <[email protected]>

* fix

Signed-off-by: Guyue Huang <[email protected]>

* mixtral 8x22b recipe change to mitigate a megatron.core bug

Signed-off-by: Guyue Huang <[email protected]>

* Make column names of all csvs align; fix for 8x22b

Signed-off-by: Guyue Huang <[email protected]>

* typing fix

Signed-off-by: Guyue Huang <[email protected]>

* Adjust 405B config due to asymmetric PP

Signed-off-by: Guyue Huang <[email protected]>

* Add a comment

Signed-off-by: Guyue Huang <[email protected]>

* Change 405B GBS

Signed-off-by: Guyue Huang <[email protected]>

* fix gbs for gb200 finetuning

Signed-off-by: Guyue Huang <[email protected]>

---------

Signed-off-by: Guyue Huang <[email protected]>
  • Loading branch information
guyueh1 authored Feb 14, 2025
1 parent 245f97c commit 7a00886
Show file tree
Hide file tree
Showing 9 changed files with 71 additions and 47 deletions.
2 changes: 1 addition & 1 deletion scripts/llm/performance/finetune_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [])
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
11 changes: 9 additions & 2 deletions scripts/llm/performance/finetune_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ def override_recipe_configs(
NOTE: Use fp8 precision training with caution. It might not give desirable results.
"""
finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
gpu_type = args.gpu.lower()
if gpu_type in ["gb200"] and finetuning_scheme == "lora":
# On GB200 for lora task, we need to enable Cuda Graph for optimal performance.
# However, Cuda Graph increases memory usage, so in order to avoid OOM, we need
# to reduce the sequence length.
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048)
else:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
recipe = set_primary_perf_configs(
recipe,
args.tensorboard,
Expand All @@ -77,7 +84,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [] and finetuning_scheme != "lora")
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"] and finetuning_scheme == "lora")
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
4 changes: 2 additions & 2 deletions scripts/llm/performance/finetune_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def override_recipe_configs(
finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning

gpu_type = args.gpu.lower()
if gpu_type == "b200":
if gpu_type in ["b200", "gb200"]:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=16384)
else:
recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
Expand Down Expand Up @@ -82,7 +82,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(gpu_type in ["b200"])
enable_cuda_graph = bool(gpu_type in ["b200", "gb200"])
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph
Expand Down
2 changes: 1 addition & 1 deletion scripts/llm/performance/pretrain_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

enable_cuda_graph = bool(args.gpu.lower() in [])
enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
recipe.model.config.enable_cuda_graph = enable_cuda_graph
recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph

Expand Down
8 changes: 7 additions & 1 deletion scripts/llm/performance/pretrain_mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from os.path import basename, splitext
from typing import Optional

import nemo_run as run
from argument_parser import parse_cli_args
Expand All @@ -33,7 +34,7 @@ def override_recipe_configs(
cp_size: int,
vp_size: int,
ep_size: int,
etp_size: int,
etp_size: Optional[int],
):
"""
mixtral 8x22b pre-train recipe aimed at achieving best possible performance.
Expand Down Expand Up @@ -66,6 +67,11 @@ def override_recipe_configs(
recipe.trainer.plugins = bf16_with_fp8_mixed()
recipe.trainer.plugins.grad_reduce_in_fp32 = False

# to mitigate the incorrect gradient_scaling_factor calculation in megatron.core
# under scenario average_in_collective=True and tp_size != etp_size, disabling average_in_collective.
if etp_size is not None and etp_size != tp_size:
recipe.trainer.strategy.ddp.average_in_collective = False

return recipe


Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size
lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0,
pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,,12.0,2.0,256.0,
pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0,
pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,,5.0,1.0,128.0,
pre_train,llama31,405b,b200,bf16,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0,
pre_train,llama31,405b,b200,fp8,144.0,8192.0,4.0,9.0,2.0,2.0,1.0,,7.0,1.0,36.0,
pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0,
pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,,1.0,2.0,256.0,
pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0
pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,1.0,14.0,1.0,64.0,1.0
pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0,
pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0,
pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,,12.0,1.0,32.0,
pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,,1.0,2.0,256.0,
sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0,
sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,,5.0,1.0,32.0,
sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size
lora,llama3,70b,b200,fp8,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0,
lora,llama3,70b,b200,bf16,8.0,4096.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,32.0,
lora,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama31,405b,b200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
lora,llama31,405b,b200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
pre_train,gpt3,175b,b200,bf16,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0,
pre_train,gpt3,175b,b200,fp8,128.0,2048.0,4.0,4.0,1.0,8.0,1.0,12.0,2.0,256.0,
pre_train,llama3,8b,b200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
pre_train,llama3,8b,b200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
pre_train,llama3,70b,b200,fp8,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0,
pre_train,llama3,70b,b200,bf16,64.0,8192.0,2.0,4.0,2.0,4.0,1.0,5.0,1.0,128.0,
pre_train,llama31,405b,b200,bf16,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,256.0,
pre_train,llama31,405b,b200,fp8,512.0,8192.0,4.0,8.0,2.0,8.0,1.0,8.0,1.0,256.0,
pre_train,mixtral,8x7b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0,
pre_train,mixtral,8x7b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,64.0,8.0,1.0,2.0,256.0,
pre_train,mixtral,8x22b,b200,bf16,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0
pre_train,mixtral,8x22b,b200,fp8,256.0,65536.0,2.0,4.0,8.0,4.0,8.0,14.0,1.0,64.0,1.0
pre_train,nemotron4,15b,b200,bf16,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0,
pre_train,nemotron4,340b,b200,bf16,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0,
pre_train,nemotron4,340b,b200,fp8,128.0,4096.0,8.0,4.0,1.0,4.0,1.0,12.0,1.0,32.0,
pre_train,nemotron4,15b,b200,fp8,64.0,4096.0,1.0,1.0,1.0,32.0,1.0,1.0,2.0,256.0,
sft,llama3,8b,b200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
sft,llama3,70b,b200,bf16,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0,
sft,llama3,70b,b200,fp8,32.0,4096.0,2.0,4.0,1.0,4.0,1.0,5.0,1.0,32.0,
sft,llama3,8b,b200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size
lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,64.0,
lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,20.0,1.0,64.0,
lora,llama3,8b,gb200,bf16,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama3,8b,gb200,fp8,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
lora,llama31,405b,gb200,bf16,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,1.0,2.0,128.0,
sft,llama3,8b,gb200,fp8,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
sft,llama3,8b,gb200,bf16,8.0,16384.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,8.0,
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ lora,llama31,405b,h100,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,7.0,1.0,24.0,
pre_train,gpt3,175b,h100,fp8,512.0,2048.0,4.0,8.0,1.0,16.0,1.0,6.0,2.0,2048.0,
pre_train,gpt3,175b,h100,fp8,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0,
pre_train,gpt3,175b,h100,bf16,128.0,2048.0,4.0,8.0,1.0,4.0,1.0,6.0,1.0,256.0,
pre_train,llama31,405b,h100,fp8,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0,
pre_train,llama31,405b,h100,bf16,576.0,8192.0,8.0,9.0,2.0,4.0,1.0,7.0,1.0,252.0,
pre_train,llama31,405b,h100,fp8,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,256.0,
pre_train,llama31,405b,h100,bf16,512.0,8192.0,8.0,8.0,2.0,4.0,1.0,8.0,1.0,256.0,
pre_train,llama3,70b,h100,fp8,64.0,8192.0,4.0,8.0,1.0,2.0,1.0,5.0,1.0,128.0,
pre_train,llama3,70b,h100,bf16,64.0,8192.0,4.0,4.0,2.0,2.0,1.0,5.0,1.0,128.0,
pre_train,llama3,8b,h100,fp8,8.0,8192.0,1.0,1.0,2.0,4.0,1.0,1.0,1.0,128.0,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs
pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0
pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0
pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0
pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0
pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0
task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,vp_size,mbs,gbs,etp_size
pre_train,nemotron4,15b,h100,fp8,128.0,4096.0,4.0,1.0,1.0,32.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,256.0,4096.0,4.0,1.0,1.0,64.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,64.0,4096.0,4.0,1.0,1.0,16.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,16.0,4096.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,512.0,4096.0,4.0,1.0,1.0,128.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,15b,h100,fp8,1024.0,4096.0,4.0,1.0,1.0,256.0,1.0,1.0,4.0,1024.0,
pre_train,nemotron4,340b,h100,fp8,128.0,4096.0,8.0,8.0,1.0,2.0,1.0,12.0,1.0,512.0,
pre_train,nemotron4,340b,h100,fp8,256.0,4096.0,8.0,8.0,1.0,4.0,1.0,12.0,1.0,512.0,
pre_train,nemotron4,340b,h100,fp8,512.0,4096.0,8.0,8.0,1.0,8.0,1.0,12.0,1.0,512.0,
pre_train,nemotron4,340b,h100,fp8,1024.0,4096.0,8.0,8.0,1.0,16.0,1.0,12.0,1.0,512.0,

0 comments on commit 7a00886

Please sign in to comment.