NVIDIA · erhoo82 · Feb 14, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 11, 2025
diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
@@ -77,7 +77,7 @@ def override_recipe_configs(
         recipe.trainer.plugins = bf16_with_fp8_mixed()
         recipe.trainer.plugins.grad_reduce_in_fp32 = False
 
-    enable_cuda_graph = bool(args.gpu.lower() in [])
+    enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
     recipe.model.config.enable_cuda_graph = enable_cuda_graph
     recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
     recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph

diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
@@ -50,7 +50,12 @@
    NOTE: Use fp8 precision training with caution. It might not give desirable results.
     """
     finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning
     recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
+    gpu_type = args.gpu.lower()
+    if gpu_type in ["gb200"] and finetuning_scheme == "lora":
+        recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=2048)
+    else:
+        recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
     recipe = set_primary_perf_configs(
         recipe,
         args.tensorboard,
@@ -77,7 +82,7 @@
         recipe.trainer.plugins = bf16_with_fp8_mixed()
         recipe.trainer.plugins.grad_reduce_in_fp32 = False
 
-    enable_cuda_graph = bool(args.gpu.lower() in [] and finetuning_scheme != "lora")
+    enable_cuda_graph = bool(args.gpu.lower() in ["gb200"] and finetuning_scheme == "lora")
     recipe.model.config.enable_cuda_graph = enable_cuda_graph
     recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
     recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph

diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
@@ -52,7 +52,7 @@ def override_recipe_configs(
     finetuning_scheme = "none" if args.finetuning == "sft" else args.finetuning
 
     gpu_type = args.gpu.lower()
-    if gpu_type == "b200":
+    if gpu_type in ["b200", "gb200"]:
         recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True, seq_length=16384)
     else:
         recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
@@ -82,7 +82,7 @@ def override_recipe_configs(
         recipe.trainer.plugins = bf16_with_fp8_mixed()
         recipe.trainer.plugins.grad_reduce_in_fp32 = False
 
-    enable_cuda_graph = bool(gpu_type in ["b200"])
+    enable_cuda_graph = bool(gpu_type in ["b200", "gb200"])
     recipe.model.config.enable_cuda_graph = enable_cuda_graph
     recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
     recipe.data.packed_sequence_specs.pad_cu_seqlens = enable_cuda_graph

diff --git a/scripts/llm/performance/pretrain_llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py
@@ -65,7 +65,7 @@ def override_recipe_configs(
         recipe.trainer.plugins = bf16_with_fp8_mixed()
         recipe.trainer.plugins.grad_reduce_in_fp32 = False
 
-    enable_cuda_graph = bool(args.gpu.lower() in [])
+    enable_cuda_graph = bool(args.gpu.lower() in ["gb200"])
     recipe.model.config.enable_cuda_graph = enable_cuda_graph
     recipe.trainer.strategy.use_te_rng_tracker = enable_cuda_graph
 

diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py
@@ -66,6 +66,10 @@ def override_recipe_configs(
         recipe.trainer.plugins = bf16_with_fp8_mixed()
         recipe.trainer.plugins.grad_reduce_in_fp32 = False
 
+    # to mitigate the incorrect gradient_scaling_factor calculation in megatron.core
+    # under scenario average_in_collective=True and etp_size>1, disabling average_in_collective.
+    recipe.trainer.strategy.ddp.average_in_collective = False
+
     return recipe
 
 

diff --git a/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv b/scripts/llm/performance/recommended_model_configs/model_configs_gb200.csv
@@ -0,0 +1,11 @@
+task,model,size,system,dtype,num_gpus,seq_len,tp_size,pp_size,cp_size,dp_size,ep_size,etp_size,vp_size,mbs,gbs,etp_size
+lora,llama3,70b,gb200,fp8,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
+lora,llama3,70b,gb200,bf16,8.0,2048.0,1.0,4.0,1.0,2.0,1.0,,20.0,1.0,32.0,
+lora,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
+lora,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
+lora,llama31,405b,gb200,fp8,24.0,2048.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
+lora,llama31,405b,gb200,bf16,24.0,4096.0,4.0,6.0,1.0,1.0,1.0,,7.0,1.0,24.0,
+pre_train,llama3,8b,gb200,bf16,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
+pre_train,llama3,8b,gb200,fp8,8.0,8192.0,1.0,1.0,1.0,8.0,1.0,,1.0,2.0,128.0,
+sft,llama3,8b,gb200,fp8,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,
+sft,llama3,8b,gb200,bf16,8.0,4096.0,1.0,1.0,1.0,8.0,1.0,,1.0,1.0,8.0,