NVIDIA · pablo-garay · Feb 14, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -60,7 +60,7 @@
 
 from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 from nemo.lightning import _strategy_lib, io
-from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, aggregate_moe_loss_stats
+from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel
 from nemo.lightning.pytorch.callbacks import ModelTransform
 from nemo.lightning.pytorch.strategies.utils import (
     RestoreConfig,
@@ -628,9 +628,11 @@
                     "reduced_train_loss", reduced_train_loss, prog_bar=True, batch_size=1, sync_dist=False
                 )
                 # Log any MoE losses.
+                # @akoumparouli: disabling this as it hangs with deepseek.
                 # TODO(@akoumparouli): loss_scale depends on the GBS.
-                for loss_name, loss_value in aggregate_moe_loss_stats(loss_scale=1.0).items():
-                    self.lightning_module.log(loss_name, loss_value, prog_bar=True, rank_zero_only=True, batch_size=1)
+            # for loss_name, loss_value in aggregate_moe_loss_stats(loss_scale=1.0).items():
+            #    self.lightning_module.log(
+            #    loss_name, loss_value, prog_bar=True, rank_zero_only=True, batch_size=1)
@@ -632,5 +632,5 @@
                # TODO(@akoumparouli): loss_scale depends on the GBS.
-            # for loss_name, loss_value in aggregate_moe_loss_stats(loss_scale=1.0).items():
-            #    self.lightning_module.log(
-            #    loss_name, loss_value, prog_bar=True, rank_zero_only=True, batch_size=1)
+
+
+
@@ -632,5 +632,5 @@
                # TODO(@akoumparouli): loss_scale depends on the GBS.
-            # for loss_name, loss_value in aggregate_moe_loss_stats(loss_scale=1.0).items():
-            #    self.lightning_module.log(
-            #    loss_name, loss_value, prog_bar=True, rank_zero_only=True, batch_size=1)
+
+
+

 
             return out