[DPO] add reference log-prob outputs in DPO (#521)

## Summary Since the DPO uses a reference model we also need to return the reference logprobs in DPO  ## Testing Done   - Hardware Type: <BLANK> - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence --------- Co-authored-by: Shao Tang <[email protected]>
linkedin · Jan 30, 2025 · eed8af3 · eed8af3
1 parent aa2d23d
commit eed8af3
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 3 deletions.
diff --git a/src/liger_kernel/chunked_loss/dpo_loss.py b/src/liger_kernel/chunked_loss/dpo_loss.py
@@ -45,9 +45,12 @@ def preference_loss_fn(
         chosen_logratios = chosen_logps - ref_chosen_logps
         rejected_logratios = rejected_logps - ref_rejected_logps
 
+        chosen_rewards = beta * (chosen_logps - ref_chosen_logps)
+        rejected_rewards = beta * (rejected_logps - ref_rejected_logps)
+
         logits_diff = beta * (chosen_logratios - rejected_logratios)
         loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
-        return loss
+        return loss, chosen_rewards, rejected_rewards
 
     @staticmethod
     def forward(
@@ -99,7 +102,7 @@ def __init__(
         beta: float = 0.1,
         compute_nll_loss: bool = False,
         compiled: bool = True,
-        use_ref_model: bool = False,
+        use_ref_model: bool = True,
     ):
         """
         Args:

diff --git a/test/chunked_loss/test_dpo_loss.py b/test/chunked_loss/test_dpo_loss.py
@@ -56,9 +56,12 @@ def alignment_loss(
         chosen_logratios = policy_chosen_logps - ref_chosen_logps
         rejected_logratios = policy_rejected_logps - ref_rejected_logps
 
+        chosen_rewards = self.beta * (policy_chosen_logps - ref_chosen_logps)   
+        rejected_rewards = self.beta * (policy_rejected_logps - ref_rejected_logps)
+
         logits_diff = self.beta * (chosen_logratios - rejected_logratios)
         losses = -F.logsigmoid(logits_diff)
-        return losses
+        return losses, chosen_rewards, rejected_rewards
 
 
 class TorchLMHeadDPO(torch.nn.Module):