Fix Fast Gradient Clipping bias gradient calculation for three dim data (meta-pytorch#751)

EnayatUllah · facebook-github-bot · commit d0a98cb1d38c · 2025-04-09T14:58:29.000-07:00
Summary:

The bias grad calculation for three dim data was incorect. 

Let `G = g^Tg`, where `g`, of dimensions `Txd` be the per-sample activation gradient, where `T` is the number of tokens and `d` dimension.

The per-sample gradient norm  with respect to bias is 
`vec(G)^T vec(1)`, instead of the erroneous,`vec(G)^T vec(G)` before. This diff fixes it.

Reviewed By: HuanyuZhang

Differential Revision: D70823094
diff --git a/opacus/grad_sample/linear.py b/opacus/grad_sample/linear.py
@@ -83,7 +83,6 @@ def compute_linear_norm_sample(
 
             ret[layer.weight] = torch.sqrt(ga)
         if layer.bias is not None and layer.bias.requires_grad:
-            ggT = torch.einsum("nik,njk->nij", backprops, backprops)
-            gg = torch.einsum("n...i,n...i->n", ggT, ggT).clamp(min=0)
-            ret[layer.bias] = torch.sqrt(gg)
+            ggT = torch.einsum("nik,njk->nij", backprops, backprops)  # batchwise g g^T
+            ret[layer.bias] = torch.sqrt(torch.einsum("n...i->n", ggT).clamp(min=0))
     return ret
diff --git a/opacus/tests/grad_sample_module_fast_gradient_clipping_test.py b/opacus/tests/grad_sample_module_fast_gradient_clipping_test.py
@@ -15,6 +15,7 @@
 
 import logging
 import unittest
+import copy
 
 import hypothesis.strategies as st
 import torch
@@ -27,6 +28,7 @@
 from opacus.utils.per_sample_gradients_utils import clone_module
 from torch.utils.data import DataLoader, Dataset
 
+
 from .grad_sample_module_test import GradSampleModuleTest, SampleConvNet
 
 
@@ -54,8 +56,8 @@ def __init__(self):
         super(SampleModule, self).__init__()
         self.fc1 = nn.Linear(2, 2)
         self.fc3 = nn.Linear(2, 1024)
-        self.fc4 = nn.Linear(1024, 1024)
-        self.fc5 = nn.Linear(1024, 1)
+        self.fc4 = nn.Linear(1024, 10)
+        self.fc5 = nn.Linear(10, 1)
         self.layer_norm = nn.LayerNorm(2)
 
     def forward(self, x):
@@ -119,7 +121,7 @@ def setUp_data_sequantial(self, size, length, dim):
 
     @given(
         size=st.sampled_from([10]),
-        length=st.sampled_from([1]),
+        length=st.sampled_from([1, 10]),
         dim=st.sampled_from([2]),
     )
     @settings(deadline=1000000)
@@ -131,7 +133,7 @@ def test_norm_calculation_fast_gradient_clipping(self, size, length, dim):
         self.size = size
         self.dim = dim
 
-        self.criterion = torch.nn.CrossEntropyLoss(reduction="none")
+        self.criterion = torch.nn.CrossEntropyLoss(reduction="mean")
         self.setUp_data_sequantial(self.size, self.length, self.dim)
         noise_multiplier = 0.0
         batch_size = self.size
@@ -150,19 +152,21 @@ def test_norm_calculation_fast_gradient_clipping(self, size, length, dim):
             clone_module(sample_module),
             max_grad_norm=max_grad_norm,
             use_ghost_clipping=True,
+            loss_reduction="mean",
         )
         optimizer_gc = torch.optim.SGD(self.grad_sample_module.parameters(), lr=1)
         optimizer_gc = DPOptimizerFastGradientClipping(
             optimizer_gc,
             noise_multiplier=noise_multiplier,
             max_grad_norm=max_grad_norm,
             expected_batch_size=batch_size,
+            loss_reduction="mean",
         )
 
         (input_data, target_data) = list(self.dl)[0]
         optimizer_normal.zero_grad()
         output_normal = self.model_normal(input_data)
-        loss_normal = torch.mean(self.criterion(output_normal, target_data), dim=0)
+        loss_normal = self.criterion(output_normal, target_data)
         loss_normal.backward()
         all_norms_normal = torch.stack(
             [
@@ -173,19 +177,13 @@ def test_norm_calculation_fast_gradient_clipping(self, size, length, dim):
         )
         flat_norms_normal = torch.cat([p.flatten() for p in all_norms_normal])
 
-        self.grad_sample_module.enable_hooks()
-        output_gc = self.grad_sample_module(input_data)
-
-        first_loss_per_sample = self.criterion(output_gc, target_data)
-        first_loss = torch.mean(first_loss_per_sample)
-        first_loss.backward(retain_graph=True)
-
         optimizer_gc.zero_grad()
-        coeff = self.grad_sample_module.get_clipping_coef()
-        second_loss_per_sample = coeff * first_loss_per_sample
-        second_loss = torch.sum(second_loss_per_sample)
-        self.grad_sample_module.disable_hooks()
-        second_loss.backward()
+        criterion_gc = DPLossFastGradientClipping(
+            self.grad_sample_module, optimizer_gc, copy.deepcopy(self.criterion)
+        )
+        output_gc = self.grad_sample_module(input_data)
+        loss_gc = criterion_gc(output_gc, target_data)
+        loss_gc.backward()
 
         all_norms_gc = [
             param._norm_sample for param in self.grad_sample_module.parameters()
@@ -194,13 +192,13 @@ def test_norm_calculation_fast_gradient_clipping(self, size, length, dim):
 
         diff = flat_norms_normal - flat_norms_gc
 
-        logging.info(f"Diff = {diff}"),
+        logging.info(f"Max difference between (vanilla) Opacus and FGC = {max(diff)}")
         msg = "Fail: Gradients from vanilla DP-SGD and from fast gradient clipping are different"
         assert torch.allclose(flat_norms_normal, flat_norms_gc, atol=1e-3), msg
 
     @given(
         size=st.sampled_from([10]),
-        length=st.sampled_from([1, 5]),
+        length=st.sampled_from([1, 10]),
         dim=st.sampled_from([2]),
     )
     @settings(deadline=1000000)
@@ -243,7 +241,7 @@ def test_gradient_calculation_fast_gradient_clipping(self, size, length, dim):
         )
 
         criterion_gc = DPLossFastGradientClipping(
-            self.grad_sample_module, optimizer_gc, self.criterion
+            self.grad_sample_module, optimizer_gc, copy.deepcopy(self.criterion)
         )
 
         (input_data, target_data) = list(self.dl)[0]
@@ -273,7 +271,7 @@ def test_gradient_calculation_fast_gradient_clipping(self, size, length, dim):
                 for (g_gc, g_normal) in zip(flat_grads_gc, flat_grads_normal)
             ]
         )
-        logging.info(f"Diff = {diff}")
+        logging.info(f"Max difference between (vanilla) Opacus and FGC = {max(diff)}")
         msg = "Fail: Gradients from vanilla DP-SGD and from fast gradient clipping are different"
         assert torch.allclose(flat_grads_normal, flat_grads_gc, atol=1e-3), msg
 
@@ -350,7 +348,7 @@ def test_norm_calculation(self):
 
         diff = flat_norms_normal - flat_norms_gc
 
-        logging.info(f"Diff = {diff}")
+        logging.info(f"Max difference between (vanilla) Opacus and FGC = {max(diff)}")
         msg = "Fail: Gradient norms from vanilla DP-SGD and from fast gradient clipping are different"
         assert torch.allclose(flat_norms_normal, flat_norms_gc, atol=1e-3), msg
 
@@ -421,6 +419,6 @@ def test_gradient_calculation(self):
             ]
         )
 
-        logging.info(f"Diff = {diff}")
+        logging.info(f"Max difference between (vanilla) Opacus and FGC = {max(diff)}")
         msg = "Fail: Gradients from vanilla DP-SGD and from fast gradient clipping are different"
         assert torch.allclose(flat_grads_normal, flat_grads_gc, atol=1e-3), msg