Remove presence of grad_sample from optimizer for FGC (#756)

aparna-aketi · facebook-github-bot · commit 7555297de2b1 · 2025-05-09T13:57:05.000-07:00
Summary: Pull Request resolved: #756 In case of FGC, grad_samples is set to None in the backward hook after computing the norm per layer. There is no need to set p.grad_samples to None in the optimizer. Reviewed By: EnayatUllah Differential Revision: D74418221 fbshipit-source-id: 0f91288e0839d35887e5ec6add36fc3baf89dd85
diff --git a/opacus/grad_sample/grad_sample_module_fast_gradient_clipping.py b/opacus/grad_sample/grad_sample_module_fast_gradient_clipping.py
@@ -240,7 +240,7 @@ def capture_backprops_hook(
                         grad_sample=p.grad_sample,
                         max_batch_len=module.max_batch_len,
                     )
-                    del p.grad_sample
+                    p.grad_sample = None
         if len(module.activations) == 0:
             if hasattr(module, "max_batch_len"):
                 del module.max_batch_len
diff --git a/opacus/optimizers/optimizer_fast_gradient_clipping.py b/opacus/optimizers/optimizer_fast_gradient_clipping.py
@@ -123,10 +123,10 @@ def zero_grad(self, set_to_none: bool = False):
         """
         Clear gradients.
 
-        Clears ``p.grad``, ``p.grad_sample`` and ``p.summed_grad`` for all of it's parameters
+        Clears ``p.grad`` and ``p.summed_grad`` for all of it's parameters
 
         Notes:
-            ``set_to_none`` argument only affects ``p.grad``. ``p.grad_sample`` and
+            ``set_to_none`` argument only affects ``p.grad`` and
             ``p.summed_grad`` is never zeroed out and always set to None.
             Normal grads can do this, because their shape is always the same.
             Grad samples do not behave like this, as we accumulate gradients from different
@@ -140,13 +140,11 @@ def zero_grad(self, set_to_none: bool = False):
         if set_to_none is False:
             logger.debug(
                 "Despite set_to_none is set to False, "
-                "opacus will set p.grad_sample and p.summed_grad to None due to "
+                "opacus will set p.summed_grad to None due to "
                 "non-trivial gradient accumulation behaviour"
             )
 
         for p in self.params:
-            p.grad_sample = None
-
             if not self._is_last_step_skipped:
                 p.summed_grad = None
         self.original_optimizer.zero_grad(set_to_none)

Original file line number	Diff line number	Diff line change
`@@ -240,7 +240,7 @@ def capture_backprops_hook(`
`240`	`240`	`grad_sample=p.grad_sample,`
`241`	`241`	`max_batch_len=module.max_batch_len,`
`242`	`242`	`)`
`243`		`- del p.grad_sample`
	`243`	`+ p.grad_sample = None`
`244`	`244`	`if len(module.activations) == 0:`
`245`	`245`	`if hasattr(module, "max_batch_len"):`
`246`	`246`	`del module.max_batch_len`