Add gradient sample mode to the logging system

HuanyuZhang · facebook-github-bot · commit e194ad7f576b · 2025-02-26T12:14:44.000-08:00
Summary: We add gradient sample mode of each submodule to the logging system, which is especially useful information when people want to check the compatibility of complex model architecture.

Differential Revision: D70255075
diff --git a/opacus/grad_sample/grad_sample_module_fast_gradient_clipping.py b/opacus/grad_sample/grad_sample_module_fast_gradient_clipping.py
@@ -26,7 +26,8 @@
     create_or_accumulate_grad_sample,
     promote_current_grad_sample,
 )
-from opacus.utils.module_utils import requires_grad, trainable_parameters
+from opacus.layers.dp_rnn import DPGRU, DPLSTM, DPRNN
+from opacus.utils.module_utils import requires_grad, trainable_modules, trainable_parameters
 
 
 logger = logging.getLogger(__name__)
@@ -109,6 +110,8 @@ def __init__(
                 If ``strict`` is set to ``True`` and module ``m`` (or any of its
                 submodules) includes a buffer.
         """
+        if logger.isEnabledFor(logging.INFO):
+            self.module_gradient_sample_mode(module=m, force_functorch=force_functorch, use_ghost_clipping=use_ghost_clipping)
 
         super().__init__(
             m,
@@ -233,7 +236,43 @@ def capture_backprops_hook(
         if len(module.activations) == 0:
             if hasattr(module, "max_batch_len"):
                 del module.max_batch_len
+ 
+    def module_gradient_sample_mode(
+        self, module: nn.Module, *, force_functorch=False, use_ghost_clipping=True
+    ):
+        """
+        Add logs to track gradient sample mode for each part of the module, including 1) Ghost Clipping, 2) Fast Gradient Clipping (hook mode), and 3) Fast Gradient Clipping (functorch mode).
 
+        Args:
+            module: nn.Module to be checked
+            force_functorch: If set to ``True``, will use functorch to compute
+                all per sample gradients. Otherwise, functorch will be used only
+                for layers without registered grad sampler methods.
+            use_ghost_clipping: If set to ``True``, Ghost Clipping
+                will be used for clipping gradients of supported layers. If ``False``, Fast
+                Gradient Clipping will be used for all layers.
+        """
+        for m_name, m in trainable_modules(module):
+            if type(m) in [DPRNN, DPLSTM, DPGRU]:
+                logger.info(
+                    f"Module name: {m_name}, module type: {type(m)}. No hook or functorch is added."
+                )
+
+            elif use_ghost_clipping and type(m) in self.NORM_SAMPLERS:
+                logger.info(
+                    f"Module name: {m_name}, module type: {type(m)}, under Ghost Clipping."
+                )
+
+            else:
+                if not force_functorch and type(m) in self.GRAD_SAMPLERS:
+                    logger.info(
+                        f"Module name: {m_name}, module type: {type(m)}, under Fast Gradient Clipping (hook mode)."
+                    )
+                else:
+                    logger.info(
+                        f"Module name: {m_name}, module type: {type(m)}, under Fast Gradient Clipping (functorch mode)."
+                    )
+    
     @property
     def per_sample_gradient_norms(self) -> torch.Tensor:
         """Returns per sample gradient norms. Note that these are not privatized and should only be used for debugging purposes or in non-private settings"""