meta-pytorch
diff --git a/‎.circleci/config.yml‎
Lines changed: 1 addition & 1 deletion b/‎.circleci/config.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 26 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎dev_requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎dev_requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opacus/accountants/accountant.py‎
Lines changed: 2 additions & 2 deletions b/‎opacus/accountants/accountant.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎opacus/grad_sample/functorch.py‎
Lines changed: 61 additions & 0 deletions b/‎opacus/grad_sample/functorch.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎opacus/grad_sample/grad_sample_module.py‎
Lines changed: 59 additions & 18 deletions b/‎opacus/grad_sample/grad_sample_module.py‎
Lines changed: 59 additions & 18 deletions
diff --git a/‎opacus/grad_sample/utils.py‎
Lines changed: 3 additions & 1 deletion b/‎opacus/grad_sample/utils.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎opacus/optimizers/ddp_perlayeroptimizer.py‎
Lines changed: 1 addition & 1 deletion b/‎opacus/optimizers/ddp_perlayeroptimizer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opacus/optimizers/optimizer.py‎
Lines changed: 2 additions & 2 deletions b/‎opacus/optimizers/optimizer.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎opacus/tests/grad_sample_module_test.py‎
Lines changed: 10 additions & 6 deletions b/‎opacus/tests/grad_sample_module_test.py‎
Lines changed: 10 additions & 6 deletions
@@ -204,7 +204,7 @@ commands:
             echo "Using $(python -V) ($(which python))"
             echo "Using $(pip -V) ($(which pip))"
             pip install --user datasets transformers
-            python examples/imdb.py --lr 0.02 --sigma 0.56 -c 1.0 --batch-size 32 --max-sequence-length 256 --epochs 1 --data-root runs/imdb/data --device <<parameters.device>>
+            python examples/imdb.py --lr 0.02 --sigma 1.0 -c 1.0 --batch-size 64 --max-sequence-length 256 --epochs 2 --data-root runs/imdb/data --device <<parameters.device>>
             python -c "import torch; accuracy = torch.load('run_results_imdb_classification.pt'); exit(0) if (accuracy>0.54 and accuracy<0.66) else exit(1)"
           when: always
       - store_test_results:
 
@@ -95,6 +95,32 @@ Run following command from `website` folder. It will build the docs and serve th
 ./scripts/build_website.sh
 ```
 
+You can also perform spell checks on documentation automatically (besides IDEs) using [```sphinxcontrib-spelling```](https://sphinxcontrib-spelling.readthedocs.io/en/latest/install.html)
+Note that you will also need [```PyEnchant```](https://pyenchant.github.io/pyenchant/) to run ```sphinxcontrib-spelling```, and thus the Enchant C library. Use this guide for ```PyEnchant```. 
+
+Steps:
+1. Install the extension with pip: ```pip install sphinxcontrib-spelling```
+2. Add ```sphinxcontrib.spelling``` to the extensions list in ```conf.py```.
+3. Install ```PyEnchant```. Please follow the [installation guide](https://pyenchant.github.io/pyenchant/install.html). Noticed that Apple Silicons may require a way around under section "Apple Silicon related errors".
+4. Make sure you have a ```source``` and ```build``` folder. Pass "spelling" as the builder argument to ```sphinx-build```.
+   ```
+   cd website/sphnix
+   mkdir build  # if you do not already have one
+   sphinx-build -b spelling source build
+   ```
+5. Find files with spelling errors in ```build``` (remember to check each folder). A file will be generated for each source file that contains spelling error. Example:
+   * File name: ```batch_memory_manager.spelling```
+   * File content:
+   ```
+   ../../opacus/utils/batch_memory_manager.py:docstring of opacus.utils.batch_memory_manager.BatchMemoryManager:5: (occasinal)  safeguarding against occasinal large batches produced by
+   ../../opacus/utils/batch_memory_manager.py:docstring of opacus.utils.batch_memory_manager.BatchMemoryManager:13: (optimzer)  On every step optimzer will check if the batch was the last physical batch comprising
+   ../../opacus/utils/batch_memory_manager.py:docstring of opacus.utils.batch_memory_manager.BatchMemoryManager:14: (behaviour)  a logical one, and will change behaviour accordignly.
+   ../../opacus/utils/batch_memory_manager.py:docstring of opacus.utils.batch_memory_manager.BatchMemoryManager:14: (accordignly)  a logical one, and will change behaviour accordignly.
+   ../../opacus/utils/batch_memory_manager.py:docstring of opacus.utils.batch_memory_manager.BatchSplittingSampler:4: (physocal)  Used to split large logical batches into physocal batches of a smaller size,
+   ```
+6. Manually review the spelling files and make changes in source files accordingly. Some detections are not perfect. For example, "nn" (from torch.nn) can be detected as a spelling error.
+
+
 ## Pull Requests
 We actively welcome your pull requests.
 
 
@@ -1,4 +1,4 @@
-torch==1.8.1
+torch
 torchvision>=0.9.1
 tqdm>=4.40
 requests>=2.25.1
 
@@ -72,7 +72,7 @@ def get_optimizer_hook_fn(
         """
         Returns a callback function which can be used to attach to DPOptimizer
         Args:
-            sample_rate: Expected samping rate used for accounting
+            sample_rate: Expected sampling rate used for accounting
         """
 
         def hook_fn(optim: DPOptimizer):
@@ -88,7 +88,7 @@ def hook_fn(optim: DPOptimizer):
 
     def state_dict(self, destination: T_state_dict = None) -> T_state_dict:
         """
-        Retruns a dictionary containing the state of the accountant.
+        Returns a dictionary containing the state of the accountant.
         Args:
             destination: a mappable object to populate the current state_dict into.
                 If this arg is None, an OrderedDict is created and populated.
 
@@ -0,0 +1,61 @@
+from opacus.layers.dp_rnn import RNNLinear
+
+
+def prepare_layer(layer, batch_first=True):
+    """
+    Prepare a layer to compute grad samples using functorch.
+    The grad samples are computed by redoing the forward and
+    backward passes on the functional version of the module.
+
+    Args:
+        layer: the layer to prepare
+        batch_first: whether the input is batch_first or not
+    """
+    from functorch import grad, make_functional, vmap
+
+    if len(list(layer.buffers())) > 0:
+        raise NotImplementedError(
+            "This layer has buffers and is not supported by Opacus"
+        )
+    flayer, _ = make_functional(layer)
+
+    def compute_loss_stateless_model(params, activations, backprops):
+        if batch_first or type(layer) is RNNLinear:
+            batched_activations = activations.unsqueeze(0)
+            batched_backprops = backprops.unsqueeze(0)
+        else:
+            # If batch_first is False, the batch dimension is the second dimension
+            batched_activations = activations.unsqueeze(1)
+            batched_backprops = backprops.unsqueeze(1)
+
+        output = flayer(params, batched_activations)
+        loss = (output * batched_backprops).sum()
+
+        return loss
+
+    ft_compute_grad = grad(compute_loss_stateless_model)
+    # Note that the vmap is done on the first dimension, regardless of batch_first
+    # This is because the activations and backprops given by the GradSampleModule
+    # are always batch_first=True
+    layer.ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, 0, 0))
+
+
+def ft_compute_per_sample_gradient(layer, activations, backprops):
+    """
+    Compute the per-sample gradient of the layer.
+    Args:
+        layer: the layer on which to compute the gradient
+        activations: the input to the layer
+        backprops: the  gradient of the loss w.r.t. outputs of the layer
+    """
+    parameters = list(layer.parameters())
+    if not hasattr(layer, "ft_compute_sample_grad"):
+        prepare_layer(layer)
+
+    per_sample_grads = layer.ft_compute_sample_grad(parameters, activations, backprops)
+
+    ret = {}
+    for i_p, p in enumerate(parameters):
+        ret[p] = per_sample_grads[i_p]
+
+    return ret
@@ -16,13 +16,15 @@
 from __future__ import annotations
 
 import logging
+import warnings
 from functools import partial
 from typing import List, Tuple
 
 import torch
 import torch.nn as nn
+from opacus.grad_sample.functorch import ft_compute_per_sample_gradient, prepare_layer
 from opacus.grad_sample.gsm_base import AbstractGradSampleModule
-from opacus.layers.dp_rnn import DPRNNBase, DPRNNCellBase, RNNLinear
+from opacus.layers.dp_rnn import DPGRU, DPLSTM, DPRNN, RNNLinear
 from opacus.utils.module_utils import (
     requires_grad,
     trainable_modules,
@@ -89,6 +91,7 @@ def __init__(
         batch_first=True,
         loss_reduction="mean",
         strict: bool = True,
+        force_functorch=False,
     ):
         """
 
@@ -108,6 +111,9 @@ def __init__(
                 possible and set to None otherwise. This is not recommended, because
                 some unsupported modules (e.g. BatchNorm) affect other parameters and
                 invalidate the concept of per sample gradients for the entire model.
+            force_functorch: If set to ``True``, will use functorch to compute
+                all per sample gradients. Otherwise, functorch will be used only
+                for layers without registered grad sampler methods.
 
         Raises:
             NotImplementedError
@@ -128,13 +134,24 @@ def __init__(
             )
 
         self.hooks_enabled = False
-        self.add_hooks(loss_reduction=loss_reduction, batch_first=batch_first)
+        self.batch_first = batch_first
+        self.loss_reduction = loss_reduction
+        self.force_functorch = force_functorch
+        self.add_hooks(
+            loss_reduction=loss_reduction,
+            batch_first=batch_first,
+            force_functorch=force_functorch,
+        )
 
     def forward(self, *args, **kwargs):
         return self._module(*args, **kwargs)
 
     def add_hooks(
-        self, *, loss_reduction: str = "mean", batch_first: bool = True
+        self,
+        *,
+        loss_reduction: str = "mean",
+        batch_first: bool = True,
+        force_functorch: bool = False,
     ) -> None:
         """
         Adds hooks to model to save activations and backprop values.
@@ -151,6 +168,8 @@ def add_hooks(
                 ``[K, batch_size, ...]``
             loss_reduction: Indicates if the loss reduction (for aggregating the gradients)
                 is a sum or a mean operation. Can take values "sum" or "mean"
+            force_functorch: If set to ``True``, will use functorch to compute all per sample gradients.
+                Otherwise, functorch will be used only for layers without registered grad sampler methods.
         """
         if hasattr(self._module, "autograd_grad_sample_hooks"):
             raise ValueError("Trying to add hooks twice to the same model")
@@ -159,20 +178,27 @@ def add_hooks(
             self.autograd_grad_sample_hooks = self._module.autograd_grad_sample_hooks
 
         for _module_name, module in trainable_modules(self._module):
-            if type(module) in self.GRAD_SAMPLERS:
-                self.autograd_grad_sample_hooks.append(
-                    module.register_forward_hook(self.capture_activations_hook)
-                )
+            # Do not add hooks to DPRNN, DPLSTM or DPGRU as the hooks are handled by the `RNNLinear`
+            if type(module) in [DPRNN, DPLSTM, DPGRU]:
+                continue
+
+            if force_functorch or not type(module) in self.GRAD_SAMPLERS:
+                prepare_layer(module, batch_first=batch_first)
+
+            self.autograd_grad_sample_hooks.append(
+                module.register_forward_hook(self.capture_activations_hook)
+            )
 
-                self.autograd_grad_sample_hooks.append(
-                    module.register_backward_hook(
-                        partial(
-                            self.capture_backprops_hook,
-                            loss_reduction=loss_reduction,
-                            batch_first=batch_first,
-                        )
+            self.autograd_grad_sample_hooks.append(
+                module.register_backward_hook(
+                    partial(
+                        self.capture_backprops_hook,
+                        loss_reduction=loss_reduction,
+                        batch_first=batch_first,
                     )
                 )
+            )
+
         self.enable_hooks()
 
     def remove_hooks(self) -> None:
@@ -197,6 +223,11 @@ def remove_hooks(self) -> None:
             delattr(self, "autograd_grad_sample_hooks")
             delattr(self._module, "autograd_grad_sample_hooks")
 
+        # Remove functorch hooks
+        for _module_name, module in trainable_modules(self._module):
+            if hasattr(module, "ft_compute_sample_grad"):
+                delattr(module, "ft_compute_sample_grad")
+
     def disable_hooks(self) -> None:
         r"""
         Globally disable all hooks installed by this library.
@@ -282,7 +313,11 @@ def capture_backprops_hook(
             loss_reduction=loss_reduction,
             batch_first=batch_first,
         )
-        grad_sampler_fn = self.GRAD_SAMPLERS[type(module)]
+        if not self.force_functorch and type(module) in self.GRAD_SAMPLERS:
+            grad_sampler_fn = self.GRAD_SAMPLERS[type(module)]
+        else:
+            grad_sampler_fn = ft_compute_per_sample_gradient
+
         grad_samples = grad_sampler_fn(module, activations, backprops)
         for param, gs in grad_samples.items():
             create_or_accumulate_grad_sample(
@@ -374,10 +409,13 @@ def is_supported(cls, module: nn.Module) -> bool:
         Returns:
             ``True`` if grad sampler is found, ``False`` otherwise
         """
-        return type(module) in cls.GRAD_SAMPLERS or isinstance(
-            module, (DPRNNBase, DPRNNCellBase)
+        warnings.warn(
+            "GradSampleModule.is_supported is deprecated, as all layers can now be used with functorch.",
+            DeprecationWarning,
         )
 
+        return True
+
     @classmethod
     def validate(
         cls, module: nn.Module, *, strict: bool = False
@@ -409,7 +447,10 @@ def validate(
                     f"(See opacus.grad_sample.utils.register_grad_sampler)"
                 )
                 for m_name, m in trainable_modules(module)
-                if not cls.is_supported(m)
+                # With functorch, all modules are trainable
+                # We still want to avoid module that have buffers (e.g. BatchNorm)
+                # as the buffers are not private
+                if len(list(m.buffers())) > 0
             ]
         )
         # raise or return errors as needed
 
@@ -52,6 +52,8 @@ def decorator(f):
 
 def wrap_model(model: nn.Module, grad_sample_mode: str, *args, **kwargs):
     cls = get_gsm_class(grad_sample_mode)
+    if grad_sample_mode == "functorch":
+        kwargs["force_functorch"] = True
     return cls(model, *args, **kwargs)
 
 
@@ -63,7 +65,7 @@ def get_gsm_class(grad_sample_mode: str) -> Type[AbstractGradSampleModule]:
     :param grad_sample_mode:
     :return:
     """
-    if grad_sample_mode == "hooks":
+    if grad_sample_mode in ["hooks", "functorch"]:
         return GradSampleModule
     elif grad_sample_mode == "ew":
         return GradSampleModuleExpandedWeights
 
@@ -67,7 +67,7 @@ def __init__(
 class DistributedPerLayerOptimizer(DPOptimizer):
     """
     :class:`~opacus.optimizers.optimizer.DPOptimizer` that implements
-    per layer clipping strategy and is compatible with distibured data parallel
+    per layer clipping strategy and is compatible with distributed data parallel
     """
 
     def __init__(
 
@@ -113,7 +113,7 @@ def _generate_noise(
         reference: The reference Tensor to get the appropriate shape and device
             for generating the noise
         generator: The PyTorch noise generator
-        secure_mode: boolean showing if "secure" noise need to be generate
+        secure_mode: boolean showing if "secure" noise need to be generated
             (see the notes)
 
     Notes:
@@ -186,7 +186,7 @@ class DPOptimizer(Optimizer):
     Examples:
         >>> module = MyCustomModel()
         >>> optimizer = torch.optim.SGD(module.parameters(), lr=0.1)
-        >>> dp_optimzer = DPOptimizer(
+        >>> dp_optimizer = DPOptimizer(
         ...     optimizer=optimizer,
         ...     noise_multiplier=1.0,
         ...     max_grad_norm=1.0,
 
@@ -212,11 +212,18 @@ def __init__(self, in_f, out_f):
             def forward(self, x: torch.Tensor):
                 return F.linear(x, self.p)
 
-        with self.assertRaises(NotImplementedError):
-            GradSampleModule(SimpleLinear(4, 2))
+        # Should be handled by functorch
+        try:
+            gsm = GradSampleModule(SimpleLinear(4, 2))
+            self.assertTrue(hasattr(gsm._module, "ft_compute_sample_grad"))
+        except ImportError:
+            print("Test could not be ran because functorch not available")
 
         # Should not raise exception if strict=False
-        GradSampleModule(SimpleLinear(4, 2), strict=False)
+        try:
+            GradSampleModule(SimpleLinear(4, 2), strict=False)
+        except ImportError:
+            print("Test could not be ran because functorch not available")
 
         # Should not fail after relevant grad sampler has been registered
         register_grad_sampler(SimpleLinear)(compute_linear_grad_sample)
@@ -226,9 +233,6 @@ def test_custom_module_validation(self):
         with self.assertRaises(NotImplementedError):
             GradSampleModule(mobilenet_v3_small())
 
-        # Should not raise exception if strict=False
-        GradSampleModule(mobilenet_v3_small(), strict=False)
-
     def test_submodule_access(self):
         _ = self.grad_sample_module.fc1
         _ = self.grad_sample_module.fc2
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-torch==1.8.1`
	`1`	`+torch`
`2`	`2`	`torchvision>=0.9.1`
`3`	`3`	`tqdm>=4.40`
`4`	`4`	`requests>=2.25.1`