Allow a context manager to be called around apply_jit

kddnewton · facebook-github-bot · commit 78f73b6737db · 2025-04-30T06:38:00.000-07:00
Summary:
When running torch.jit.script on the various forward functions, you can run into issues if there are any other utilites interacting with the function definitions. As an example, if you have another JIT running, you need to disable it throughout this process.

This commit adds the ability to additionally pass an apply_jit_context context manager wherever apply_jit is currently passed that will be called around the application of the torch jit.

Differential Revision: D73781040
diff --git a/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py b/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py
@@ -10,7 +10,8 @@
 import copy
 import enum
 import unittest
-from typing import List
+from contextlib import contextmanager
+from typing import Generator, List
 from unittest.mock import MagicMock
 
 import torch
@@ -42,6 +43,29 @@ class ModelType(enum.Enum):
 
 
 class TrainPipelineUtilsTest(TrainPipelineSparseDistTestBase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @unittest.skipIf(
+        not torch.cuda.is_available(),
+        "Not enough GPUs, this test requires at least one GPU",
+    )
+    def test_rewrite_model_apply_jit(self) -> None:
+        @contextmanager
+        def apply_jit_context(events: list[str]) -> Generator[None, None, None]:
+            events.append("__enter__")
+            yield
+            events.append("__exit__")
+
+        events = []
+        _rewrite_model(
+            model=self._setup_model(),
+            context=TrainPipelineContext(),
+            dist_stream=None,
+            apply_jit=True,
+            apply_jit_context=apply_jit_context(events),
+        )
+
+        self.assertEqual(events, ["__enter__", "__exit__"])
+
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
     @unittest.skipIf(
         not torch.cuda.is_available(),
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -11,6 +11,7 @@
 import contextlib
 import logging
 from collections import deque
+from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import (
     Any,
@@ -318,6 +319,9 @@ def progress(self, dataloader_iter: Iterator[In]) -> Out:
         return output
 
 
+_apply_jit_context_default: ContextManager[None] = nullcontext()
+
+
 class TrainPipelineSparseDist(TrainPipeline[In, Out]):
     """
     This pipeline overlaps device transfer, and `ShardedModule.input_dist()` with
@@ -343,6 +347,8 @@ class TrainPipelineSparseDist(TrainPipeline[In, Out]):
         execute_all_batches (bool): executes remaining batches in pipeline after
             exhausting dataloader iterator.
         apply_jit (bool): apply torch.jit.script to non-pipelined (unsharded) modules.
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
     """
 
     # The PipelinedForward class that is used in _rewrite_model
@@ -355,6 +361,7 @@ def __init__(
         device: torch.device,
         execute_all_batches: bool = True,
         apply_jit: bool = False,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
         context_type: Type[TrainPipelineContext] = TrainPipelineContext,
         # keep for backward compatibility
         pipeline_postproc: bool = False,
@@ -367,6 +374,7 @@ def __init__(
         self._device = device
         self._execute_all_batches = execute_all_batches
         self._apply_jit = apply_jit
+        self._apply_jit_context = apply_jit_context
 
         if device.type == "cuda":
             # use two data streams to support two concurrent batches
@@ -641,6 +649,7 @@ def _pipeline_model(
             default_stream=torch.get_device_module(self._device).current_stream(),
             batch=batch,
             apply_jit=self._apply_jit,
+            apply_jit_context=self._apply_jit_context,
             pipelined_forward=pipelined_forward,
             pipeline_postproc=self._pipeline_postproc,
         )
@@ -820,6 +829,8 @@ class TrainPipelineSemiSync(TrainPipelineSparseDist[In, Out]):
         execute_all_batches (bool): executes remaining batches in pipeline after
             exhausting dataloader iterator.
         apply_jit (bool): apply torch.jit.script to non-pipelined (unsharded) modules.
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
         start_batch (int): batch to begin semi-sync training.  Typically small period of synchronous training reduces early stage NEX.
         stash_gradients (bool): if True, will store gradients for each parameter to insure true "Semi-Sync"
             training.  If False, will update dense optimizer as soon as gradients available (naive "Semi-Sync)
@@ -835,6 +846,7 @@ def __init__(
         device: torch.device,
         execute_all_batches: bool = True,
         apply_jit: bool = False,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
         start_batch: int = 900,
         stash_gradients: bool = False,
         pipeline_postproc: bool = True,
@@ -849,6 +861,7 @@ def __init__(
             device=device,
             execute_all_batches=execute_all_batches,
             apply_jit=apply_jit,
+            apply_jit_context=apply_jit_context,
             context_type=EmbeddingTrainPipelineContext,
             pipeline_postproc=pipeline_postproc,
             custom_model_fwd=custom_model_fwd,
@@ -1135,6 +1148,8 @@ class PrefetchTrainPipelineSparseDist(TrainPipelineSparseDist[In, Out]):
         execute_all_batches (bool): executes remaining batches in pipeline after
             exhausting dataloader iterator.
         apply_jit (bool): apply torch.jit.script to non-pipelined (unsharded) modules.
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
     """
 
     # The PipelinedForward class that is used in _rewrite_model
@@ -1147,6 +1162,7 @@ def __init__(
         device: torch.device,
         execute_all_batches: bool = True,
         apply_jit: bool = False,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
         pipeline_postproc: bool = True,
         custom_model_fwd: Optional[
             Callable[[Optional[In]], Tuple[torch.Tensor, Out]]
@@ -1158,6 +1174,7 @@ def __init__(
             device=device,
             execute_all_batches=execute_all_batches,
             apply_jit=apply_jit,
+            apply_jit_context=apply_jit_context,
             context_type=PrefetchTrainPipelineContext,
             pipeline_postproc=pipeline_postproc,
             custom_model_fwd=custom_model_fwd,
@@ -1292,6 +1309,8 @@ class EvalPipelineSparseDist(TrainPipelineSparseDist[In, Out]):
         device (torch.device): device where device transfer, sparse data dist, and
             forward/backward pass will happen.
         apply_jit (bool): apply torch.jit.script to non-pipelined (unsharded) modules.
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
     """
 
     # The PipelinedForward class that is used in _rewrite_model
@@ -1303,8 +1322,9 @@ def __init__(
         optimizer: torch.optim.Optimizer,
         device: torch.device,
         apply_jit: bool = False,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
     ) -> None:
-        super().__init__(model, optimizer, device, True, apply_jit)
+        super().__init__(model, optimizer, device, True, apply_jit, apply_jit_context)
         self._batch_loader: Optional[DataLoadingThread[In]] = None
 
     def __del__(self) -> None:
@@ -1661,6 +1681,7 @@ def __init__(
         device: torch.device,
         execute_all_batches: bool = True,
         apply_jit: bool = False,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
         context_type: Type[TrainPipelineContext] = TrainPipelineContext,
         pipeline_postproc: bool = False,
         custom_model_fwd: Optional[
@@ -1673,6 +1694,7 @@ def __init__(
             device,
             execute_all_batches,
             apply_jit,
+            apply_jit_context,
             context_type,
             pipeline_postproc,
             custom_model_fwd,
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -11,7 +11,7 @@
 import itertools
 import logging
 from collections import defaultdict, OrderedDict
-from contextlib import AbstractContextManager
+from contextlib import AbstractContextManager, nullcontext
 from dataclasses import dataclass, field
 
 from itertools import chain
@@ -20,6 +20,7 @@
     Any,
     Callable,
     cast,
+    ContextManager,
     Dict,
     Generic,
     Iterable,
@@ -1454,13 +1455,17 @@ def _pipeline_detach_model(
         setattr(model, postproc_mod.fqn, postproc_mod.postproc_module)
 
 
+_rewrite_model_apply_jit_context_default: ContextManager[None] = nullcontext()
+
+
 # pyre-ignore[3]
 def _rewrite_model(  # noqa C901
     model: torch.nn.Module,
     context: TForwardContext,
     dist_stream: Optional[torch.Stream],
     batch: Optional[In] = None,
     apply_jit: bool = False,
+    apply_jit_context: ContextManager[None] = _rewrite_model_apply_jit_context_default,
     pipelined_forward: Type[BaseForward[TrainPipelineContext]] = PipelinedForward,
     pipeline_postproc: bool = False,
     default_stream: Optional[torch.Stream] = None,
@@ -1546,10 +1551,11 @@ def _rewrite_model(  # noqa C901
 
     # JIT script unsharded modules if applicable.
     if apply_jit:
-        graph_model = torch.fx.GraphModule(model, graph)
-        _jit_modules(graph_model, "")
-        if isinstance(input_model, DistributedModelParallel):
-            input_model.module = graph_model
+        with apply_jit_context:
+            graph_model = torch.fx.GraphModule(model, graph)
+            _jit_modules(graph_model, "")
+            if isinstance(input_model, DistributedModelParallel):
+                input_model.module = graph_model
 
     if non_pipelined_sharded_modules:
         logger.warn(