[Profiler] Induce Inductor Import before Profiling (pytorch#155243)

sraikund16 · pytorchmergebot · commit abf4da0d2422 · 2025-06-07T23:58:50.000Z
Fixes pytorch#151829 Summary: Currently, inductor has a lazy init which causes certain aten ops to run during a profiling run. This ends up cluttering the function events especially for smaller traces. One of the attempts to fix this was to simply remove that import from the profiler entirely but it looks like the import happens somewhere downstream anyways and the event still flood our profile. To fix this, we induce the inductor import during prepare trace if the inductor is present. This way regardless of how the workload imports the inductor the actual init process will be done before tracing starts, resulting in more accurate tracing. Test Plan: Added test, also ran N7316820 manually and went from getting many events on the first run to the following output (only difference is Runtime Triggered Module Loading which is CUPTI overhead event): ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls aten::mul_ 1.40% 340.638us 99.92% 24.390ms 24.390ms 1.535us 100.00% 4.605us 4.605us 1 cudaLaunchKernel 0.60% 146.533us 98.52% 24.049ms 24.049ms 0.000us 0.00% 3.070us 3.070us 1 Runtime Triggered Module Loading 6.14% 1.500ms 6.14% 1.500ms 1.500ms 1.535us 100.00% 1.535us 1.535us 1 Runtime Triggered Module Loading 91.78% 22.403ms 91.78% 22.403ms 22.403ms 1.535us 100.00% 1.535us 1.535us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.535us 100.00% 1.535us 1.535us 1 cudaDeviceSynchronize 0.08% 20.031us 0.08% 20.031us 20.031us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls aten::mul_ 82.81% 484.396us 94.26% 551.378us 551.378us 1.440us 100.00% 1.440us 1.440us 1 cudaLaunchKernel 11.45% 66.982us 11.45% 66.982us 66.982us 0.000us 0.00% 0.000us 0.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.440us 100.00% 1.440us 1.440us 1 cudaDeviceSynchronize 5.74% 33.581us 5.74% 33.581us 33.581us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Rollback Plan: Differential Revision: D76056511 Pull Request resolved: pytorch#155243 Approved by: https://github.com/ngimel
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
@@ -2033,6 +2033,19 @@ def test_user_annotation(self):
             else:
                 self.assertFalse(evt.is_user_annotation)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_basic_profile(self):
+        # test a really basic profile to make sure no erroneous aten ops are run
+        x = torch.randn(4, device="cuda")
+        with torch.profiler.profile(with_stack=True) as p:
+            x *= 2
+        names = [e.name for e in p.events()]
+        for name in names:
+            if name.startswith("aten") and name != "aten::mul_":
+                self.assertTrue(False, "Found unexpected event: " + name)
+        self.assertTrue("aten::mul_" in names)
+
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     def test_dynamic_toggle(self):
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
@@ -157,6 +157,7 @@ def __init__(
         self.acc_events = acc_events
         self.custom_trace_id_callback = custom_trace_id_callback
         self.profiler: Optional[prof.profile] = None
+        self.has_cudagraphs = False
         self.mem_tl: Optional[MemoryProfileTimeline] = None
         self.use_device = None
         if ProfilerActivity.CUDA in self.activities:
@@ -181,6 +182,10 @@ def stop(self):
         self.stop_trace()
 
     def prepare_trace(self):
+        if hasattr(torch, "_inductor"):
+            import torch._inductor.config as inductor_config
+
+            self.has_cudagraphs = inductor_config.triton.cudagraphs
         if (self.profiler is None) or (not self.acc_events):
             self.profiler = prof.profile(
                 use_cpu=(ProfilerActivity.CPU in self.activities),
@@ -221,26 +226,23 @@ def start_trace(self):
                     "distributedInfo", json.dumps(dist_info, cls=_NumpyEncoder)
                 )
 
-            if hasattr(torch, "_inductor"):
-                import torch._inductor.config as inductor_config
-
-                cuda_version = None
-                if hasattr(torch, "version"):
-                    from torch.torch_version import TorchVersion
-
-                    cuda_version = TorchVersion(getattr(torch.version, "cuda", "0.0"))
-
-                if inductor_config.triton.cudagraphs and (
-                    (cuda_version and cuda_version < "12.6")
-                    or not profiler_allow_cudagraph_cupti_lazy_reinit_cuda12()
-                ):
-                    os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
-                    self.add_metadata_json("DISABLE_CUPTI_LAZY_REINIT", "1")
-                    # FIXME: CUDA Graph does not work well with CUPTI teardown.
-                    #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
-                    #   2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
-                    # Workaround: turn off CUPTI teardown when using CUDA Graphs.
-                    os.environ["TEARDOWN_CUPTI"] = "0"
+            cuda_version = None
+            if hasattr(torch, "version"):
+                from torch.torch_version import TorchVersion
+
+                cuda_version = TorchVersion(getattr(torch.version, "cuda", "0.0"))
+
+            if self.has_cudagraphs and (
+                (cuda_version and cuda_version < "12.6")
+                or not profiler_allow_cudagraph_cupti_lazy_reinit_cuda12()
+            ):
+                os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
+                self.add_metadata_json("DISABLE_CUPTI_LAZY_REINIT", "1")
+                # FIXME: CUDA Graph does not work well with CUPTI teardown.
+                #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
+                #   2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
+                # Workaround: turn off CUPTI teardown when using CUDA Graphs.
+                os.environ["TEARDOWN_CUPTI"] = "0"
 
             # Insert the preset user metadata to the trace
             for k, v in self.preset_metadata.items():