PR4: Adding automatic scheduler

jjsjann123 · jjsjann123 · commit 856f2919c036 · 2025-09-15T09:15:47.000-07:00
test case added

wip

prevent cacheAndForkOutputs

disabl cacheInputs for offsets TVs

change domain stuff in reference TV

revert unused changes

err something isn't working right

wip
diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp
@@ -777,6 +777,12 @@ bool isIndexSelectLookupTv(const TensorView* tv) {
         return true;
       }
     }
+    if (expr->isA<PreprocessGroupedMatmulInputSf>()) {
+      auto layout = expr->as<PreprocessGroupedMatmulInputSf>();
+      if (tv == layout->inputOffsets() || tv == layout->outputOffsets()) {
+        return true;
+      }
+    }
   }
   return false;
 }
diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
@@ -49,15 +49,15 @@ bool checkCanSchedule(Fusion* fusion, SchedulerType scheduler_type) {
           GroupedMmaOp,
           ScaledMmaOp,
           CutlassNvfp4GroupedMmaOp,
-          // TODO: remove this once we have a scheduler for it
-          PreprocessGroupedMatmulInputSf,
           TopKOp,
           ScanOp>(fusion)) {
     scheduler_debug_utils::canScheduleRejectReason(
         scheduler_type, "Has unsupported ops");
     return false;
   }
 
+  // TODO: check PreprocessGroupedMatmulInputSf's output is in global memory / fusion output
+
   // Fusions with `MatmulOp, LinearOp, MmaOp` can only be accepted by Matmul
   // scheduler.
   if (scheduler_type != SchedulerType::Matmul &&
@@ -72,6 +72,7 @@ bool checkCanSchedule(Fusion* fusion, SchedulerType scheduler_type) {
         scheduler_type, "Connected fusion graph check failed!");
     return false;
   }
+
   if (IterDomainGraph(fusion, /*allow_self_mapping=*/true).hasSelfMapping()) {
     scheduler_debug_utils::canScheduleRejectReason(
         scheduler_type, "Iter domain graph check failed!");
diff --git a/csrc/scheduler/tools/domain_map.cpp b/csrc/scheduler/tools/domain_map.cpp
@@ -58,6 +58,10 @@ bool canIgnoreIndexedInputDomainID(
                ->isBroadcast()) {
         return false;
       }
+    } else if (auto layout = dynamic_cast<PreprocessGroupedMatmulInputSf*>(use)) {
+      if (input_tv == layout->inputOffsets() || input_tv == layout->outputOffsets()) {
+        continue;
+      }
     } else {
       // If the input TV is used by any other ops
       return false;
diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp
@@ -1341,7 +1341,8 @@ std::vector<std::pair<TensorView*, TensorView*>> cacheAndForkOutputs(
     if (output->definition() == nullptr ||
         // the output of ScatterOp must on the global memory due to the random
         // or atomic access.
-        output->definition()->isA<ScatterOp>()) {
+        output->definition()->isA<ScatterOp>() ||
+        output->definition()->isA<PreprocessGroupedMatmulInputSf>()) {
       continue;
     }
     if (!output->uses().empty()) {
diff --git a/tests/cpp/test_layout_op.cpp b/tests/cpp/test_layout_op.cpp
@@ -134,4 +134,45 @@ TEST_F(LayoutOpTest, ManualKernel) {
       t2));
 }
 
+TEST_F(LayoutOpTest, SchedulerKernel) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto inp = makeSymbolicTensor(2);
+  auto offsets = makeSymbolicTensor(1, DataType::Int32);
+  auto rounded_offsets = makeSymbolicTensor(1, DataType::Int32);
+  fusion.addInput(inp);
+  fusion.addInput(offsets);
+  fusion.addInput(rounded_offsets);
+
+  auto inp_tv = set(inp);
+  auto out_tv = preprocessGroupedMatmulInputSf(
+      inp_tv, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
+  // NOTE: output of preprocessGroupedMatmulInputSf needs to be on global
+  // memory, because we do indexing on output inside the runtime function.
+  fusion.addOutput(out_tv);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int m = 512;
+  int k = 9; // note: padded column size would be 12
+  auto t0 = at::randn({m, k}, options);
+  // tokens per group are [100, 150, 262] respectively, so each group would be
+  // padded to multiple of 128. Hence the total output row span would cover a
+  // length of 128 + 256 + 384 = 768.
+  auto t1 = at::tensor({0, 100, 250, 512}, options.dtype(at::kInt));
+  auto t2 = at::tensor({0, 128, 384, 768}, options.dtype(at::kInt));
+
+  // naive scheduling.
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
+
+  ASSERT_TRUE(validateGroupedLayout(
+      BlockScalingFactorLayout::Block128x4,
+      outputs[0].as<at::Tensor>(),
+      t0,
+      t1,
+      t2));
+}
+
 } // namespace nvfuser

Original file line number	Diff line number	Diff line change
`@@ -777,6 +777,12 @@ bool isIndexSelectLookupTv(const TensorView* tv) {`
`777`	`777`	`return true;`
`778`	`778`	`}`
`779`	`779`	`}`
	`780`	`+ if (expr->isA<PreprocessGroupedMatmulInputSf>()) {`
	`781`	`+ auto layout = expr->as<PreprocessGroupedMatmulInputSf>();`
	`782`	`+ if (tv == layout->inputOffsets() \|\| tv == layout->outputOffsets()) {`
	`783`	`+ return true;`
	`784`	`+ }`
	`785`	`+ }`
`780`	`786`	`}`
`781`	`787`	`return false;`
`782`	`788`	`}`