review comment

jjsjann123 · jjsjann123 · commit c4e65d3ef075 · 2025-09-04T15:09:20.000-07:00
diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
@@ -125,7 +125,7 @@ bool isTvOp(const Expr* expr) {
           SliceOp,
           CatOp,
           ScanOp,
-          GroupedBlockScalingFactorLayoutOp,
+          PreprocessGroupedMatmulInputSf,
           kir::AllocTMem,
           kir::GridReduction,
           kir::GroupedGridReduction,
diff --git a/csrc/dispatch.h b/csrc/dispatch.h
@@ -68,60 +68,60 @@ class Val;
 #define DISPATCH_FOR_ALL_KIR_VALS(f) f(Predicate) f(TensorIndex)
 #define DISPATCH_FOR_ALL_HIR_VALS(f) f(Stream)
 
-#define DISPATCH_FOR_ALL_EXPRS(f)       \
-  f(FullOp);                            \
-  f(IotaOp);                            \
-  f(EyeOp);                             \
-  f(UnaryOp);                           \
-  f(BinaryOp);                          \
-  f(TernaryOp);                         \
-  f(ArrayConstruct);                    \
-  f(StructConstruct);                   \
-  f(GetAttr);                           \
-  f(GetItem);                           \
-  f(ReverseArray);                      \
-  f(GetMetaData);                       \
-  f(TensorConstruct);                   \
-  f(SelectOp);                          \
-  f(IndexSelectOp);                     \
-  f(IndexPutAccumulateOp);              \
-  f(GatherOp);                          \
-  f(ScatterOp);                         \
-  f(RNGOp);                             \
-  f(ReductionOp);                       \
-  f(GroupedReductionOp);                \
-  f(WelfordOp);                         \
-  f(GroupedWelfordOp);                  \
-  f(LoadStoreOp);                       \
-  f(MmaOp);                             \
-  f(BroadcastOp);                       \
-  f(SqueezeOp);                         \
-  f(ExpandOp);                          \
-  f(RepeatOp);                          \
-  f(ViewAsScalar);                      \
-  f(ReshapeOp);                         \
-  f(CatOp);                             \
-  f(PadOp);                             \
-  f(SliceOp);                           \
-  f(Split);                             \
-  f(ArgsortOp);                         \
-  f(GroupedMmaOp);                      \
-  f(ScaledMmaOp);                       \
-  f(CutlassNvfp4GroupedMmaOp);          \
-  f(GroupedBlockScalingFactorLayoutOp); \
-  f(TopKOp);                            \
-  f(ScanOp);                            \
-  f(Merge);                             \
-  f(Swizzle);                           \
-  f(Swizzle2D);                         \
-  f(Resize);                            \
-  f(MatmulOp);                          \
-  f(LinearOp);                          \
-  f(SdpaFwdOp);                         \
-  f(SdpaBwdOp);                         \
-  f(EmbeddingFwdOp);                    \
-  f(Communication);                     \
-  f(ForLoop);                           \
+#define DISPATCH_FOR_ALL_EXPRS(f)    \
+  f(FullOp);                         \
+  f(IotaOp);                         \
+  f(EyeOp);                          \
+  f(UnaryOp);                        \
+  f(BinaryOp);                       \
+  f(TernaryOp);                      \
+  f(ArrayConstruct);                 \
+  f(StructConstruct);                \
+  f(GetAttr);                        \
+  f(GetItem);                        \
+  f(ReverseArray);                   \
+  f(GetMetaData);                    \
+  f(TensorConstruct);                \
+  f(SelectOp);                       \
+  f(IndexSelectOp);                  \
+  f(IndexPutAccumulateOp);           \
+  f(GatherOp);                       \
+  f(ScatterOp);                      \
+  f(RNGOp);                          \
+  f(ReductionOp);                    \
+  f(GroupedReductionOp);             \
+  f(WelfordOp);                      \
+  f(GroupedWelfordOp);               \
+  f(LoadStoreOp);                    \
+  f(MmaOp);                          \
+  f(BroadcastOp);                    \
+  f(SqueezeOp);                      \
+  f(ExpandOp);                       \
+  f(RepeatOp);                       \
+  f(ViewAsScalar);                   \
+  f(ReshapeOp);                      \
+  f(CatOp);                          \
+  f(PadOp);                          \
+  f(SliceOp);                        \
+  f(Split);                          \
+  f(ArgsortOp);                      \
+  f(GroupedMmaOp);                   \
+  f(ScaledMmaOp);                    \
+  f(CutlassNvfp4GroupedMmaOp);       \
+  f(PreprocessGroupedMatmulInputSf); \
+  f(TopKOp);                         \
+  f(ScanOp);                         \
+  f(Merge);                          \
+  f(Swizzle);                        \
+  f(Swizzle2D);                      \
+  f(Resize);                         \
+  f(MatmulOp);                       \
+  f(LinearOp);                       \
+  f(SdpaFwdOp);                      \
+  f(SdpaBwdOp);                      \
+  f(EmbeddingFwdOp);                 \
+  f(Communication);                  \
+  f(ForLoop);                        \
   f(P2PCommunication);
 #define DISPATCH_FOR_ALL_KIR_EXPRS(f) \
   f(Allocate);                        \
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
@@ -3442,38 +3442,46 @@ class CutlassNvfp4GroupedMmaOp : public Expr {
   }
 };
 
-//! NOTE -- [ GroupedBlockScalingFactorLayoutOp ]
+//! NOTE -- [ PreprocessGroupedMatmulInputSf ]
 //!
 //! This operation performs a layout change on the input, it's currently used
 //! for block scaling factor accompanying narrow precision inputs.
 //!
-//! 1. This can be viewed as a point-wise operation, where output loop domain
-//! would match the input logical domain.
+//! PreprocessGroupedMatmulInputSf(TensorView* output, TensorView* input, ...)
+//!
+//!   input:  logical domain:   (i0, i1)
+//!   output: root domain:      (i0, i1)
+//!           logical domain:   (i2, i3)
+//!           loop domain:      (i0, i1)
+//!
+//! 1. This can be viewed as a point-wise operation, since output loop domain
+//! matches the input logical domain.
+//!
 //! 2. Because of the potential padding/swizzle, the logical domain of the
 //! output does not map to input. We don't rely on codegen for indexing, so we
 //! don't care about mapping the logical/allocation of output to anything else.
-//! Indexing will be done in runtime function, utilizing `expert_offsets` and
-//! `sf_offsets`.
-//! 3. Output has a root domain, which is identical to its loop domain. We add
-//! this so we can map it to input.
-class GroupedBlockScalingFactorLayoutOp : public Expr {
+//! Indexing will be done in runtime function, utilizing `input_offsets` and
+//! `output_offsets`.
+//!
+//! 3. Output has a root domain that matches the logical domain of the input.
+class PreprocessGroupedMatmulInputSf : public Expr {
  public:
   using Expr::Expr;
 
-  GroupedBlockScalingFactorLayoutOp(
+  PreprocessGroupedMatmulInputSf(
       IrBuilderPasskey,
       Val* output,
       Val* input,
-      Val* expert_offsets,
-      Val* sf_offsets,
+      Val* input_offsets,
+      Val* output_offsets,
       BlockScalingFactorLayout layout,
       Val* k,
       Val* g);
 
   NVFUSER_DECLARE_CLONE_AND_CREATE
 
   const char* getOpString() const override {
-    return "GroupedBlockScalingFactorLayoutOp";
+    return "PreprocessGroupedMatmulInputSf";
   }
 
   std::string toString(int indent_size = 0) const override;
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
@@ -6555,7 +6555,7 @@ std::vector<PolymorphicValue> CutlassNvfp4GroupedMmaOp::evaluate(
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(CutlassNvfp4GroupedMmaOp)
 
-GroupedBlockScalingFactorLayoutOp::GroupedBlockScalingFactorLayoutOp(
+PreprocessGroupedMatmulInputSf::PreprocessGroupedMatmulInputSf(
     IrBuilderPasskey passkey,
     Val* output,
     Val* input,
@@ -6574,11 +6574,11 @@ GroupedBlockScalingFactorLayoutOp::GroupedBlockScalingFactorLayoutOp(
   addDataAttribute(layout);
 }
 
-std::string GroupedBlockScalingFactorLayoutOp::toString(int indent_size) const {
+std::string PreprocessGroupedMatmulInputSf::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << output(0)->toString() << "\n";
   indent_size++;
-  indent(ss, indent_size) << " = grouped_block_scaling_factor_layout(\n";
+  indent(ss, indent_size) << " = preprocessGroupedMatmulInputSf(\n";
   indent_size++;
   indent(ss, indent_size) << "input = " << in()->toString() << ",\n";
   indent(ss, indent_size) << "expert_offsets = " << expertOffsets()->toString()
@@ -6595,19 +6595,18 @@ std::string GroupedBlockScalingFactorLayoutOp::toString(int indent_size) const {
   return ss.str();
 }
 
-std::string GroupedBlockScalingFactorLayoutOp::toInlineString(
+std::string PreprocessGroupedMatmulInputSf::toInlineString(
     int indent_size) const {
-  NVF_CHECK(
-      false, "GroupedBlockScalingFactorLayoutOp can not be printed inline");
+  NVF_CHECK(false, "PreprocessGroupedMatmulInputSf can not be printed inline");
 }
 
-std::vector<PolymorphicValue> GroupedBlockScalingFactorLayoutOp::evaluate(
+std::vector<PolymorphicValue> PreprocessGroupedMatmulInputSf::evaluate(
     const ExpressionEvaluator& ee,
     const std::vector<PolymorphicValue>& inputs) const {
   // This is a placeholder, currently we don't have a fallback kernel available
-  NVF_THROW("GroupedBlockScalingFactorLayoutOp evaluation not yet implemented");
+  NVF_THROW("PreprocessGroupedMatmulInputSf evaluation not yet implemented");
 }
 
-NVFUSER_DEFINE_CLONE_AND_CREATE(GroupedBlockScalingFactorLayoutOp)
+NVFUSER_DEFINE_CLONE_AND_CREATE(PreprocessGroupedMatmulInputSf)
 
 } // namespace nvfuser
diff --git a/csrc/logical_domain_map.cpp b/csrc/logical_domain_map.cpp
@@ -133,7 +133,7 @@ std::pair<std::unordered_set<IterDomain*>, bool> getNonMappingDomainInfo(
     }
   } else if (
       auto* grouped_block_sf_layout =
-          dynamic_cast<GroupedBlockScalingFactorLayoutOp*>(
+          dynamic_cast<PreprocessGroupedMatmulInputSf*>(
               consumer_tv->definition())) {
     if (producer_tv != grouped_block_sf_layout->in()) {
       auto producer_logical =
diff --git a/csrc/ops/indexing.cpp b/csrc/ops/indexing.cpp
@@ -292,10 +292,10 @@ TensorView* takeAlongAxis(TensorView* inp, TensorView* index, int64_t dim) {
   return out_tensor->as<TensorView>();
 }
 
-TensorView* groupedBlockSfLayout(
+TensorView* preprocessGroupedMatmulInputSf(
     TensorView* input,
-    TensorView* expert_offsets,
-    TensorView* sf_offsets,
+    TensorView* input_offsets,
+    TensorView* output_offsets,
     BlockScalingFactorLayout layout) {
   // only support input matrix;
   auto input_logical_dom =
@@ -312,9 +312,6 @@ TensorView* groupedBlockSfLayout(
       });
 
   // Create the logical domain of output.
-  // Note: output logical domain handles potential padding required for the
-  // layout. Since the actual padding size is data-dependent, we allocate for
-  // the maximum padding (reflected on logical/allocation domain).
   std::vector<IterDomain*> out_logical;
   out_logical.reserve(input_logical_dom.size());
 
@@ -325,11 +322,15 @@ TensorView* groupedBlockSfLayout(
 
   auto* one_val = input->fusion()->oneVal(DataType::Index);
   std::vector<IterDomain*> offset_logical_dom =
-      TensorDomain::noReductions(expert_offsets->getLogicalDomain());
+      TensorDomain::noReductions(input_offsets->getLogicalDomain());
   Val* num_groups =
       SimplifyingIrBuilder::subExpr(offset_logical_dom[0]->extent(), one_val);
-  // padded row size:
-  // num_groups * (row_multiple - 1) + row_size
+
+  // Note: output logical domain handles potential padding required for the
+  // layout. Since the actual padding size is data-dependent, we allocate for
+  // the maximum padding (reflected on logical/allocation domain).
+
+  // pad row size: num_groups * (row_multiple - 1) + row_size
   auto pad_to_max_extent = [&](IterDomain* id, int multiple) -> IterDomain* {
     auto* maximum_pad_value_per_group =
         IrBuilder::create<Val>(multiple - 1, DataType::Index);
@@ -340,8 +341,7 @@ TensorView* groupedBlockSfLayout(
   };
   out_logical.push_back(pad_to_max_extent(out_root[0], row_multiple));
 
-  // padded col size:
-  // (col_size + col_multiple - 1) / col_multiple * col_multiple
+  // pad col size: (col_size + col_multiple - 1) / col_multiple * col_multiple
   auto pad_to_multiple = [&](IterDomain* id, int multiple) -> IterDomain* {
     Val* ext = id->extent();
     auto* multiple_val = IrBuilder::create<Val>(multiple, DataType::Index);
@@ -370,11 +370,11 @@ TensorView* groupedBlockSfLayout(
           /*skip_checks=*/true),
       input->getDataType().value());
 
-  IrBuilder::create<GroupedBlockScalingFactorLayoutOp>(
+  IrBuilder::create<PreprocessGroupedMatmulInputSf>(
       out_tv,
       input,
-      expert_offsets,
-      sf_offsets,
+      input_offsets,
+      output_offsets,
       layout,
       input_logical_dom[1]->getMaybeExpandedExtent(),
       num_groups);
diff --git a/csrc/ops/indexing.h b/csrc/ops/indexing.h
@@ -83,10 +83,13 @@ NVF_API TensorView* takeAlongAxis(
     TensorView* index,
     int64_t dim);
 
-NVF_API TensorView* groupedBlockSfLayout(
+//! Changes the layout of input to satisfy the requirement of grouped matmul on
+//! block scaling factor. see:
+//! https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#scale-factor-layouts
+NVF_API TensorView* preprocessGroupedMatmulInputSf(
     TensorView* input,
-    TensorView* expert_offsets,
-    TensorView* sf_offsets,
+    TensorView* input_offsets,
+    TensorView* output_offsets,
     BlockScalingFactorLayout layout);
 
 } // namespace nvfuser
diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
@@ -50,7 +50,7 @@ bool checkCanSchedule(Fusion* fusion, SchedulerType scheduler_type) {
           ScaledMmaOp,
           CutlassNvfp4GroupedMmaOp,
           // TODO: remove this once we have a scheduler for it
-          GroupedBlockScalingFactorLayoutOp,
+          PreprocessGroupedMatmulInputSf,
           TopKOp,
           ScanOp>(fusion)) {
     scheduler_debug_utils::canScheduleRejectReason(
diff --git a/csrc/type.cpp b/csrc/type.cpp
@@ -338,9 +338,8 @@ const char* block_sf_layout2string(BlockScalingFactorLayout t) {
   switch (t) {
     case BlockScalingFactorLayout::Block128x4:
       return "block_128_4";
-    default:
-      NVF_THROW("No string found for layout.");
   }
+  std::unreachable();
 }
 
 const char* predicate_type2string(PredicateType t) {
diff --git a/csrc/type.h b/csrc/type.h
@@ -1176,6 +1176,9 @@ std::ostream& operator<<(std::ostream&, TMemRegisterDataPath);
 
 std::ostream& operator<<(std::ostream&, cudaDriverEntryPointQueryResult);
 
+// Layout for block scaling factor used by mx-format with narrow precision, this
+// indicates how to index into block scaling factor. see:
+// https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#scale-factor-layouts
 enum class BlockScalingFactorLayout {
   Block128x4,
 };
diff --git a/tests/cpp/test_layout_op.cpp b/tests/cpp/test_layout_op.cpp
@@ -38,7 +38,7 @@ TEST_F(LayoutOpTest, CppApi) {
   fusion.addInput(offsets);
   fusion.addInput(rounded_offsets);
 
-  auto out = groupedBlockSfLayout(
+  auto out = preprocessGroupedMatmulInputSf(
       inp, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
   fusion.addOutput(out);
 }

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ std::pair<std::unordered_set<IterDomain*>, bool> getNonMappingDomainInfo(`
`133`	`133`	`}`
`134`	`134`	`} else if (`
`135`	`135`	`auto* grouped_block_sf_layout =`
`136`		`- dynamic_cast<GroupedBlockScalingFactorLayoutOp*>(`
	`136`	`+ dynamic_cast<PreprocessGroupedMatmulInputSf*>(`
`137`	`137`	`consumer_tv->definition())) {`
`138`	`138`	`if (producer_tv != grouped_block_sf_layout->in()) {`
`139`	`139`	`auto producer_logical =`
Original file line number	Diff line number	Diff line change
`@@ -338,9 +338,8 @@ const char* block_sf_layout2string(BlockScalingFactorLayout t) {`
`338`	`338`	`switch (t) {`
`339`	`339`	`case BlockScalingFactorLayout::Block128x4:`
`340`	`340`	`return "block_128_4";`
`341`		`- default:`
`342`		`- NVF_THROW("No string found for layout.");`
`343`	`341`	`}`
	`342`	`+ std::unreachable();`
`344`	`343`	`}`
`345`	`344`
`346`	`345`	`const char* predicate_type2string(PredicateType t) {`