@@ -134,4 +134,45 @@ TEST_F(LayoutOpTest, ManualKernel) {
134134 t2));
135135}
136136
137+ TEST_F (LayoutOpTest, SchedulerKernel) {
138+ auto fusion_ptr = std::make_unique<Fusion>();
139+ Fusion& fusion = *fusion_ptr.get ();
140+ FusionGuard fg (&fusion);
141+
142+ auto inp = makeSymbolicTensor (2 );
143+ auto offsets = makeSymbolicTensor (1 , DataType::Int32);
144+ auto rounded_offsets = makeSymbolicTensor (1 , DataType::Int32);
145+ fusion.addInput (inp);
146+ fusion.addInput (offsets);
147+ fusion.addInput (rounded_offsets);
148+
149+ auto inp_tv = set (inp);
150+ auto out_tv = preprocessGroupedMatmulInputSf (
151+ inp_tv, offsets, rounded_offsets, BlockScalingFactorLayout::Block128x4);
152+ // NOTE: output of preprocessGroupedMatmulInputSf needs to be on global
153+ // memory, because we do indexing on output inside the runtime function.
154+ fusion.addOutput (out_tv);
155+
156+ auto options = at::TensorOptions ().dtype (at::kFloat ).device (at::kCUDA , 0 );
157+ int m = 512 ;
158+ int k = 9 ; // note: padded column size would be 12
159+ auto t0 = at::randn ({m, k}, options);
160+ // tokens per group are [100, 150, 262] respectively, so each group would be
161+ // padded to multiple of 128. Hence the total output row span would cover a
162+ // length of 128 + 256 + 384 = 768.
163+ auto t1 = at::tensor ({0 , 100 , 250 , 512 }, options.dtype (at::kInt ));
164+ auto t2 = at::tensor ({0 , 128 , 384 , 768 }, options.dtype (at::kInt ));
165+
166+ // naive scheduling.
167+ FusionExecutorCache executor_cache (std::move (fusion_ptr));
168+ auto outputs = executor_cache.runFusionWithInputs ({t0, t1, t2});
169+
170+ ASSERT_TRUE (validateGroupedLayout (
171+ BlockScalingFactorLayout::Block128x4,
172+ outputs[0 ].as <at::Tensor>(),
173+ t0,
174+ t1,
175+ t2));
176+ }
177+
137178} // namespace nvfuser
0 commit comments