NVIDIA · jjsjann123 · Oct 28, 2025 · Oct 22, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/csrc/scheduler/pointwise.cpp b/csrc/scheduler/pointwise.cpp
@@ -295,7 +295,10 @@ std::unique_ptr<PointwiseParams> getPointwiseHeuristics(
       });
 
   int64_t max_dtype_size_bit_for_vectorization = 0;
+  // ugly WAR.
+  bool has_sub_byte = false;
   for (auto inp : vectorizable_inputs_outputs_entry.get()) {
+    has_sub_byte |= dataTypeSizeBit(inp->getDataType().value()) < 8;
     max_dtype_size_bit_for_vectorization = std::max(
         max_dtype_size_bit_for_vectorization,
         dataTypeSizeBit(inp->getDataType().value(), index_type));
@@ -484,8 +487,11 @@ std::unique_ptr<PointwiseParams> getPointwiseHeuristics(
     }
   }
 
+  // If we have sub-byte data types, we wouldn't want to clamp vectorization
+  // factor to 1, otherwise we could end up with illegal array type with
+  // sub-byte length.
   params->vectorization_factor = std::min(
-      max_vect_factor,
+      has_sub_byte ? std::max(2l, max_vect_factor) : max_vect_factor,
       vectorize_helper::getVectorizationFactor(
           runtime_info,
           largest_out,

diff --git a/tests/python/direct/test_narrow_precision.py b/tests/python/direct/test_narrow_precision.py
@@ -19,6 +19,7 @@
     linear_to_swizzled_128_4,
     round_up,
     activation_scale_to_nvfp4,
+    to_fp4,
 )
 
 import pytest
@@ -274,3 +275,35 @@ def nvfuser_fusion_id0(fd: FusionDefinition) -> None:
         )
 
     assert torch.allclose(o_decomposed_ref, o[0], atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.skipif(
+    is_pre_blackwell(), reason="Only supported on blackwell and newer devices."
+)
+@pytest.mark.skipif(
+    not microarchitecture_is_pre(12), reason="Does not support blackwell compute 12.0"
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float])
+def test_fp4_vectorization(
+    nvfuser_direct_test,
+    dtype,
+):
+    inputs = [
+        torch.ones(4, 8, dtype=dtype, device="cuda"),
+        torch.ones(4, dtype=dtype, device="cuda"),
+    ]
+
+    def nvfuser_fusion_id0(fd: FusionDefinition) -> None:
+        T0 = fd.from_pytorch(inputs[0])
+        T1 = fd.from_pytorch(inputs[1])
+        T2 = fd.ops.cast(T0, DataType.Float)
+        cast_T1 = fd.ops.cast(T1, DataType.Float)
+        broadcast_T1 = fd.ops.broadcast(cast_T1, [False, True])
+        T3 = fd.ops.div(T2, broadcast_T1)
+        T4 = fd.ops.cast(T3, DataType.Float4_e2m1fn)
+        T5 = fd.ops.reshape(T4, [32])
+        fd.add_output(T5)
+
+    o, _ = nvfuser_direct_test.exec_nvfuser(nvfuser_fusion_id0, inputs)
+
+    ref_o = to_fp4(inputs[0].to(torch.float) / inputs[1].unsqueeze(-1)).reshape(-1)
diff --git a/tests/python/direct_utils/utils.py b/tests/python/direct_utils/utils.py
@@ -54,9 +54,15 @@ def check_captured_python_definition(reference_outputs, fd, inputs, device=None)
             # torch.allclose does not work with fp8 datatype, so cast to fp64.
             # However, casting complex values to real discards the imaginary
             # part, so skip complex dtypes.
-            if not ref_out.dtype.is_complex:
+            # Similarly, packed fp4 dtype cannot be compared neither, we view
+            # it as int8 and run comparison as-is.
+            if ref_out.dtype == torch.float4_e2m1fn_x2:
+                ref_out = ref_out.view(torch.int8)
+            elif not ref_out.dtype.is_complex:
                 ref_out = ref_out.to(torch.float64)
-            if not cap_out.dtype.is_complex:
+            if cap_out.dtype == torch.float4_e2m1fn_x2:
+                cap_out = cap_out.view(torch.int8)
+            elif not cap_out.dtype.is_complex:
                 cap_out = cap_out.to(torch.float64)
             if not torch.allclose(ref_out, cap_out, equal_nan=True):
                 return False