thu-ml
diff --git a/‎csrc/dispatch_utils.h‎
Lines changed: 16 additions & 0 deletions b/‎csrc/dispatch_utils.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎csrc/fused/fused.cu‎
Lines changed: 2 additions & 2 deletions b/‎csrc/fused/fused.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/qattn/attn_cuda_sm90.h‎
Lines changed: 31 additions & 0 deletions b/‎csrc/qattn/attn_cuda_sm90.h‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎csrc/qattn/pybind_sm90.cpp‎
Lines changed: 2 additions & 0 deletions b/‎csrc/qattn/pybind_sm90.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -33,6 +33,22 @@
     throw std::invalid_argument(err_msg.str());                 \
   }
 
+#define DISPATCH_HEAD_DIM_QK(head_dim, HEAD_DIM, ...)              \
+  if (head_dim == 64) {                                         \
+    constexpr int HEAD_DIM = 64;                                \
+    __VA_ARGS__                                                 \
+  } else if (head_dim == 128) {                                 \
+    constexpr int HEAD_DIM = 128;                               \
+    __VA_ARGS__                                                 \
+  } else if (head_dim == 192) {                                 \
+    constexpr int HEAD_DIM = 192;                               \
+    __VA_ARGS__                                                 \
+  } else {                                                      \
+    std::ostringstream err_msg;                                 \
+    err_msg << "Unsupported head dim: " << int(head_dim);       \
+    throw std::invalid_argument(err_msg.str());                 \
+  }
+
 #define DISPATCH_CAUSAL(is_causal, IS_CAUSAL, ...)              \
   if (is_causal == 1) {                                         \
     constexpr bool IS_CAUSAL = true;                            \
 
@@ -652,7 +652,7 @@ void quant_per_block_int8_fuse_sub_mean_cuda(
 
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input_dtype, c_type, {
     DISPATCH_BLOCK_SIZE(block_size, BLOCK_SIZE, {
-      DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, {
+      DISPATCH_HEAD_DIM_QK(head_dim, HEAD_DIM, {
 
         CHECK_SHAPE(mean, batch_size, num_heads, head_dim);
         CHECK_SHAPE(output, input.size(0), input.size(1), input.size(2), input.size(3));
@@ -738,7 +738,7 @@ void quant_per_warp_int8_cuda(
   DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(input_dtype, c_type, {
     DISPATCH_BLOCK_SIZE(block_size, BLOCK_SIZE, {
       DISPATCH_WARP_BLOCK_SIZE(warp_block_size, WARP_BLOCK_SIZE, {
-        DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, {
+        DISPATCH_HEAD_DIM_QK(head_dim, HEAD_DIM, {
 
           CHECK_SHAPE(output, input.size(0), input.size(1), input.size(2), input.size(3));
           CHECK_SHAPE(scale, batch_size, num_heads, (num_tokens + BLOCK_SIZE - 1) / BLOCK_SIZE * (BLOCK_SIZE / WARP_BLOCK_SIZE));
 
@@ -29,6 +29,21 @@ torch::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf(
                     float sm_scale,
                     int return_lse);
 
+torch::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf_dsk_sm90(
+                  torch::Tensor query,
+                  torch::Tensor key,
+                  torch::Tensor query_pe,
+                  torch::Tensor key_pe,
+                  torch::Tensor value,
+                  torch::Tensor output,
+                  torch::Tensor query_scale,
+                  torch::Tensor key_scale,
+                  int tensor_layout,
+                  int is_causal,
+                  int qk_quant_gran,
+                  float sm_scale,
+                  int return_lse);
+
 torch::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
                     torch::Tensor query,
                     torch::Tensor key,
@@ -41,4 +56,20 @@ torch::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
                     int is_causal,
                     int qk_quant_gran,
                     float sm_scale,
+                    int return_lse);
+
+torch::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_dsk_sm90(
+                    torch::Tensor query,
+                    torch::Tensor key,
+                    torch::Tensor query_pe,
+                    torch::Tensor key_pe,
+                    torch::Tensor value,
+                    torch::Tensor output,
+                    torch::Tensor query_scale,
+                    torch::Tensor key_scale,
+                    torch::Tensor value_scale,
+                    int tensor_layout,
+                    int is_causal,
+                    int qk_quant_gran,
+                    float sm_scale,
                     int return_lse);
@@ -22,4 +22,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
   m.def("qk_int8_sv_f8_accum_f32_attn_inst_buf", &qk_int8_sv_f8_accum_f32_attn_inst_buf);
   m.def("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf", &qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf);
+  m.def("qk_int8_sv_f8_accum_f32_attn_inst_buf_dsk_sm90", &qk_int8_sv_f8_accum_f32_attn_inst_buf_dsk_sm90);
+  m.def("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_dsk_sm90", &qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_dsk_sm90);
 }
Original file line number	Diff line number	Diff line change
`@@ -22,4 +22,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)`
`22`	`22`	`{`
`23`	`23`	`m.def("qk_int8_sv_f8_accum_f32_attn_inst_buf", &qk_int8_sv_f8_accum_f32_attn_inst_buf);`
`24`	`24`	`m.def("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf", &qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf);`
	`25`	`+ m.def("qk_int8_sv_f8_accum_f32_attn_inst_buf_dsk_sm90", &qk_int8_sv_f8_accum_f32_attn_inst_buf_dsk_sm90);`
	`26`	`+ m.def("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_dsk_sm90", &qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_dsk_sm90);`
`25`	`27`	`}`