fix kernel & requirements

0xSooki · 0xSooki · commit 19fd99fbced1 · 2025-05-28T15:43:03.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 jax
 pybind11
-scikit-build-core
+scikit-build-core
+pytest
diff --git a/src/gpu_ops.cc b/src/gpu_ops.cc
@@ -12,7 +12,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(
         .Arg<ffi::Buffer<ffi::F32>>()             // b
         .Ret<ffi::Buffer<ffi::F32>>()             // result (scalar)
         .Ret<ffi::Buffer<ffi::F32>>()             // b_plus_1
-        .Attr<size_t>("n"),
+        .Attr<int64_t>("n"),
     {xla::ffi::Traits::kCmdBufferCompatible}); // cudaGraph enabled
 
 // Creates symbol FooBwd with C linkage that can be loaded using Python ctypes
@@ -25,7 +25,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(
         .Arg<ffi::Buffer<ffi::F32>>()             // b_plus_1
         .Ret<ffi::Buffer<ffi::F32>>()             // a_grad
         .Ret<ffi::Buffer<ffi::F32>>()             // b_grad
-        .Attr<size_t>("n"),
+        .Attr<int64_t>("n"),
     {xla::ffi::Traits::kCmdBufferCompatible}); // cudaGraph enabled
 
 template <typename T>
diff --git a/src/kernels.cc.cu b/src/kernels.cc.cu
@@ -3,7 +3,7 @@
 
 namespace ffi = xla::ffi;
 __global__ void FooFwdKernel(const float *a, const float *b, float *result,
-                             float *b_plus_1, size_t n)
+                             float *b_plus_1, int64_t n)
 {
   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   const size_t grid_stride = blockDim.x * gridDim.x;
@@ -22,7 +22,7 @@ __global__ void FooFwdKernel(const float *a, const float *b, float *result,
 
 ffi::Error FooFwdHost(cudaStream_t stream, ffi::Buffer<ffi::F32> a,
                       ffi::Buffer<ffi::F32> b, ffi::ResultBuffer<ffi::F32> result,
-                      ffi::ResultBuffer<ffi::F32> b_plus_1, size_t n)
+                      ffi::ResultBuffer<ffi::F32> b_plus_1, int64_t n)
 {
   const int block_dim = 128;
   const int grid_dim = std::min(32, (int)((n + block_dim - 1) / block_dim));
@@ -47,7 +47,7 @@ __global__ void FooBwdKernel(const float *scalar_grad,
                              const float *b_plus_1,
                              float *a_grad,
                              float *b_grad,
-                             size_t n)
+                             int64_t n)
 {
   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   const size_t grid_stride = blockDim.x * gridDim.x;
@@ -67,7 +67,7 @@ ffi::Error FooBwdHost(cudaStream_t stream,
                       ffi::Buffer<ffi::F32> b_plus_1,
                       ffi::ResultBuffer<ffi::F32> a_grad,
                       ffi::ResultBuffer<ffi::F32> b_grad,
-                      size_t n)
+                      int64_t n)
 {
   const int block_dim = 128;
   const int grid_dim = std::min(32, (int)((n + block_dim - 1) / block_dim));
diff --git a/src/kernels.h b/src/kernels.h
@@ -8,14 +8,14 @@ namespace ffi = xla::ffi;
 
 ffi::Error FooFwdHost(cudaStream_t stream, ffi::Buffer<ffi::F32> a,
                       ffi::Buffer<ffi::F32> b, ffi::ResultBuffer<ffi::F32> result,
-                      ffi::ResultBuffer<ffi::F32> b_plus_1, size_t n);
+                      ffi::ResultBuffer<ffi::F32> b_plus_1, int64_t n);
 
 ffi::Error FooBwdHost(cudaStream_t stream,
                       ffi::Buffer<ffi::F32> scalar_grad,
                       ffi::Buffer<ffi::F32> a,
                       ffi::Buffer<ffi::F32> b_plus_1,
                       ffi::ResultBuffer<ffi::F32> a_grad,
                       ffi::ResultBuffer<ffi::F32> b_grad,
-                      size_t n);
+                      int64_t n);
 
 #endif // KERNELS_H_
diff --git a/src/sooki/ops.py b/src/sooki/ops.py
@@ -4,8 +4,7 @@
 import jax.numpy as jnp
 import sooki
 
-# Note: Using float32 to match FFI expectations
-# jax.config.update("jax_enable_x64", True)
+jax.config.update("jax_enable_x64", True)
 
 gpu = False
 gpu_targets = {}
@@ -28,33 +27,42 @@
 def foo_fwd(a, b):
     assert a.shape == b.shape
     assert a.dtype == b.dtype
+
+    if a.size == 0:
+        return jnp.array(0.0, dtype=a.dtype), (a, b)
+    
     n = np.prod(a.shape).astype(np.int64)
-    scalar_type = jax.ShapeDtypeStruct((), a.dtype)  # scalar output
-    intermediate_type = jax.ShapeDtypeStruct(a.shape, a.dtype)  # b_plus_1 shape
+    scalar_type = jax.ShapeDtypeStruct((), a.dtype)
+    intermediate_type = jax.ShapeDtypeStruct(a.shape, a.dtype)
 
-    # Use GPU if available, otherwise use CPU
-    ffi_name = "foo_fwd" if gpu else "foo_fwd_cpu"
+    def impl(target_name):
+        return lambda: jax.ffi.ffi_call(
+            target_name, (scalar_type, intermediate_type), vmap_method="sequential"
+        )(a, b, n=n)
 
-    result, b_plus_1 = jax.ffi.ffi_call(
-        ffi_name, (scalar_type, intermediate_type), vmap_method="sequential"
-    )(a, b, n=n)
+    result, b_plus_1 = jax.lax.platform_dependent(
+        cpu=impl("foo_fwd_cpu"), cuda=impl("foo_fwd")
+    )
     return result, (a, b_plus_1)
 
 
 def foo_bwd(res, c_grad):
     a, b_plus_1 = res
+
+    if a.size == 0:
+        return jnp.zeros_like(a), jnp.zeros_like(a)
+
     assert c_grad.dtype == a.dtype
     assert a.dtype == b_plus_1.dtype
     n = np.prod(a.shape).astype(np.int64)
     out_type = jax.ShapeDtypeStruct(a.shape, a.dtype)
 
-    # Use GPU if available, otherwise use CPU
-    ffi_name = "foo_bwd" if gpu else "foo_bwd_cpu"
+    def impl(target_name):
+        return lambda: jax.ffi.ffi_call(
+            target_name, (out_type, out_type), vmap_method="sequential"
+        )(c_grad, a, b_plus_1, n=n)
 
-    # c_grad is now a scalar, pass it directly to the FFI function
-    return jax.ffi.ffi_call(ffi_name, (out_type, out_type), vmap_method="sequential")(
-        c_grad, a, b_plus_1, n=n
-    )
+    return jax.lax.platform_dependent(cpu=impl("foo_bwd_cpu"), cuda=impl("foo_bwd"))
 
 
 @jax.custom_vjp