Felix-Petersen · sguada · Dec 16, 2024
diff --git a/difflogic/difflogic.py b/difflogic/difflogic.py
@@ -1,10 +1,21 @@
 import torch
-import difflogic_cuda
+from pallas import autograd as pallas_autograd
 import numpy as np
+import jax
+import jax.numpy as jnp
+from jax.nn import softmax, one_hot
 from .functional import bin_op_s, get_unique_connections, GradFactor
 from .packbitstensor import PackBitsTensor
 
 
+from pallas import compile as pallas_compile
+
+@pallas_compile(backend="cuda")
+def logic_layer_kernel(x, a, b, w, y):
+    i = pallas_autograd.program_id(0)
+    y[i] = x[i]
+
+
 ########################################################################################################################
 
 
@@ -53,30 +64,14 @@ def __init__(
         assert self.connections in ['random', 'unique'], self.connections
         self.indices = self.get_connections(self.connections, device)
 
-        if self.implementation == 'cuda':
-            """
-            Defining additional indices for improving the efficiency of the backward of the CUDA implementation.
-            """
-            given_x_indices_of_y = [[] for _ in range(in_dim)]
-            indices_0_np = self.indices[0].cpu().numpy()
-            indices_1_np = self.indices[1].cpu().numpy()
-            for y in range(out_dim):
-                given_x_indices_of_y[indices_0_np[y]].append(y)
-                given_x_indices_of_y[indices_1_np[y]].append(y)
-            self.given_x_indices_of_y_start = torch.tensor(
-                np.array([0] + [len(g) for g in given_x_indices_of_y]).cumsum(), device=device, dtype=torch.int64)
-            self.given_x_indices_of_y = torch.tensor(
-                [item for sublist in given_x_indices_of_y for item in sublist], dtype=torch.int64, device=device)
-
         self.num_neurons = out_dim
         self.num_weights = out_dim
 
-    def forward(self, x):
+    def forward(self, x, training: bool):
         if isinstance(x, PackBitsTensor):
-            assert not self.training, 'PackBitsTensor is not supported for the differentiable training mode.'
-            assert self.device == 'cuda', 'PackBitsTensor is only supported for CUDA, not for {}. ' \
-                                          'If you want fast inference on CPU, please use CompiledDiffLogicModel.' \
-                                          ''.format(self.device)
+            assert not training, 'PackBitsTensor is not supported for the differentiable training mode.'
+            assert self.device == 'cuda', 'PackBitsTensor is only supported for CUDA, not for {}. '.format(self.device) + \
+                                          'If you want fast inference on CPU, please use CompiledDiffLogicModel.'
 
         else:
             if self.grad_factor != 1.:
@@ -85,68 +80,41 @@ def forward(self, x):
         if self.implementation == 'cuda':
             if isinstance(x, PackBitsTensor):
                 return self.forward_cuda_eval(x)
-            return self.forward_cuda(x)
+            return self.forward_cuda(x, training)
         elif self.implementation == 'python':
-            return self.forward_python(x)
+            return self.forward_python(x, training)
         else:
             raise ValueError(self.implementation)
 
-    def forward_python(self, x):
-        assert x.shape[-1] == self.in_dim, (x[0].shape[-1], self.in_dim)
-
-        if self.indices[0].dtype == torch.int64 or self.indices[1].dtype == torch.int64:
-            print(self.indices[0].dtype, self.indices[1].dtype)
-            self.indices = self.indices[0].long(), self.indices[1].long()
-            print(self.indices[0].dtype, self.indices[1].dtype)
+    def forward_python(self, x, training: bool):
+        assert x.shape[-1] == self.in_dim, (x.shape[-1], self.in_dim)
 
         a, b = x[..., self.indices[0]], x[..., self.indices[1]]
-        if self.training:
-            x = bin_op_s(a, b, torch.nn.functional.softmax(self.weights, dim=-1))
+        weights = jnp.array(self.weights)  # Convert to JAX array
+        if training:
+            x = bin_op_s(a, b, softmax(weights, axis=-1))
         else:
-            weights = torch.nn.functional.one_hot(self.weights.argmax(-1), 16).to(torch.float32)
+            weights = one_hot(jnp.argmax(weights, axis=-1), 16).astype(jnp.float32)
             x = bin_op_s(a, b, weights)
         return x
 
-    def forward_cuda(self, x):
-        if self.training:
-            assert x.device.type == 'cuda', x.device
-        assert x.ndim == 2, x.ndim
+    def forward_cuda(self, x, training: bool):
+        x = jnp.array(x)
+        a = jnp.array(self.indices[0])
+        b = jnp.array(self.indices[1])
+        w = jnp.array(self.weights)
+        y = jnp.zeros((x.shape[0], self.out_dim), dtype=x.dtype)
 
-        x = x.transpose(0, 1)
-        x = x.contiguous()
+        grid_dim = (x.shape[0],)
+        block_dim = (min(x.shape[0], 1024),)
 
-        assert x.shape[0] == self.in_dim, (x.shape, self.in_dim)
+        logic_layer_kernel[grid_dim, block_dim](x[:,0], a, b, w, y)
 
-        a, b = self.indices
-
-        if self.training:
-            w = torch.nn.functional.softmax(self.weights, dim=-1).to(x.dtype)
-            return LogicLayerCudaFunction.apply(
-                x, a, b, w, self.given_x_indices_of_y_start, self.given_x_indices_of_y
-            ).transpose(0, 1)
-        else:
-            w = torch.nn.functional.one_hot(self.weights.argmax(-1), 16).to(x.dtype)
-            with torch.no_grad():
-                return LogicLayerCudaFunction.apply(
-                    x, a, b, w, self.given_x_indices_of_y_start, self.given_x_indices_of_y
-                ).transpose(0, 1)
+        return y
 
     def forward_cuda_eval(self, x: PackBitsTensor):
-        """
-        WARNING: this is an in-place operation.
-
-        :param x:
-        :return:
-        """
-        assert not self.training
-        assert isinstance(x, PackBitsTensor)
-        assert x.t.shape[0] == self.in_dim, (x.t.shape, self.in_dim)
-
-        a, b = self.indices
-        w = self.weights.argmax(-1).to(torch.uint8)
-        x.t = difflogic_cuda.eval(x.t, a, b, w)
-
-        return x
+        raise NotImplementedError("`forward_cuda_eval` is not yet implemented for the JAX version. "
+                                  "PackBitsTensor is currently not supported in JAX.")
 
     def extra_repr(self):
         return '{}, {}, {}'.format(self.in_dim, self.out_dim, 'train' if self.training else 'eval')
@@ -172,53 +140,22 @@ def get_connections(self, connections, device='cuda'):
 ########################################################################################################################
 
 
-class GroupSum(torch.nn.Module):
+class GroupSum:
     """
     The GroupSum module.
     """
     def __init__(self, k: int, tau: float = 1., device='cuda'):
-        """
-
-        :param k: number of intended real valued outputs, e.g., number of classes
-        :param tau: the (softmax) temperature tau. The summed outputs are divided by tau.
-        :param device:
-        """
-        super().__init__()
         self.k = k
         self.tau = tau
         self.device = device
 
     def forward(self, x):
         if isinstance(x, PackBitsTensor):
-            return x.group_sum(self.k)
+            raise NotImplementedError("PackBitsTensor is not yet supported in JAX.")
 
         assert x.shape[-1] % self.k == 0, (x.shape, self.k)
-        return x.reshape(*x.shape[:-1], self.k, x.shape[-1] // self.k).sum(-1) / self.tau
-
-    def extra_repr(self):
-        return 'k={}, tau={}'.format(self.k, self.tau)
-
+        return x.reshape(*x.shape[:-1], self.k, x.shape[-1] // self.k).sum(axis=-1) / self.tau
 
-########################################################################################################################
-
-
-class LogicLayerCudaFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, a, b, w, given_x_indices_of_y_start, given_x_indices_of_y):
-        ctx.save_for_backward(x, a, b, w, given_x_indices_of_y_start, given_x_indices_of_y)
-        return difflogic_cuda.forward(x, a, b, w)
-
-    @staticmethod
-    def backward(ctx, grad_y):
-        x, a, b, w, given_x_indices_of_y_start, given_x_indices_of_y = ctx.saved_tensors
-        grad_y = grad_y.contiguous()
+    def __repr__(self):
+        return f'GroupSum(k={self.k}, tau={self.tau})'
 
-        grad_w = grad_x = None
-        if ctx.needs_input_grad[0]:
-            grad_x = difflogic_cuda.backward_x(x, a, b, w, grad_y, given_x_indices_of_y_start, given_x_indices_of_y)
-        if ctx.needs_input_grad[3]:
-            grad_w = difflogic_cuda.backward_w(x, a, b, grad_y)
-        return grad_x, None, None, grad_w, None, None, None
-
-
-########################################################################################################################
diff --git a/difflogic/functional.py b/difflogic/functional.py
@@ -1,4 +1,4 @@
-import torch
+import jax.numpy as jnp
 import numpy as np
 
 BITS_TO_NP_DTYPE = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}
@@ -29,7 +29,7 @@ def bin_op(a, b, i):
         assert a[1].shape == b[1].shape, (a[1].shape, b[1].shape)
 
     if i == 0:
-        return torch.zeros_like(a)
+        return jnp.zeros_like(a)
     elif i == 1:
         return a * b
     elif i == 2:
@@ -59,11 +59,11 @@ def bin_op(a, b, i):
     elif i == 14:
         return 1 - a * b
     elif i == 15:
-        return torch.ones_like(a)
+        return jnp.ones_like(a)
 
 
 def bin_op_s(a, b, i_s):
-    r = torch.zeros_like(a)
+    r = jnp.zeros_like(a)
     for i in range(16):
         u = bin_op(a, b, i)
         r = r + i_s[..., i] * u

diff --git a/difflogic/tests/test_jax.py b/difflogic/tests/test_jax.py
@@ -0,0 +1,26 @@
+import jax
+import jax.numpy as jnp
+from jax.nn import softmax, one_hot
+from difflogic import LogicLayer, GroupSum
+from difflogic.functional import bin_op_s
+import numpy as np
+
+def test_logic_layer_forward():
+    in_dim = 4
+    out_dim = 2
+    layer = LogicLayer(in_dim=in_dim, out_dim=out_dim, implementation="python", connections="unique")
+    x = jnp.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]])
+    layer.weights = np.random.randn(out_dim, 16)
+    output = layer(x, training=True)
+    assert output.shape == (2, 2)
+
+def test_group_sum_forward():
+    k = 2
+    group_sum = GroupSum(k=k)
+    x = jnp.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    output = group_sum.forward(x)
+    jnp.testing.assert_allclose(output, jnp.array([[4., 6.], [12., 14.]]))
+
+
+test_logic_layer_forward()
+test_group_sum_forward()