From e74a43560db8e00f28ea7e563dcd8462c017406f Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 09:38:03 -0700
Subject: [PATCH 1/9] fixing tests and adding github action

---
 .github/workflows/tests.yml                   | 24 +++++++++++++++++++
 .../requirements.txt => requirements.txt      |  0
 test/test_blocksparse.py                      | 20 +++++++++-------
 test/test_cross_entropy.py                    |  7 +++---
 test/test_flash_attention.py                  |  8 +++----
 test/test_inductor.py                         |  8 +++----
 test/test_matmul.py                           |  8 +++----
 7 files changed, 50 insertions(+), 25 deletions(-)
 create mode 100644 .github/workflows/tests.yml
 rename models/llama/requirements.txt => requirements.txt (100%)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..f5330bb
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,24 @@
+name: build
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main, "*"]
+
+jobs:
+
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+    - name:  Install pip dependencies
+      run: |
+        python3 -m pip install --upgrade pip
+        python3 -m pip install -r requirements.txt
+    - name: Run Python unit tests
+      run: python3 -u -m pytest tests/tests.py
diff --git a/models/llama/requirements.txt b/requirements.txt
similarity index 100%
rename from models/llama/requirements.txt
rename to requirements.txt
diff --git a/test/test_blocksparse.py b/test/test_blocksparse.py
index cf456af..abc71b6 100644
--- a/test/test_blocksparse.py
+++ b/test/test_blocksparse.py
@@ -2,7 +2,7 @@
 import torch
 
 import triton
-import triton.ops
+import kernels
 
 
 def is_hip_mi200():
@@ -57,7 +57,7 @@ def mask_tensor(x, mask, block, value=0):
 @pytest.mark.parametrize("BLOCK", [16, 32, 64])
 @pytest.mark.parametrize("DTYPE", [torch.float16])
 def test_matmul(
-    MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, device, Z=3, H=2, M=512, N=384, K=256
+    MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, device="cuda", Z=3, H=2, M=512, N=384, K=256
 ):
     seed = 0
     torch.manual_seed(seed)
@@ -103,7 +103,7 @@ def test_matmul(
     b_tri = do_sparsify(b_tri) if is_dds else b_tri
     a_tri.retain_grad()
     b_tri.retain_grad()
-    op = triton.ops.blocksparse.matmul(
+    op = kernels.blocksparse.matmul(
         layout, BLOCK, MODE, trans_a=TRANS_A, trans_b=TRANS_B, device=device
     )
     c_tri = op(a_tri, b_tri)
@@ -132,7 +132,9 @@ def test_matmul(
 
 @pytest.mark.parametrize("is_dense", [False, True])
 @pytest.mark.parametrize("BLOCK, WIDTH", configs)
-def test_softmax(BLOCK, WIDTH, is_dense, device, Z=2, H=2, is_causal=True, scale=0.4):
+def test_softmax(
+    BLOCK, WIDTH, is_dense, device="cuda", Z=2, H=2, is_causal=True, scale=0.4
+):
     # set seed
     torch.random.manual_seed(0)
     Z, H, M, N = 2, 3, WIDTH, WIDTH
@@ -164,7 +166,7 @@ def test_softmax(BLOCK, WIDTH, is_dense, device, Z=2, H=2, is_causal=True, scale
     a_tri = sparsify_tensor(a_tri, layout, BLOCK)
     a_tri.retain_grad()
     dout_tri = sparsify_tensor(dout_tri, layout, BLOCK)
-    op = triton.ops.blocksparse.softmax(layout, BLOCK, device=device, is_dense=is_dense)
+    op = kernels.blocksparse.softmax(layout, BLOCK, device=device, is_dense=is_dense)
     out_tri = op(a_tri, scale=scale, is_causal=is_causal)
     out_tri.backward(dout_tri)
     da_tri = a_tri.grad
@@ -178,7 +180,7 @@ def test_softmax(BLOCK, WIDTH, is_dense, device, Z=2, H=2, is_causal=True, scale
 def test_attention_fwd_bwd(
     block,
     dtype,
-    device,
+    device="cuda",
     input_scale=1.0,
     scale=1 / 8.0,
     n_ctx=256,
@@ -251,13 +253,13 @@ def triton_attention(
     value: torch.Tensor,
     scale: float,
 ):
-    sparse_dot_sdd_nt = triton.ops.blocksparse.matmul(
+    sparse_dot_sdd_nt = kernels.blocksparse.matmul(
         layout, block, "sdd", trans_a=False, trans_b=True, device=value.device
     )
-    sparse_dot_dsd_nn = triton.ops.blocksparse.matmul(
+    sparse_dot_dsd_nn = kernels.blocksparse.matmul(
         layout, block, "dsd", trans_a=False, trans_b=False, device=value.device
     )
-    sparse_softmax = triton.ops.blocksparse.softmax(layout, block, device=value.device)
+    sparse_softmax = kernels.blocksparse.softmax(layout, block, device=value.device)
 
     w = sparse_dot_sdd_nt(query, key)
     w = sparse_softmax(w, scale=scale, is_causal=True)
diff --git a/test/test_cross_entropy.py b/test/test_cross_entropy.py
index 701f54a..05ac67c 100644
--- a/test/test_cross_entropy.py
+++ b/test/test_cross_entropy.py
@@ -1,8 +1,7 @@
 import pytest
 import torch
 
-import triton
-import triton.ops
+import kernels
 
 
 @pytest.mark.parametrize(
@@ -15,7 +14,7 @@
         for mode in ["forward", "backward"]
     ],
 )
-def test_op(M, N, dtype, mode, device):
+def test_op(M, N, dtype, mode, device="cuda"):
     capability = torch.cuda.get_device_capability()
     if capability[0] < 8 and dtype == "bfloat16":
         pytest.skip("Only test bfloat16 on devices with sm >= 80")
@@ -28,7 +27,7 @@ def test_op(M, N, dtype, mode, device):
     x = torch.randn(M, N, dtype=dtype, device=device, requires_grad=True)
     idx = 4 + torch.ones(M, dtype=torch.int64, device=device)
     # forward pass
-    tt_y = triton.ops.cross_entropy(x, idx)
+    tt_y = kernels.cross_entropy(x, idx)
     th_y = torch.nn.CrossEntropyLoss(reduction="none")(x, idx)
     if mode == "forward":
         torch.testing.assert_close(th_y, tt_y)
diff --git a/test/test_flash_attention.py b/test/test_flash_attention.py
index 724028c..af8a645 100644
--- a/test/test_flash_attention.py
+++ b/test/test_flash_attention.py
@@ -2,8 +2,8 @@
 import torch
 import os
 
+import kernels
 import triton
-import triton.ops
 
 
 @pytest.mark.interpreter
@@ -19,7 +19,7 @@
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("causal", [True, False])
 @pytest.mark.parametrize("seq_par", [True, False])
-def test_op(Z, H, N_CTX, D_HEAD, dtype, causal, seq_par, device):
+def test_op(Z, H, N_CTX, D_HEAD, dtype, causal, seq_par, device="cuda"):
     capability = torch.cuda.get_device_capability()
     if capability[0] < 8:
         pytest.skip("Flash attention only supported for compute capability >= 80")
@@ -56,7 +56,7 @@ def test_op(Z, H, N_CTX, D_HEAD, dtype, causal, seq_par, device):
     ref_dk, k.grad = k.grad.clone(), None
     ref_dq, q.grad = q.grad.clone(), None
     # # triton implementation
-    tri_out = triton.ops.attention(q, k, v, causal, sm_scale, seq_par)
+    tri_out = kernels.attention(q, k, v, causal, sm_scale, seq_par)
     tri_out.backward(dout)
     tri_dv, v.grad = v.grad.clone(), None
     tri_dk, k.grad = k.grad.clone(), None
@@ -151,7 +151,7 @@ def bench_flash_attention(
         (BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True
     )
     if provider == "triton":
-        fn = lambda: triton.ops.attention(q, k, v, casual, sm_scale, seq_par)
+        fn = lambda: kernels.attention(q, k, v, casual, sm_scale, seq_par)
         if mode == "bwd":
             o = fn()
             do = torch.randn_like(o)
diff --git a/test/test_inductor.py b/test/test_inductor.py
index 73c3b9b..2820876 100644
--- a/test/test_inductor.py
+++ b/test/test_inductor.py
@@ -5,7 +5,7 @@
 import triton.language as tl
 
 
-def test_normalization_with_remat(device):
+def test_normalization_with_remat(device="cuda"):
 
     @triton.jit
     def triton_(
@@ -80,7 +80,7 @@ def triton_(
     )
 
 
-def test_avg_pool_bw(device):
+def test_avg_pool_bw(device="cuda"):
 
     @triton.jit
     def triton_(in_ptr0, out_ptr0, XBLOCK: tl.constexpr):
@@ -200,7 +200,7 @@ def triton_(in_ptr0, out_ptr0, XBLOCK: tl.constexpr):
 
 @pytest.mark.parametrize("RBLOCK", [1, 16, 32, 64, 128])
 @pytest.mark.parametrize("num_warps", [1, 4])
-def test_scan2d_broadcast(RBLOCK, num_warps, device):
+def test_scan2d_broadcast(RBLOCK, num_warps, device="cuda"):
 
     @triton.jit(debug=True)
     def fn(in_ptr, out_ptr, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr):
@@ -220,7 +220,7 @@ def fn(in_ptr, out_ptr, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr):
     torch.testing.assert_close(output, ref)
 
 
-def test_scan2d_for(device):
+def test_scan2d_for(device="cuda"):
 
     @triton.jit
     def fn(out_ptr0, rnumel, RBLOCK: tl.constexpr):
diff --git a/test/test_matmul.py b/test/test_matmul.py
index dab9f82..632165e 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -5,7 +5,7 @@
 
 import triton
 import triton.language as tl
-import triton.ops
+import kernels
 
 
 def is_hip():
@@ -1006,7 +1006,7 @@ def test_op(
             kwargs=kwargs, num_warps=NWARP, num_stages=NSTAGE, pre_hook=pre_hook
         )
     ]
-    kernel = triton.ops._matmul.kernel
+    kernel = kernels._matmul.kernel
     kernel.configs = configs
     # kernel.run = kernel.run.run.run
 
@@ -1071,7 +1071,7 @@ def init_input(m, n, dtype, acc_dtype):
     # run test
     th_a = upcast_if_fp8(a, ADTYPE)
     th_b = upcast_if_fp8(b, BDTYPE)
-    ab_dtype = triton.ops.get_higher_dtype(th_a.dtype, th_b.dtype)
+    ab_dtype = kernels.get_higher_dtype(th_a.dtype, th_b.dtype)
     acc_dtype = getattr(torch, ACC_DTYPE) if ACC_DTYPE else ab_dtype
     output_dtype = getattr(torch, OUTPUT_DTYPE) if OUTPUT_DTYPE else ab_dtype
     th_c = torch.matmul(th_a.to(output_dtype), th_b.to(output_dtype))
@@ -1080,7 +1080,7 @@ def init_input(m, n, dtype, acc_dtype):
             a = triton.reinterpret(a, getattr(tl, ADTYPE))
         if is_fp8(BDTYPE):
             b = triton.reinterpret(b, getattr(tl, BDTYPE))
-        tt_c = triton.ops.matmul(
+        tt_c = kernels.matmul(
             a,
             b,
             acc_dtype if ACC_DTYPE else None,

From ab062273f00c1a0d4b7124714420c8be31239094 Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 09:42:41 -0700
Subject: [PATCH 2/9] forgot to add pytest to the installs

---
 .github/workflows/tests.yml | 2 +-
 requirements.txt            | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f5330bb..d7c7785 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: build
+name: pytests
 
 on:
   push:
diff --git a/requirements.txt b/requirements.txt
index df41a7a..af7ed72 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ tiktoken
 fairscale
 triton
 pandas
+pytest

From c78d320af740d609a402eb1abb58f8f29db5a5af Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 09:46:45 -0700
Subject: [PATCH 3/9] fixing pytests. only way to test this is unfortunately
 with commits

---
 .github/workflows/tests.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d7c7785..72c9f4f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -7,8 +7,7 @@ on:
     branches: [ main, "*"]
 
 jobs:
-
-  build:
+  tests:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
@@ -21,4 +20,4 @@ jobs:
         python3 -m pip install --upgrade pip
         python3 -m pip install -r requirements.txt
     - name: Run Python unit tests
-      run: python3 -u -m pytest tests/tests.py
+      run: python3  -m pytest

From 277a0f10520c872348102aec6ac56f8aac3d4459 Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 10:02:07 -0700
Subject: [PATCH 4/9] cuda wasnt working last run so trying with different
 configs

---
 .github/workflows/tests.yml | 77 ++++++++++++++++++++++++++++++-------
 requirements.txt            |  1 +
 2 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 72c9f4f..55f62d7 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: pytests
+name: tests
 
 on:
   push:
@@ -7,17 +7,68 @@ on:
     branches: [ main, "*"]
 
 jobs:
-  tests:
+  Runner-Preparation:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    outputs:
+      matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
+      matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
+      matrix-MACOS: ${{ steps.set-matrix.outputs.matrix-MACOS }}
+    steps:
+      - name: Prepare runner matrix
+        id: set-matrix
+        if: env.enable_integration == 'true'
+        run: |
+          if [ x"${{ github.repository }}" == x"triton-lang/kernels" ]; then
+            echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"]]'
+            echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"]]'
+            echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
+          else
+            echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
+            echo '::set-output name=matrix-HIP::["ubuntu-latest"]'
+            echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
+          fi
+  pre-commit:
+    name: pre-commit (code formatting)
+    needs: Runner-Preparation
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - name: Setup Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.12'
-    - name:  Install pip dependencies
-      run: |
-        python3 -m pip install --upgrade pip
-        python3 -m pip install -r requirements.txt
-    - name: Run Python unit tests
-      run: python3  -m pytest
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+      - name: install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install -r requirements.txt
+      - name: Check pre-commit
+        run: |
+          # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed
+          python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true
+          # If first run of yapf worked and made changes reset the tree to the original state
+          git reset --hard
+          python3 -m pre_commit run --all-files --verbose
+      - name: Print diff of changes if pre-commit failed
+        if: failure()
+        run: |
+          git diff
+  tests:
+    needs: pre-commit
+    if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.12'
+      - name: Run Python unit tests
+        run: python3  -m pytest
diff --git a/requirements.txt b/requirements.txt
index af7ed72..d16b448 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ fairscale
 triton
 pandas
 pytest
+pre-commit

From 8a0489d1fd6435c81176be9ad1ab1758eae0aafe Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 10:12:31 -0700
Subject: [PATCH 5/9] removing pre commit

---
 .github/workflows/tests.yml | 27 +--------------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 55f62d7..d0f52fd 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -28,11 +28,6 @@ jobs:
             echo '::set-output name=matrix-HIP::["ubuntu-latest"]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           fi
-  pre-commit:
-    name: pre-commit (code formatting)
-    needs: Runner-Preparation
-    runs-on: ubuntu-latest
-    steps:
       - name: Checkout
         uses: actions/checkout@v4
       - uses: actions/setup-python@v5
@@ -43,32 +38,12 @@ jobs:
         run: |
           python3 -m pip install --upgrade pip
           python3 -m pip install -r requirements.txt
-      - name: Check pre-commit
-        run: |
-          # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed
-          python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true
-          # If first run of yapf worked and made changes reset the tree to the original state
-          git reset --hard
-          python3 -m pre_commit run --all-files --verbose
-      - name: Print diff of changes if pre-commit failed
-        if: failure()
-        run: |
-          git diff
   tests:
-    needs: pre-commit
+    needs: Runner-Preparation
     if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
     strategy:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}}
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: "true"
-      - uses: actions/checkout@v3
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.12'
       - name: Run Python unit tests
         run: python3  -m pytest

From 13cadd4dabc1719fba8f1ced99e976c76410d1a3 Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 10:14:08 -0700
Subject: [PATCH 6/9] following github actions page to see why this isnt
 running

---
 .github/workflows/tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d0f52fd..bbf47b3 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -40,6 +40,7 @@ jobs:
           python3 -m pip install -r requirements.txt
   tests:
     needs: Runner-Preparation
+    runs-on: ubuntu-latest
     if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
     strategy:
       matrix:

From 2a2925b457a4a4d5e0dd65a6c0ed36ce22b56686 Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 10:18:21 -0700
Subject: [PATCH 7/9] fixing if statement in action

---
 .github/workflows/tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index bbf47b3..1bd2780 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -17,7 +17,6 @@ jobs:
     steps:
       - name: Prepare runner matrix
         id: set-matrix
-        if: env.enable_integration == 'true'
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/kernels" ]; then
             echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"]]'

From 26884ab7e83b19b33f27952b1b29282cb2395f9d Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 10:39:50 -0700
Subject: [PATCH 8/9] sorry for all the emails folks...

---
 .github/workflows/tests.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1bd2780..3be93be 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,6 +27,14 @@ jobs:
             echo '::set-output name=matrix-HIP::["ubuntu-latest"]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           fi
+  tests:
+    needs: Runner-Preparation
+    runs-on: ubuntu-latest
+    if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}}
+    steps:
       - name: Checkout
         uses: actions/checkout@v4
       - uses: actions/setup-python@v5
@@ -37,13 +45,5 @@ jobs:
         run: |
           python3 -m pip install --upgrade pip
           python3 -m pip install -r requirements.txt
-  tests:
-    needs: Runner-Preparation
-    runs-on: ubuntu-latest
-    if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
-    strategy:
-      matrix:
-        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}}
-    steps:
       - name: Run Python unit tests
         run: python3  -m pytest

From 2d6054fccc808a3bdba0f5d5932ef8fcd3ea3084 Mon Sep 17 00:00:00 2001
From: adamomainz <amainz@meta.com>
Date: Fri, 23 Aug 2024 11:34:20 -0700
Subject: [PATCH 9/9] looking to switch the runs-on to cuda`

---
 .github/workflows/tests.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3be93be..c3fca51 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -29,7 +29,7 @@ jobs:
           fi
   tests:
     needs: Runner-Preparation
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.runner }}
     if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
     strategy:
       matrix:
@@ -46,4 +46,6 @@ jobs:
           python3 -m pip install --upgrade pip
           python3 -m pip install -r requirements.txt
       - name: Run Python unit tests
-        run: python3  -m pytest
+        env:
+          CUDA_HOME: "/usr/local/cuda"
+        run: python3  -m pytest -s -n 8