Implementation of Atomic And, Min, and Max (#92)

astroC86 · web-flow · commit d5a876d6a168 · 2025-08-12T17:39:32.000-07:00
diff --git a/iris/__init__.py b/iris/__init__.py
@@ -18,7 +18,10 @@
     atomic_cas,
     atomic_xchg,
     atomic_xor,
-    atomic_or
+    atomic_or,
+    atomic_and,
+    atomic_min,
+    atomic_max,
 )
 
 from .util import (
@@ -60,6 +63,9 @@
     "atomic_xchg",
     "atomic_xor",
     "atomic_or",
+    "atomic_and",
+    "atomic_min",
+    "atomic_max",
     "do_bench",
     "memset_tensor",
     "hip",
diff --git a/iris/iris.py b/iris/iris.py
@@ -576,6 +576,33 @@ def atomic_xor(pointer, val, from_rank, to_rank, heap_bases, mask=None, sem=None
     return tl.atomic_xor(translated_ptr, val, mask=mask, sem=sem, scope=scope)
 
 
+@triton.jit
+def atomic_and(pointer, val, from_rank, to_rank, heap_bases, mask=None, sem=None, scope=None):
+    """
+    Performs an atomic and at the specified rank's memory location.
+
+    This function performs an atomic and operation by translating the pointer
+    from the from_rank's address space to the to_rank's address space and atomically
+    anding the provided data to the to_rank memory location. If the from_rank and to_rank are the same,
+    this function performs a local atomic and operation.
+
+    Args:
+        pointer (triton.PointerType, or block of dtype=triton.PointerType): The memory locations in the from_rank's address space that will be translated to the to_rank's address space. Must be the current rank where the pointer is local.
+        val (Block of dtype=pointer.dtype.element_ty): The values with which to perform the atomic operation.
+        from_rank (int): The rank ID from which the pointer originates. Must be the current rank where the pointer is local.
+        to_rank (int): The rank ID to which the atomic operation will be performed.
+        heap_bases (triton.PointerType): Array containing the heap base addresses for all ranks.
+        mask (Block of triton.int1, optional): If mask[idx] is false, do not perform the atomic operation at address pointer[idx]. Defaults to None.
+        sem (str, optional): Specifies the memory semantics for the operation. Acceptable values are "acquire", "release", "acq_rel" (stands for "ACQUIRE_RELEASE"), and "relaxed". If not provided, the function defaults to using "acq_rel" semantics.
+        scope (str, optional): Defines the scope of threads that observe the synchronizing effect of the atomic operation. Acceptable values are "gpu" (default), "cta" (cooperative thread array, thread block), or "sys" (stands for "SYSTEM"). The default value is "gpu".
+
+    Returns:
+        Block: The data stored at pointer before the atomic operation.
+    """
+    translated_ptr = __translate(pointer, from_rank, to_rank, heap_bases)
+    return tl.atomic_and(translated_ptr, val, mask=mask, sem=sem, scope=scope)
+
+
 @triton.jit
 def atomic_or(pointer, val, from_rank, to_rank, heap_bases, mask=None, sem=None, scope=None):
     """
@@ -603,6 +630,60 @@ def atomic_or(pointer, val, from_rank, to_rank, heap_bases, mask=None, sem=None,
     return tl.atomic_or(translated_ptr, val, mask=mask, sem=sem, scope=scope)
 
 
+@triton.jit
+def atomic_min(pointer, val, from_rank, to_rank, heap_bases, mask=None, sem=None, scope=None):
+    """
+    Performs an atomic min at the specified rank's memory location.
+
+    This function performs an atomic min operation by translating the pointer
+    from the from_rank's address space to the to_rank's address space and atomically
+    performing the min on the provided data to the to_rank memory location. If the from_rank and to_rank are the same,
+    this function performs a local atomic min operation.
+
+    Args:
+        pointer (triton.PointerType, or block of dtype=triton.PointerType): The memory locations in the from_rank's address space that will be translated to the to_rank's address space. Must be the current rank where the pointer is local.
+        val (Block of dtype=pointer.dtype.element_ty): The values with which to perform the atomic operation.
+        from_rank (int): The rank ID from which the pointer originates. Must be the current rank where the pointer is local.
+        to_rank (int): The rank ID to which the atomic operation will be performed.
+        heap_bases (triton.PointerType): Array containing the heap base addresses for all ranks.
+        mask (Block of triton.int1, optional): If mask[idx] is false, do not perform the atomic operation at address pointer[idx]. Defaults to None.
+        sem (str, optional): Specifies the memory semantics for the operation. Acceptable values are "acquire", "release", "acq_rel" (stands for "ACQUIRE_RELEASE"), and "relaxed". If not provided, the function defaults to using "acq_rel" semantics.
+        scope (str, optional): Defines the scope of threads that observe the synchronizing effect of the atomic operation. Acceptable values are "gpu" (default), "cta" (cooperative thread array, thread block), or "sys" (stands for "SYSTEM"). The default value is "gpu".
+
+    Returns:
+        Block: The data stored at pointer before the atomic operation.
+    """
+    translated_ptr = __translate(pointer, from_rank, to_rank, heap_bases)
+    return tl.atomic_min(translated_ptr, val, mask=mask, sem=sem, scope=scope)
+
+
+@triton.jit
+def atomic_max(pointer, val, from_rank, to_rank, heap_bases, mask=None, sem=None, scope=None):
+    """
+    Performs an atomic max at the specified rank's memory location.
+
+    This function performs an atomic max operation by translating the pointer
+    from the from_rank's address space to the to_rank's address space and atomically
+    performing the max on the provided data to the to_rank memory location. If the from_rank and to_rank are the same,
+    this function performs a local atomic max operation.
+
+    Args:
+        pointer (triton.PointerType, or block of dtype=triton.PointerType): The memory locations in the from_rank's address space that will be translated to the to_rank's address space. Must be the current rank where the pointer is local.
+        val (Block of dtype=pointer.dtype.element_ty): The values with which to perform the atomic operation.
+        from_rank (int): The rank ID from which the pointer originates. Must be the current rank where the pointer is local.
+        to_rank (int): The rank ID to which the atomic operation will be performed.
+        heap_bases (triton.PointerType): Array containing the heap base addresses for all ranks.
+        mask (Block of triton.int1, optional): If mask[idx] is false, do not perform the atomic operation at address pointer[idx]. Defaults to None.
+        sem (str, optional): Specifies the memory semantics for the operation. Acceptable values are "acquire", "release", "acq_rel" (stands for "ACQUIRE_RELEASE"), and "relaxed". If not provided, the function defaults to using "acq_rel" semantics.
+        scope (str, optional): Defines the scope of threads that observe the synchronizing effect of the atomic operation. Acceptable values are "gpu" (default), "cta" (cooperative thread array, thread block), or "sys" (stands for "SYSTEM"). The default value is "gpu".
+
+    Returns:
+        Block: The data stored at pointer before the atomic operation.
+    """
+    translated_ptr = __translate(pointer, from_rank, to_rank, heap_bases)
+    return tl.atomic_max(translated_ptr, val, mask=mask, sem=sem, scope=scope)
+
+
 def iris(heap_size=1 << 30):
     """
     Create and return an Iris instance with the specified heap size.
diff --git a/tests/unittests/test_atomic_and.py b/tests/unittests/test_atomic_and.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import torch
+import triton
+import triton.language as tl
+import pytest
+import iris
+
+
+@triton.jit
+def atomic_and_kernel(
+    results,
+    sem: tl.constexpr,
+    scope: tl.constexpr,
+    cur_rank: tl.constexpr,
+    num_ranks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    heap_bases: tl.tensor,
+):
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < BLOCK_SIZE
+
+    bit = (cur_rank // 32) % 2
+    val = bit << (cur_rank % results.type.element_ty.primitive_bitwidth)
+    acc = tl.full([BLOCK_SIZE], val, dtype=results.type.element_ty)
+
+    for target_rank in range(num_ranks):
+        iris.atomic_and(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_and_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
+
+    bit_width      = 32 if dtype == torch.int32 else 64
+    effective_bits = min(num_ranks, bit_width)
+    initial_mask   = (1 << effective_bits) - 1
+
+    results = shmem.full((BLOCK_SIZE,), initial_mask, dtype=dtype)
+
+    grid = lambda meta: (1,)
+    atomic_and_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # All ranks start out with a full mask vector 0xFFFFFF (initial_mask)
+    # All ranks then take turns in clearing the their bit position in the mask
+    # By the end we would have effective_bits - num_ranks many ones followed by num_ranks zeros
+    expected_scalar = ~((1 << num_ranks) - 1) & initial_mask
+    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
diff --git a/tests/unittests/test_atomic_max.py b/tests/unittests/test_atomic_max.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import torch
+import triton
+import triton.language as tl
+import pytest
+import iris
+
+
+@triton.jit
+def atomic_max_kernel(
+    results,
+    sem: tl.constexpr,
+    scope: tl.constexpr,
+    cur_rank: tl.constexpr,
+    num_ranks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    heap_bases: tl.tensor,
+):
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < BLOCK_SIZE
+
+    acc = tl.full([BLOCK_SIZE], cur_rank + 1, dtype=results.type.element_ty)
+
+    for target_rank in range(num_ranks):
+        iris.atomic_max(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_max_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
+
+    min_val = torch.iinfo(dtype).min
+    results = shmem.full((BLOCK_SIZE,), min_val, dtype=dtype)
+
+    grid = lambda meta: (1,)
+    atomic_max_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # All ranks participate in performing the max operation
+    # Each rank performs the atomic operation: max(rank_id + 1)
+    # The result equals the ID of the last rank + 1
+    expected = torch.full((BLOCK_SIZE,), num_ranks, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
diff --git a/tests/unittests/test_atomic_min.py b/tests/unittests/test_atomic_min.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import torch
+import triton
+import triton.language as tl
+import pytest
+import iris
+
+
+@triton.jit
+def atomic_min_kernel(
+    results,
+    sem: tl.constexpr,
+    scope: tl.constexpr,
+    cur_rank: tl.constexpr,
+    num_ranks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    heap_bases: tl.tensor,
+):
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < BLOCK_SIZE
+
+    acc = tl.full([BLOCK_SIZE], cur_rank + 1, dtype=results.type.element_ty)
+
+    for target_rank in range(num_ranks):
+        iris.atomic_min(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_min_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
+
+    max_val = torch.iinfo(dtype).max
+    results  = shmem.full((BLOCK_SIZE,), max_val, dtype=dtype)
+
+    grid = lambda meta: (1,)
+    atomic_min_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+    # All ranks participate in performing the max operation
+    # Each rank performs the atomic operation: max(rank_id + 1)
+    # The result equals the ID of the first rank + 1 
+    expected = torch.full((BLOCK_SIZE,), 1, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
diff --git a/tests/unittests/test_atomic_or.py b/tests/unittests/test_atomic_or.py