From 5bea679265e0fd87917ce283d89d54b238e38192 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 31 Jan 2025 17:27:58 +0000
Subject: [PATCH 01/89] start drafting support for axlearn

---
 .github/container/Dockerfile.axlearn |  44 ++++++++
 .github/container/test-axlearn.sh    | 146 +++++++++++++++++++++++++++
 2 files changed, 190 insertions(+)
 create mode 100644 .github/container/Dockerfile.axlearn
 create mode 100644 .github/container/test-axlearn.sh
diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
new file mode 100644
index 000000000..857e3941f
--- /dev/null
+++ b/.github/container/Dockerfile.axlearn
@@ -0,0 +1,44 @@
+# syntax=docker/dockerfile:1-labs
+ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
+ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git
+ARG SRC_PATH_AXLEARN=/opt/axlearn
+
+###############################################################################
+## Download source and configure dependencies
+###############################################################################
+FROM ${BASE_IMAGE} AS mealkit
+ARG URLREF_AXLEARN
+ARG SRC_PATH_AXLEARN
+
+RUN <<"EOF" bash -ex
+  git clone "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}"
+EOF
+
+RUN <<"EOF" bash -ex
+  echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in
+  echo <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
+aqtp==0.8.2
+einops==0.8.0
+nltk==3.7
+portpicker==1.6.0
+seqio==0.0.18
+protobuf==3.20.3  
+tensorflow==2.18.0
+tensorflow-datasets==4.9.7
+tensorflow-io==0.37.1
+tensorflow-io-gcs-filesystem==0.37.1
+tensorflow-metadata==1.13.1 
+tensorflow-probability==0.24.0
+tensorflow-text==2.18.1
+pytest>=7.4.3
+REQUIREMENTS
+EOF
+
+###############################################################################
+## Install accumulated packages from the base image and the previous stage
+###############################################################################
+FROM mealkit AS final
+
+RUN pip-finalize.sh
+
+WORKDIR ${SRC_PATH_AXLEARN}
diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
new file mode 100644
index 000000000..5d36ad11c
--- /dev/null
+++ b/.github/container/test-axlearn.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+
+set -euo pipefail
+
+usage() {
+    echo "Run tests in axlearn with specified options."
+    echo ""
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "  OPTIONS                       DESCRIPTION"
+    echo "  -d, --directory DIR           Directory to run tests in."
+    echo "                                Default: 'axlearn/axlearn/common'."
+    echo "  -p, --packages PACKAGES       Space-separated list of packages to install via pip."
+    echo "                                Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'."
+    echo "  -c, --cuda-devices DEVICES    CUDA devices to use. Default: '0,1'."
+    echo "  -t, --test-files PATTERN      Pattern for test files to run."
+    echo "                                Default: '*_test.py'."
+    echo "  --test-files-list FILE        File containing the list of test files to run."
+    echo "  -o, --output DIRECTORY        Output directory for logs and summary."
+    echo "                                Default: 'test_runs/<timestamp>'."
+    echo "  -h, --help                    Show this help message and exit."
+    exit 1
+}
+
+# Default values
+DIR='axlearn/axlearn/common'
+PACKAGES='attrs scikit-learn torch evaluate transformers timm wandb grain'
+CUDNN_VERSION='9.7.0.66' # TODO check the cudnn version on compute
+CUDA_DEVICES='0,1'
+TEST_FILES_PATTERN='*_test.py'
+TEST_FILES_LIST=''
+OUTPUT_DIRECTORY=''
+
+# Parse args
+args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,test-files-list:,output:,help -- "$@")
+if [ $? -ne 0 ]; then
+    usage
+    exit 1
+fi
+
+eval set -- "$args"
+
+while true; do
+    case "$1" in
+        -d|--directory)
+            DIR="$2"
+            shift 2
+            ;;
+        -p|--packages)
+            PACKAGES="$2"
+            shift 2
+            ;;
+        -c|--cuda-devices)
+            CUDA_DEVICES="$2"
+            shift 2
+            ;;
+        -t|--test-files)
+            TEST_FILES_PATTERN="$2"
+            shift 2
+            ;;
+        --test-files-list)
+            TEST_FILES_LIST="$2"
+            shift 2
+            ;;
+        -o|--output)
+            OUTPUT_DIRECTORY="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        --)
+            shift
+            break
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+# TODO double check what's the best choice
+if [ -z "$OUTPUT_DIRECTORY" ]; then
+    timestamp=$(date +%Y%m%d_%H%M%S)
+    OUTPUT_DIRECTORY="test_runs/${timestamp}"
+fi
+LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs"
+
+mkdir -p "${LOG_DIRECTORY}"
+
+# Print out config for sanity check
+echo "Configuration:"
+echo "  Directory: $DIR"
+echo "  Packages: $PACKAGES"
+echo "  CUDA Devices: $CUDA_DEVICES"
+if [ -n "$TEST_FILES_LIST" ]; then
+    echo "  Test Files List: $TEST_FILES_LIST"
+else
+    echo "  Test Files Pattern: $TEST_FILES_PATTERN"
+fi
+echo "  Output Directory: $OUTPUT_DIRECTORY"
+echo ""
+
+
+cd "$DIR" || exit 1
+
+# Install all the neeeded packages
+echo "Installing packages..."
+pip install $PACKAGES
+
+# Set CUDA devices
+export CUDA_VISIBLE_DEVICES="${CUDA_DEVICES}"
+echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES"
+
+echo "Running tests..."
+
+if [ -n "$TEST_FILES_LIST" ]; then
+    mapfile -t test_files < "$TEST_FILES_LIST"
+else
+    shopt -s nullglob
+    test_files=($TEST_FILES_PATTERN)
+    shopt -u nullglob
+fi
+
+if [ "${#test_files[@]}" -eq 0 ]; then
+    echo "No test files found to run."
+    exit 1
+fi
+
+for test_file in "${test_files[@]}"; do
+    echo "Running: ${test_file}"
+    # Ensure the test file exists
+    if [ ! -f "${test_file}" ]; then
+        echo "${test_file}: NOT FOUND" >> "${SUMMARY_FILE}"
+        echo "Test file not found: ${test_file}"
+        ((errors++))
+        continue
+    fi
+    log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
+    log_file="${LOG_DIRECTORY}/${log_file_name}"
+    # run the tests and save them as *.log
+    pytest "${test_file}" -v --capture=tee-sys | tee "${log_file}"
+    # TODO parse the logs
+    #echo ${PIPESTATUS[0]}
+done

From 807d3df8d3c1a0b7295a7d25d3db5a4e3e1fc64b Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Feb 2025 16:47:57 +0000
Subject: [PATCH 02/89] fix test for axlearn

---
 .github/container/test-axlearn.sh | 131 ++++++++++++++++++++++++------
 1 file changed, 105 insertions(+), 26 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 5d36ad11c..70d7ecb00 100644
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -euo pipefail
+set -uo pipefail
 
 usage() {
     echo "Run tests in axlearn with specified options."
@@ -13,9 +13,8 @@ usage() {
     echo "  -p, --packages PACKAGES       Space-separated list of packages to install via pip."
     echo "                                Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'."
     echo "  -c, --cuda-devices DEVICES    CUDA devices to use. Default: '0,1'."
-    echo "  -t, --test-files PATTERN      Pattern for test files to run."
+    echo "  -t, --test-files FILES        Pattern for test files to run."
     echo "                                Default: '*_test.py'."
-    echo "  --test-files-list FILE        File containing the list of test files to run."
     echo "  -o, --output DIRECTORY        Output directory for logs and summary."
     echo "                                Default: 'test_runs/<timestamp>'."
     echo "  -h, --help                    Show this help message and exit."
@@ -27,12 +26,11 @@ DIR='axlearn/axlearn/common'
 PACKAGES='attrs scikit-learn torch evaluate transformers timm wandb grain'
 CUDNN_VERSION='9.7.0.66' # TODO check the cudnn version on compute
 CUDA_DEVICES='0,1'
-TEST_FILES_PATTERN='*_test.py'
-TEST_FILES_LIST=''
+TEST_FILES=()
 OUTPUT_DIRECTORY=''
 
 # Parse args
-args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,test-files-list:,output:,help -- "$@")
+args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,output:,help -- "$@")
 if [ $? -ne 0 ]; then
     usage
     exit 1
@@ -55,12 +53,12 @@ while true; do
             shift 2
             ;;
         -t|--test-files)
-            TEST_FILES_PATTERN="$2"
-            shift 2
-            ;;
-        --test-files-list)
-            TEST_FILES_LIST="$2"
-            shift 2
+            shift
+            # Collect all arguments until the next option (starting with '-')
+            while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
+                TEST_FILES+=("$1")
+                shift
+            done
             ;;
         -o|--output)
             OUTPUT_DIRECTORY="$2"
@@ -94,10 +92,13 @@ echo "Configuration:"
 echo "  Directory: $DIR"
 echo "  Packages: $PACKAGES"
 echo "  CUDA Devices: $CUDA_DEVICES"
-if [ -n "$TEST_FILES_LIST" ]; then
-    echo "  Test Files List: $TEST_FILES_LIST"
+if [ "${#TEST_FILES[@]}" -gt 0 ]; then
+    echo "  Test Files:"
+    for f in "${TEST_FILES[@]}"; do
+        echo "    $f"
+    done
 else
-    echo "  Test Files Pattern: $TEST_FILES_PATTERN"
+    echo "  Test Files Pattern: '*_test.py' (default)"
 fi
 echo "  Output Directory: $OUTPUT_DIRECTORY"
 echo ""
@@ -115,20 +116,73 @@ echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES"
 
 echo "Running tests..."
 
-if [ -n "$TEST_FILES_LIST" ]; then
-    mapfile -t test_files < "$TEST_FILES_LIST"
-else
-    shopt -s nullglob
-    test_files=($TEST_FILES_PATTERN)
-    shopt -u nullglob
+if [ "${#TEST_FILES[@]}" -eq 0 ]; then
+    TEST_FILES=("*_test.py")
 fi
+expanded_test_files=()
+for pattern in "${TEST_FILES[@]}"; do
+    # Use globbing to expand pattern
+    files=( $pattern )
+    if [ "${#files[@]}" -gt 0 ]; then
+        expanded_test_files+=( "${files[@]}" )
+    else
+        echo "Warning: No files matched pattern '$pattern'"
+    fi
+done
+
 
-if [ "${#test_files[@]}" -eq 0 ]; then
+if [ "${#expanded_test_files[@]}" -eq 0 ]; then
     echo "No test files found to run."
     exit 1
 fi
 
-for test_file in "${test_files[@]}"; do
+echo "These are the test files:"
+for f in "${expanded_test_files[@]}"; do
+    echo "  $f"
+done
+
+# Get the directory where the script is located
+#SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt"
+EXCLUDE_PATTERNS=()
+
+if [ -f "$EXCLUDE_LIST_FILE" ]; then
+    echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'"
+    mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE"
+else
+    echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'"
+fi
+echo "Exclude patterns read:"
+for pattern in "${EXCLUDE_PATTERNS[@]}"; do
+    echo  "$pattern"
+done
+
+#expanded_test_files=( "${expanded_test_files[@]:0:10}" )
+# we are skipping some tests as there's still wip by Apple
+final_test_files=()
+
+for test_file in "${expanded_test_files[@]}"; do 
+    exclude=false 
+    #echo $test_file
+    for pattern in "${EXCLUDE_PATTERNS[@]}"; do 
+        if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then
+            exclude=true 
+            break 
+        fi 
+    done 
+    if [ "$exclude" = false ]; then 
+        final_test_files+=("$test_file")
+    fi 
+done
+
+# Initialize counters
+errors=0
+failures=0
+passed=0
+SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
+
+
+for test_file in "${final_test_files[@]:0:10}"; do
     echo "Running: ${test_file}"
     # Ensure the test file exists
     if [ ! -f "${test_file}" ]; then
@@ -140,7 +194,32 @@ for test_file in "${test_files[@]}"; do
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
     log_file="${LOG_DIRECTORY}/${log_file_name}"
     # run the tests and save them as *.log
-    pytest "${test_file}" -v --capture=tee-sys | tee "${log_file}"
-    # TODO parse the logs
-    #echo ${PIPESTATUS[0]}
+    pytest "${test_file}" --capture=tee-sys | tee "${log_file}"
+    # TODO parse the logs?
+    exit_code=${PIPESTATUS[0]}
+    echo $exit_code
+    if [ $exit_code -eq 0 ]; then
+        echo "${test_file}: PASSED" >> "${SUMMARY_FILE}"
+        ((passed++))
+    else
+        echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}"
+        ((failures++))
+    fi
+    echo ""
 done
+
+echo $errors 
+echo $passed 
+echo $failures
+
+# e.g. of output summary 
+#/opt/axlearn/axlearn/common/adapter_flax_test.py: PASSED
+#/opt/axlearn/axlearn/common/attention_bias_test.py: PASSED
+#/opt/axlearn/axlearn/common/bert_test.py: FAILED (Exit code: 1)
+#/opt/axlearn/axlearn/common/causal_lm_test.py: FAILED (Exit code: 1)
+#/opt/axlearn/axlearn/common/checkpointer_orbax_test.py: PASSED
+#/opt/axlearn/axlearn/common/checkpointer_test.py: PASSED
+#/opt/axlearn/axlearn/common/compiler_options_test.py: PASSED
+#/opt/axlearn/axlearn/common/config_test.py: PASSED
+#/opt/axlearn/axlearn/common/conformer_test.py: FAILED (Exit code: 1)
+#/opt/axlearn/axlearn/common/convolution_test.py: FAILED (Exit code: 1)

From 9947c0843862c1d668434be8e827705692bffdae Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Feb 2025 17:38:45 +0000
Subject: [PATCH 03/89] add build for axlearn

---
 .github/workflows/_ci.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 167c4f009..b31359d0c 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -203,6 +203,21 @@ jobs:
         URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
     secrets: inherit
 
+  build-axlearn:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-axlearn-build
+      BADGE_FILENAME: badge-axlearn-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: axlearn
+      DOCKERFILE: .github/container/Dockerfile.axlearn
+      EXTRA_BUILD_ARGS: |
+        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+    secrets: inherit
+
   collect-docker-tags:
     runs-on: ubuntu-22.04
     if: "!cancelled()"
@@ -218,6 +233,7 @@ jobs:
       - build-rosetta-t5x
       - build-rosetta-pax
       - build-gemma
+      - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
     steps:
@@ -237,6 +253,7 @@ jobs:
             {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
@@ -247,6 +264,7 @@ jobs:
             {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
           ]

From ef775d5fd8d54e9ba1965afad9c29414eed1f3d6 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 4 Feb 2025 11:04:28 +0000
Subject: [PATCH 04/89] install dependencies

---
 .github/container/Dockerfile.axlearn | 15 +++++++++++++++
 .github/container/test-axlearn.sh    | 10 ++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 857e3941f..1ec98c86b 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -31,9 +31,24 @@ tensorflow-metadata==1.13.1
 tensorflow-probability==0.24.0
 tensorflow-text==2.18.1
 pytest>=7.4.3
+scikit-learn 
+torch 
+evaluate 
+transformers 
+timm 
+wandb 
+grain
+nvidia-cudnn-cu12==9.7.0.66
 REQUIREMENTS
 EOF
 
+
+###############################################################################
+## Add test script to the path
+###############################################################################
+
+ADD test-axlearn.sh /usr/local/bin
+
 ###############################################################################
 ## Install accumulated packages from the base image and the previous stage
 ###############################################################################
diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 70d7ecb00..e59933129 100644
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -12,7 +12,7 @@ usage() {
     echo "                                Default: 'axlearn/axlearn/common'."
     echo "  -p, --packages PACKAGES       Space-separated list of packages to install via pip."
     echo "                                Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'."
-    echo "  -c, --cuda-devices DEVICES    CUDA devices to use. Default: '0,1'."
+    echo "  -c, --cuda-devices DEVICES    CUDA devices to use. Default: '0,1,2,3,4,5,6,7'."
     echo "  -t, --test-files FILES        Pattern for test files to run."
     echo "                                Default: '*_test.py'."
     echo "  -o, --output DIRECTORY        Output directory for logs and summary."
@@ -23,9 +23,7 @@ usage() {
 
 # Default values
 DIR='axlearn/axlearn/common'
-PACKAGES='attrs scikit-learn torch evaluate transformers timm wandb grain'
-CUDNN_VERSION='9.7.0.66' # TODO check the cudnn version on compute
-CUDA_DEVICES='0,1'
+CUDA_DEVICES='0,1,2,3,4,5,6,7'
 TEST_FILES=()
 OUTPUT_DIRECTORY=''
 
@@ -106,10 +104,6 @@ echo ""
 
 cd "$DIR" || exit 1
 
-# Install all the neeeded packages
-echo "Installing packages..."
-pip install $PACKAGES
-
 # Set CUDA devices
 export CUDA_VISIBLE_DEVICES="${CUDA_DEVICES}"
 echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES"

From 15927d2eaa6871d7c4a17c28b56f31ba4b7448be Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 4 Feb 2025 17:26:30 +0000
Subject: [PATCH 05/89] check tests

---
 .github/workflows/_ci.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index b31359d0c..84fdc05fc 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -710,3 +710,21 @@ jobs:
     with:
       MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
+
+  test-axlearn:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+    uses: ./.github/workflows/_test_unit.yaml
+    with: # fix the arguments below
+      TEST_NAME: axlearn
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all \
+        ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-backend-independent.log
+          test-axlearn.sh --directory $pwd --output /opt/output/output.log --test-files /opt/axlearn/axlearn/common/*_test.py 
+        EOF
+      STATISTICS_SCRIPT: |
+        echo "Todo"
+      ARTIFACTS: |
+        test-backend-independent.log
+    secrets: inherit
\ No newline at end of file

From af9dad37a01de86eb7e2886bd331a8b67e02b59a Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 6 Feb 2025 10:37:56 +0000
Subject: [PATCH 06/89] make the bash script executable

---
 .github/container/test-axlearn.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 .github/container/test-axlearn.sh

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
old mode 100644
new mode 100755

From 9e4d4a551b7df060a055140790934e5769481327 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 7 Feb 2025 09:31:44 +0100
Subject: [PATCH 07/89] minimal ci to test axlearn

---
 .github/workflows/_ci.yaml | 1124 ++++++++++++++++++------------------
 1 file changed, 553 insertions(+), 571 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 84fdc05fc..514545edb 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,142 +66,142 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  build-triton:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-triton-build
-      BADGE_FILENAME: badge-triton-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: triton
-      DOCKERFILE: .github/container/Dockerfile.triton
-      RUNNER_SIZE: large
-      EXTRA_BUILD_ARGS: |
-        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-    secrets: inherit
+  # build-triton:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-triton-build
+  #     BADGE_FILENAME: badge-triton-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: triton
+  #     DOCKERFILE: .github/container/Dockerfile.triton
+  #     RUNNER_SIZE: large
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+  #   secrets: inherit
 
-  build-equinox:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-equinox-build
-      BADGE_FILENAME: badge-equinox-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: equinox
-      DOCKERFILE: .github/container/Dockerfile.equinox
-      EXTRA_BUILD_ARGS: |
-        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-    secrets: inherit
+  # build-equinox:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-equinox-build
+  #     BADGE_FILENAME: badge-equinox-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: equinox
+  #     DOCKERFILE: .github/container/Dockerfile.equinox
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+  #   secrets: inherit
 
-  build-maxtext:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-maxtext-build
-      BADGE_FILENAME: badge-maxtext-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: maxtext
-      DOCKERFILE: .github/container/Dockerfile.maxtext
-      EXTRA_BUILD_ARGS: |
-        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-    secrets: inherit
+  # build-maxtext:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-maxtext-build
+  #     BADGE_FILENAME: badge-maxtext-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: maxtext
+  #     DOCKERFILE: .github/container/Dockerfile.maxtext
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+  #   secrets: inherit
 
-  build-levanter:
-    needs: [build-jax]
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-levanter-build"
-      BADGE_FILENAME: "badge-levanter-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: levanter
-      DOCKERFILE: .github/container/Dockerfile.levanter
-      EXTRA_BUILD_ARGS: |
-        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-    secrets: inherit
+  # build-levanter:
+  #   needs: [build-jax]
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-levanter-build"
+  #     BADGE_FILENAME: "badge-levanter-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: levanter
+  #     DOCKERFILE: .github/container/Dockerfile.levanter
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+  #   secrets: inherit
 
-  build-upstream-t5x:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-t5x-build"
-      BADGE_FILENAME: "badge-t5x-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-t5x
-      DOCKERFILE: .github/container/Dockerfile.t5x
-      EXTRA_BUILD_ARGS: |
-        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-    secrets: inherit
+  # build-upstream-t5x:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-t5x-build"
+  #     BADGE_FILENAME: "badge-t5x-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: upstream-t5x
+  #     DOCKERFILE: .github/container/Dockerfile.t5x
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+  #   secrets: inherit
 
-  build-upstream-pax:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-pax-build
-      BADGE_FILENAME: badge-pax-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-pax
-      DOCKERFILE: .github/container/Dockerfile.pax
-      EXTRA_BUILD_ARGS: |
-        URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
-        URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
-        URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
-    secrets: inherit
+  # build-upstream-pax:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-pax-build
+  #     BADGE_FILENAME: badge-pax-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: upstream-pax
+  #     DOCKERFILE: .github/container/Dockerfile.pax
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
+  #       URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
+  #       URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
+  #   secrets: inherit
 
-  build-rosetta-t5x:
-    needs: build-upstream-t5x
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: t5x
-    secrets: inherit
+  # build-rosetta-t5x:
+  #   needs: build-upstream-t5x
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: t5x
+  #   secrets: inherit
 
-  build-rosetta-pax:
-    needs: build-upstream-pax
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: pax
-    secrets: inherit
+  # build-rosetta-pax:
+  #   needs: build-upstream-pax
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: pax
+  #   secrets: inherit
 
-  build-gemma:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-gemma-build
-      BADGE_FILENAME: badge-gemma-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: gemma
-      DOCKERFILE: rosetta/Dockerfile.gemma
-      DOCKER_CONTEXT: .
-      EXTRA_BUILD_ARGS: |
-        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-    secrets: inherit
+  # build-gemma:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-gemma-build
+  #     BADGE_FILENAME: badge-gemma-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: gemma
+  #     DOCKERFILE: rosetta/Dockerfile.gemma
+  #     DOCKER_CONTEXT: .
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+  #   secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -224,15 +224,15 @@ jobs:
     needs:
       - build-base
       - build-jax
-      - build-triton
-      - build-equinox
-      - build-maxtext
-      - build-levanter
-      - build-upstream-t5x
-      - build-upstream-pax
-      - build-rosetta-t5x
-      - build-rosetta-pax
-      - build-gemma
+      # - build-triton
+      # - build-equinox
+      # - build-maxtext
+      # - build-levanter
+      # - build-upstream-t5x
+      # - build-upstream-pax
+      # - build-rosetta-t5x
+      # - build-rosetta-pax
+      # - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -244,26 +244,8 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -273,447 +255,447 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  test-distribution:
-    runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
-    steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
-        run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  test-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent 
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee tee test-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-gpu.log
-    secrets: inherit
-
-  test-nsys-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: nsys-jax
-      EXECUTE: |
-        set -o pipefail
-        num_tests=0
-        num_failures=0
-        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-        # avoid an early abort here.
-        set +e
-        docker run -i --shm-size=1g --gpus all \
-          -v $PWD:/opt/output \
-          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-          bash <<"EOF" |& tee test-nsys-jax.log
-            # nsys-jax is already installed, this is just adding the test dependencies
-            pip install pytest-reportlog nsys-jax[test]
-            # abuse knowledge that nsys-jax is installed editable, so the tests exist
-            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-        EOF
-        set -e
-        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-        for mode in 1-process 2-process process-per-gpu; do
-          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-          if [[ "${mode}" == "1-process" ]]; then
-            PROCESS_COUNT=1
-            ARGS=""
-          elif [[ "${mode}" == "2-process" ]]; then
-            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-            # this will flush out more bugs than process-per-node or process-per-GPU.
-            PROCESS_COUNT=2
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-          else
-            PROCESS_COUNT=${GPUS_PER_NODE}
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-          fi
-          for collection in full partial; do
-            NSYS_JAX="nsys-jax"
-            if [[ "${mode}" == "1-process" ]]; then
-              # We will not run nsys-jax-combine, so run analyses eagerly
-              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-            fi
-            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-            if [[ "${collection}" == "partial" ]]; then
-              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-              # nvbug/4801401
-              NSYS_JAX+=" --sample=none"
-            fi
-            set +e
-            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          done
-          if [[ "${mode}" != "1-process" ]]; then
-            # Run nsys-jax-combine
-            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-            for (( i=0; i<PROCESS_COUNT; i++ )); do
-              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-            done
-            set +e
-            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          fi
-        done
-        ls -R .
-        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-        exit $num_failures
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-nsys-jax.log)
-        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        # pytest-driven part
-        test-nsys-jax.log
-        pytest-report.jsonl
-        # nsys-jax logfiles
-        *process-*-execution.log
-        # nsys-jax output for the case that doesn't use nsys-jax-combine
-        1-process-*-execution-0.zip
-        # nsys-jax-combine output/logfiles
-        *process*-*-execution.zip
-        *-execution-combine.log
-    secrets: inherit
-
-  # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  # not already have nsys-jax installed
-  test-nsys-jax-archive:
-    needs: test-nsys-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Download nsys-jax output .zip files
-      uses: actions/download-artifact@v4
-      with:
-        name: nsys-jax-unit-test-A100
-    - name: Extract archives and execute install scripts
-      run: |
-        pip install virtualenv # for install.sh
-        for zip in $(ls *.zip); do
-          ZIP="${PWD}/${zip}"
-          pushd $(mktemp -d)
-          unzip "${ZIP}"
-          ls -l
-          # TODO: verify this isn't needed, or make sure it isn't needed
-          chmod 755 install.sh
-          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-          # Skip executing Jupyter lab
-          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-          popd
-        done
-
-  test-nsys-jax-eks:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    runs-on: eks
-    env:
-      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: Store GitHub Container Registry token as Kubernetes secret
-      run: |
-        kubectl create secret generic \
-          ${{ github.run_id }}-${{ github.run_attempt }}-token \
-          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-          --type=kubernetes.io/dockerconfigjson
-    - name: Configure Kubernetes job
-      run: |
-        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-          | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
-          .github/eks-workflow-files/job.yml
-        git diff .github/eks-workflow-files/job.yml
-    - name: Submit Kubernetes job
-      run: kubectl apply -f .github/eks-workflow-files/job.yml
-    - name: Wait for Kubernetes job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 2
-        done
-    - name: Stream Kubernetes job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
-    # Clean up in case of errors as well as success
-    - name: Delete Kubernetes job
-      if: always()
-      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
-    - name: Configure post-processing job
-      run: |
-        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-          .github/eks-workflow-files/post-process-job.yml
-        git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post-processing Kubernetes job
-      run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
-    - name: Wait for post-processing Kubernetes job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 2
-        done
-    - name: Stream post-processing Kubernetes job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-    # Clean up in case of errors as well as success
-    - name: Delete post-processing Kubernetes job
-      if: always()
-      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-    - name: Delete GitHub Container Registry token
-      if: always()
-      run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
-
-  # test-equinox:
-  #   needs: build-equinox
+  # test-distribution:
+  #   runs-on: ubuntu-22.04
+  #   strategy:
+  #     matrix:
+  #       TEST_SCRIPT:
+  #         - extra-only-distribution.sh
+  #         - mirror-only-distribution.sh
+  #         - upstream-only-distribution.sh
+  #         - local-patch-distribution.sh
+  #     fail-fast: false
+  #   steps:
+  #     - name: Print environment variables
+  #       run: env
+  #     - name: Set git login for tests
+  #       run: |
+  #         git config --global user.email "jax@nvidia.com"
+  #         git config --global user.name "JAX-Toolbox CI"
+  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
+  #       uses: actions/checkout@v4
+  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  # test-jax:
+  #   needs: build-jax
   #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
   #   uses: ./.github/workflows/_test_unit.yaml
   #   with:
-  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-  #     TEST_NAME: equinox
+  #     TEST_NAME: jax
   #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -exc -o pipefail \
-  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-jax.sh -b backend-independent 
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee tee test-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b gpu
+  #       EOF
   #     STATISTICS_SCRIPT: |
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
   #       total_tests=$((failed_tests + passed_tests))
   #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
   #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
   #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
   #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
   #     ARTIFACTS: |
-  #       test-equinox.log
+  #       test-backend-independent.log
+  #       test-gpu.log
   #   secrets: inherit
 
-  test-te-multigpu:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_te.yaml
-    with:
-      TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-nsys-jax:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: nsys-jax
+  #     EXECUTE: |
+  #       set -o pipefail
+  #       num_tests=0
+  #       num_failures=0
+  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+  #       # avoid an early abort here.
+  #       set +e
+  #       docker run -i --shm-size=1g --gpus all \
+  #         -v $PWD:/opt/output \
+  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #         bash <<"EOF" |& tee test-nsys-jax.log
+  #           # nsys-jax is already installed, this is just adding the test dependencies
+  #           pip install pytest-reportlog nsys-jax[test]
+  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
+  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+  #       EOF
+  #       set -e
+  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+  #       for mode in 1-process 2-process process-per-gpu; do
+  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+  #         if [[ "${mode}" == "1-process" ]]; then
+  #           PROCESS_COUNT=1
+  #           ARGS=""
+  #         elif [[ "${mode}" == "2-process" ]]; then
+  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+  #           # this will flush out more bugs than process-per-node or process-per-GPU.
+  #           PROCESS_COUNT=2
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+  #         else
+  #           PROCESS_COUNT=${GPUS_PER_NODE}
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+  #         fi
+  #         for collection in full partial; do
+  #           NSYS_JAX="nsys-jax"
+  #           if [[ "${mode}" == "1-process" ]]; then
+  #             # We will not run nsys-jax-combine, so run analyses eagerly
+  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+  #           fi
+  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+  #           if [[ "${collection}" == "partial" ]]; then
+  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+  #             # nvbug/4801401
+  #             NSYS_JAX+=" --sample=none"
+  #           fi
+  #           set +e
+  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         done
+  #         if [[ "${mode}" != "1-process" ]]; then
+  #           # Run nsys-jax-combine
+  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
+  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+  #           done
+  #           set +e
+  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         fi
+  #       done
+  #       ls -R .
+  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+  #       exit $num_failures
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-nsys-jax.log)
+  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       # pytest-driven part
+  #       test-nsys-jax.log
+  #       pytest-report.jsonl
+  #       # nsys-jax logfiles
+  #       *process-*-execution.log
+  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
+  #       1-process-*-execution-0.zip
+  #       # nsys-jax-combine output/logfiles
+  #       *process*-*-execution.zip
+  #       *-execution-combine.log
+  #   secrets: inherit
 
-  test-upstream-t5x:
-    needs: build-upstream-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_upstream_t5x.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  # # not already have nsys-jax installed
+  # test-nsys-jax-archive:
+  #   needs: test-nsys-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   strategy:
+  #     matrix:
+  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+  #   runs-on: ${{ matrix.os }}
+  #   steps:
+  #   - name: Download nsys-jax output .zip files
+  #     uses: actions/download-artifact@v4
+  #     with:
+  #       name: nsys-jax-unit-test-A100
+  #   - name: Extract archives and execute install scripts
+  #     run: |
+  #       pip install virtualenv # for install.sh
+  #       for zip in $(ls *.zip); do
+  #         ZIP="${PWD}/${zip}"
+  #         pushd $(mktemp -d)
+  #         unzip "${ZIP}"
+  #         ls -l
+  #         # TODO: verify this isn't needed, or make sure it isn't needed
+  #         chmod 755 install.sh
+  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+  #         # Skip executing Jupyter lab
+  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+  #         popd
+  #       done
+
+  # test-nsys-jax-eks:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   runs-on: eks
+  #   env:
+  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
+  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
+  #     TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: Store GitHub Container Registry token as Kubernetes secret
+  #     run: |
+  #       kubectl create secret generic \
+  #         ${{ github.run_id }}-${{ github.run_attempt }}-token \
+  #         --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+  #         --type=kubernetes.io/dockerconfigjson
+  #   - name: Configure Kubernetes job
+  #     run: |
+  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
+  #         .github/eks-workflow-files/job.yml
+  #       git diff .github/eks-workflow-files/job.yml
+  #   - name: Submit Kubernetes job
+  #     run: kubectl apply -f .github/eks-workflow-files/job.yml
+  #   - name: Wait for Kubernetes job to start
+  #     run: |
+  #       while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+  #         sleep 2
+  #       done
+  #   - name: Stream Kubernetes job output
+  #     run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
+  #   # Clean up in case of errors as well as success
+  #   - name: Delete Kubernetes job
+  #     if: always()
+  #     run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
+  #   - name: Configure post-processing job
+  #     run: |
+  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+  #         | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+  #         .github/eks-workflow-files/post-process-job.yml
+  #       git diff .github/eks-workflow-files/post-process-job.yml
+  #   - name: Submit post-processing Kubernetes job
+  #     run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
+  #   - name: Wait for post-processing Kubernetes job to start
+  #     run: |
+  #       while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+  #         sleep 2
+  #       done
+  #   - name: Stream post-processing Kubernetes job output
+  #     run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
+  #   # Clean up in case of errors as well as success
+  #   - name: Delete post-processing Kubernetes job
+  #     if: always()
+  #     run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
+  #   - name: Delete GitHub Container Registry token
+  #     if: always()
+  #     run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
+
+  # # test-equinox:
+  # #   needs: build-equinox
+  # #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  # #   uses: ./.github/workflows/_test_unit.yaml
+  # #   with:
+  # #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  # #     TEST_NAME: equinox
+  # #     EXECUTE: |
+  # #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+  # #       bash -exc -o pipefail \
+  # #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+  # #     STATISTICS_SCRIPT: |
+  # #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  # #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  # #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  # #       total_tests=$((failed_tests + passed_tests))
+  # #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  # #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  # #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  # #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  # #     ARTIFACTS: |
+  # #       test-equinox.log
+  # #   secrets: inherit
+
+  # test-te-multigpu:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_te.yaml
+  #   with:
+  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-rosetta-t5x:
-    needs: build-rosetta-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_t5x_rosetta.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-t5x:
+  #   needs: build-upstream-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-triton:
-    needs: build-triton
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: triton
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-triton.log
-          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-          # actually having a CUDA backend for pytoch
-          pip install --no-deps torch
-          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-        EOF
-      STATISTICS_SCRIPT: |
-        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-        passed_tests=$((total_tests - errors - failed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-triton.log
-    secrets: inherit
+  # test-rosetta-t5x:
+  #   needs: build-rosetta-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-levanter:
-    needs: build-levanter
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: levanter
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g \
-        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-levanter.log
-          pip install flake8 pytest soundfile librosa
-          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-levanter.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-levanter.log
-    secrets: inherit
+  # test-triton:
+  #   needs: build-triton
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: triton
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-triton.log
+  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+  #         # actually having a CUDA backend for pytoch
+  #         pip install --no-deps torch
+  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+  #       passed_tests=$((total_tests - errors - failed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-triton.log
+  #   secrets: inherit
 
-  test-te:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: te
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-        ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-te.log
-          pip install pytest-reportlog
-          pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-te.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      TIMEOUT_MINUTES: 120
-      ARTIFACTS: |
-        test-te.log
-        pytest-report.jsonl
-    secrets: inherit
+  # test-levanter:
+  #   needs: build-levanter
+  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: levanter
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g \
+  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-levanter.log
+  #         pip install flake8 pytest soundfile librosa
+  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-levanter.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-levanter.log
+  #   secrets: inherit
 
-  test-upstream-pax:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_upstream_pax.yaml
-    with:
-      PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-te:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: te
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-te.log
+  #         pip install pytest-reportlog
+  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-te.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     TIMEOUT_MINUTES: 120
+  #     ARTIFACTS: |
+  #       test-te.log
+  #       pytest-report.jsonl
+  #   secrets: inherit
 
-  test-rosetta-pax:
-    needs: build-rosetta-pax
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_pax_rosetta.yaml
-    with:
-      PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-pax:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_upstream_pax.yaml
+  #   with:
+  #     PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
+
+  # test-rosetta-pax:
+  #   needs: build-rosetta-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_pax_rosetta.yaml
+  #   with:
+  #     PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
   
-  test-gemma:
-    needs: build-gemma
-    uses: ./.github/workflows/_test_unit.yaml  
-    if: inputs.ARCHITECTURE == 'amd64'
-    with:
-      TEST_NAME: gemma
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-        bash -ec \
-        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-gemma.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-gemma.log
-    secrets: inherit
+  # test-gemma:
+  #   needs: build-gemma
+  #   uses: ./.github/workflows/_test_unit.yaml  
+  #   if: inputs.ARCHITECTURE == 'amd64'
+  #   with:
+  #     TEST_NAME: gemma
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -ec \
+  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-gemma.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-gemma.log
+  #   secrets: inherit
 
-  test-maxtext:
-    needs: build-maxtext
-    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-maxtext:
+  #   needs: build-maxtext
+  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+  #   uses: ./.github/workflows/_test_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
   test-axlearn:
     needs: build-axlearn
-    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+    if: inputs.ARCHITECTURE == 'amd64'
     uses: ./.github/workflows/_test_unit.yaml
     with: # fix the arguments below
       TEST_NAME: axlearn

From a3b8f266dcaef5d266622545c5e323f679b4ce2d Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Feb 2025 14:12:36 +0000
Subject: [PATCH 08/89] fix requirements

---
 .github/container/Dockerfile.axlearn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 1ec98c86b..c09b2c08c 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -16,7 +16,7 @@ EOF
 
 RUN <<"EOF" bash -ex
   echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in
-  echo <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
+  cat <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
 aqtp==0.8.2
 einops==0.8.0
 nltk==3.7

From 94054fd34bfb73e91ad13ca794090cd337b25c21 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Feb 2025 14:15:00 +0000
Subject: [PATCH 09/89] fix installation from pip

---
 .github/container/test-axlearn.sh | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index e59933129..2ffda9c4d 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -10,8 +10,6 @@ usage() {
     echo "  OPTIONS                       DESCRIPTION"
     echo "  -d, --directory DIR           Directory to run tests in."
     echo "                                Default: 'axlearn/axlearn/common'."
-    echo "  -p, --packages PACKAGES       Space-separated list of packages to install via pip."
-    echo "                                Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'."
     echo "  -c, --cuda-devices DEVICES    CUDA devices to use. Default: '0,1,2,3,4,5,6,7'."
     echo "  -t, --test-files FILES        Pattern for test files to run."
     echo "                                Default: '*_test.py'."
@@ -28,7 +26,7 @@ TEST_FILES=()
 OUTPUT_DIRECTORY=''
 
 # Parse args
-args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,output:,help -- "$@")
+args=$(getopt -o d:p:c:t:o:h --long directory:,cuda-devices:,test-files:,output:,help -- "$@")
 if [ $? -ne 0 ]; then
     usage
     exit 1
@@ -42,10 +40,6 @@ while true; do
             DIR="$2"
             shift 2
             ;;
-        -p|--packages)
-            PACKAGES="$2"
-            shift 2
-            ;;
         -c|--cuda-devices)
             CUDA_DEVICES="$2"
             shift 2
@@ -88,7 +82,6 @@ mkdir -p "${LOG_DIRECTORY}"
 # Print out config for sanity check
 echo "Configuration:"
 echo "  Directory: $DIR"
-echo "  Packages: $PACKAGES"
 echo "  CUDA Devices: $CUDA_DEVICES"
 if [ "${#TEST_FILES[@]}" -gt 0 ]; then
     echo "  Test Files:"
@@ -99,8 +92,7 @@ else
     echo "  Test Files Pattern: '*_test.py' (default)"
 fi
 echo "  Output Directory: $OUTPUT_DIRECTORY"
-echo ""
-
+echo "" 
 
 cd "$DIR" || exit 1
 

From b9e893c52aa97c93903eae7fe8ec2cff30eae61f Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Feb 2025 15:33:17 +0000
Subject: [PATCH 10/89] remove the nvidia-cunn package

---
 .github/container/Dockerfile.axlearn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index c09b2c08c..6a0d3ac76 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -38,7 +38,6 @@ transformers
 timm 
 wandb 
 grain
-nvidia-cudnn-cu12==9.7.0.66
 REQUIREMENTS
 EOF
 

From 4088e2ffaaf659414033f0013e30445cb5e4eeb8 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Feb 2025 18:37:46 +0000
Subject: [PATCH 11/89] fix input for tests

---
 .github/workflows/_ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 514545edb..3ea68ced4 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -703,7 +703,7 @@ jobs:
         docker run -i --shm-size=1g --gpus all \
         ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee test-backend-independent.log
-          test-axlearn.sh --directory $pwd --output /opt/output/output.log --test-files /opt/axlearn/axlearn/common/*_test.py 
+          test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" 
         EOF
       STATISTICS_SCRIPT: |
         echo "Todo"

From 15781cbed8854ea86f100275a750041453da5cd2 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 11 Feb 2025 12:03:48 +0000
Subject: [PATCH 12/89] fix test and create output

---
 .github/container/test-axlearn.sh | 19 ++-----------------
 .github/workflows/_ci.yaml        | 10 +++++++++-
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 2ffda9c4d..57b9d7080 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -162,19 +162,17 @@ for test_file in "${expanded_test_files[@]}"; do
 done
 
 # Initialize counters
-errors=0
 failures=0
 passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]:0:10}"; do
+for test_file in "${final_test_files[@]}"; do
     echo "Running: ${test_file}"
     # Ensure the test file exists
     if [ ! -f "${test_file}" ]; then
         echo "${test_file}: NOT FOUND" >> "${SUMMARY_FILE}"
         echo "Test file not found: ${test_file}"
-        ((errors++))
         continue
     fi
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
@@ -194,18 +192,5 @@ for test_file in "${final_test_files[@]:0:10}"; do
     echo ""
 done
 
-echo $errors 
 echo $passed 
-echo $failures
-
-# e.g. of output summary 
-#/opt/axlearn/axlearn/common/adapter_flax_test.py: PASSED
-#/opt/axlearn/axlearn/common/attention_bias_test.py: PASSED
-#/opt/axlearn/axlearn/common/bert_test.py: FAILED (Exit code: 1)
-#/opt/axlearn/axlearn/common/causal_lm_test.py: FAILED (Exit code: 1)
-#/opt/axlearn/axlearn/common/checkpointer_orbax_test.py: PASSED
-#/opt/axlearn/axlearn/common/checkpointer_test.py: PASSED
-#/opt/axlearn/axlearn/common/compiler_options_test.py: PASSED
-#/opt/axlearn/axlearn/common/config_test.py: PASSED
-#/opt/axlearn/axlearn/common/conformer_test.py: FAILED (Exit code: 1)
-#/opt/axlearn/axlearn/common/convolution_test.py: FAILED (Exit code: 1)
+echo $failures
\ No newline at end of file
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 3ea68ced4..563a1aa2c 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -706,7 +706,15 @@ jobs:
           test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" 
         EOF
       STATISTICS_SCRIPT: |
-        echo "Todo"
+        # Parse the summary.txt file to count passed/failed/error tests
+        # Adjust greps if your output format changes.
+        passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true)
+        failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true)
+        total_tests=$((failed_tests + passed_tests))
+
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
       ARTIFACTS: |
         test-backend-independent.log
     secrets: inherit
\ No newline at end of file

From 031cfb0119adfcbfeb488a548611e682f52f5dc0 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 11 Feb 2025 16:18:55 +0000
Subject: [PATCH 13/89] fix requirements

---
 .github/container/Dockerfile.axlearn | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 6a0d3ac76..88cbc458c 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -23,21 +23,7 @@ nltk==3.7
 portpicker==1.6.0
 seqio==0.0.18
 protobuf==3.20.3  
-tensorflow==2.18.0
-tensorflow-datasets==4.9.7
-tensorflow-io==0.37.1
-tensorflow-io-gcs-filesystem==0.37.1
-tensorflow-metadata==1.13.1 
-tensorflow-probability==0.24.0
-tensorflow-text==2.18.1
 pytest>=7.4.3
-scikit-learn 
-torch 
-evaluate 
-transformers 
-timm 
-wandb 
-grain
 REQUIREMENTS
 EOF
 

From 1fce714bbe35fbe3257d0a4a710643fcd666e7fc Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 13 Feb 2025 12:59:13 +0000
Subject: [PATCH 14/89] setup for running axlearn tests on k8s

---
 .../axlearn/axlearn-job.yml                   |  63 ++++++++++
 .../axlearn/axlearn-postprocess-job.yml       |  45 +++++++
 .github/workflows/_ci.yaml                    | 119 ++++++++++++++----
 3 files changed, 202 insertions(+), 25 deletions(-)
 create mode 100644 .github/eks-workflow-files/axlearn/axlearn-job.yml
 create mode 100644 .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml

diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
new file mode 100644
index 000000000..e5fbca44f
--- /dev/null
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -0,0 +1,63 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: PLACEHOLDER 
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
+spec:
+  completions: 1
+  parallelism: 1
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+      - name: axlearn
+        image: PLACEHOLDER 
+        command:
+          - bash
+          - -exo
+          - pipefail
+          - -c
+          - |
+            # Example test command; adapted from your Docker run snippet
+            # Writes logs to /opt/output/test-backend-independent.log
+            # Also writes a summary file to /opt/output/summary.txt
+            test-axlearn.sh \
+              --directory "." \
+              --output "/opt/output/" \
+              --test-files "/opt/axlearn/axlearn/common/*_test.py"
+
+            # Wait a moment to ensure logs are flushed
+            sync
+        resources:
+          limits:
+            nvidia.com/gpu: 8
+        volumeMounts:
+        - name: output
+          mountPath: /opt/output
+
+      - name: upload
+        image: amazon/aws-cli
+        command:
+          - sh
+          - -c
+          - |
+            # Wait for the summary file to appear
+            while [ ! -f /opt/output/summary.txt ]; do
+              sleep 1
+            done
+            # Also wait for the main log
+            while [ ! -f /opt/output/test-backend-independent.log ]; do
+              sleep 1
+            done
+            # Now upload to your S3 bucket
+            aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/
+            aws s3 cp /opt/output/test-backend-independent.log s3://jax-toolbox-eks-output/
+        volumeMounts:
+        - name: output
+          mountPath: /opt/output
+      imagePullSecrets:
+      - name: PLACEHOLDER  
+      volumes:
+      - name: output
+        emptyDir: {}
diff --git a/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml b/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
new file mode 100644
index 000000000..b6404a559
--- /dev/null
+++ b/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
@@ -0,0 +1,45 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: PLACEHOLDER 
+spec:
+  template:
+    spec:
+      restartPolicy: Never
+      initContainers:
+      - name: download
+        image: amazon/aws-cli
+        command:
+          - sh
+          - -c
+          - |
+            aws s3 cp s3://jax-toolbox-eks-output/summary.txt /opt/output/
+            aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /opt/output/
+        volumeMounts:
+        - mountPath: /opt/output
+          name: output
+      containers:
+      - name: parse-axlearn
+        image: ubuntu:22.04 
+        command:
+          - bash
+          - -exo
+          - pipefail
+          - -c
+          - |
+            if [ ! -f /opt/output/summary.txt ]; then
+              echo "summary.txt not found!"
+              exit 1
+            fi
+
+            passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true)
+            failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true)
+            total_tests=$((failed_tests + passed_tests))
+        volumeMounts:
+        - mountPath: /opt/output
+          name: output
+      imagePullSecrets:
+      - name: PLACEHOLDER 
+      volumes:
+      - name: output
+        emptyDir: {}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 563a1aa2c..9686170ce 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -693,28 +693,97 @@ jobs:
   #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
   #   secrets: inherit
 
-  test-axlearn:
-    needs: build-axlearn
-    if: inputs.ARCHITECTURE == 'amd64'
-    uses: ./.github/workflows/_test_unit.yaml
-    with: # fix the arguments below
-      TEST_NAME: axlearn
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" 
-        EOF
-      STATISTICS_SCRIPT: |
-        # Parse the summary.txt file to count passed/failed/error tests
-        # Adjust greps if your output format changes.
-        passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true)
-        failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true)
-        total_tests=$((failed_tests + passed_tests))
-
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-    secrets: inherit
\ No newline at end of file
+  # test-axlearn-slurm:
+  #   needs: build-axlearn
+  #   if: inputs.ARCHITECTURE == 'amd64'
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with: # fix the arguments below
+  #     TEST_NAME: axlearn
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" 
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       # Parse the summary.txt file to count passed/failed/error tests
+  #       # Adjust greps if your output format changes.
+  #       passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true)
+  #       failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true)
+  #       total_tests=$((failed_tests + passed_tests))
+
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-backend-independent.log
+  #   secrets: inherit
+
+# TODO WE CAN CREATE A RESUABLE ACTION HERE
+# FIX everything with env.something
+test-axlearn-eks:
+  needs: build-axlearn
+  if: inputs.ARCHITECTURE == 'amd64'
+  runs-on: eks
+  env:
+    AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+    JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn
+    POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
+    TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+  steps:
+  - name: Check out the repository
+    uses: actions/checkout@v4
+  - name: Login to GitHub Container Registry
+    uses: docker/login-action@v3
+    with:
+      registry: ghcr.io
+      username: ${{ github.repository_owner }}
+      password: ${{ secrets.GITHUB_TOKEN }}
+  - name: Store GitHub Container Registry token as Kubernetes secret
+    run: |
+      kubectl create secret generic \
+        ${{ github.run_id }}-${{ github.run_attempt }}-token \
+        --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+        --type=kubernetes.io/dockerconfigjson
+  - name: Configure axlearn test job
+    run: |
+      # Replace placeholders in axlearn-job.yml with environment variables
+      yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME)
+        | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+        | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+      .github/eks-workflow-files/axlearn-job.yml
+      git diff .github/eks-workflow-files/axlearn-job.yml
+  - name: Submit axlearn test job
+    run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml
+  - name: Wait for axlearn test job to start
+    run: |
+      while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+        sleep 10
+      done
+  - name: Stream axlearn test job output
+    run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn
+  - name: Delete axlearn test job
+    if: always()
+    run: kubectl delete job ${{ env.JOB_NAME }}
+  - name: Configure axlearn post-processing job
+    run: |
+      yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+        | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME)
+      ' \
+      .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
+      git diff .github/eks-workflow-files/axlearn-postprocess-job.yml
+  - name: Submit axlearn post-processing job
+    run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml
+  - name: Wait for axlearn post-processing job to start
+    run: |
+      while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+        sleep 10
+      done
+  - name: Stream axlearn post-processing job output
+    run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
+  - name: Delete axlearn post-processing job
+    if: always()
+    run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
+  - name: Delete GitHub Container Registry token
+    if: always()
+    run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token

From 89d388ca1237ea599be0b1b2671670418ccf0067 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 13 Feb 2025 13:05:19 +0000
Subject: [PATCH 15/89] fix indentation

---
 .github/workflows/_ci.yaml | 136 ++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 9686170ce..6e277fcec 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -719,71 +719,71 @@ jobs:
   #       test-backend-independent.log
   #   secrets: inherit
 
-# TODO WE CAN CREATE A RESUABLE ACTION HERE
-# FIX everything with env.something
-test-axlearn-eks:
-  needs: build-axlearn
-  if: inputs.ARCHITECTURE == 'amd64'
-  runs-on: eks
-  env:
-    AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-    JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn
-    POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
-    TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
-  steps:
-  - name: Check out the repository
-    uses: actions/checkout@v4
-  - name: Login to GitHub Container Registry
-    uses: docker/login-action@v3
-    with:
-      registry: ghcr.io
-      username: ${{ github.repository_owner }}
-      password: ${{ secrets.GITHUB_TOKEN }}
-  - name: Store GitHub Container Registry token as Kubernetes secret
-    run: |
-      kubectl create secret generic \
-        ${{ github.run_id }}-${{ github.run_attempt }}-token \
-        --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-        --type=kubernetes.io/dockerconfigjson
-  - name: Configure axlearn test job
-    run: |
-      # Replace placeholders in axlearn-job.yml with environment variables
-      yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME)
-        | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-        | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
-      .github/eks-workflow-files/axlearn-job.yml
-      git diff .github/eks-workflow-files/axlearn-job.yml
-  - name: Submit axlearn test job
-    run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml
-  - name: Wait for axlearn test job to start
-    run: |
-      while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-        sleep 10
-      done
-  - name: Stream axlearn test job output
-    run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn
-  - name: Delete axlearn test job
-    if: always()
-    run: kubectl delete job ${{ env.JOB_NAME }}
-  - name: Configure axlearn post-processing job
-    run: |
-      yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-        | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME)
-      ' \
-      .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-      git diff .github/eks-workflow-files/axlearn-postprocess-job.yml
-  - name: Submit axlearn post-processing job
-    run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml
-  - name: Wait for axlearn post-processing job to start
-    run: |
-      while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-        sleep 10
-      done
-  - name: Stream axlearn post-processing job output
-    run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
-  - name: Delete axlearn post-processing job
-    if: always()
-    run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
-  - name: Delete GitHub Container Registry token
-    if: always()
-    run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
+  # TODO WE CAN CREATE A RESUABLE ACTION HERE
+  # FIX everything with env.something
+  test-axlearn-eks:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64'
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
+      TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Store GitHub Container Registry token as Kubernetes secret
+      run: |
+        kubectl create secret generic \
+          ${{ github.run_id }}-${{ github.run_attempt }}-token \
+          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+          --type=kubernetes.io/dockerconfigjson
+    - name: Configure axlearn test job
+      run: |
+        # Replace placeholders in axlearn-job.yml with environment variables
+        yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+        .github/eks-workflow-files/axlearn-job.yml
+        git diff .github/eks-workflow-files/axlearn-job.yml
+    - name: Submit axlearn test job
+      run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml
+    - name: Wait for axlearn test job to start
+      run: |
+        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+          sleep 10
+        done
+    - name: Stream axlearn test job output
+      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn
+    - name: Delete axlearn test job
+      if: always()
+      run: kubectl delete job ${{ env.JOB_NAME }}
+    - name: Configure axlearn post-processing job
+      run: |
+        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+          | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME)
+        ' \
+        .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
+        git diff .github/eks-workflow-files/axlearn-postprocess-job.yml
+    - name: Submit axlearn post-processing job
+      run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml
+    - name: Wait for axlearn post-processing job to start
+      run: |
+        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+          sleep 10
+        done
+    - name: Stream axlearn post-processing job output
+      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
+    - name: Delete axlearn post-processing job
+      if: always()
+      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
+    - name: Delete GitHub Container Registry token
+      if: always()
+      run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token

From 6c47cf57b372fd16ea3b9bcf7333facd46c4e4b3 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 13 Feb 2025 14:25:00 +0000
Subject: [PATCH 16/89] what an error

---
 .github/workflows/_ci.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 6e277fcec..c3e0b3b2c 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -751,8 +751,8 @@ jobs:
         yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
-        .github/eks-workflow-files/axlearn-job.yml
-        git diff .github/eks-workflow-files/axlearn-job.yml
+        .github/eks-workflow-files/axlearn/axlearn-job.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
     - name: Submit axlearn test job
       run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml
     - name: Wait for axlearn test job to start
@@ -771,9 +771,9 @@ jobs:
           | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME)
         ' \
         .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-        git diff .github/eks-workflow-files/axlearn-postprocess-job.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
     - name: Submit axlearn post-processing job
-      run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml
+      run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
     - name: Wait for axlearn post-processing job to start
       run: |
         while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do

From 44ffb6ef2c6e34db359bfd72573e13ab1967c81a Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 13 Feb 2025 16:01:44 +0000
Subject: [PATCH 17/89] add the k8s option

---
 .github/container/test-axlearn.sh             | 53 +++++++++++++------
 .../axlearn/axlearn-job.yml                   |  3 +-
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 57b9d7080..a82cdefe1 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -15,6 +15,7 @@ usage() {
     echo "                                Default: '*_test.py'."
     echo "  -o, --output DIRECTORY        Output directory for logs and summary."
     echo "                                Default: 'test_runs/<timestamp>'."
+    echo "  -k, --k8s                     Whether to run on a Kubernetes cluster."
     echo "  -h, --help                    Show this help message and exit."
     exit 1
 }
@@ -24,45 +25,57 @@ DIR='axlearn/axlearn/common'
 CUDA_DEVICES='0,1,2,3,4,5,6,7'
 TEST_FILES=()
 OUTPUT_DIRECTORY=''
+K8S=false
 
-# Parse args
-args=$(getopt -o d:p:c:t:o:h --long directory:,cuda-devices:,test-files:,output:,help -- "$@")
-if [ $? -ne 0 ]; then
-    usage
-    exit 1
-fi
-
-eval set -- "$args"
-
-while true; do
-    case "$1" in
+# Parse args manually
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
         -d|--directory)
+            if [[ -z "$2" ]]; then
+                echo "Error: --directory requires an argument."
+                usage
+            fi
             DIR="$2"
             shift 2
             ;;
         -c|--cuda-devices)
+            if [[ -z "$2" ]]; then
+                echo "Error: --cuda-devices requires an argument."
+                usage
+            fi
             CUDA_DEVICES="$2"
             shift 2
             ;;
         -t|--test-files)
             shift
             # Collect all arguments until the next option (starting with '-')
+            if [[ $# -eq 0 ]]; then
+                echo "Error: --test-files requires at least one file pattern."
+                usage
+            fi
+            echo "Option -t|--test-files with arguments:"
             while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
+                echo "  $1"
                 TEST_FILES+=("$1")
                 shift
             done
             ;;
         -o|--output)
+            if [[ -z "$2" ]]; then
+                echo "Error: --output requires an argument."
+                usage
+            fi
             OUTPUT_DIRECTORY="$2"
             shift 2
             ;;
+        -k|--k8s)
+            K8S=true
+            shift
+            ;;
         -h|--help)
             usage
             ;;
-        --)
-            shift
-            break
-            ;;
         *)
             echo "Unknown option: $1"
             usage
@@ -70,7 +83,7 @@ while true; do
     esac
 done
 
-# TODO double check what's the best choice
+
 if [ -z "$OUTPUT_DIRECTORY" ]; then
     timestamp=$(date +%Y%m%d_%H%M%S)
     OUTPUT_DIRECTORY="test_runs/${timestamp}"
@@ -92,8 +105,10 @@ else
     echo "  Test Files Pattern: '*_test.py' (default)"
 fi
 echo "  Output Directory: $OUTPUT_DIRECTORY"
+echo "  Kubernetes mode: $K8S"
 echo "" 
 
+
 cd "$DIR" || exit 1
 
 # Set CUDA devices
@@ -102,6 +117,12 @@ echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES"
 
 echo "Running tests..."
 
+# If we are on Kubernetes, install torch
+if [ "$K8S" = true ]; then
+    echo "K8S mode is true. Installing torch..."
+    pip install torch
+fi
+
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then
     TEST_FILES=("*_test.py")
 fi
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index e5fbca44f..eb75faad5 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -25,7 +25,8 @@ spec:
             test-axlearn.sh \
               --directory "." \
               --output "/opt/output/" \
-              --test-files "/opt/axlearn/axlearn/common/*_test.py"
+              --test-files "/opt/axlearn/axlearn/common/*_test.py" \
+              --k8s
 
             # Wait a moment to ensure logs are flushed
             sync

From a926bf99cc216d3d00df650dd28509b55e7d03fc Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 13 Feb 2025 18:57:58 +0000
Subject: [PATCH 18/89] try a test with 5 files and avoid postprocessing on k8s

---
 .github/container/test-axlearn.sh |  8 +++--
 .github/workflows/_ci.yaml        | 59 +++++++++++++++++++------------
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index a82cdefe1..9d7ec05ed 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -119,8 +119,10 @@ echo "Running tests..."
 
 # If we are on Kubernetes, install torch
 if [ "$K8S" = true ]; then
-    echo "K8S mode is true. Installing torch..."
-    pip install torch
+    uname -a 
+    python --version
+    #pip install torch  # install cpu version 
+    #nvidia-cudnn-cu12==9.7.0.66
 fi
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then
@@ -168,7 +170,7 @@ done
 # we are skipping some tests as there's still wip by Apple
 final_test_files=()
 
-for test_file in "${expanded_test_files[@]}"; do 
+for test_file in "${expanded_test_files[@]:0:5}"; do 
     exclude=false 
     #echo $test_file
     for pattern in "${EXCLUDE_PATTERNS[@]}"; do 
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index c3e0b3b2c..cd047662c 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -742,7 +742,7 @@ jobs:
     - name: Store GitHub Container Registry token as Kubernetes secret
       run: |
         kubectl create secret generic \
-          ${{ github.run_id }}-${{ github.run_attempt }}-token \
+          ${{ env.TOKEN_NAME }} \
           --from-file=.dockerconfigjson=$HOME/.docker/config.json \
           --type=kubernetes.io/dockerconfigjson
     - name: Configure axlearn test job
@@ -757,33 +757,48 @@ jobs:
       run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml
     - name: Wait for axlearn test job to start
       run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
           sleep 10
         done
     - name: Stream axlearn test job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn
+      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.JOB_NAME }}
     - name: Delete axlearn test job
       if: always()
       run: kubectl delete job ${{ env.JOB_NAME }}
-    - name: Configure axlearn post-processing job
+    - name: Download logs from S3
       run: |
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME)
-        ' \
-        .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-    - name: Submit axlearn post-processing job
-      run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-    - name: Wait for axlearn post-processing job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 10
-        done
-    - name: Stream axlearn post-processing job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
-    - name: Delete axlearn post-processing job
-      if: always()
-      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
+        mkdir -p /tmp/axlearn-output
+        aws s3 cp s3://jax-toolbox-eks-output/summary.txt /tmp/axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /tmp/axlearn-output/
+
+        passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true)
+        failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true)
+        total_tests=$((failed_tests + passed_tests))
+
+        echo "Passed tests: $passed_tests"
+        echo "Failed tests: $failed_tests"
+        echo "Total tests: $total_tests"
+    # - name: Configure axlearn post-processing job
+    #   run: |
+    #     yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+    #       | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME)
+    #     ' \
+    #     .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
+    #     git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
+    # - name: Submit axlearn post-processing job
+    #   run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
+    # - name: Wait for axlearn post-processing job to start
+    #   run: |
+    #     while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.POSTPROCESS_JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+    #       sleep 10
+    #     done
+    # - name: Stream axlearn post-processing job output
+    #   run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.POSTPROCESS_JOB_NAME }}
+    # - name: Delete axlearn post-processing job
+    #   if: always()
+    #   run: kubectl delete job ${{ env.POSTPROCESS_JOB_NAME }}
+    # TODO upload aritfacts to github
     - name: Delete GitHub Container Registry token
       if: always()
-      run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
+      run: kubectl delete secret ${{ env.TOKEN }}
+  

From 6349f445b32ea3aae0c2e8e16aa422a201c24c6b Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 12:03:09 +0000
Subject: [PATCH 19/89] fix test

---
 .github/container/test-axlearn.sh             | 10 +--
 .../axlearn/axlearn-job.yml                   | 11 +--
 .github/workflows/_ci.yaml                    | 78 +++++++++++++------
 3 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 9d7ec05ed..e8e87a10b 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -119,9 +119,7 @@ echo "Running tests..."
 
 # If we are on Kubernetes, install torch
 if [ "$K8S" = true ]; then
-    uname -a 
-    python --version
-    #pip install torch  # install cpu version 
+    pip install torch==2.6.0+cpu.cxx11.abi-cp312-cp312-linux_x86_64.whl --index-url https://download.pytorch.org/whl/torch/
     #nvidia-cudnn-cu12==9.7.0.66
 fi
 
@@ -166,11 +164,9 @@ for pattern in "${EXCLUDE_PATTERNS[@]}"; do
     echo  "$pattern"
 done
 
-#expanded_test_files=( "${expanded_test_files[@]:0:10}" )
-# we are skipping some tests as there's still wip by Apple
 final_test_files=()
 
-for test_file in "${expanded_test_files[@]:0:5}"; do 
+for test_file in "${expanded_test_files[@]}"; do 
     exclude=false 
     #echo $test_file
     for pattern in "${EXCLUDE_PATTERNS[@]}"; do 
@@ -190,7 +186,7 @@ passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]}"; do
+for test_file in "${final_test_files[@]:0:5}"; do
     echo "Running: ${test_file}"
     # Ensure the test file exists
     if [ ! -f "${test_file}" ]; then
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index eb75faad5..56183f35a 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -36,9 +36,11 @@ spec:
         volumeMounts:
         - name: output
           mountPath: /opt/output
-
       - name: upload
         image: amazon/aws-cli
+        env: 
+        - name: TEST_DATE 
+          value: PLACEHOLDER
         command:
           - sh
           - -c
@@ -47,13 +49,8 @@ spec:
             while [ ! -f /opt/output/summary.txt ]; do
               sleep 1
             done
-            # Also wait for the main log
-            while [ ! -f /opt/output/test-backend-independent.log ]; do
-              sleep 1
-            done
             # Now upload to your S3 bucket
-            aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/
-            aws s3 cp /opt/output/test-backend-independent.log s3://jax-toolbox-eks-output/
+            aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt
         volumeMounts:
         - name: output
           mountPath: /opt/output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index cd047662c..d4e6097cd 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -745,11 +745,16 @@ jobs:
           ${{ env.TOKEN_NAME }} \
           --from-file=.dockerconfigjson=$HOME/.docker/config.json \
           --type=kubernetes.io/dockerconfigjson
+    - name: Set date environment variable
+      run: |
+        echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV
     - name: Configure axlearn test job
       run: |
         # Replace placeholders in axlearn-job.yml with environment variables
-        yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME)
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.containers[1].env[0].value = strenv(DATE_TEST_RAN)
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
         .github/eks-workflow-files/axlearn/axlearn-job.yml
         git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -768,8 +773,7 @@ jobs:
     - name: Download logs from S3
       run: |
         mkdir -p /tmp/axlearn-output
-        aws s3 cp s3://jax-toolbox-eks-output/summary.txt /tmp/axlearn-output/
-        aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /tmp/axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ env.DATE_TEST_RAN }}/summary.txt /tmp/axlearn-output/
 
         passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true)
         failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true)
@@ -778,26 +782,54 @@ jobs:
         echo "Passed tests: $passed_tests"
         echo "Failed tests: $failed_tests"
         echo "Total tests: $total_tests"
-    # - name: Configure axlearn post-processing job
-    #   run: |
-    #     yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-    #       | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME)
-    #     ' \
-    #     .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-    #     git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-    # - name: Submit axlearn post-processing job
-    #   run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
-    # - name: Wait for axlearn post-processing job to start
-    #   run: |
-    #     while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.POSTPROCESS_JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-    #       sleep 10
-    #     done
-    # - name: Stream axlearn post-processing job output
-    #   run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.POSTPROCESS_JOB_NAME }}
-    # - name: Delete axlearn post-processing job
-    #   if: always()
-    #   run: kubectl delete job ${{ env.POSTPROCESS_JOB_NAME }}
-    # TODO upload aritfacts to github
+
+        echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
+        echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
+
+    - name: Generate sitrep
+      id: sitrep
+      if: "!cancelled()"
+      shell: bash -x -e {0}
+      run: |
+        # bring in utility functions
+        source .github/workflows/scripts/to_json.sh
+
+        badge_label='Axlearn EKS Unit'
+
+        total_tests=${{ steps.test-stats.outputs.TOTAL_TESTS }} \
+        failed_tests=${{ steps.test-stats.outputs.FAILED_TESTS }} \
+        passed_tests=${{ steps.test-stats.outputs.PASSED_TESTS }} \
+        errors="0" \
+        summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+        badge_message="Passed $passed_tests out of $total_tests." \
+        badge_color="brightgreen"
+        if [ "$failed_tests" -gt 0 ]; then
+          badge_color="red"
+        fi \
+
+        to_json \
+          summary \
+          errors total_tests passed_tests failed_tests \
+          badge_label badge_color badge_message \
+        > sitrep.json
+
+        schemaVersion=1 \
+        label="${badge_label}" \
+        message="Passed $passed_tests out of $total_tests." \
+        color=$badge_color \
+        to_json schemaVersion label message color \
+        > "badge-axlearn-test"
+
+    - name: Upload artifacts
+      if: "!cancelled()"
+      uses: actions/upload-artifact@v4
+      with:
+        name: "artifact-axlearn-test"
+        path: |
+          sitrep.json
+          "badge-axlearn-test"
+          summary.txt
     - name: Delete GitHub Container Registry token
       if: always()
       run: kubectl delete secret ${{ env.TOKEN }}

From 004ed787e7c65f1c7fd21582d6c26a1ea9e92730 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 13:18:44 +0000
Subject: [PATCH 20/89] remove postprocess

---
 .../axlearn/axlearn-postprocess-job.yml       | 45 -------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml

diff --git a/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml b/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
deleted file mode 100644
index b6404a559..000000000
--- a/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: PLACEHOLDER 
-spec:
-  template:
-    spec:
-      restartPolicy: Never
-      initContainers:
-      - name: download
-        image: amazon/aws-cli
-        command:
-          - sh
-          - -c
-          - |
-            aws s3 cp s3://jax-toolbox-eks-output/summary.txt /opt/output/
-            aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /opt/output/
-        volumeMounts:
-        - mountPath: /opt/output
-          name: output
-      containers:
-      - name: parse-axlearn
-        image: ubuntu:22.04 
-        command:
-          - bash
-          - -exo
-          - pipefail
-          - -c
-          - |
-            if [ ! -f /opt/output/summary.txt ]; then
-              echo "summary.txt not found!"
-              exit 1
-            fi
-
-            passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true)
-            failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true)
-            total_tests=$((failed_tests + passed_tests))
-        volumeMounts:
-        - mountPath: /opt/output
-          name: output
-      imagePullSecrets:
-      - name: PLACEHOLDER 
-      volumes:
-      - name: output
-        emptyDir: {}

From cf858141c7b173fe13b84bd7ae92a1d2d47a981a Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 14:17:56 +0000
Subject: [PATCH 21/89] reusable actions test

---
 .../actions/checkout-ghcr-login/action.yml    | 34 +++++++++++
 .github/actions/delete-ghcr-token/action.yml  | 16 ++++++
 .github/actions/delete-k8s-job/action.yml     | 15 +++++
 .github/actions/submit-k8s-job/action.yml     | 31 ++++++++++
 .github/container/test-axlearn.sh             | 31 +++-------
 .github/workflows/_ci.yaml                    | 56 ++++++++-----------
 6 files changed, 127 insertions(+), 56 deletions(-)
 create mode 100644 .github/actions/checkout-ghcr-login/action.yml
 create mode 100644 .github/actions/delete-ghcr-token/action.yml
 create mode 100644 .github/actions/delete-k8s-job/action.yml
 create mode 100644 .github/actions/submit-k8s-job/action.yml

diff --git a/.github/actions/checkout-ghcr-login/action.yml b/.github/actions/checkout-ghcr-login/action.yml
new file mode 100644
index 000000000..a71a1be12
--- /dev/null
+++ b/.github/actions/checkout-ghcr-login/action.yml
@@ -0,0 +1,34 @@
+name: Checkout, GHCR login, K8s secret
+description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret.
+
+inputs:
+  docker-username:
+    description: Username for GHCR
+    required: true
+  docker-password:
+    description: Password (e.g., GITHUB_TOKEN)
+    required: true
+  token-name:
+    description: Name of the K8s secret to create
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: "ghcr.io"
+        username: ${{ inputs.docker-username }}
+        password: ${{ inputs.docker-password }}
+
+    - name: Store GitHub Container Registry token as Kubernetes secret
+      shell: bash
+      run: |
+        kubectl create secret generic \
+          ${{ inputs.token-name }} \
+          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+          --type=kubernetes.io/dockerconfigjson
diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/delete-ghcr-token/action.yml
new file mode 100644
index 000000000..0d90dd168
--- /dev/null
+++ b/.github/actions/delete-ghcr-token/action.yml
@@ -0,0 +1,16 @@
+name: Delete GHCR Token
+description: Deletes the K8s secret used for pulling images from GHCR.
+
+inputs:
+  token-name:
+    description: Name of the K8s secret to delete
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Delete GitHub Container Registry token
+      shell: bash
+      if: always()
+      run: |
+        kubectl delete secret ${{ inputs.token-name }}
diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml
new file mode 100644
index 000000000..cf8011fcc
--- /dev/null
+++ b/.github/actions/delete-k8s-job/action.yml
@@ -0,0 +1,15 @@
+name: Delete K8s Job
+description: Cleans up the Job resource to avoid leaving pods behind.
+
+inputs:
+  job-name:
+    description: The job name to delete
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Delete Kubernetes job
+      if: always()
+      run: |
+        kubectl delete job ${{ inputs.job-name }}
diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml
new file mode 100644
index 000000000..c00826897
--- /dev/null
+++ b/.github/actions/submit-k8s-job/action.yml
@@ -0,0 +1,31 @@
+name: Submit & Stream K8s Job
+description: Submits a Kubernetes job and then streams its logs to GitHub Actions.
+
+inputs:
+  job-config-file:
+    description: Path to the Kubernetes job YAML
+    required: true
+  job-name:
+    description: The job name
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Submit Kubernetes job
+      shell: bash
+      run: |
+        kubectl apply -f "${{ inputs.job-config-file }}"
+
+    - name: Wait for Kubernetes job to start
+      shell: bash
+      run: |
+        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+          echo "Waiting for pods to start..."
+          sleep 10
+        done
+
+    - name: Stream Kubernetes job output
+      shell: bash
+      run: |
+        kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index e8e87a10b..8cf2a94d6 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -117,18 +117,18 @@ echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES"
 
 echo "Running tests..."
 
-# If we are on Kubernetes, install torch
+# If we are on Kubernetes, install torch for cpu only
 if [ "$K8S" = true ]; then
-    pip install torch==2.6.0+cpu.cxx11.abi-cp312-cp312-linux_x86_64.whl --index-url https://download.pytorch.org/whl/torch/
-    #nvidia-cudnn-cu12==9.7.0.66
+     pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
 fi
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then
     TEST_FILES=("*_test.py")
 fi
+
 expanded_test_files=()
 for pattern in "${TEST_FILES[@]}"; do
-    # Use globbing to expand pattern
+    # retrieve all the files
     files=( $pattern )
     if [ "${#files[@]}" -gt 0 ]; then
         expanded_test_files+=( "${files[@]}" )
@@ -137,19 +137,12 @@ for pattern in "${TEST_FILES[@]}"; do
     fi
 done
 
-
 if [ "${#expanded_test_files[@]}" -eq 0 ]; then
     echo "No test files found to run."
     exit 1
 fi
 
-echo "These are the test files:"
-for f in "${expanded_test_files[@]}"; do
-    echo "  $f"
-done
-
-# Get the directory where the script is located
-#SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+# in case we have the exclusion list file 
 EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt"
 EXCLUDE_PATTERNS=()
 
@@ -159,16 +152,11 @@ if [ -f "$EXCLUDE_LIST_FILE" ]; then
 else
     echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'"
 fi
-echo "Exclude patterns read:"
-for pattern in "${EXCLUDE_PATTERNS[@]}"; do
-    echo  "$pattern"
-done
 
 final_test_files=()
 
 for test_file in "${expanded_test_files[@]}"; do 
     exclude=false 
-    #echo $test_file
     for pattern in "${EXCLUDE_PATTERNS[@]}"; do 
         if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then
             exclude=true 
@@ -180,7 +168,7 @@ for test_file in "${expanded_test_files[@]}"; do
     fi 
 done
 
-# Initialize counters
+# Initialize counters for test
 failures=0
 passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
@@ -198,9 +186,9 @@ for test_file in "${final_test_files[@]:0:5}"; do
     log_file="${LOG_DIRECTORY}/${log_file_name}"
     # run the tests and save them as *.log
     pytest "${test_file}" --capture=tee-sys | tee "${log_file}"
-    # TODO parse the logs?
     exit_code=${PIPESTATUS[0]}
     echo $exit_code
+    # write number of tests passed and failed
     if [ $exit_code -eq 0 ]; then
         echo "${test_file}: PASSED" >> "${SUMMARY_FILE}"
         ((passed++))
@@ -209,7 +197,4 @@ for test_file in "${final_test_files[@]:0:5}"; do
         ((failures++))
     fi
     echo ""
-done
-
-echo $passed 
-echo $failures
\ No newline at end of file
+done
\ No newline at end of file
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index d4e6097cd..d36750e10 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -719,35 +719,25 @@ jobs:
   #       test-backend-independent.log
   #   secrets: inherit
 
-  # TODO WE CAN CREATE A RESUABLE ACTION HERE
-  # FIX everything with env.something
+
   test-axlearn-eks:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
     runs-on: eks
     env:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+      JOB_NAME: axlearn-${{ github.run_id }}
+      TOKEN_NAME: axlearn-${{ github.run_id }}-token
     steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: Store GitHub Container Registry token as Kubernetes secret
-      run: |
-        kubectl create secret generic \
-          ${{ env.TOKEN_NAME }} \
-          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-          --type=kubernetes.io/dockerconfigjson
-    - name: Set date environment variable
+    - name: Set date env var for saving files
       run: |
         echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV
+    - name: Check and GHCR Login
+      uses: /.github/actions/checkout-ghcr-login
+      with: 
+          docker-username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+          token-name: ${{ env.TOKEN_NAME }}
     - name: Configure axlearn test job
       run: |
         # Replace placeholders in axlearn-job.yml with environment variables
@@ -758,18 +748,17 @@ jobs:
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
         .github/eks-workflow-files/axlearn/axlearn-job.yml
         git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
-    - name: Submit axlearn test job
-      run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml
-    - name: Wait for axlearn test job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 10
-        done
-    - name: Stream axlearn test job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.JOB_NAME }}
+    - name: Submit & wait for axlearn test job
+      uses: ./.github/actions/submit-k8s-job 
+      with:
+        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
+        job-name: ${{ env.JOB_NAME }}
+
     - name: Delete axlearn test job
-      if: always()
-      run: kubectl delete job ${{ env.JOB_NAME }}
+      uses: ./.github/actions/delete-k8s-job
+      with: 
+        job-name: ${{ env.JOB_NAME }}
+
     - name: Download logs from S3
       run: |
         mkdir -p /tmp/axlearn-output
@@ -831,6 +820,7 @@ jobs:
           "badge-axlearn-test"
           summary.txt
     - name: Delete GitHub Container Registry token
-      if: always()
-      run: kubectl delete secret ${{ env.TOKEN }}
+      uses: ./.github/actions/delete-ghcr-token
+      with: 
+        token-name: ${{ env.TOKEN_NAME }}
   

From f32ee766dfc12f83c2cd3f1f9b4c0bc7b3486e74 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 14:36:31 +0000
Subject: [PATCH 22/89] fix

---
 .github/actions/delete-k8s-job/action.yml | 1 +
 .github/workflows/_ci.yaml                | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml
index cf8011fcc..877039672 100644
--- a/.github/actions/delete-k8s-job/action.yml
+++ b/.github/actions/delete-k8s-job/action.yml
@@ -10,6 +10,7 @@ runs:
   using: "composite"
   steps:
     - name: Delete Kubernetes job
+      shell: bash
       if: always()
       run: |
         kubectl delete job ${{ inputs.job-name }}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index d36750e10..f0323e040 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -733,10 +733,10 @@ jobs:
       run: |
         echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV
     - name: Check and GHCR Login
-      uses: /.github/actions/checkout-ghcr-login
+      uses: ./.github/actions/checkout-ghcr-login
       with: 
           docker-username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+          docker-password: ${{ secrets.GITHUB_TOKEN }}
           token-name: ${{ env.TOKEN_NAME }}
     - name: Configure axlearn test job
       run: |
@@ -748,6 +748,7 @@ jobs:
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
         .github/eks-workflow-files/axlearn/axlearn-job.yml
         git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
+
     - name: Submit & wait for axlearn test job
       uses: ./.github/actions/submit-k8s-job 
       with:

From 04c6cf9ab7edb70187e2f9a8801af8208e75c675 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 15:39:04 +0000
Subject: [PATCH 23/89] test on single piece

---
 .../actions/checkout-ghcr-login/action.yml    |  34 ---
 .github/workflows/_ci.yaml                    | 231 +++++++++---------
 2 files changed, 111 insertions(+), 154 deletions(-)
 delete mode 100644 .github/actions/checkout-ghcr-login/action.yml

diff --git a/.github/actions/checkout-ghcr-login/action.yml b/.github/actions/checkout-ghcr-login/action.yml
deleted file mode 100644
index a71a1be12..000000000
--- a/.github/actions/checkout-ghcr-login/action.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: Checkout, GHCR login, K8s secret
-description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret.
-
-inputs:
-  docker-username:
-    description: Username for GHCR
-    required: true
-  docker-password:
-    description: Password (e.g., GITHUB_TOKEN)
-    required: true
-  token-name:
-    description: Name of the K8s secret to create
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: "ghcr.io"
-        username: ${{ inputs.docker-username }}
-        password: ${{ inputs.docker-password }}
-
-    - name: Store GitHub Container Registry token as Kubernetes secret
-      shell: bash
-      run: |
-        kubectl create secret generic \
-          ${{ inputs.token-name }} \
-          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-          --type=kubernetes.io/dockerconfigjson
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index f0323e040..7bfd93bbd 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -38,33 +38,33 @@ permissions:
 
 jobs:
 
-  build-base:
-    uses: ./.github/workflows/_build_base.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BASE_IMAGE: ${{ inputs.CUDA_IMAGE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
-    secrets: inherit
-
-  build-jax:
-    needs: build-base
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-jax-build
-      BADGE_FILENAME: badge-jax-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
-      CONTAINER_NAME: jax
-      DOCKERFILE: .github/container/Dockerfile.jax
-      RUNNER_SIZE: large
-      EXTRA_BUILD_ARGS: |
-        URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
-        URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
-        URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
-        URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
-    secrets: inherit
+  # build-base:
+  #   uses: ./.github/workflows/_build_base.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BASE_IMAGE: ${{ inputs.CUDA_IMAGE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
+  #   secrets: inherit
+
+  # build-jax:
+  #   needs: build-base
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-jax-build
+  #     BADGE_FILENAME: badge-jax-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
+  #     CONTAINER_NAME: jax
+  #     DOCKERFILE: .github/container/Dockerfile.jax
+  #     RUNNER_SIZE: large
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
+  #       URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
+  #       URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
+  #       URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
+  #   secrets: inherit
 
   # build-triton:
   #   needs: build-jax
@@ -203,57 +203,57 @@ jobs:
   #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
   #   secrets: inherit
 
-  build-axlearn:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-axlearn-build
-      BADGE_FILENAME: badge-axlearn-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: axlearn
-      DOCKERFILE: .github/container/Dockerfile.axlearn
-      EXTRA_BUILD_ARGS: |
-        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-    secrets: inherit
-
-  collect-docker-tags:
-    runs-on: ubuntu-22.04
-    if: "!cancelled()"
-    needs:
-      - build-base
-      - build-jax
-      # - build-triton
-      # - build-equinox
-      # - build-maxtext
-      # - build-levanter
-      # - build-upstream-t5x
-      # - build-upstream-pax
-      # - build-rosetta-t5x
-      # - build-rosetta-pax
-      # - build-gemma
-      - build-axlearn
-    outputs:
-      TAGS: ${{ steps.collect-tags.outputs.TAGS }}
-    steps:
-      - name: Save docker tags as a JSON object
-        id: collect-tags
-        run: |
-          TAGS=$(cat <<EOF | jq -c
-          [\
-            {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
-            {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
-
-            {}\
-          ]
-          EOF
-          )
-
-          echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
+  # build-axlearn:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-axlearn-build
+  #     BADGE_FILENAME: badge-axlearn-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: axlearn
+  #     DOCKERFILE: .github/container/Dockerfile.axlearn
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+  #   secrets: inherit
+
+  # collect-docker-tags:
+  #   runs-on: ubuntu-22.04
+  #   if: "!cancelled()"
+  #   needs:
+  #     - build-base
+  #     - build-jax
+  #     # - build-triton
+  #     # - build-equinox
+  #     # - build-maxtext
+  #     # - build-levanter
+  #     # - build-upstream-t5x
+  #     # - build-upstream-pax
+  #     # - build-rosetta-t5x
+  #     # - build-rosetta-pax
+  #     # - build-gemma
+  #     - build-axlearn
+  #   outputs:
+  #     TAGS: ${{ steps.collect-tags.outputs.TAGS }}
+  #   steps:
+  #     - name: Save docker tags as a JSON object
+  #       id: collect-tags
+  #       run: |
+  #         TAGS=$(cat <<EOF | jq -c
+  #         [\
+  #           {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
+  #           {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
+
+  #           {}\
+  #         ]
+  #         EOF
+  #         )
+
+  #         echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
   # test-distribution:
   #   runs-on: ubuntu-22.04
@@ -450,20 +450,12 @@ jobs:
   #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
   #     TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
   #   steps:
-  #   - name: Check out the repository
-  #     uses: actions/checkout@v4
-  #   - name: Login to GitHub Container Registry
-  #     uses: docker/login-action@v3
-  #     with:
-  #       registry: ghcr.io
-  #       username: ${{ github.repository_owner }}
-  #       password: ${{ secrets.GITHUB_TOKEN }}
-  #   - name: Store GitHub Container Registry token as Kubernetes secret
-  #     run: |
-  #       kubectl create secret generic \
-  #         ${{ github.run_id }}-${{ github.run_attempt }}-token \
-  #         --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-  #         --type=kubernetes.io/dockerconfigjson
+  #   - name: GHCR login
+  #     uses: ./.github/actions/ghcr-login 
+  #     with: 
+  #       docker-username: ${{ github.repository_owner }}
+  #       docker-password: ${{ secrets.GITHUB_TOKEN}}
+  #       token-name: ${{ env.TOKEN_NAME }}
   #   - name: Configure Kubernetes job
   #     run: |
   #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
@@ -474,18 +466,17 @@ jobs:
   #         .github/eks-workflow-files/job.yml
   #       git diff .github/eks-workflow-files/job.yml
   #   - name: Submit Kubernetes job
-  #     run: kubectl apply -f .github/eks-workflow-files/job.yml
-  #   - name: Wait for Kubernetes job to start
-  #     run: |
-  #       while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-  #         sleep 2
-  #       done
-  #   - name: Stream Kubernetes job output
-  #     run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
+  #     uses: ./.github/acitons/submit-k8s-job
+  #     with: 
+  #       job-config-file: .github/eks-workflow-files/job.yml
+  #       job-name: ${{ env.JOB_NAME }}
+
   #   # Clean up in case of errors as well as success
   #   - name: Delete Kubernetes job
-  #     if: always()
-  #     run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
+  #     uses: ./.github/actions/delete-k8s-job
+  #     with: 
+  #       job-name: ${{ env.JOB_NAME }}
+    
   #   - name: Configure post-processing job
   #     run: |
   #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
@@ -495,22 +486,19 @@ jobs:
   #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
   #         .github/eks-workflow-files/post-process-job.yml
   #       git diff .github/eks-workflow-files/post-process-job.yml
-  #   - name: Submit post-processing Kubernetes job
-  #     run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
-  #   - name: Wait for post-processing Kubernetes job to start
-  #     run: |
-  #       while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-  #         sleep 2
-  #       done
-  #   - name: Stream post-processing Kubernetes job output
-  #     run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-  #   # Clean up in case of errors as well as success
+  #   - name: Submit Kubernetes job
+  #     uses: ./.github/acitons/submit-k8s-job
+  #     with: 
+  #       job-config-file: .github/eks-workflow-files/post-process-job.yml
+  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
   #   - name: Delete post-processing Kubernetes job
-  #     if: always()
-  #     run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
+  #     uses: ./.github/actions/delete-k8s-job
+  #     with: 
+  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
   #   - name: Delete GitHub Container Registry token
-  #     if: always()
-  #     run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
+  #     uses: ./.github/actions/delete-ghcr-token
+  #     with: 
+  #       token-name: ${{ env.TOKEN_NAME }}
 
   # # test-equinox:
   # #   needs: build-equinox
@@ -721,19 +709,22 @@ jobs:
 
 
   test-axlearn-eks:
-    needs: build-axlearn
+    #needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
     runs-on: eks
     env:
-      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      # needs.build-axlearn.outputs.DOCKER_TAG_FINAL
+      AXLEARN_DOCKER_IMAGE: "ghcr.io/nvidia/jax-toolbox-internal:13331372559-axlearn-amd6" 
       JOB_NAME: axlearn-${{ github.run_id }}
       TOKEN_NAME: axlearn-${{ github.run_id }}-token
     steps:
     - name: Set date env var for saving files
       run: |
         echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV
-    - name: Check and GHCR Login
-      uses: ./.github/actions/checkout-ghcr-login
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: GHCR Login
+      uses: ./.github/actions/ghcr-login
       with: 
           docker-username: ${{ github.repository_owner }}
           docker-password: ${{ secrets.GITHUB_TOKEN }}

From cfc68db19e41a67c858ec06ac59a587446f359dc Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 15:42:55 +0000
Subject: [PATCH 24/89] add checkout

---
 .github/actions/ghcr-login/action.yml | 31 +++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 .github/actions/ghcr-login/action.yml

diff --git a/.github/actions/ghcr-login/action.yml b/.github/actions/ghcr-login/action.yml
new file mode 100644
index 000000000..2c62591ed
--- /dev/null
+++ b/.github/actions/ghcr-login/action.yml
@@ -0,0 +1,31 @@
+name: Checkout, GHCR login, K8s secret
+description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret.
+
+inputs:
+  docker-username:
+    description: Username for GHCR
+    required: true
+  docker-password:
+    description: Password (e.g., GITHUB_TOKEN)
+    required: true
+  token-name:
+    description: Name of the K8s secret to create
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: "ghcr.io"
+        username: ${{ inputs.docker-username }}
+        password: ${{ inputs.docker-password }}
+
+    - name: Store GitHub Container Registry token as Kubernetes secret
+      shell: bash
+      run: |
+        kubectl create secret generic \
+          ${{ inputs.token-name }} \
+          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+          --type=kubernetes.io/dockerconfigjson

From 65eca97ac54d52f8d707369766f0799bf22384e9 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 15:52:16 +0000
Subject: [PATCH 25/89] restart ci

---
 .github/workflows/_ci.yaml | 159 ++++++++++++++++++-------------------
 1 file changed, 78 insertions(+), 81 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 7bfd93bbd..5d1028a17 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -38,33 +38,33 @@ permissions:
 
 jobs:
 
-  # build-base:
-  #   uses: ./.github/workflows/_build_base.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     BASE_IMAGE: ${{ inputs.CUDA_IMAGE }}
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
-  #   secrets: inherit
-
-  # build-jax:
-  #   needs: build-base
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-jax-build
-  #     BADGE_FILENAME: badge-jax-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
-  #     CONTAINER_NAME: jax
-  #     DOCKERFILE: .github/container/Dockerfile.jax
-  #     RUNNER_SIZE: large
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
-  #       URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
-  #       URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
-  #       URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
-  #   secrets: inherit
+  build-base:
+    uses: ./.github/workflows/_build_base.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BASE_IMAGE: ${{ inputs.CUDA_IMAGE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
+    secrets: inherit
+
+  build-jax:
+    needs: build-base
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-jax-build
+      BADGE_FILENAME: badge-jax-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
+      CONTAINER_NAME: jax
+      DOCKERFILE: .github/container/Dockerfile.jax
+      RUNNER_SIZE: large
+      EXTRA_BUILD_ARGS: |
+        URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
+        URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
+        URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
+        URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
+    secrets: inherit
 
   # build-triton:
   #   needs: build-jax
@@ -203,57 +203,55 @@ jobs:
   #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
   #   secrets: inherit
 
-  # build-axlearn:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-axlearn-build
-  #     BADGE_FILENAME: badge-axlearn-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: axlearn
-  #     DOCKERFILE: .github/container/Dockerfile.axlearn
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-  #   secrets: inherit
-
-  # collect-docker-tags:
-  #   runs-on: ubuntu-22.04
-  #   if: "!cancelled()"
-  #   needs:
-  #     - build-base
-  #     - build-jax
-  #     # - build-triton
-  #     # - build-equinox
-  #     # - build-maxtext
-  #     # - build-levanter
-  #     # - build-upstream-t5x
-  #     # - build-upstream-pax
-  #     # - build-rosetta-t5x
-  #     # - build-rosetta-pax
-  #     # - build-gemma
-  #     - build-axlearn
-  #   outputs:
-  #     TAGS: ${{ steps.collect-tags.outputs.TAGS }}
-  #   steps:
-  #     - name: Save docker tags as a JSON object
-  #       id: collect-tags
-  #       run: |
-  #         TAGS=$(cat <<EOF | jq -c
-  #         [\
-  #           {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
-  #           {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-  #           {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
-  #           {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-  #           {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
-
-  #           {}\
-  #         ]
-  #         EOF
-  #         )
-
-  #         echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
+  build-axlearn:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-axlearn-build
+      BADGE_FILENAME: badge-axlearn-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: axlearn
+      DOCKERFILE: .github/container/Dockerfile.axlearn
+    secrets: inherit
+
+  collect-docker-tags:
+    runs-on: ubuntu-22.04
+    if: "!cancelled()"
+    needs:
+      - build-base
+      - build-jax
+      # - build-triton
+      # - build-equinox
+      # - build-maxtext
+      # - build-levanter
+      # - build-upstream-t5x
+      # - build-upstream-pax
+      # - build-rosetta-t5x
+      # - build-rosetta-pax
+      # - build-gemma
+      - build-axlearn
+    outputs:
+      TAGS: ${{ steps.collect-tags.outputs.TAGS }}
+    steps:
+      - name: Save docker tags as a JSON object
+        id: collect-tags
+        run: |
+          TAGS=$(cat <<EOF | jq -c
+          [\
+            {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
+            {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
+
+            {}\
+          ]
+          EOF
+          )
+
+          echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
   # test-distribution:
   #   runs-on: ubuntu-22.04
@@ -709,12 +707,11 @@ jobs:
 
 
   test-axlearn-eks:
-    #needs: build-axlearn
+    needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
     runs-on: eks
     env:
-      # needs.build-axlearn.outputs.DOCKER_TAG_FINAL
-      AXLEARN_DOCKER_IMAGE: "ghcr.io/nvidia/jax-toolbox-internal:13331372559-axlearn-amd6" 
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
       JOB_NAME: axlearn-${{ github.run_id }}
       TOKEN_NAME: axlearn-${{ github.run_id }}-token
     steps:

From 580bf733ffd0d328dabf2f05b360872b145760a8 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 14 Feb 2025 17:23:23 +0000
Subject: [PATCH 26/89] general clean up

---
 .github/container/test-axlearn.sh |    9 +-
 .github/workflows/_ci.yaml        | 1125 ++++++++++++++---------------
 2 files changed, 560 insertions(+), 574 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 8cf2a94d6..27118b7a0 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -120,6 +120,7 @@ echo "Running tests..."
 # If we are on Kubernetes, install torch for cpu only
 if [ "$K8S" = true ]; then
      pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+     pip install transformers
 fi
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then
@@ -174,14 +175,8 @@ passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]:0:5}"; do
+for test_file in "${final_test_files[@]}"; do
     echo "Running: ${test_file}"
-    # Ensure the test file exists
-    if [ ! -f "${test_file}" ]; then
-        echo "${test_file}: NOT FOUND" >> "${SUMMARY_FILE}"
-        echo "Test file not found: ${test_file}"
-        continue
-    fi
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
     log_file="${LOG_DIRECTORY}/${log_file_name}"
     # run the tests and save them as *.log
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 5d1028a17..02c2c4611 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,142 +66,142 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  # build-triton:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-triton-build
-  #     BADGE_FILENAME: badge-triton-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: triton
-  #     DOCKERFILE: .github/container/Dockerfile.triton
-  #     RUNNER_SIZE: large
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-  #   secrets: inherit
+  build-triton:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-triton-build
+      BADGE_FILENAME: badge-triton-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: triton
+      DOCKERFILE: .github/container/Dockerfile.triton
+      RUNNER_SIZE: large
+      EXTRA_BUILD_ARGS: |
+        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+    secrets: inherit
 
-  # build-equinox:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-equinox-build
-  #     BADGE_FILENAME: badge-equinox-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: equinox
-  #     DOCKERFILE: .github/container/Dockerfile.equinox
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-  #   secrets: inherit
+  build-equinox:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-equinox-build
+      BADGE_FILENAME: badge-equinox-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: equinox
+      DOCKERFILE: .github/container/Dockerfile.equinox
+      EXTRA_BUILD_ARGS: |
+        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+    secrets: inherit
 
-  # build-maxtext:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-maxtext-build
-  #     BADGE_FILENAME: badge-maxtext-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: maxtext
-  #     DOCKERFILE: .github/container/Dockerfile.maxtext
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-  #   secrets: inherit
+  build-maxtext:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-maxtext-build
+      BADGE_FILENAME: badge-maxtext-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: maxtext
+      DOCKERFILE: .github/container/Dockerfile.maxtext
+      EXTRA_BUILD_ARGS: |
+        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+    secrets: inherit
 
-  # build-levanter:
-  #   needs: [build-jax]
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-levanter-build"
-  #     BADGE_FILENAME: "badge-levanter-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: levanter
-  #     DOCKERFILE: .github/container/Dockerfile.levanter
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-  #   secrets: inherit
+  build-levanter:
+    needs: [build-jax]
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-levanter-build"
+      BADGE_FILENAME: "badge-levanter-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: levanter
+      DOCKERFILE: .github/container/Dockerfile.levanter
+      EXTRA_BUILD_ARGS: |
+        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+    secrets: inherit
 
-  # build-upstream-t5x:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-t5x-build"
-  #     BADGE_FILENAME: "badge-t5x-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: upstream-t5x
-  #     DOCKERFILE: .github/container/Dockerfile.t5x
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-  #   secrets: inherit
+  build-upstream-t5x:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-t5x-build"
+      BADGE_FILENAME: "badge-t5x-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: upstream-t5x
+      DOCKERFILE: .github/container/Dockerfile.t5x
+      EXTRA_BUILD_ARGS: |
+        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+    secrets: inherit
 
-  # build-upstream-pax:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-pax-build
-  #     BADGE_FILENAME: badge-pax-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: upstream-pax
-  #     DOCKERFILE: .github/container/Dockerfile.pax
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
-  #       URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
-  #       URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
-  #   secrets: inherit
+  build-upstream-pax:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-pax-build
+      BADGE_FILENAME: badge-pax-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: upstream-pax
+      DOCKERFILE: .github/container/Dockerfile.pax
+      EXTRA_BUILD_ARGS: |
+        URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
+        URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
+        URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
+    secrets: inherit
 
-  # build-rosetta-t5x:
-  #   needs: build-upstream-t5x
-  #   uses: ./.github/workflows/_build_rosetta.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-  #     BASE_LIBRARY: t5x
-  #   secrets: inherit
+  build-rosetta-t5x:
+    needs: build-upstream-t5x
+    uses: ./.github/workflows/_build_rosetta.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+      BASE_LIBRARY: t5x
+    secrets: inherit
 
-  # build-rosetta-pax:
-  #   needs: build-upstream-pax
-  #   uses: ./.github/workflows/_build_rosetta.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
-  #     BASE_LIBRARY: pax
-  #   secrets: inherit
+  build-rosetta-pax:
+    needs: build-upstream-pax
+    uses: ./.github/workflows/_build_rosetta.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
+      BASE_LIBRARY: pax
+    secrets: inherit
 
-  # build-gemma:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-gemma-build
-  #     BADGE_FILENAME: badge-gemma-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: gemma
-  #     DOCKERFILE: rosetta/Dockerfile.gemma
-  #     DOCKER_CONTEXT: .
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-  #   secrets: inherit
+  build-gemma:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-gemma-build
+      BADGE_FILENAME: badge-gemma-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: gemma
+      DOCKERFILE: rosetta/Dockerfile.gemma
+      DOCKER_CONTEXT: .
+      EXTRA_BUILD_ARGS: |
+        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+    secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -222,15 +222,15 @@ jobs:
     needs:
       - build-base
       - build-jax
-      # - build-triton
-      # - build-equinox
-      # - build-maxtext
-      # - build-levanter
-      # - build-upstream-t5x
-      # - build-upstream-pax
-      # - build-rosetta-t5x
-      # - build-rosetta-pax
-      # - build-gemma
+      - build-triton
+      - build-equinox
+      - build-maxtext
+      - build-levanter
+      - build-upstream-t5x
+      - build-upstream-pax
+      - build-rosetta-t5x
+      - build-rosetta-pax
+      - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -242,8 +242,26 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -253,343 +271,263 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  # test-distribution:
-  #   runs-on: ubuntu-22.04
-  #   strategy:
-  #     matrix:
-  #       TEST_SCRIPT:
-  #         - extra-only-distribution.sh
-  #         - mirror-only-distribution.sh
-  #         - upstream-only-distribution.sh
-  #         - local-patch-distribution.sh
-  #     fail-fast: false
-  #   steps:
-  #     - name: Print environment variables
-  #       run: env
-  #     - name: Set git login for tests
-  #       run: |
-  #         git config --global user.email "jax@nvidia.com"
-  #         git config --global user.name "JAX-Toolbox CI"
-  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
-  #       uses: actions/checkout@v4
-  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  # test-jax:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: jax
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-backend-independent.log
-  #         test-jax.sh -b backend-independent 
-  #       EOF
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee tee test-gpu.log
-  #         nvidia-cuda-mps-control -d
-  #         test-jax.sh -b gpu
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-backend-independent.log
-  #       test-gpu.log
-  #   secrets: inherit
+  test-distribution:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        TEST_SCRIPT:
+          - extra-only-distribution.sh
+          - mirror-only-distribution.sh
+          - upstream-only-distribution.sh
+          - local-patch-distribution.sh
+      fail-fast: false
+    steps:
+      - name: Print environment variables
+        run: env
+      - name: Set git login for tests
+        run: |
+          git config --global user.email "jax@nvidia.com"
+          git config --global user.name "JAX-Toolbox CI"
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v4
+      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  test-jax:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: jax
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all \
+        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-backend-independent.log
+          test-jax.sh -b backend-independent 
+        EOF
+        docker run -i --shm-size=1g --gpus all \
+        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee tee test-gpu.log
+          nvidia-cuda-mps-control -d
+          test-jax.sh -b gpu
+        EOF
+      STATISTICS_SCRIPT: |
+        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-backend-independent.log
+        test-gpu.log
+    secrets: inherit
 
-  # test-nsys-jax:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: nsys-jax
-  #     EXECUTE: |
-  #       set -o pipefail
-  #       num_tests=0
-  #       num_failures=0
-  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-  #       # avoid an early abort here.
-  #       set +e
-  #       docker run -i --shm-size=1g --gpus all \
-  #         -v $PWD:/opt/output \
-  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #         bash <<"EOF" |& tee test-nsys-jax.log
-  #           # nsys-jax is already installed, this is just adding the test dependencies
-  #           pip install pytest-reportlog nsys-jax[test]
-  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
-  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-  #       EOF
-  #       set -e
-  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-  #       for mode in 1-process 2-process process-per-gpu; do
-  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-  #         if [[ "${mode}" == "1-process" ]]; then
-  #           PROCESS_COUNT=1
-  #           ARGS=""
-  #         elif [[ "${mode}" == "2-process" ]]; then
-  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-  #           # this will flush out more bugs than process-per-node or process-per-GPU.
-  #           PROCESS_COUNT=2
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-  #         else
-  #           PROCESS_COUNT=${GPUS_PER_NODE}
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-  #         fi
-  #         for collection in full partial; do
-  #           NSYS_JAX="nsys-jax"
-  #           if [[ "${mode}" == "1-process" ]]; then
-  #             # We will not run nsys-jax-combine, so run analyses eagerly
-  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-  #           fi
-  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-  #           if [[ "${collection}" == "partial" ]]; then
-  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-  #             # nvbug/4801401
-  #             NSYS_JAX+=" --sample=none"
-  #           fi
-  #           set +e
-  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         done
-  #         if [[ "${mode}" != "1-process" ]]; then
-  #           # Run nsys-jax-combine
-  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
-  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-  #           done
-  #           set +e
-  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         fi
-  #       done
-  #       ls -R .
-  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-  #       exit $num_failures
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-nsys-jax.log)
-  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       # pytest-driven part
-  #       test-nsys-jax.log
-  #       pytest-report.jsonl
-  #       # nsys-jax logfiles
-  #       *process-*-execution.log
-  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
-  #       1-process-*-execution-0.zip
-  #       # nsys-jax-combine output/logfiles
-  #       *process*-*-execution.zip
-  #       *-execution-combine.log
-  #   secrets: inherit
+  test-nsys-jax:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: nsys-jax
+      EXECUTE: |
+        set -o pipefail
+        num_tests=0
+        num_failures=0
+        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+        # avoid an early abort here.
+        set +e
+        docker run -i --shm-size=1g --gpus all \
+          -v $PWD:/opt/output \
+          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+          bash <<"EOF" |& tee test-nsys-jax.log
+            # nsys-jax is already installed, this is just adding the test dependencies
+            pip install pytest-reportlog nsys-jax[test]
+            # abuse knowledge that nsys-jax is installed editable, so the tests exist
+            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+        EOF
+        set -e
+        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+        for mode in 1-process 2-process process-per-gpu; do
+          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+          if [[ "${mode}" == "1-process" ]]; then
+            PROCESS_COUNT=1
+            ARGS=""
+          elif [[ "${mode}" == "2-process" ]]; then
+            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+            # this will flush out more bugs than process-per-node or process-per-GPU.
+            PROCESS_COUNT=2
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+          else
+            PROCESS_COUNT=${GPUS_PER_NODE}
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+          fi
+          for collection in full partial; do
+            NSYS_JAX="nsys-jax"
+            if [[ "${mode}" == "1-process" ]]; then
+              # We will not run nsys-jax-combine, so run analyses eagerly
+              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+            fi
+            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+            if [[ "${collection}" == "partial" ]]; then
+              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+              # nvbug/4801401
+              NSYS_JAX+=" --sample=none"
+            fi
+            set +e
+            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          done
+          if [[ "${mode}" != "1-process" ]]; then
+            # Run nsys-jax-combine
+            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+            for (( i=0; i<PROCESS_COUNT; i++ )); do
+              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+            done
+            set +e
+            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          fi
+        done
+        ls -R .
+        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+        exit $num_failures
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-nsys-jax.log)
+        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        # pytest-driven part
+        test-nsys-jax.log
+        pytest-report.jsonl
+        # nsys-jax logfiles
+        *process-*-execution.log
+        # nsys-jax output for the case that doesn't use nsys-jax-combine
+        1-process-*-execution-0.zip
+        # nsys-jax-combine output/logfiles
+        *process*-*-execution.zip
+        *-execution-combine.log
+    secrets: inherit
 
-  # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  # # not already have nsys-jax installed
-  # test-nsys-jax-archive:
-  #   needs: test-nsys-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   strategy:
-  #     matrix:
-  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-  #   runs-on: ${{ matrix.os }}
-  #   steps:
-  #   - name: Download nsys-jax output .zip files
-  #     uses: actions/download-artifact@v4
-  #     with:
-  #       name: nsys-jax-unit-test-A100
-  #   - name: Extract archives and execute install scripts
-  #     run: |
-  #       pip install virtualenv # for install.sh
-  #       for zip in $(ls *.zip); do
-  #         ZIP="${PWD}/${zip}"
-  #         pushd $(mktemp -d)
-  #         unzip "${ZIP}"
-  #         ls -l
-  #         # TODO: verify this isn't needed, or make sure it isn't needed
-  #         chmod 755 install.sh
-  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-  #         # Skip executing Jupyter lab
-  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-  #         popd
-  #       done
-
-  # test-nsys-jax-eks:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   runs-on: eks
-  #   env:
-  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
-  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-  #     TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
-  #   steps:
-  #   - name: GHCR login
-  #     uses: ./.github/actions/ghcr-login 
-  #     with: 
-  #       docker-username: ${{ github.repository_owner }}
-  #       docker-password: ${{ secrets.GITHUB_TOKEN}}
-  #       token-name: ${{ env.TOKEN_NAME }}
-  #   - name: Configure Kubernetes job
-  #     run: |
-  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
-  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
-  #         .github/eks-workflow-files/job.yml
-  #       git diff .github/eks-workflow-files/job.yml
-  #   - name: Submit Kubernetes job
-  #     uses: ./.github/acitons/submit-k8s-job
-  #     with: 
-  #       job-config-file: .github/eks-workflow-files/job.yml
-  #       job-name: ${{ env.JOB_NAME }}
-
-  #   # Clean up in case of errors as well as success
-  #   - name: Delete Kubernetes job
-  #     uses: ./.github/actions/delete-k8s-job
-  #     with: 
-  #       job-name: ${{ env.JOB_NAME }}
+  # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  # not already have nsys-jax installed
+  test-nsys-jax-archive:
+    needs: test-nsys-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Download nsys-jax output .zip files
+      uses: actions/download-artifact@v4
+      with:
+        name: nsys-jax-unit-test-A100
+    - name: Extract archives and execute install scripts
+      run: |
+        pip install virtualenv # for install.sh
+        for zip in $(ls *.zip); do
+          ZIP="${PWD}/${zip}"
+          pushd $(mktemp -d)
+          unzip "${ZIP}"
+          ls -l
+          # TODO: verify this isn't needed, or make sure it isn't needed
+          chmod 755 install.sh
+          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+          # Skip executing Jupyter lab
+          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+          popd
+        done
+
+  test-nsys-jax-eks:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    runs-on: eks
+    env:
+      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
+      TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+    steps:
+    - name: GHCR login
+      uses: ./.github/actions/ghcr-login 
+      with: 
+        docker-username: ${{ github.repository_owner }}
+        docker-password: ${{ secrets.GITHUB_TOKEN}}
+        token-name: ${{ env.TOKEN_NAME }}
+    - name: Configure Kubernetes job
+      run: |
+        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+          | select(di == 1).metadata.name = strenv(JOB_NAME)
+          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
+          .github/eks-workflow-files/job.yml
+        git diff .github/eks-workflow-files/job.yml
+    - name: Submit Kubernetes job
+      uses: ./.github/acitons/submit-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/job.yml
+        job-name: ${{ env.JOB_NAME }}
+
+    # Clean up in case of errors as well as success
+    - name: Delete Kubernetes job
+      uses: ./.github/actions/delete-k8s-job
+      with: 
+        job-name: ${{ env.JOB_NAME }}
     
-  #   - name: Configure post-processing job
-  #     run: |
-  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-  #         | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-  #         .github/eks-workflow-files/post-process-job.yml
-  #       git diff .github/eks-workflow-files/post-process-job.yml
-  #   - name: Submit Kubernetes job
-  #     uses: ./.github/acitons/submit-k8s-job
-  #     with: 
-  #       job-config-file: .github/eks-workflow-files/post-process-job.yml
-  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-  #   - name: Delete post-processing Kubernetes job
-  #     uses: ./.github/actions/delete-k8s-job
-  #     with: 
-  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-  #   - name: Delete GitHub Container Registry token
-  #     uses: ./.github/actions/delete-ghcr-token
-  #     with: 
-  #       token-name: ${{ env.TOKEN_NAME }}
-
-  # # test-equinox:
-  # #   needs: build-equinox
-  # #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  # #   uses: ./.github/workflows/_test_unit.yaml
-  # #   with:
-  # #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-  # #     TEST_NAME: equinox
-  # #     EXECUTE: |
-  # #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-  # #       bash -exc -o pipefail \
-  # #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
-  # #     STATISTICS_SCRIPT: |
-  # #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  # #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  # #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  # #       total_tests=$((failed_tests + passed_tests))
-  # #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  # #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  # #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  # #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  # #     ARTIFACTS: |
-  # #       test-equinox.log
-  # #   secrets: inherit
-
-  # test-te-multigpu:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_te.yaml
-  #   with:
-  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+    - name: Configure post-processing job
+      run: |
+        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+          .github/eks-workflow-files/post-process-job.yml
+        git diff .github/eks-workflow-files/post-process-job.yml
+    - name: Submit Kubernetes job
+      uses: ./.github/acitons/submit-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/post-process-job.yml
+        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+    - name: Delete post-processing Kubernetes job
+      uses: ./.github/actions/delete-k8s-job
+      with: 
+        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+    - name: Delete GitHub Container Registry token
+      uses: ./.github/actions/delete-ghcr-token
+      with: 
+        token-name: ${{ env.TOKEN_NAME }}
 
-  # test-upstream-t5x:
-  #   needs: build-upstream-t5x
+  # test-equinox:
+  #   needs: build-equinox
   #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-rosetta-t5x:
-  #   needs: build-rosetta-t5x
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-triton:
-  #   needs: build-triton
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: triton
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-triton.log
-  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-  #         # actually having a CUDA backend for pytoch
-  #         pip install --no-deps torch
-  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-  #       passed_tests=$((total_tests - errors - failed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-triton.log
-  #   secrets: inherit
-
-  # test-levanter:
-  #   needs: build-levanter
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
   #   uses: ./.github/workflows/_test_unit.yaml
   #   with:
-  #     TEST_NAME: levanter
+  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  #     TEST_NAME: equinox
   #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g \
-  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-levanter.log
-  #         pip install flake8 pytest soundfile librosa
-  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-  #       EOF
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -exc -o pipefail \
+  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
   #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-levanter.log)
   #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
   #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
   #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
@@ -599,112 +537,165 @@ jobs:
   #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
   #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
   #     ARTIFACTS: |
-  #       test-levanter.log
+  #       test-equinox.log
   #   secrets: inherit
 
-  # test-te:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: te
-  #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-te.log
-  #         pip install pytest-reportlog
-  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-te.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     TIMEOUT_MINUTES: 120
-  #     ARTIFACTS: |
-  #       test-te.log
-  #       pytest-report.jsonl
-  #   secrets: inherit
+  test-te-multigpu:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_te.yaml
+    with:
+      TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
-  # test-upstream-pax:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_upstream_pax.yaml
-  #   with:
-  #     PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-upstream-t5x:
+    needs: build-upstream-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_upstream_t5x.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
-  # test-rosetta-pax:
-  #   needs: build-rosetta-pax
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_pax_rosetta.yaml
-  #   with:
-  #     PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-  
-  # test-gemma:
-  #   needs: build-gemma
-  #   uses: ./.github/workflows/_test_unit.yaml  
-  #   if: inputs.ARCHITECTURE == 'amd64'
-  #   with:
-  #     TEST_NAME: gemma
-  #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -ec \
-  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-gemma.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-gemma.log
-  #   secrets: inherit
+  test-rosetta-t5x:
+    needs: build-rosetta-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_t5x_rosetta.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
-  # test-maxtext:
-  #   needs: build-maxtext
-  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-  #   uses: ./.github/workflows/_test_maxtext.yaml
-  #   with:
-  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-triton:
+    needs: build-triton
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: triton
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-triton.log
+          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+          # actually having a CUDA backend for pytoch
+          pip install --no-deps torch
+          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+        EOF
+      STATISTICS_SCRIPT: |
+        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+        passed_tests=$((total_tests - errors - failed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-triton.log
+    secrets: inherit
 
-  # test-axlearn-slurm:
-  #   needs: build-axlearn
-  #   if: inputs.ARCHITECTURE == 'amd64'
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with: # fix the arguments below
-  #     TEST_NAME: axlearn
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-backend-independent.log
-  #         test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" 
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       # Parse the summary.txt file to count passed/failed/error tests
-  #       # Adjust greps if your output format changes.
-  #       passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true)
-  #       failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true)
-  #       total_tests=$((failed_tests + passed_tests))
+  test-levanter:
+    needs: build-levanter
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: levanter
+      EXECUTE: |
+        docker run -i --gpus all --shm-size=1g \
+        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-levanter.log
+          pip install flake8 pytest soundfile librosa
+          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+        EOF
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-levanter.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-levanter.log
+    secrets: inherit
 
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-backend-independent.log
-  #   secrets: inherit
+  test-te:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: te
+      EXECUTE: |
+        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+        ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-te.log
+          pip install pytest-reportlog
+          pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+        EOF
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-te.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      TIMEOUT_MINUTES: 120
+      ARTIFACTS: |
+        test-te.log
+        pytest-report.jsonl
+    secrets: inherit
 
+  test-upstream-pax:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_upstream_pax.yaml
+    with:
+      PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-rosetta-pax:
+    needs: build-rosetta-pax
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_pax_rosetta.yaml
+    with:
+      PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+  
+  test-gemma:
+    needs: build-gemma
+    uses: ./.github/workflows/_test_unit.yaml  
+    if: inputs.ARCHITECTURE == 'amd64'
+    with:
+      TEST_NAME: gemma
+      EXECUTE: |
+        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+        bash -ec \
+        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-gemma.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-gemma.log
+    secrets: inherit
+
+  test-maxtext:
+    needs: build-maxtext
+    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+    uses: ./.github/workflows/_test_maxtext.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn
@@ -749,6 +740,7 @@ jobs:
         job-name: ${{ env.JOB_NAME }}
 
     - name: Download logs from S3
+      id: log-s3
       run: |
         mkdir -p /tmp/axlearn-output
         aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ env.DATE_TEST_RAN }}/summary.txt /tmp/axlearn-output/
@@ -764,7 +756,6 @@ jobs:
         echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
         echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
         echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
-
     - name: Generate sitrep
       id: sitrep
       if: "!cancelled()"
@@ -775,9 +766,9 @@ jobs:
 
         badge_label='Axlearn EKS Unit'
 
-        total_tests=${{ steps.test-stats.outputs.TOTAL_TESTS }} \
-        failed_tests=${{ steps.test-stats.outputs.FAILED_TESTS }} \
-        passed_tests=${{ steps.test-stats.outputs.PASSED_TESTS }} \
+        total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
+        failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
+        passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
         errors="0" \
         summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
         badge_message="Passed $passed_tests out of $total_tests." \

From 3cd5b7842c88db218db868c510291a4d578eaa86 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Sat, 15 Feb 2025 19:58:19 +0000
Subject: [PATCH 27/89] Fix nsys

---
 .github/workflows/_ci.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 02c2c4611..bab13b8eb 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -466,6 +466,8 @@ jobs:
       POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
       TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
     steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
     - name: GHCR login
       uses: ./.github/actions/ghcr-login 
       with: 

From 51307d9cd04c9bbab3ba48cb3a698a705e6a59f0 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Sun, 16 Feb 2025 20:50:18 +0000
Subject: [PATCH 28/89] fix typo

---
 .github/workflows/_ci.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index bab13b8eb..078449f63 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -484,7 +484,7 @@ jobs:
           .github/eks-workflow-files/job.yml
         git diff .github/eks-workflow-files/job.yml
     - name: Submit Kubernetes job
-      uses: ./.github/acitons/submit-k8s-job
+      uses: ./.github/actions/submit-k8s-job
       with: 
         job-config-file: .github/eks-workflow-files/job.yml
         job-name: ${{ env.JOB_NAME }}
@@ -505,7 +505,7 @@ jobs:
           .github/eks-workflow-files/post-process-job.yml
         git diff .github/eks-workflow-files/post-process-job.yml
     - name: Submit Kubernetes job
-      uses: ./.github/acitons/submit-k8s-job
+      uses: ./.github/actions/submit-k8s-job
       with: 
         job-config-file: .github/eks-workflow-files/post-process-job.yml
         job-name: ${{ env.POSTPROCESS_JOB_NAME }}

From 8d7af610ad81805d6ffbfd316730a6ba07715906 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 17 Feb 2025 10:37:46 +0000
Subject: [PATCH 29/89] test on eks

---
 .github/actions/delete-k8s-job/action.yml |  21 +-
 .github/workflows/_ci.yaml                | 915 +++++++++++-----------
 2 files changed, 477 insertions(+), 459 deletions(-)

diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml
index 877039672..97e2b12a9 100644
--- a/.github/actions/delete-k8s-job/action.yml
+++ b/.github/actions/delete-k8s-job/action.yml
@@ -5,12 +5,29 @@ inputs:
   job-name:
     description: The job name to delete
     required: true
+  token-name:
+    description: Name of the K8s secret to delete
+    required: true
 
 runs:
   using: "composite"
   steps:
     - name: Delete Kubernetes job
       shell: bash
-      if: always()
       run: |
-        kubectl delete job ${{ inputs.job-name }}
+        # make sure we're deleting all the resources 
+        pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}')
+
+        for pod in $pods; do 
+          status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true)
+          echo "Pod: $pod, status: $status"
+          if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then
+            kubectl delete pod "$pod" --force --grace-period=0 || true
+          fi
+        
+        # make sure job is deleted
+        kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true 
+
+
+        # delet eghcr secret 
+        kubectl delete secret ${{ inputs.token-name }} || true
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 078449f63..87c0b689a 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,142 +66,142 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  build-triton:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-triton-build
-      BADGE_FILENAME: badge-triton-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: triton
-      DOCKERFILE: .github/container/Dockerfile.triton
-      RUNNER_SIZE: large
-      EXTRA_BUILD_ARGS: |
-        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-    secrets: inherit
+  # build-triton:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-triton-build
+  #     BADGE_FILENAME: badge-triton-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: triton
+  #     DOCKERFILE: .github/container/Dockerfile.triton
+  #     RUNNER_SIZE: large
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+  #   secrets: inherit
 
-  build-equinox:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-equinox-build
-      BADGE_FILENAME: badge-equinox-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: equinox
-      DOCKERFILE: .github/container/Dockerfile.equinox
-      EXTRA_BUILD_ARGS: |
-        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-    secrets: inherit
+  # build-equinox:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-equinox-build
+  #     BADGE_FILENAME: badge-equinox-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: equinox
+  #     DOCKERFILE: .github/container/Dockerfile.equinox
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+  #   secrets: inherit
 
-  build-maxtext:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-maxtext-build
-      BADGE_FILENAME: badge-maxtext-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: maxtext
-      DOCKERFILE: .github/container/Dockerfile.maxtext
-      EXTRA_BUILD_ARGS: |
-        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-    secrets: inherit
+  # build-maxtext:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-maxtext-build
+  #     BADGE_FILENAME: badge-maxtext-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: maxtext
+  #     DOCKERFILE: .github/container/Dockerfile.maxtext
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+  #   secrets: inherit
 
-  build-levanter:
-    needs: [build-jax]
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-levanter-build"
-      BADGE_FILENAME: "badge-levanter-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: levanter
-      DOCKERFILE: .github/container/Dockerfile.levanter
-      EXTRA_BUILD_ARGS: |
-        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-    secrets: inherit
+  # build-levanter:
+  #   needs: [build-jax]
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-levanter-build"
+  #     BADGE_FILENAME: "badge-levanter-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: levanter
+  #     DOCKERFILE: .github/container/Dockerfile.levanter
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+  #   secrets: inherit
 
-  build-upstream-t5x:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-t5x-build"
-      BADGE_FILENAME: "badge-t5x-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-t5x
-      DOCKERFILE: .github/container/Dockerfile.t5x
-      EXTRA_BUILD_ARGS: |
-        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-    secrets: inherit
+  # build-upstream-t5x:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-t5x-build"
+  #     BADGE_FILENAME: "badge-t5x-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: upstream-t5x
+  #     DOCKERFILE: .github/container/Dockerfile.t5x
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+  #   secrets: inherit
 
-  build-upstream-pax:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-pax-build
-      BADGE_FILENAME: badge-pax-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-pax
-      DOCKERFILE: .github/container/Dockerfile.pax
-      EXTRA_BUILD_ARGS: |
-        URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
-        URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
-        URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
-    secrets: inherit
+  # build-upstream-pax:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-pax-build
+  #     BADGE_FILENAME: badge-pax-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: upstream-pax
+  #     DOCKERFILE: .github/container/Dockerfile.pax
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
+  #       URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
+  #       URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
+  #   secrets: inherit
 
-  build-rosetta-t5x:
-    needs: build-upstream-t5x
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: t5x
-    secrets: inherit
+  # build-rosetta-t5x:
+  #   needs: build-upstream-t5x
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: t5x
+  #   secrets: inherit
 
-  build-rosetta-pax:
-    needs: build-upstream-pax
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: pax
-    secrets: inherit
+  # build-rosetta-pax:
+  #   needs: build-upstream-pax
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: pax
+  #   secrets: inherit
 
-  build-gemma:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-gemma-build
-      BADGE_FILENAME: badge-gemma-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: gemma
-      DOCKERFILE: rosetta/Dockerfile.gemma
-      DOCKER_CONTEXT: .
-      EXTRA_BUILD_ARGS: |
-        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-    secrets: inherit
+  # build-gemma:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-gemma-build
+  #     BADGE_FILENAME: badge-gemma-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: gemma
+  #     DOCKERFILE: rosetta/Dockerfile.gemma
+  #     DOCKER_CONTEXT: .
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+  #   secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -222,15 +222,15 @@ jobs:
     needs:
       - build-base
       - build-jax
-      - build-triton
-      - build-equinox
-      - build-maxtext
-      - build-levanter
-      - build-upstream-t5x
-      - build-upstream-pax
-      - build-rosetta-t5x
-      - build-rosetta-pax
-      - build-gemma
+      # - build-triton
+      # - build-equinox
+      # - build-maxtext
+      # - build-levanter
+      # - build-upstream-t5x
+      # - build-upstream-pax
+      # - build-rosetta-t5x
+      # - build-rosetta-pax
+      # - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -242,26 +242,26 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -271,27 +271,27 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  test-distribution:
-    runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
-    steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
-        run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+  # test-distribution:
+  #   runs-on: ubuntu-22.04
+  #   strategy:
+  #     matrix:
+  #       TEST_SCRIPT:
+  #         - extra-only-distribution.sh
+  #         - mirror-only-distribution.sh
+  #         - upstream-only-distribution.sh
+  #         - local-patch-distribution.sh
+  #     fail-fast: false
+  #   steps:
+  #     - name: Print environment variables
+  #       run: env
+  #     - name: Set git login for tests
+  #       run: |
+  #         git config --global user.email "jax@nvidia.com"
+  #         git config --global user.name "JAX-Toolbox CI"
+  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
+  #       uses: actions/checkout@v4
+  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
 
   test-jax:
     needs: build-jax
@@ -325,136 +325,136 @@ jobs:
         test-gpu.log
     secrets: inherit
 
-  test-nsys-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: nsys-jax
-      EXECUTE: |
-        set -o pipefail
-        num_tests=0
-        num_failures=0
-        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-        # avoid an early abort here.
-        set +e
-        docker run -i --shm-size=1g --gpus all \
-          -v $PWD:/opt/output \
-          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-          bash <<"EOF" |& tee test-nsys-jax.log
-            # nsys-jax is already installed, this is just adding the test dependencies
-            pip install pytest-reportlog nsys-jax[test]
-            # abuse knowledge that nsys-jax is installed editable, so the tests exist
-            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-        EOF
-        set -e
-        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-        for mode in 1-process 2-process process-per-gpu; do
-          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-          if [[ "${mode}" == "1-process" ]]; then
-            PROCESS_COUNT=1
-            ARGS=""
-          elif [[ "${mode}" == "2-process" ]]; then
-            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-            # this will flush out more bugs than process-per-node or process-per-GPU.
-            PROCESS_COUNT=2
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-          else
-            PROCESS_COUNT=${GPUS_PER_NODE}
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-          fi
-          for collection in full partial; do
-            NSYS_JAX="nsys-jax"
-            if [[ "${mode}" == "1-process" ]]; then
-              # We will not run nsys-jax-combine, so run analyses eagerly
-              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-            fi
-            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-            if [[ "${collection}" == "partial" ]]; then
-              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-              # nvbug/4801401
-              NSYS_JAX+=" --sample=none"
-            fi
-            set +e
-            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          done
-          if [[ "${mode}" != "1-process" ]]; then
-            # Run nsys-jax-combine
-            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-            for (( i=0; i<PROCESS_COUNT; i++ )); do
-              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-            done
-            set +e
-            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          fi
-        done
-        ls -R .
-        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-        exit $num_failures
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-nsys-jax.log)
-        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        # pytest-driven part
-        test-nsys-jax.log
-        pytest-report.jsonl
-        # nsys-jax logfiles
-        *process-*-execution.log
-        # nsys-jax output for the case that doesn't use nsys-jax-combine
-        1-process-*-execution-0.zip
-        # nsys-jax-combine output/logfiles
-        *process*-*-execution.zip
-        *-execution-combine.log
-    secrets: inherit
+  # test-nsys-jax:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: nsys-jax
+  #     EXECUTE: |
+  #       set -o pipefail
+  #       num_tests=0
+  #       num_failures=0
+  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+  #       # avoid an early abort here.
+  #       set +e
+  #       docker run -i --shm-size=1g --gpus all \
+  #         -v $PWD:/opt/output \
+  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #         bash <<"EOF" |& tee test-nsys-jax.log
+  #           # nsys-jax is already installed, this is just adding the test dependencies
+  #           pip install pytest-reportlog nsys-jax[test]
+  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
+  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+  #       EOF
+  #       set -e
+  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+  #       for mode in 1-process 2-process process-per-gpu; do
+  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+  #         if [[ "${mode}" == "1-process" ]]; then
+  #           PROCESS_COUNT=1
+  #           ARGS=""
+  #         elif [[ "${mode}" == "2-process" ]]; then
+  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+  #           # this will flush out more bugs than process-per-node or process-per-GPU.
+  #           PROCESS_COUNT=2
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+  #         else
+  #           PROCESS_COUNT=${GPUS_PER_NODE}
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+  #         fi
+  #         for collection in full partial; do
+  #           NSYS_JAX="nsys-jax"
+  #           if [[ "${mode}" == "1-process" ]]; then
+  #             # We will not run nsys-jax-combine, so run analyses eagerly
+  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+  #           fi
+  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+  #           if [[ "${collection}" == "partial" ]]; then
+  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+  #             # nvbug/4801401
+  #             NSYS_JAX+=" --sample=none"
+  #           fi
+  #           set +e
+  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         done
+  #         if [[ "${mode}" != "1-process" ]]; then
+  #           # Run nsys-jax-combine
+  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
+  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+  #           done
+  #           set +e
+  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         fi
+  #       done
+  #       ls -R .
+  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+  #       exit $num_failures
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-nsys-jax.log)
+  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       # pytest-driven part
+  #       test-nsys-jax.log
+  #       pytest-report.jsonl
+  #       # nsys-jax logfiles
+  #       *process-*-execution.log
+  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
+  #       1-process-*-execution-0.zip
+  #       # nsys-jax-combine output/logfiles
+  #       *process*-*-execution.zip
+  #       *-execution-combine.log
+  #   secrets: inherit
 
   # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
   # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
   # not already have nsys-jax installed
-  test-nsys-jax-archive:
-    needs: test-nsys-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Download nsys-jax output .zip files
-      uses: actions/download-artifact@v4
-      with:
-        name: nsys-jax-unit-test-A100
-    - name: Extract archives and execute install scripts
-      run: |
-        pip install virtualenv # for install.sh
-        for zip in $(ls *.zip); do
-          ZIP="${PWD}/${zip}"
-          pushd $(mktemp -d)
-          unzip "${ZIP}"
-          ls -l
-          # TODO: verify this isn't needed, or make sure it isn't needed
-          chmod 755 install.sh
-          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-          # Skip executing Jupyter lab
-          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-          popd
-        done
+  # test-nsys-jax-archive:
+  #   needs: test-nsys-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   strategy:
+  #     matrix:
+  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+  #   runs-on: ${{ matrix.os }}
+  #   steps:
+  #   - name: Download nsys-jax output .zip files
+  #     uses: actions/download-artifact@v4
+  #     with:
+  #       name: nsys-jax-unit-test-A100
+  #   - name: Extract archives and execute install scripts
+  #     run: |
+  #       pip install virtualenv # for install.sh
+  #       for zip in $(ls *.zip); do
+  #         ZIP="${PWD}/${zip}"
+  #         pushd $(mktemp -d)
+  #         unzip "${ZIP}"
+  #         ls -l
+  #         # TODO: verify this isn't needed, or make sure it isn't needed
+  #         chmod 755 install.sh
+  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+  #         # Skip executing Jupyter lab
+  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+  #         popd
+  #       done
 
   test-nsys-jax-eks:
     needs: build-jax
@@ -489,11 +489,12 @@ jobs:
         job-config-file: .github/eks-workflow-files/job.yml
         job-name: ${{ env.JOB_NAME }}
 
-    # Clean up in case of errors as well as success
-    - name: Delete Kubernetes job
+    - name: Delete eks job
       uses: ./.github/actions/delete-k8s-job
+      if: ( cancelled() || always() )
       with: 
         job-name: ${{ env.JOB_NAME }}
+        token-name: ${{ env.TOKEN_NAME }}
     
     - name: Configure post-processing job
       run: |
@@ -509,10 +510,12 @@ jobs:
       with: 
         job-config-file: .github/eks-workflow-files/post-process-job.yml
         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete post-processing Kubernetes job
+    - name: Delete eks postprocess job
       uses: ./.github/actions/delete-k8s-job
+      if: ( cancelled() || always() )
       with: 
-        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+        job-name: ${{ env.JOB_NAME }}
+        token-name: ${{ env.TOKEN_NAME }}
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/delete-ghcr-token
       with: 
@@ -542,162 +545,162 @@ jobs:
   #       test-equinox.log
   #   secrets: inherit
 
-  test-te-multigpu:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_te.yaml
-    with:
-      TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-te-multigpu:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_te.yaml
+  #   with:
+  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-upstream-t5x:
-    needs: build-upstream-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_upstream_t5x.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-t5x:
+  #   needs: build-upstream-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-rosetta-t5x:
-    needs: build-rosetta-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_t5x_rosetta.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-rosetta-t5x:
+  #   needs: build-rosetta-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-triton:
-    needs: build-triton
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: triton
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-triton.log
-          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-          # actually having a CUDA backend for pytoch
-          pip install --no-deps torch
-          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-        EOF
-      STATISTICS_SCRIPT: |
-        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-        passed_tests=$((total_tests - errors - failed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-triton.log
-    secrets: inherit
+  # test-triton:
+  #   needs: build-triton
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: triton
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-triton.log
+  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+  #         # actually having a CUDA backend for pytoch
+  #         pip install --no-deps torch
+  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+  #       passed_tests=$((total_tests - errors - failed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-triton.log
+  #   secrets: inherit
 
-  test-levanter:
-    needs: build-levanter
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: levanter
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g \
-        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-levanter.log
-          pip install flake8 pytest soundfile librosa
-          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-levanter.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-levanter.log
-    secrets: inherit
+  # test-levanter:
+  #   needs: build-levanter
+  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: levanter
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g \
+  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-levanter.log
+  #         pip install flake8 pytest soundfile librosa
+  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-levanter.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-levanter.log
+  #   secrets: inherit
 
-  test-te:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: te
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-        ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-te.log
-          pip install pytest-reportlog
-          pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-te.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      TIMEOUT_MINUTES: 120
-      ARTIFACTS: |
-        test-te.log
-        pytest-report.jsonl
-    secrets: inherit
+  # test-te:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: te
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-te.log
+  #         pip install pytest-reportlog
+  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-te.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     TIMEOUT_MINUTES: 120
+  #     ARTIFACTS: |
+  #       test-te.log
+  #       pytest-report.jsonl
+  #   secrets: inherit
 
-  test-upstream-pax:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_upstream_pax.yaml
-    with:
-      PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-pax:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_upstream_pax.yaml
+  #   with:
+  #     PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-rosetta-pax:
-    needs: build-rosetta-pax
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_pax_rosetta.yaml
-    with:
-      PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-rosetta-pax:
+  #   needs: build-rosetta-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_pax_rosetta.yaml
+  #   with:
+  #     PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
   
-  test-gemma:
-    needs: build-gemma
-    uses: ./.github/workflows/_test_unit.yaml  
-    if: inputs.ARCHITECTURE == 'amd64'
-    with:
-      TEST_NAME: gemma
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-        bash -ec \
-        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-gemma.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-gemma.log
-    secrets: inherit
+  # test-gemma:
+  #   needs: build-gemma
+  #   uses: ./.github/workflows/_test_unit.yaml  
+  #   if: inputs.ARCHITECTURE == 'amd64'
+  #   with:
+  #     TEST_NAME: gemma
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -ec \
+  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-gemma.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-gemma.log
+  #   secrets: inherit
 
-  test-maxtext:
-    needs: build-maxtext
-    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-maxtext:
+  #   needs: build-maxtext
+  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+  #   uses: ./.github/workflows/_test_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn
@@ -738,8 +741,10 @@ jobs:
 
     - name: Delete axlearn test job
       uses: ./.github/actions/delete-k8s-job
+      if: ( cancelled() || always() )
       with: 
         job-name: ${{ env.JOB_NAME }}
+        token-name: ${{ env.TOKEN_NAME }}
 
     - name: Download logs from S3
       id: log-s3
@@ -801,8 +806,4 @@ jobs:
           sitrep.json
           "badge-axlearn-test"
           summary.txt
-    - name: Delete GitHub Container Registry token
-      uses: ./.github/actions/delete-ghcr-token
-      with: 
-        token-name: ${{ env.TOKEN_NAME }}
   

From ca15908a9b6c0de0039093ae50aace4ab04d78ba Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 17 Feb 2025 11:21:44 +0000
Subject: [PATCH 30/89] forgot the done for

---
 .github/actions/delete-k8s-job/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml
index 97e2b12a9..db749daab 100644
--- a/.github/actions/delete-k8s-job/action.yml
+++ b/.github/actions/delete-k8s-job/action.yml
@@ -24,6 +24,7 @@ runs:
           if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then
             kubectl delete pod "$pod" --force --grace-period=0 || true
           fi
+        done
         
         # make sure job is deleted
         kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true 

From 9fe301c82ee8f3c8a7629de95865fa18bb9c2907 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 17 Feb 2025 12:03:48 +0000
Subject: [PATCH 31/89] move ghcr deletion a part

---
 .github/actions/delete-k8s-job/action.yml | 10 ++--------
 .github/workflows/_ci.yaml                |  8 ++++++--
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml
index db749daab..15a5add64 100644
--- a/.github/actions/delete-k8s-job/action.yml
+++ b/.github/actions/delete-k8s-job/action.yml
@@ -5,9 +5,7 @@ inputs:
   job-name:
     description: The job name to delete
     required: true
-  token-name:
-    description: Name of the K8s secret to delete
-    required: true
+
 
 runs:
   using: "composite"
@@ -27,8 +25,4 @@ runs:
         done
         
         # make sure job is deleted
-        kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true 
-
-
-        # delet eghcr secret 
-        kubectl delete secret ${{ inputs.token-name }} || true
+        kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true 
\ No newline at end of file
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 87c0b689a..b2fa91b25 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -494,7 +494,6 @@ jobs:
       if: ( cancelled() || always() )
       with: 
         job-name: ${{ env.JOB_NAME }}
-        token-name: ${{ env.TOKEN_NAME }}
     
     - name: Configure post-processing job
       run: |
@@ -515,9 +514,9 @@ jobs:
       if: ( cancelled() || always() )
       with: 
         job-name: ${{ env.JOB_NAME }}
-        token-name: ${{ env.TOKEN_NAME }}
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/delete-ghcr-token
+      if: ( cancelled() || always() )
       with: 
         token-name: ${{ env.TOKEN_NAME }}
 
@@ -744,6 +743,11 @@ jobs:
       if: ( cancelled() || always() )
       with: 
         job-name: ${{ env.JOB_NAME }}
+
+    - name: Delete GitHub Container Registry token
+      uses: ./.github/actions/delete-ghcr-token
+      if: ( cancelled() || always() )
+      with: 
         token-name: ${{ env.TOKEN_NAME }}
 
     - name: Download logs from S3

From 9125c820cc4aca7db768c8937c32fff297c0ccff Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 17 Feb 2025 14:22:03 +0000
Subject: [PATCH 32/89] try to replace postprocess

---
 .github/container/test-axlearn.sh |  2 +-
 .github/workflows/_ci.yaml        | 26 +++++++++-----------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 27118b7a0..088c955df 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -120,7 +120,7 @@ echo "Running tests..."
 # If we are on Kubernetes, install torch for cpu only
 if [ "$K8S" = true ]; then
      pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
-     pip install transformers
+     pip install transformers sklearn timm
 fi
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index b2fa91b25..0c09e3ba5 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -495,25 +495,17 @@ jobs:
       with: 
         job-name: ${{ env.JOB_NAME }}
     
-    - name: Configure post-processing job
+
+    - name: Postprocess retrieve test 
       run: |
         export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-          .github/eks-workflow-files/post-process-job.yml
-        git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit Kubernetes job
-      uses: ./.github/actions/submit-k8s-job
-      with: 
-        job-config-file: .github/eks-workflow-files/post-process-job.yml
-        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete eks postprocess job
-      uses: ./.github/actions/delete-k8s-job
-      if: ( cancelled() || always() )
-      with: 
-        job-name: ${{ env.JOB_NAME }}
+        mkdir -p /tmp/axlearn-output
+        aws s3 cp --recursive --exclude "*" --include "${JOB_OUTPUT_PATTERN}" s3://jax-toolbox-eks-output/ /tmp/axlearn-output
+
+    - name: Combine with nsys-jax-combine
+      run: | 
+        cd /tmp/axlearn-output
+        nsys-jax-combine -o combined.zip ./*.zip --analysis communication
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/delete-ghcr-token
       if: ( cancelled() || always() )

From 4b39c9cdd2ea1b0ce241e48bd0be742df3e7f331 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 17 Feb 2025 16:20:36 +0000
Subject: [PATCH 33/89] fix nccl test

---
 .github/actions/submit-k8s-job/action.yml |  13 ++-
 .github/workflows/_ci.yaml                |  41 ++++---
 .github/workflows/_test_nccl.yaml         | 131 ----------------------
 .github/workflows/nccl-k8s.yaml           | 105 ++++++++++++++++-
 4 files changed, 136 insertions(+), 154 deletions(-)
 delete mode 100644 .github/workflows/_test_nccl.yaml

diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml
index c00826897..aa73cf2e2 100644
--- a/.github/actions/submit-k8s-job/action.yml
+++ b/.github/actions/submit-k8s-job/action.yml
@@ -17,12 +17,21 @@ runs:
       run: |
         kubectl apply -f "${{ inputs.job-config-file }}"
 
-    - name: Wait for Kubernetes job to start
+    - name: Wait for job to be un-suspended (Kueue)
       shell: bash
+      run: |
+        # wait for the job to be created 
+        kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60
+
+        # wait for the 'spec.suspend' field to become false. Necessary for kueue
+        kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=3600s
+
+    - name: Wait for pods to start 
+      shell: bash 
       run: |
         while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
           echo "Waiting for pods to start..."
-          sleep 10
+          sleep 20
         done
 
     - name: Stream Kubernetes job output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 0c09e3ba5..cfe15607c 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -462,9 +462,9 @@ jobs:
     runs-on: eks
     env:
       JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+      JOB_NAME: ${{ github.run_id }}-nsys-jax
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+      TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
@@ -488,27 +488,32 @@ jobs:
       with: 
         job-config-file: .github/eks-workflow-files/job.yml
         job-name: ${{ env.JOB_NAME }}
-
     - name: Delete eks job
       uses: ./.github/actions/delete-k8s-job
-      if: ( cancelled() || always() )
+      if: always()
       with: 
         job-name: ${{ env.JOB_NAME }}
-    
-
-    - name: Postprocess retrieve test 
+    - name: Configure post-processing job
       run: |
         export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        mkdir -p /tmp/axlearn-output
-        aws s3 cp --recursive --exclude "*" --include "${JOB_OUTPUT_PATTERN}" s3://jax-toolbox-eks-output/ /tmp/axlearn-output
-
-    - name: Combine with nsys-jax-combine
-      run: | 
-        cd /tmp/axlearn-output
-        nsys-jax-combine -o combined.zip ./*.zip --analysis communication
+        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+          .github/eks-workflow-files/post-process-job.yml
+        git diff .github/eks-workflow-files/post-process-job.yml
+    - name: Submit post process k8s job
+      uses: ./.github/actions/submit-k8s/job
+      with: 
+          job-config-file: .github/eks-workflow-files/post-process-job.yml
+          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+    - name: Delete post process k8s job
+      uses: ./.github/actions/delete-k8s-job
+      with:
+          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/delete-ghcr-token
-      if: ( cancelled() || always() )
+      if: always()
       with: 
         token-name: ${{ env.TOKEN_NAME }}
 
@@ -732,13 +737,13 @@ jobs:
 
     - name: Delete axlearn test job
       uses: ./.github/actions/delete-k8s-job
-      if: ( cancelled() || always() )
+      if: always()
       with: 
         job-name: ${{ env.JOB_NAME }}
 
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/delete-ghcr-token
-      if: ( cancelled() || always() )
+      if: always()
       with: 
         token-name: ${{ env.TOKEN_NAME }}
 
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
deleted file mode 100644
index 54da0886e..000000000
--- a/.github/workflows/_test_nccl.yaml
+++ /dev/null
@@ -1,131 +0,0 @@
-name: ~run NCCL tests
-
-on:
-  workflow_call:
-    inputs:
-      # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
-      # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
-      # to be modified to test one of the JAX-Toolbox containers.
-      CONTAINER:
-        type: string
-        description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
-        required: true
-
-permissions:
-  actions:  write # to cancel previous workflows
-  contents: read  # to fetch code
-  packages: write # to upload container
-
-jobs:
-  build-mpi-operator-compatible-base:
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: amd64
-      ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
-      BADGE_FILENAME: badge-mpi-operator-compatible-base-build
-      BUILD_DATE: 0000-00-00 # not important; this image is never published
-      BASE_IMAGE: ${{ inputs.CONTAINER }}
-      CONTAINER_NAME: mpi-operator-compatible-base
-      DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
-      RUNNER_SIZE: small
-    secrets: inherit
-  nccl-test:
-    needs: build-mpi-operator-compatible-base
-    strategy:
-      matrix:
-        test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
-    runs-on: eks
-    env:
-      BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
-      TEST_NAME: ${{ matrix.test }}
-    steps:
-      - name: Check out the repository
-        uses: actions/checkout@v4
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Store GitHub Container Registry token as Kubernetes secret
-        run: |
-          # Replace underscores in TEST_NAME with - to make a valid Kubernetes name
-          JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
-          LAUNCHER_NAME="${JOB_NAME}-launcher"
-          TOKEN_NAME="${JOB_NAME}-token"
-          # Make these available to later steps
-          echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
-          echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
-          echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
-          kubectl create secret generic \
-            ${TOKEN_NAME} \
-            --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-            --type=kubernetes.io/dockerconfigjson
-      - name: Configure Kubernetes job
-        run: |
-          export WORKER_NAME="${JOB_NAME}-worker"
-          yq -i '.metadata.name = strenv(JOB_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
-            .github/eks-workflow-files/mpi-nccl-test.yml
-          git diff .github/eks-workflow-files/mpi-nccl-test.yml
-      - name: Submit Kubernetes job
-        run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
-      - name: Wait for Kubernetes job to start
-        # Note that this is *not* using JOB_NAME
-        run: |
-          # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
-          # resources are available, but that is where there can be a long wait if the
-          # cluster is busy executing other jobs.
-          kubectl wait --for=create job/${LAUNCHER_NAME}
-          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
-      - name: Stream Kubernetes job output
-        # Note that this is *not* JOB_NAME
-        run: |
-          # Streaming logs will fail if the container/pod is still pending
-          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-            sleep 1
-          done
-          # TODO: --all-containers=true --all-pods=true could make sense here, but it
-          # prefixes lines with a rather verbose tag
-          kubectl logs --follow job/${LAUNCHER_NAME}
-      - name: Retrieve Kubernetes job status
-        shell: bash -exo pipefail {0}
-        run: |
-          while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
-            failure=${status[0]:-0}
-            success=${status[1]:-0}
-            total=$((failure+success))
-            if [[ ${total} < 1 ]]; then
-              sleep 1
-            elif [[ ${total} == 1 ]]; then
-              break
-            else
-              # Shouldn't happen, maybe a sign the job being monitored does not have a
-              # single launcher pod?
-              exit 255
-            fi
-          done
-          exit ${failure}
-      # Provide more debug output in case of failure; note that some kinds of launch
-      # failure do not produce any log output.
-      - name: Debug failed Kubernetes job
-        if: failure()
-        run: |
-          # Provide better debug in case of launch failures that will not produce log output
-          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
-          if [[ -n "${pods}" ]]; then
-            kubectl describe ${pods}
-          fi
-      # Clean up in case of errors as well as success
-      - name: Delete Kubernetes job
-        if: always()
-        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
-      - name: Delete GitHub Container Registry token
-        if: always()
-        run: kubectl delete secret ${TOKEN_NAME}
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index d51c12382..65dcc660c 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -31,8 +31,107 @@ permissions:
   packages: write # to upload container
 
 jobs:
-  nccl-tests:
-    uses: ./.github/workflows/_test_nccl.yaml
+  build-mpi-operator-compatible-base:
+    uses: ./.github/workflows/_build.yaml
     with:
-      CONTAINER: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }}
+      ARCHITECTURE: amd64
+      ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
+      BADGE_FILENAME: badge-mpi-operator-compatible-base-build
+      BUILD_DATE: 0000-00-00 # Not important; this image is never published
+      BASE_IMAGE: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }}
+      CONTAINER_NAME: mpi-operator-compatible-base
+      DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
+      RUNNER_SIZE: small
     secrets: inherit
+
+  nccl-tests:
+    needs: build-mpi-operator-compatible-base
+    runs-on: eks
+    strategy:
+      matrix:
+        test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
+    env:
+      BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: ${{ matrix.test }}
+      JOB_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}"
+      LAUNCHER_NAME: "${{ env.JOB_NAME }}-launcher"
+      TOKEN_NAME: "${{ env.JOB_NAME }}-token"
+
+    
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4
+
+      - name: GHCR login and store K8s secret
+        uses: ./.github/actions/ghcr-login
+        with:
+          docker-username: ${{ github.repository_owner }}
+          docker-password: ${{ secrets.GITHUB_TOKEN }}
+          token-name: ${{ env.TOKEN_NAME }}
+      - name: Configure Kubernetes job
+        shell: bash
+        run: |
+          export JOB_NAME="${{ env.JOB_NAME }}"
+          export LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
+          export TOKEN_NAME="${{ env.TOKEN_NAME }}"
+          export TEST_NAME="${{ env.TEST_NAME }}"
+          export WORKER_NAME="${JOB_NAME}-worker"
+
+          # Use yq to set our fields in-place
+          yq -i '.metadata.name = strenv(JOB_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+            .github/eks-workflow-files/mpi-nccl-test.yml
+
+          # (Optional) Show diff for debugging
+          git diff .github/eks-workflow-files/mpi-nccl-test.yml
+
+      - name: Submit & stream K8s job
+        uses: ./.github/actions/submit-k8s-job
+        with:
+          job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml
+          job-name: ${{ env.LAUNCHER_NAME }} 
+      - name: Retrieve Kubernetes job status
+        shell: bash -exo pipefail
+        run: |
+          LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
+          while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failure=${status[0]:-0}
+            success=${status[1]:-0}
+            total=$((failure+success))
+            if [[ ${total} < 1 ]]; then
+              sleep 1
+            elif [[ ${total} == 1 ]]; then
+              break
+            else
+              # If total > 1, that suggests a mismatch that can occur if there's more than one launcher pod
+              exit 255
+            fi
+          done
+          exit ${failure}
+      - name: Debug failed Kubernetes job
+        if: failure()
+        shell: bash
+        run: |
+          LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
+          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
+          if [[ -n "${pods}" ]]; then
+            kubectl describe ${pods}
+          fi
+      - name: Delete Kubernetes job
+        if: always()
+        uses: ./.github/actions/delete-k8s-job
+        with:
+          job-name: ${{ env.LAUNCHER_NAME }}
+
+          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+      - name: Delete GitHub Container Registry token
+        uses: ./.github/actions/delete-ghcr-token
+        if: always()
+        with: 
+          token-name: ${{ env.TOKEN_NAME }}
\ No newline at end of file

From 9516183231a94a2272e366e5bbad806a0d4bbce9 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 17 Feb 2025 16:50:34 +0000
Subject: [PATCH 34/89] fix errors

---
 .github/actions/submit-k8s-job/action.yml | 2 +-
 .github/workflows/nccl-k8s.yaml           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml
index aa73cf2e2..49ddad748 100644
--- a/.github/actions/submit-k8s-job/action.yml
+++ b/.github/actions/submit-k8s-job/action.yml
@@ -21,7 +21,7 @@ runs:
       shell: bash
       run: |
         # wait for the job to be created 
-        kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60
+        kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s
 
         # wait for the 'spec.suspend' field to become false. Necessary for kueue
         kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=3600s
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index 65dcc660c..805aba9e8 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -54,8 +54,8 @@ jobs:
       BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
       TEST_NAME: ${{ matrix.test }}
       JOB_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}"
-      LAUNCHER_NAME: "${{ env.JOB_NAME }}-launcher"
-      TOKEN_NAME: "${{ env.JOB_NAME }}-token"
+      LAUNCHER_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-launcher"
+      TOKEN_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-token"
 
     
     steps:

From cbee8bbc6909b1598aee4ab8bb5404a8f888ce52 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 17 Feb 2025 17:19:01 +0000
Subject: [PATCH 35/89] fix typo

---
 .github/workflows/_ci.yaml      | 2 +-
 .github/workflows/nccl-k8s.yaml | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index cfe15607c..74a840cb0 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -503,7 +503,7 @@ jobs:
           .github/eks-workflow-files/post-process-job.yml
         git diff .github/eks-workflow-files/post-process-job.yml
     - name: Submit post process k8s job
-      uses: ./.github/actions/submit-k8s/job
+      uses: ./.github/actions/submit-k8s-job
       with: 
           job-config-file: .github/eks-workflow-files/post-process-job.yml
           job-name: ${{ env.POSTPROCESS_JOB_NAME }}
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index 805aba9e8..176217234 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -128,8 +128,6 @@ jobs:
         uses: ./.github/actions/delete-k8s-job
         with:
           job-name: ${{ env.LAUNCHER_NAME }}
-
-          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
       - name: Delete GitHub Container Registry token
         uses: ./.github/actions/delete-ghcr-token
         if: always()

From 8aed044a4eb29a7fe70484b591672371bef8b89c Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 10:29:41 +0000
Subject: [PATCH 36/89] make a test with 5 files

---
 .github/container/test-axlearn.sh             |  2 +-
 .../axlearn/axlearn-job.yml                   |  5 ++--
 .github/workflows/nccl-k8s.yaml               | 29 +++++++++++--------
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 088c955df..5d256706e 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -175,7 +175,7 @@ passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]}"; do
+for test_file in "${final_test_files[@]:0:10}"; do
     echo "Running: ${test_file}"
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
     log_file="${LOG_DIRECTORY}/${log_file_name}"
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 56183f35a..7e7fe0f15 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -15,7 +15,7 @@ spec:
         image: PLACEHOLDER 
         command:
           - bash
-          - -exo
+          - -xo
           - pipefail
           - -c
           - |
@@ -29,7 +29,8 @@ spec:
               --k8s
 
             # Wait a moment to ensure logs are flushed
-            sync
+            sync  
+            
         resources:
           limits:
             nvidia.com/gpu: 8
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index 176217234..816979355 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -53,27 +53,32 @@ jobs:
     env:
       BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
       TEST_NAME: ${{ matrix.test }}
-      JOB_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}"
-      LAUNCHER_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-launcher"
-      TOKEN_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-token"
 
     
     steps:
       - name: Check out the repository
         uses: actions/checkout@v4
 
+      - name: Modify variables 
+        id: var
+        shell: bash 
+        run: |
+          echo "JOB_NAME=${{ env.JOB_NAME}//_/-}" >> $GITHUB_OUTPUT
+          echo "LAUNCHER_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-launcher" >> $GITHUB_OUTPUT 
+          echo "TOKEN_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-token" >> $GITHUB_OUTPUT
+
       - name: GHCR login and store K8s secret
         uses: ./.github/actions/ghcr-login
         with:
           docker-username: ${{ github.repository_owner }}
           docker-password: ${{ secrets.GITHUB_TOKEN }}
-          token-name: ${{ env.TOKEN_NAME }}
+          token-name: ${{ steps.var.TOKEN_NAME }}
       - name: Configure Kubernetes job
         shell: bash
         run: |
-          export JOB_NAME="${{ env.JOB_NAME }}"
-          export LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
-          export TOKEN_NAME="${{ env.TOKEN_NAME }}"
+          export JOB_NAME="${{ steps.var.JOB_NAME }}"
+          export LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
+          export TOKEN_NAME="${{ steps.var.TOKEN_NAME }}"
           export TEST_NAME="${{ env.TEST_NAME }}"
           export WORKER_NAME="${JOB_NAME}-worker"
 
@@ -95,11 +100,11 @@ jobs:
         uses: ./.github/actions/submit-k8s-job
         with:
           job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml
-          job-name: ${{ env.LAUNCHER_NAME }} 
+          job-name: ${{ steps.var.LAUNCHER_NAME }} 
       - name: Retrieve Kubernetes job status
         shell: bash -exo pipefail
         run: |
-          LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
+          LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
           while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
             failure=${status[0]:-0}
             success=${status[1]:-0}
@@ -118,7 +123,7 @@ jobs:
         if: failure()
         shell: bash
         run: |
-          LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
+          LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
           pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
           if [[ -n "${pods}" ]]; then
             kubectl describe ${pods}
@@ -127,9 +132,9 @@ jobs:
         if: always()
         uses: ./.github/actions/delete-k8s-job
         with:
-          job-name: ${{ env.LAUNCHER_NAME }}
+          job-name: ${{ steps.var.LAUNCHER_NAME }}
       - name: Delete GitHub Container Registry token
         uses: ./.github/actions/delete-ghcr-token
         if: always()
         with: 
-          token-name: ${{ env.TOKEN_NAME }}
\ No newline at end of file
+          token-name: ${{ steps.var.TOKEN_NAME }}
\ No newline at end of file

From 91a2bf7a2d38ee3536830a5f9ba13f7a062606c1 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 10:32:28 +0000
Subject: [PATCH 37/89] fix conflicts

---
 .github/workflows/_ci.yaml | 610 ++++++++++++++++++-------------------
 1 file changed, 305 insertions(+), 305 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 74a840cb0..1164e2a3d 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,142 +66,142 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  # build-triton:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-triton-build
-  #     BADGE_FILENAME: badge-triton-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: triton
-  #     DOCKERFILE: .github/container/Dockerfile.triton
-  #     RUNNER_SIZE: large
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-  #   secrets: inherit
+  build-triton:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-triton-build
+      BADGE_FILENAME: badge-triton-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: triton
+      DOCKERFILE: .github/container/Dockerfile.triton
+      RUNNER_SIZE: large
+      EXTRA_BUILD_ARGS: |
+        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+    secrets: inherit
 
-  # build-equinox:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-equinox-build
-  #     BADGE_FILENAME: badge-equinox-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: equinox
-  #     DOCKERFILE: .github/container/Dockerfile.equinox
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-  #   secrets: inherit
+  build-equinox:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-equinox-build
+      BADGE_FILENAME: badge-equinox-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: equinox
+      DOCKERFILE: .github/container/Dockerfile.equinox
+      EXTRA_BUILD_ARGS: |
+        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+    secrets: inherit
 
-  # build-maxtext:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-maxtext-build
-  #     BADGE_FILENAME: badge-maxtext-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: maxtext
-  #     DOCKERFILE: .github/container/Dockerfile.maxtext
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-  #   secrets: inherit
+  build-maxtext:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-maxtext-build
+      BADGE_FILENAME: badge-maxtext-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: maxtext
+      DOCKERFILE: .github/container/Dockerfile.maxtext
+      EXTRA_BUILD_ARGS: |
+        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+    secrets: inherit
 
-  # build-levanter:
-  #   needs: [build-jax]
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-levanter-build"
-  #     BADGE_FILENAME: "badge-levanter-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: levanter
-  #     DOCKERFILE: .github/container/Dockerfile.levanter
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-  #   secrets: inherit
+  build-levanter:
+    needs: [build-jax]
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-levanter-build"
+      BADGE_FILENAME: "badge-levanter-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: levanter
+      DOCKERFILE: .github/container/Dockerfile.levanter
+      EXTRA_BUILD_ARGS: |
+        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+    secrets: inherit
 
-  # build-upstream-t5x:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-t5x-build"
-  #     BADGE_FILENAME: "badge-t5x-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: upstream-t5x
-  #     DOCKERFILE: .github/container/Dockerfile.t5x
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-  #   secrets: inherit
+  build-upstream-t5x:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-t5x-build"
+      BADGE_FILENAME: "badge-t5x-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: upstream-t5x
+      DOCKERFILE: .github/container/Dockerfile.t5x
+      EXTRA_BUILD_ARGS: |
+        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+    secrets: inherit
 
-  # build-upstream-pax:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-pax-build
-  #     BADGE_FILENAME: badge-pax-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: upstream-pax
-  #     DOCKERFILE: .github/container/Dockerfile.pax
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
-  #       URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
-  #       URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
-  #   secrets: inherit
+  build-upstream-pax:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-pax-build
+      BADGE_FILENAME: badge-pax-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: upstream-pax
+      DOCKERFILE: .github/container/Dockerfile.pax
+      EXTRA_BUILD_ARGS: |
+        URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }}
+        URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }}
+        URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }}
+    secrets: inherit
 
-  # build-rosetta-t5x:
-  #   needs: build-upstream-t5x
-  #   uses: ./.github/workflows/_build_rosetta.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-  #     BASE_LIBRARY: t5x
-  #   secrets: inherit
+  build-rosetta-t5x:
+    needs: build-upstream-t5x
+    uses: ./.github/workflows/_build_rosetta.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+      BASE_LIBRARY: t5x
+    secrets: inherit
 
-  # build-rosetta-pax:
-  #   needs: build-upstream-pax
-  #   uses: ./.github/workflows/_build_rosetta.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
-  #     BASE_LIBRARY: pax
-  #   secrets: inherit
+  build-rosetta-pax:
+    needs: build-upstream-pax
+    uses: ./.github/workflows/_build_rosetta.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}
+      BASE_LIBRARY: pax
+    secrets: inherit
 
-  # build-gemma:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-gemma-build
-  #     BADGE_FILENAME: badge-gemma-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: gemma
-  #     DOCKERFILE: rosetta/Dockerfile.gemma
-  #     DOCKER_CONTEXT: .
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-  #   secrets: inherit
+  build-gemma:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-gemma-build
+      BADGE_FILENAME: badge-gemma-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: gemma
+      DOCKERFILE: rosetta/Dockerfile.gemma
+      DOCKER_CONTEXT: .
+      EXTRA_BUILD_ARGS: |
+        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+    secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -222,15 +222,15 @@ jobs:
     needs:
       - build-base
       - build-jax
-      # - build-triton
-      # - build-equinox
-      # - build-maxtext
-      # - build-levanter
-      # - build-upstream-t5x
-      # - build-upstream-pax
-      # - build-rosetta-t5x
-      # - build-rosetta-pax
-      # - build-gemma
+      - build-triton
+      - build-equinox
+      - build-maxtext
+      - build-levanter
+      - build-upstream-t5x
+      - build-upstream-pax
+      - build-rosetta-t5x
+      - build-rosetta-pax
+      - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -242,26 +242,26 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-pax", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "pax",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "pax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -271,27 +271,27 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  # test-distribution:
-  #   runs-on: ubuntu-22.04
-  #   strategy:
-  #     matrix:
-  #       TEST_SCRIPT:
-  #         - extra-only-distribution.sh
-  #         - mirror-only-distribution.sh
-  #         - upstream-only-distribution.sh
-  #         - local-patch-distribution.sh
-  #     fail-fast: false
-  #   steps:
-  #     - name: Print environment variables
-  #       run: env
-  #     - name: Set git login for tests
-  #       run: |
-  #         git config --global user.email "jax@nvidia.com"
-  #         git config --global user.name "JAX-Toolbox CI"
-  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
-  #       uses: actions/checkout@v4
-  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+  test-distribution:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        TEST_SCRIPT:
+          - extra-only-distribution.sh
+          - mirror-only-distribution.sh
+          - upstream-only-distribution.sh
+          - local-patch-distribution.sh
+      fail-fast: false
+    steps:
+      - name: Print environment variables
+        run: env
+      - name: Set git login for tests
+        run: |
+          git config --global user.email "jax@nvidia.com"
+          git config --global user.name "JAX-Toolbox CI"
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v4
+      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
 
   test-jax:
     needs: build-jax
@@ -325,136 +325,136 @@ jobs:
         test-gpu.log
     secrets: inherit
 
-  # test-nsys-jax:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: nsys-jax
-  #     EXECUTE: |
-  #       set -o pipefail
-  #       num_tests=0
-  #       num_failures=0
-  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-  #       # avoid an early abort here.
-  #       set +e
-  #       docker run -i --shm-size=1g --gpus all \
-  #         -v $PWD:/opt/output \
-  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #         bash <<"EOF" |& tee test-nsys-jax.log
-  #           # nsys-jax is already installed, this is just adding the test dependencies
-  #           pip install pytest-reportlog nsys-jax[test]
-  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
-  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-  #       EOF
-  #       set -e
-  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-  #       for mode in 1-process 2-process process-per-gpu; do
-  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-  #         if [[ "${mode}" == "1-process" ]]; then
-  #           PROCESS_COUNT=1
-  #           ARGS=""
-  #         elif [[ "${mode}" == "2-process" ]]; then
-  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-  #           # this will flush out more bugs than process-per-node or process-per-GPU.
-  #           PROCESS_COUNT=2
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-  #         else
-  #           PROCESS_COUNT=${GPUS_PER_NODE}
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-  #         fi
-  #         for collection in full partial; do
-  #           NSYS_JAX="nsys-jax"
-  #           if [[ "${mode}" == "1-process" ]]; then
-  #             # We will not run nsys-jax-combine, so run analyses eagerly
-  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-  #           fi
-  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-  #           if [[ "${collection}" == "partial" ]]; then
-  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-  #             # nvbug/4801401
-  #             NSYS_JAX+=" --sample=none"
-  #           fi
-  #           set +e
-  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         done
-  #         if [[ "${mode}" != "1-process" ]]; then
-  #           # Run nsys-jax-combine
-  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
-  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-  #           done
-  #           set +e
-  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         fi
-  #       done
-  #       ls -R .
-  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-  #       exit $num_failures
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-nsys-jax.log)
-  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       # pytest-driven part
-  #       test-nsys-jax.log
-  #       pytest-report.jsonl
-  #       # nsys-jax logfiles
-  #       *process-*-execution.log
-  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
-  #       1-process-*-execution-0.zip
-  #       # nsys-jax-combine output/logfiles
-  #       *process*-*-execution.zip
-  #       *-execution-combine.log
-  #   secrets: inherit
+  test-nsys-jax:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: nsys-jax
+      EXECUTE: |
+        set -o pipefail
+        num_tests=0
+        num_failures=0
+        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+        # avoid an early abort here.
+        set +e
+        docker run -i --shm-size=1g --gpus all \
+          -v $PWD:/opt/output \
+          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+          bash <<"EOF" |& tee test-nsys-jax.log
+            # nsys-jax is already installed, this is just adding the test dependencies
+            pip install pytest-reportlog nsys-jax[test]
+            # abuse knowledge that nsys-jax is installed editable, so the tests exist
+            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+        EOF
+        set -e
+        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+        for mode in 1-process 2-process process-per-gpu; do
+          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+          if [[ "${mode}" == "1-process" ]]; then
+            PROCESS_COUNT=1
+            ARGS=""
+          elif [[ "${mode}" == "2-process" ]]; then
+            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+            # this will flush out more bugs than process-per-node or process-per-GPU.
+            PROCESS_COUNT=2
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+          else
+            PROCESS_COUNT=${GPUS_PER_NODE}
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+          fi
+          for collection in full partial; do
+            NSYS_JAX="nsys-jax"
+            if [[ "${mode}" == "1-process" ]]; then
+              # We will not run nsys-jax-combine, so run analyses eagerly
+              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+            fi
+            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+            if [[ "${collection}" == "partial" ]]; then
+              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+              # nvbug/4801401
+              NSYS_JAX+=" --sample=none"
+            fi
+            set +e
+            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          done
+          if [[ "${mode}" != "1-process" ]]; then
+            # Run nsys-jax-combine
+            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+            for (( i=0; i<PROCESS_COUNT; i++ )); do
+              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+            done
+            set +e
+            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          fi
+        done
+        ls -R .
+        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+        exit $num_failures
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-nsys-jax.log)
+        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        # pytest-driven part
+        test-nsys-jax.log
+        pytest-report.jsonl
+        # nsys-jax logfiles
+        *process-*-execution.log
+        # nsys-jax output for the case that doesn't use nsys-jax-combine
+        1-process-*-execution-0.zip
+        # nsys-jax-combine output/logfiles
+        *process*-*-execution.zip
+        *-execution-combine.log
+    secrets: inherit
 
-  # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  # not already have nsys-jax installed
-  # test-nsys-jax-archive:
-  #   needs: test-nsys-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   strategy:
-  #     matrix:
-  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-  #   runs-on: ${{ matrix.os }}
-  #   steps:
-  #   - name: Download nsys-jax output .zip files
-  #     uses: actions/download-artifact@v4
-  #     with:
-  #       name: nsys-jax-unit-test-A100
-  #   - name: Extract archives and execute install scripts
-  #     run: |
-  #       pip install virtualenv # for install.sh
-  #       for zip in $(ls *.zip); do
-  #         ZIP="${PWD}/${zip}"
-  #         pushd $(mktemp -d)
-  #         unzip "${ZIP}"
-  #         ls -l
-  #         # TODO: verify this isn't needed, or make sure it isn't needed
-  #         chmod 755 install.sh
-  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-  #         # Skip executing Jupyter lab
-  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-  #         popd
-  #       done
+  test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  not already have nsys-jax installed
+  test-nsys-jax-archive:
+    needs: test-nsys-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Download nsys-jax output .zip files
+      uses: actions/download-artifact@v4
+      with:
+        name: nsys-jax-unit-test-A100
+    - name: Extract archives and execute install scripts
+      run: |
+        pip install virtualenv # for install.sh
+        for zip in $(ls *.zip); do
+          ZIP="${PWD}/${zip}"
+          pushd $(mktemp -d)
+          unzip "${ZIP}"
+          ls -l
+          # TODO: verify this isn't needed, or make sure it isn't needed
+          chmod 755 install.sh
+          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+          # Skip executing Jupyter lab
+          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+          popd
+        done
 
   test-nsys-jax-eks:
     needs: build-jax

From 1a97746a6b59a7c254854b86a960e49af98574c8 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 10:33:38 +0000
Subject: [PATCH 38/89] fix comments

---
 .github/workflows/_ci.yaml | 364 ++++++++++++++++++-------------------
 1 file changed, 182 insertions(+), 182 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 1164e2a3d..89397634d 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -425,9 +425,9 @@ jobs:
         *-execution-combine.log
     secrets: inherit
 
-  test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  not already have nsys-jax installed
+  #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  #not already have nsys-jax installed
   test-nsys-jax-archive:
     needs: test-nsys-jax
     if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
@@ -517,186 +517,186 @@ jobs:
       with: 
         token-name: ${{ env.TOKEN_NAME }}
 
-  # test-equinox:
-  #   needs: build-equinox
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-  #     TEST_NAME: equinox
-  #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -exc -o pipefail \
-  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
-  #     STATISTICS_SCRIPT: |
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-equinox.log
-  #   secrets: inherit
-
-  # test-te-multigpu:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_te.yaml
-  #   with:
-  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-upstream-t5x:
-  #   needs: build-upstream-t5x
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-rosetta-t5x:
-  #   needs: build-rosetta-t5x
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-triton:
-  #   needs: build-triton
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: triton
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-triton.log
-  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-  #         # actually having a CUDA backend for pytoch
-  #         pip install --no-deps torch
-  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-  #       passed_tests=$((total_tests - errors - failed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-triton.log
-  #   secrets: inherit
-
-  # test-levanter:
-  #   needs: build-levanter
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: levanter
-  #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g \
-  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-levanter.log
-  #         pip install flake8 pytest soundfile librosa
-  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-levanter.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-levanter.log
-  #   secrets: inherit
-
-  # test-te:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: te
-  #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-te.log
-  #         pip install pytest-reportlog
-  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-te.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     TIMEOUT_MINUTES: 120
-  #     ARTIFACTS: |
-  #       test-te.log
-  #       pytest-report.jsonl
-  #   secrets: inherit
-
-  # test-upstream-pax:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_upstream_pax.yaml
-  #   with:
-  #     PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-rosetta-pax:
-  #   needs: build-rosetta-pax
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_pax_rosetta.yaml
-  #   with:
-  #     PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-equinox:
+    needs: build-equinox
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: equinox
+      EXECUTE: |
+        docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+        bash -exc -o pipefail \
+        'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+      STATISTICS_SCRIPT: |
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-equinox.log
+    secrets: inherit
+
+  test-te-multigpu:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_te.yaml
+    with:
+      TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-upstream-t5x:
+    needs: build-upstream-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_upstream_t5x.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-rosetta-t5x:
+    needs: build-rosetta-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_t5x_rosetta.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-triton:
+    needs: build-triton
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: triton
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-triton.log
+          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+          # actually having a CUDA backend for pytoch
+          pip install --no-deps torch
+          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+        EOF
+      STATISTICS_SCRIPT: |
+        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+        passed_tests=$((total_tests - errors - failed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-triton.log
+    secrets: inherit
+
+  test-levanter:
+    needs: build-levanter
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: levanter
+      EXECUTE: |
+        docker run -i --gpus all --shm-size=1g \
+        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-levanter.log
+          pip install flake8 pytest soundfile librosa
+          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+        EOF
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-levanter.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-levanter.log
+    secrets: inherit
+
+  test-te:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: te
+      EXECUTE: |
+        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+        ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-te.log
+          pip install pytest-reportlog
+          pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+        EOF
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-te.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      TIMEOUT_MINUTES: 120
+      ARTIFACTS: |
+        test-te.log
+        pytest-report.jsonl
+    secrets: inherit
+
+  test-upstream-pax:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_upstream_pax.yaml
+    with:
+      PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-rosetta-pax:
+    needs: build-rosetta-pax
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_pax_rosetta.yaml
+    with:
+      PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
   
-  # test-gemma:
-  #   needs: build-gemma
-  #   uses: ./.github/workflows/_test_unit.yaml  
-  #   if: inputs.ARCHITECTURE == 'amd64'
-  #   with:
-  #     TEST_NAME: gemma
-  #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -ec \
-  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-gemma.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-gemma.log
-  #   secrets: inherit
-
-  # test-maxtext:
-  #   needs: build-maxtext
-  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-  #   uses: ./.github/workflows/_test_maxtext.yaml
-  #   with:
-  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-gemma:
+    needs: build-gemma
+    uses: ./.github/workflows/_test_unit.yaml  
+    if: inputs.ARCHITECTURE == 'amd64'
+    with:
+      TEST_NAME: gemma
+      EXECUTE: |
+        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+        bash -ec \
+        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-gemma.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-gemma.log
+    secrets: inherit
+
+  test-maxtext:
+    needs: build-maxtext
+    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+    uses: ./.github/workflows/_test_maxtext.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn

From 852d381c189fb1d46e501c8e138d001a33ecb2b3 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 10:37:41 +0000
Subject: [PATCH 39/89] test axlearn

---
 .github/workflows/_ci.yaml | 974 ++++++++++++++++++-------------------
 1 file changed, 487 insertions(+), 487 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 5d7a2cf0a..d2055d6a8 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,115 +66,115 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  build-triton:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-triton-build
-      BADGE_FILENAME: badge-triton-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: triton
-      DOCKERFILE: .github/container/Dockerfile.triton
-      RUNNER_SIZE: large
-      EXTRA_BUILD_ARGS: |
-        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-    secrets: inherit
+  # build-triton:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-triton-build
+  #     BADGE_FILENAME: badge-triton-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: triton
+  #     DOCKERFILE: .github/container/Dockerfile.triton
+  #     RUNNER_SIZE: large
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+  #   secrets: inherit
 
-  build-equinox:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-equinox-build
-      BADGE_FILENAME: badge-equinox-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: equinox
-      DOCKERFILE: .github/container/Dockerfile.equinox
-      EXTRA_BUILD_ARGS: |
-        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-    secrets: inherit
+  # build-equinox:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-equinox-build
+  #     BADGE_FILENAME: badge-equinox-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: equinox
+  #     DOCKERFILE: .github/container/Dockerfile.equinox
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+  #   secrets: inherit
 
-  build-maxtext:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-maxtext-build
-      BADGE_FILENAME: badge-maxtext-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: maxtext
-      DOCKERFILE: .github/container/Dockerfile.maxtext
-      EXTRA_BUILD_ARGS: |
-        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-    secrets: inherit
+  # build-maxtext:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-maxtext-build
+  #     BADGE_FILENAME: badge-maxtext-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: maxtext
+  #     DOCKERFILE: .github/container/Dockerfile.maxtext
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+  #   secrets: inherit
 
-  build-levanter:
-    needs: [build-jax]
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-levanter-build"
-      BADGE_FILENAME: "badge-levanter-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: levanter
-      DOCKERFILE: .github/container/Dockerfile.levanter
-      EXTRA_BUILD_ARGS: |
-        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-    secrets: inherit
+  # build-levanter:
+  #   needs: [build-jax]
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-levanter-build"
+  #     BADGE_FILENAME: "badge-levanter-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: levanter
+  #     DOCKERFILE: .github/container/Dockerfile.levanter
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+  #   secrets: inherit
 
-  build-upstream-t5x:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-t5x-build"
-      BADGE_FILENAME: "badge-t5x-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-t5x
-      DOCKERFILE: .github/container/Dockerfile.t5x
-      EXTRA_BUILD_ARGS: |
-        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-    secrets: inherit
+  # build-upstream-t5x:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-t5x-build"
+  #     BADGE_FILENAME: "badge-t5x-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: upstream-t5x
+  #     DOCKERFILE: .github/container/Dockerfile.t5x
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+  #   secrets: inherit
 
-  build-rosetta-t5x:
-    needs: build-upstream-t5x
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: t5x
-    secrets: inherit
+  # build-rosetta-t5x:
+  #   needs: build-upstream-t5x
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: t5x
+  #   secrets: inherit
 
-  build-gemma:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-gemma-build
-      BADGE_FILENAME: badge-gemma-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: gemma
-      DOCKERFILE: rosetta/Dockerfile.gemma
-      DOCKER_CONTEXT: .
-      EXTRA_BUILD_ARGS: |
-        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-    secrets: inherit
+  # build-gemma:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-gemma-build
+  #     BADGE_FILENAME: badge-gemma-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: gemma
+  #     DOCKERFILE: rosetta/Dockerfile.gemma
+  #     DOCKER_CONTEXT: .
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+  #   secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -194,14 +194,14 @@ jobs:
     if: "!cancelled()"
     needs:
       - build-base
-      - build-jax
-      - build-triton
-      - build-equinox
-      - build-maxtext
-      - build-levanter
-      - build-upstream-t5x
-      - build-rosetta-t5x
-      - build-gemma
+      # - build-jax
+      # - build-triton
+      # - build-equinox
+      # - build-maxtext
+      # - build-levanter
+      # - build-upstream-t5x
+      # - build-rosetta-t5x
+      # - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -213,22 +213,22 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -238,275 +238,275 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  test-distribution:
-    runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
-    steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
-        run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  test-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent 
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee tee test-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-gpu.log
-    secrets: inherit
+  # test-distribution:
+  #   runs-on: ubuntu-22.04
+  #   strategy:
+  #     matrix:
+  #       TEST_SCRIPT:
+  #         - extra-only-distribution.sh
+  #         - mirror-only-distribution.sh
+  #         - upstream-only-distribution.sh
+  #         - local-patch-distribution.sh
+  #     fail-fast: false
+  #   steps:
+  #     - name: Print environment variables
+  #       run: env
+  #     - name: Set git login for tests
+  #       run: |
+  #         git config --global user.email "jax@nvidia.com"
+  #         git config --global user.name "JAX-Toolbox CI"
+  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
+  #       uses: actions/checkout@v4
+  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  # test-jax:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: jax
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-jax.sh -b backend-independent 
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee tee test-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b gpu
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-backend-independent.log
+  #       test-gpu.log
+  #   secrets: inherit
 
-  test-nsys-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: nsys-jax
-      EXECUTE: |
-        set -o pipefail
-        num_tests=0
-        num_failures=0
-        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-        # avoid an early abort here.
-        set +e
-        docker run -i --shm-size=1g --gpus all \
-          -v $PWD:/opt/output \
-          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-          bash <<"EOF" |& tee test-nsys-jax.log
-            # nsys-jax is already installed, this is just adding the test dependencies
-            pip install pytest-reportlog nsys-jax[test]
-            # abuse knowledge that nsys-jax is installed editable, so the tests exist
-            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-        EOF
-        set -e
-        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-        for mode in 1-process 2-process process-per-gpu; do
-          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-          if [[ "${mode}" == "1-process" ]]; then
-            PROCESS_COUNT=1
-            ARGS=""
-          elif [[ "${mode}" == "2-process" ]]; then
-            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-            # this will flush out more bugs than process-per-node or process-per-GPU.
-            PROCESS_COUNT=2
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-          else
-            PROCESS_COUNT=${GPUS_PER_NODE}
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-          fi
-          for collection in full partial; do
-            NSYS_JAX="nsys-jax"
-            if [[ "${mode}" == "1-process" ]]; then
-              # We will not run nsys-jax-combine, so run analyses eagerly
-              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-            fi
-            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-            if [[ "${collection}" == "partial" ]]; then
-              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-              # nvbug/4801401
-              NSYS_JAX+=" --sample=none"
-            fi
-            set +e
-            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          done
-          if [[ "${mode}" != "1-process" ]]; then
-            # Run nsys-jax-combine
-            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-            for (( i=0; i<PROCESS_COUNT; i++ )); do
-              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-            done
-            set +e
-            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          fi
-        done
-        ls -R .
-        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-        exit $num_failures
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-nsys-jax.log)
-        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        # pytest-driven part
-        test-nsys-jax.log
-        pytest-report.jsonl
-        # nsys-jax logfiles
-        *process-*-execution.log
-        # nsys-jax output for the case that doesn't use nsys-jax-combine
-        1-process-*-execution-0.zip
-        # nsys-jax-combine output/logfiles
-        *process*-*-execution.zip
-        *-execution-combine.log
-    secrets: inherit
+  # test-nsys-jax:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: nsys-jax
+  #     EXECUTE: |
+  #       set -o pipefail
+  #       num_tests=0
+  #       num_failures=0
+  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+  #       # avoid an early abort here.
+  #       set +e
+  #       docker run -i --shm-size=1g --gpus all \
+  #         -v $PWD:/opt/output \
+  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #         bash <<"EOF" |& tee test-nsys-jax.log
+  #           # nsys-jax is already installed, this is just adding the test dependencies
+  #           pip install pytest-reportlog nsys-jax[test]
+  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
+  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+  #       EOF
+  #       set -e
+  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+  #       for mode in 1-process 2-process process-per-gpu; do
+  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+  #         if [[ "${mode}" == "1-process" ]]; then
+  #           PROCESS_COUNT=1
+  #           ARGS=""
+  #         elif [[ "${mode}" == "2-process" ]]; then
+  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+  #           # this will flush out more bugs than process-per-node or process-per-GPU.
+  #           PROCESS_COUNT=2
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+  #         else
+  #           PROCESS_COUNT=${GPUS_PER_NODE}
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+  #         fi
+  #         for collection in full partial; do
+  #           NSYS_JAX="nsys-jax"
+  #           if [[ "${mode}" == "1-process" ]]; then
+  #             # We will not run nsys-jax-combine, so run analyses eagerly
+  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+  #           fi
+  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+  #           if [[ "${collection}" == "partial" ]]; then
+  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+  #             # nvbug/4801401
+  #             NSYS_JAX+=" --sample=none"
+  #           fi
+  #           set +e
+  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         done
+  #         if [[ "${mode}" != "1-process" ]]; then
+  #           # Run nsys-jax-combine
+  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
+  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+  #           done
+  #           set +e
+  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         fi
+  #       done
+  #       ls -R .
+  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+  #       exit $num_failures
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-nsys-jax.log)
+  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       # pytest-driven part
+  #       test-nsys-jax.log
+  #       pytest-report.jsonl
+  #       # nsys-jax logfiles
+  #       *process-*-execution.log
+  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
+  #       1-process-*-execution-0.zip
+  #       # nsys-jax-combine output/logfiles
+  #       *process*-*-execution.zip
+  #       *-execution-combine.log
+  #   secrets: inherit
 
   #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
   #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
   #not already have nsys-jax installed
-  test-nsys-jax-archive:
-    needs: test-nsys-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Download nsys-jax output .zip files
-      uses: actions/download-artifact@v4
-      with:
-        name: nsys-jax-unit-test-A100
-    - name: Extract archives and execute install scripts
-      run: |
-        pip install virtualenv # for install.sh
-        for zip in $(ls *.zip); do
-          ZIP="${PWD}/${zip}"
-          pushd $(mktemp -d)
-          unzip "${ZIP}"
-          ls -l
-          # TODO: verify this isn't needed, or make sure it isn't needed
-          chmod 755 install.sh
-          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-          # Skip executing Jupyter lab
-          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-          popd
-        done
-
-  test-nsys-jax-eks:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    runs-on: eks
-    env:
-      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-nsys-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: GHCR login
-      uses: ./.github/actions/ghcr-login 
-      with: 
-        docker-username: ${{ github.repository_owner }}
-        docker-password: ${{ secrets.GITHUB_TOKEN}}
-        token-name: ${{ env.TOKEN_NAME }}
-    - name: Configure Kubernetes job
-      run: |
-        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-          | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
-          .github/eks-workflow-files/job.yml
-        git diff .github/eks-workflow-files/job.yml
-    - name: Submit Kubernetes job
-      uses: ./.github/actions/submit-k8s-job
-      with: 
-        job-config-file: .github/eks-workflow-files/job.yml
-        job-name: ${{ env.JOB_NAME }}
-    - name: Delete eks job
-      uses: ./.github/actions/delete-k8s-job
-      if: always()
-      with: 
-        job-name: ${{ env.JOB_NAME }}
-    - name: Configure post-processing job
-      run: |
-        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-          .github/eks-workflow-files/post-process-job.yml
-        git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post process k8s job
-      uses: ./.github/actions/submit-k8s-job
-      with: 
-          job-config-file: .github/eks-workflow-files/post-process-job.yml
-          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete post process k8s job
-      uses: ./.github/actions/delete-k8s-job
-      with:
-          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete GitHub Container Registry token
-      uses: ./.github/actions/delete-ghcr-token
-      if: always()
-      with: 
-        token-name: ${{ env.TOKEN_NAME }}
-
-  test-equinox:
-    needs: build-equinox
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-      TEST_NAME: equinox
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-        bash -exc -o pipefail \
-        'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
-      STATISTICS_SCRIPT: |
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-equinox.log
-    secrets: inherit
+  # test-nsys-jax-archive:
+  #   needs: test-nsys-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   strategy:
+  #     matrix:
+  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+  #   runs-on: ${{ matrix.os }}
+  #   steps:
+  #   - name: Download nsys-jax output .zip files
+  #     uses: actions/download-artifact@v4
+  #     with:
+  #       name: nsys-jax-unit-test-A100
+  #   - name: Extract archives and execute install scripts
+  #     run: |
+  #       pip install virtualenv # for install.sh
+  #       for zip in $(ls *.zip); do
+  #         ZIP="${PWD}/${zip}"
+  #         pushd $(mktemp -d)
+  #         unzip "${ZIP}"
+  #         ls -l
+  #         # TODO: verify this isn't needed, or make sure it isn't needed
+  #         chmod 755 install.sh
+  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+  #         # Skip executing Jupyter lab
+  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+  #         popd
+  #       done
+
+  # test-nsys-jax-eks:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   runs-on: eks
+  #   env:
+  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
+  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+  #     TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: GHCR login
+  #     uses: ./.github/actions/ghcr-login 
+  #     with: 
+  #       docker-username: ${{ github.repository_owner }}
+  #       docker-password: ${{ secrets.GITHUB_TOKEN}}
+  #       token-name: ${{ env.TOKEN_NAME }}
+  #   - name: Configure Kubernetes job
+  #     run: |
+  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
+  #         .github/eks-workflow-files/job.yml
+  #       git diff .github/eks-workflow-files/job.yml
+  #   - name: Submit Kubernetes job
+  #     uses: ./.github/actions/submit-k8s-job
+  #     with: 
+  #       job-config-file: .github/eks-workflow-files/job.yml
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Delete eks job
+  #     uses: ./.github/actions/delete-k8s-job
+  #     if: always()
+  #     with: 
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Configure post-processing job
+  #     run: |
+  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+  #         | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+  #         .github/eks-workflow-files/post-process-job.yml
+  #       git diff .github/eks-workflow-files/post-process-job.yml
+  #   - name: Submit post process k8s job
+  #     uses: ./.github/actions/submit-k8s-job
+  #     with: 
+  #         job-config-file: .github/eks-workflow-files/post-process-job.yml
+  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+  #   - name: Delete post process k8s job
+  #     uses: ./.github/actions/delete-k8s-job
+  #     with:
+  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+  #   - name: Delete GitHub Container Registry token
+  #     uses: ./.github/actions/delete-ghcr-token
+  #     if: always()
+  #     with: 
+  #       token-name: ${{ env.TOKEN_NAME }}
+
+  # test-equinox:
+  #   needs: build-equinox
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  #     TEST_NAME: equinox
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -exc -o pipefail \
+  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-equinox.log
+  #   secrets: inherit
 
   # test-te-multigpu:
   #   needs: build-upstream-pax
@@ -516,77 +516,77 @@ jobs:
   #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
   #   secrets: inherit
 
-  test-upstream-t5x:
-    needs: build-upstream-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_upstream_t5x.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-t5x:
+  #   needs: build-upstream-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-rosetta-t5x:
-    needs: build-rosetta-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_t5x_rosetta.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-rosetta-t5x:
+  #   needs: build-rosetta-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-triton:
-    needs: build-triton
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: triton
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-triton.log
-          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-          # actually having a CUDA backend for pytoch
-          pip install --no-deps torch
-          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-        EOF
-      STATISTICS_SCRIPT: |
-        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-        passed_tests=$((total_tests - errors - failed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-triton.log
-    secrets: inherit
+  # test-triton:
+  #   needs: build-triton
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: triton
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-triton.log
+  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+  #         # actually having a CUDA backend for pytoch
+  #         pip install --no-deps torch
+  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+  #       passed_tests=$((total_tests - errors - failed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-triton.log
+  #   secrets: inherit
 
-  test-levanter:
-    needs: build-levanter
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: levanter
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g \
-        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-levanter.log
-          pip install flake8 pytest soundfile librosa
-          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-levanter.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-levanter.log
-    secrets: inherit
+  # test-levanter:
+  #   needs: build-levanter
+  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: levanter
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g \
+  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-levanter.log
+  #         pip install flake8 pytest soundfile librosa
+  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-levanter.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-levanter.log
+  #   secrets: inherit
 
   # test-te:
   #   needs: build-upstream-pax
@@ -617,37 +617,37 @@ jobs:
   #       pytest-report.jsonl
   #   secrets: inherit
 
-  test-gemma:
-    needs: build-gemma
-    uses: ./.github/workflows/_test_unit.yaml  
-    if: inputs.ARCHITECTURE == 'amd64'
-    with:
-      TEST_NAME: gemma
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-        bash -ec \
-        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-gemma.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-gemma.log
-    secrets: inherit
+  # test-gemma:
+  #   needs: build-gemma
+  #   uses: ./.github/workflows/_test_unit.yaml  
+  #   if: inputs.ARCHITECTURE == 'amd64'
+  #   with:
+  #     TEST_NAME: gemma
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -ec \
+  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-gemma.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-gemma.log
+  #   secrets: inherit
 
-  test-maxtext:
-    needs: build-maxtext
-    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-maxtext:
+  #   needs: build-maxtext
+  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+  #   uses: ./.github/workflows/_test_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn

From c4d3bbfd2a7302d57fbc11cd768baa4ebb7092e2 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 11:36:25 +0000
Subject: [PATCH 40/89] fix nccl test variables, install in test file, make a
 signal for test finished

---
 .github/container/test-axlearn.sh             |  3 +-
 .../axlearn/axlearn-job.yml                   |  8 +++--
 .github/workflows/_ci.yaml                    | 14 ++++-----
 .github/workflows/nccl-k8s.yaml               | 29 ++++++++++---------
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 5d256706e..d2786a8ca 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -120,7 +120,8 @@ echo "Running tests..."
 # If we are on Kubernetes, install torch for cpu only
 if [ "$K8S" = true ]; then
      pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
-     pip install transformers sklearn timm
+     pip install transformers
+     pip install scikit-learn timm 
 fi
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 7e7fe0f15..fd4a63d31 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -30,7 +30,9 @@ spec:
 
             # Wait a moment to ensure logs are flushed
             sync  
-            
+            wait 
+            # after execution flag the results have been produced 
+            touch /opt/output/done
         resources:
           limits:
             nvidia.com/gpu: 8
@@ -46,8 +48,8 @@ spec:
           - sh
           - -c
           - |
-            # Wait for the summary file to appear
-            while [ ! -f /opt/output/summary.txt ]; do
+            # Wait for the tests to finish
+            while [ ! -f /opt/output/done ]; do
               sleep 1
             done
             # Now upload to your S3 bucket
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index d2055d6a8..0a02fe54a 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -191,7 +191,7 @@ jobs:
 
   collect-docker-tags:
     runs-on: ubuntu-22.04
-    if: "!cancelled()"
+    if: ${{ !cancelled() }}
     needs:
       - build-base
       # - build-jax
@@ -457,7 +457,7 @@ jobs:
   #       job-name: ${{ env.JOB_NAME }}
   #   - name: Delete eks job
   #     uses: ./.github/actions/delete-k8s-job
-  #     if: always()
+  #     if: ${{ always() }}
   #     with: 
   #       job-name: ${{ env.JOB_NAME }}
   #   - name: Configure post-processing job
@@ -480,7 +480,7 @@ jobs:
   #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
   #   - name: Delete GitHub Container Registry token
   #     uses: ./.github/actions/delete-ghcr-token
-  #     if: always()
+  #     if:  ${{ always() }}
   #     with: 
   #       token-name: ${{ env.TOKEN_NAME }}
 
@@ -688,13 +688,13 @@ jobs:
 
     - name: Delete axlearn test job
       uses: ./.github/actions/delete-k8s-job
-      if: always()
+      if: ${{ always() }}
       with: 
         job-name: ${{ env.JOB_NAME }}
 
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/delete-ghcr-token
-      if: always()
+      if: ${{ always() }}
       with: 
         token-name: ${{ env.TOKEN_NAME }}
 
@@ -717,7 +717,7 @@ jobs:
         echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
     - name: Generate sitrep
       id: sitrep
-      if: "!cancelled()"
+      if: ${{ !cancelled() }}
       shell: bash -x -e {0}
       run: |
         # bring in utility functions
@@ -750,7 +750,7 @@ jobs:
         > "badge-axlearn-test"
 
     - name: Upload artifacts
-      if: "!cancelled()"
+      if: ${{ !cancelled() }}
       uses: actions/upload-artifact@v4
       with:
         name: "artifact-axlearn-test"
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index 816979355..aff2f8709 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -63,9 +63,10 @@ jobs:
         id: var
         shell: bash 
         run: |
-          echo "JOB_NAME=${{ env.JOB_NAME}//_/-}" >> $GITHUB_OUTPUT
-          echo "LAUNCHER_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-launcher" >> $GITHUB_OUTPUT 
-          echo "TOKEN_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-token" >> $GITHUB_OUTPUT
+          export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}"
+          echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT
+          echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT 
+          echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT
 
       - name: GHCR login and store K8s secret
         uses: ./.github/actions/ghcr-login
@@ -76,9 +77,9 @@ jobs:
       - name: Configure Kubernetes job
         shell: bash
         run: |
-          export JOB_NAME="${{ steps.var.JOB_NAME }}"
-          export LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
-          export TOKEN_NAME="${{ steps.var.TOKEN_NAME }}"
+          export JOB_NAME="${{ steps.var.outputs.JOB_NAME }}"
+          export LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
+          export TOKEN_NAME="${{ steps.var.outputs.TOKEN_NAME }}"
           export TEST_NAME="${{ env.TEST_NAME }}"
           export WORKER_NAME="${JOB_NAME}-worker"
 
@@ -100,11 +101,11 @@ jobs:
         uses: ./.github/actions/submit-k8s-job
         with:
           job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml
-          job-name: ${{ steps.var.LAUNCHER_NAME }} 
+          job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} 
       - name: Retrieve Kubernetes job status
         shell: bash -exo pipefail
         run: |
-          LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
+          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
           while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
             failure=${status[0]:-0}
             success=${status[1]:-0}
@@ -120,21 +121,21 @@ jobs:
           done
           exit ${failure}
       - name: Debug failed Kubernetes job
-        if: failure()
+        if: ${{ failure() }}
         shell: bash
         run: |
-          LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
+          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
           pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
           if [[ -n "${pods}" ]]; then
             kubectl describe ${pods}
           fi
       - name: Delete Kubernetes job
-        if: always()
+        if: ${{ always() }}
         uses: ./.github/actions/delete-k8s-job
         with:
-          job-name: ${{ steps.var.LAUNCHER_NAME }}
+          job-name: ${{ steps.var.outputs.LAUNCHER_NAME }}
       - name: Delete GitHub Container Registry token
         uses: ./.github/actions/delete-ghcr-token
-        if: always()
+        if: ${{ always() }}
         with: 
-          token-name: ${{ steps.var.TOKEN_NAME }}
\ No newline at end of file
+          token-name: ${{ steps.var.outputs.TOKEN_NAME }}
\ No newline at end of file

From 4b5a56b0d1220dd3914e370a5b0343c87addbbb5 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 11:41:49 +0000
Subject: [PATCH 41/89] Fix var output

---
 .github/workflows/nccl-k8s.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index aff2f8709..321fd861f 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -73,7 +73,7 @@ jobs:
         with:
           docker-username: ${{ github.repository_owner }}
           docker-password: ${{ secrets.GITHUB_TOKEN }}
-          token-name: ${{ steps.var.TOKEN_NAME }}
+          token-name: ${{ steps.var.outputs.TOKEN_NAME }}
       - name: Configure Kubernetes job
         shell: bash
         run: |

From d205f6a2e199bf1944d45004e4fa942b6172633a Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 13:40:02 +0000
Subject: [PATCH 42/89] test clean

---
 .github/container/test-axlearn.sh             |    2 +-
 .../axlearn/axlearn-job.yml                   |    9 +-
 .github/workflows/_ci.yaml                    | 1074 ++++++++---------
 .github/workflows/nccl-k8s.yaml               |    2 +-
 4 files changed, 541 insertions(+), 546 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index d2786a8ca..c62f36f5b 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -176,7 +176,7 @@ passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]:0:10}"; do
+for test_file in "${final_test_files[@]}"; do
     echo "Running: ${test_file}"
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
     log_file="${LOG_DIRECTORY}/${log_file_name}"
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index fd4a63d31..7c1022f61 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -19,16 +19,12 @@ spec:
           - pipefail
           - -c
           - |
-            # Example test command; adapted from your Docker run snippet
-            # Writes logs to /opt/output/test-backend-independent.log
-            # Also writes a summary file to /opt/output/summary.txt
             test-axlearn.sh \
               --directory "." \
               --output "/opt/output/" \
               --test-files "/opt/axlearn/axlearn/common/*_test.py" \
               --k8s
 
-            # Wait a moment to ensure logs are flushed
             sync  
             wait 
             # after execution flag the results have been produced 
@@ -48,11 +44,10 @@ spec:
           - sh
           - -c
           - |
-            # Wait for the tests to finish
             while [ ! -f /opt/output/done ]; do
-              sleep 1
+              sleep 5
             done
-            # Now upload to your S3 bucket
+            # Upload to S3 bucket
             aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt
         volumeMounts:
         - name: output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 0a02fe54a..1439f515c 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,115 +66,115 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  # build-triton:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-triton-build
-  #     BADGE_FILENAME: badge-triton-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: triton
-  #     DOCKERFILE: .github/container/Dockerfile.triton
-  #     RUNNER_SIZE: large
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-  #   secrets: inherit
-
-  # build-equinox:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-equinox-build
-  #     BADGE_FILENAME: badge-equinox-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: equinox
-  #     DOCKERFILE: .github/container/Dockerfile.equinox
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-  #   secrets: inherit
-
-  # build-maxtext:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-maxtext-build
-  #     BADGE_FILENAME: badge-maxtext-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: maxtext
-  #     DOCKERFILE: .github/container/Dockerfile.maxtext
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-  #   secrets: inherit
-
-  # build-levanter:
-  #   needs: [build-jax]
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-levanter-build"
-  #     BADGE_FILENAME: "badge-levanter-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: levanter
-  #     DOCKERFILE: .github/container/Dockerfile.levanter
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-  #   secrets: inherit
-
-  # build-upstream-t5x:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-t5x-build"
-  #     BADGE_FILENAME: "badge-t5x-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: upstream-t5x
-  #     DOCKERFILE: .github/container/Dockerfile.t5x
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-  #   secrets: inherit
-
-  # build-rosetta-t5x:
-  #   needs: build-upstream-t5x
-  #   uses: ./.github/workflows/_build_rosetta.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-  #     BASE_LIBRARY: t5x
-  #   secrets: inherit
-
-  # build-gemma:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-gemma-build
-  #     BADGE_FILENAME: badge-gemma-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: gemma
-  #     DOCKERFILE: rosetta/Dockerfile.gemma
-  #     DOCKER_CONTEXT: .
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-  #   secrets: inherit
+  build-triton:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-triton-build
+      BADGE_FILENAME: badge-triton-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: triton
+      DOCKERFILE: .github/container/Dockerfile.triton
+      RUNNER_SIZE: large
+      EXTRA_BUILD_ARGS: |
+        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+    secrets: inherit
+
+  build-equinox:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-equinox-build
+      BADGE_FILENAME: badge-equinox-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: equinox
+      DOCKERFILE: .github/container/Dockerfile.equinox
+      EXTRA_BUILD_ARGS: |
+        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+    secrets: inherit
+
+  build-maxtext:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-maxtext-build
+      BADGE_FILENAME: badge-maxtext-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: maxtext
+      DOCKERFILE: .github/container/Dockerfile.maxtext
+      EXTRA_BUILD_ARGS: |
+        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+    secrets: inherit
+
+  build-levanter:
+    needs: [build-jax]
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-levanter-build"
+      BADGE_FILENAME: "badge-levanter-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: levanter
+      DOCKERFILE: .github/container/Dockerfile.levanter
+      EXTRA_BUILD_ARGS: |
+        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+    secrets: inherit
+
+  build-upstream-t5x:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-t5x-build"
+      BADGE_FILENAME: "badge-t5x-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: upstream-t5x
+      DOCKERFILE: .github/container/Dockerfile.t5x
+      EXTRA_BUILD_ARGS: |
+        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+    secrets: inherit
+
+  build-rosetta-t5x:
+    needs: build-upstream-t5x
+    uses: ./.github/workflows/_build_rosetta.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+      BASE_LIBRARY: t5x
+    secrets: inherit
+
+  build-gemma:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-gemma-build
+      BADGE_FILENAME: badge-gemma-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: gemma
+      DOCKERFILE: rosetta/Dockerfile.gemma
+      DOCKER_CONTEXT: .
+      EXTRA_BUILD_ARGS: |
+        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+    secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -194,14 +194,14 @@ jobs:
     if: ${{ !cancelled() }}
     needs:
       - build-base
-      # - build-jax
-      # - build-triton
-      # - build-equinox
-      # - build-maxtext
-      # - build-levanter
-      # - build-upstream-t5x
-      # - build-rosetta-t5x
-      # - build-gemma
+      - build-jax
+      - build-triton
+      - build-equinox
+      - build-maxtext
+      - build-levanter
+      - build-upstream-t5x
+      - build-rosetta-t5x
+      - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -213,22 +213,22 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -238,416 +238,416 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  # test-distribution:
-  #   runs-on: ubuntu-22.04
-  #   strategy:
-  #     matrix:
-  #       TEST_SCRIPT:
-  #         - extra-only-distribution.sh
-  #         - mirror-only-distribution.sh
-  #         - upstream-only-distribution.sh
-  #         - local-patch-distribution.sh
-  #     fail-fast: false
-  #   steps:
-  #     - name: Print environment variables
-  #       run: env
-  #     - name: Set git login for tests
-  #       run: |
-  #         git config --global user.email "jax@nvidia.com"
-  #         git config --global user.name "JAX-Toolbox CI"
-  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
-  #       uses: actions/checkout@v4
-  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  # test-jax:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: jax
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-backend-independent.log
-  #         test-jax.sh -b backend-independent 
-  #       EOF
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee tee test-gpu.log
-  #         nvidia-cuda-mps-control -d
-  #         test-jax.sh -b gpu
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-backend-independent.log
-  #       test-gpu.log
-  #   secrets: inherit
-
-  # test-nsys-jax:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: nsys-jax
-  #     EXECUTE: |
-  #       set -o pipefail
-  #       num_tests=0
-  #       num_failures=0
-  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-  #       # avoid an early abort here.
-  #       set +e
-  #       docker run -i --shm-size=1g --gpus all \
-  #         -v $PWD:/opt/output \
-  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #         bash <<"EOF" |& tee test-nsys-jax.log
-  #           # nsys-jax is already installed, this is just adding the test dependencies
-  #           pip install pytest-reportlog nsys-jax[test]
-  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
-  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-  #       EOF
-  #       set -e
-  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-  #       for mode in 1-process 2-process process-per-gpu; do
-  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-  #         if [[ "${mode}" == "1-process" ]]; then
-  #           PROCESS_COUNT=1
-  #           ARGS=""
-  #         elif [[ "${mode}" == "2-process" ]]; then
-  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-  #           # this will flush out more bugs than process-per-node or process-per-GPU.
-  #           PROCESS_COUNT=2
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-  #         else
-  #           PROCESS_COUNT=${GPUS_PER_NODE}
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-  #         fi
-  #         for collection in full partial; do
-  #           NSYS_JAX="nsys-jax"
-  #           if [[ "${mode}" == "1-process" ]]; then
-  #             # We will not run nsys-jax-combine, so run analyses eagerly
-  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-  #           fi
-  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-  #           if [[ "${collection}" == "partial" ]]; then
-  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-  #             # nvbug/4801401
-  #             NSYS_JAX+=" --sample=none"
-  #           fi
-  #           set +e
-  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         done
-  #         if [[ "${mode}" != "1-process" ]]; then
-  #           # Run nsys-jax-combine
-  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
-  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-  #           done
-  #           set +e
-  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         fi
-  #       done
-  #       ls -R .
-  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-  #       exit $num_failures
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-nsys-jax.log)
-  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       # pytest-driven part
-  #       test-nsys-jax.log
-  #       pytest-report.jsonl
-  #       # nsys-jax logfiles
-  #       *process-*-execution.log
-  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
-  #       1-process-*-execution-0.zip
-  #       # nsys-jax-combine output/logfiles
-  #       *process*-*-execution.zip
-  #       *-execution-combine.log
-  #   secrets: inherit
+  test-distribution:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        TEST_SCRIPT:
+          - extra-only-distribution.sh
+          - mirror-only-distribution.sh
+          - upstream-only-distribution.sh
+          - local-patch-distribution.sh
+      fail-fast: false
+    steps:
+      - name: Print environment variables
+        run: env
+      - name: Set git login for tests
+        run: |
+          git config --global user.email "jax@nvidia.com"
+          git config --global user.name "JAX-Toolbox CI"
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v4
+      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  test-jax:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: jax
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all \
+        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-backend-independent.log
+          test-jax.sh -b backend-independent 
+        EOF
+        docker run -i --shm-size=1g --gpus all \
+        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee tee test-gpu.log
+          nvidia-cuda-mps-control -d
+          test-jax.sh -b gpu
+        EOF
+      STATISTICS_SCRIPT: |
+        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-backend-independent.log
+        test-gpu.log
+    secrets: inherit
+
+  test-nsys-jax:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: nsys-jax
+      EXECUTE: |
+        set -o pipefail
+        num_tests=0
+        num_failures=0
+        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+        # avoid an early abort here.
+        set +e
+        docker run -i --shm-size=1g --gpus all \
+          -v $PWD:/opt/output \
+          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+          bash <<"EOF" |& tee test-nsys-jax.log
+            # nsys-jax is already installed, this is just adding the test dependencies
+            pip install pytest-reportlog nsys-jax[test]
+            # abuse knowledge that nsys-jax is installed editable, so the tests exist
+            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+        EOF
+        set -e
+        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+        for mode in 1-process 2-process process-per-gpu; do
+          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+          if [[ "${mode}" == "1-process" ]]; then
+            PROCESS_COUNT=1
+            ARGS=""
+          elif [[ "${mode}" == "2-process" ]]; then
+            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+            # this will flush out more bugs than process-per-node or process-per-GPU.
+            PROCESS_COUNT=2
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+          else
+            PROCESS_COUNT=${GPUS_PER_NODE}
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+          fi
+          for collection in full partial; do
+            NSYS_JAX="nsys-jax"
+            if [[ "${mode}" == "1-process" ]]; then
+              # We will not run nsys-jax-combine, so run analyses eagerly
+              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+            fi
+            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+            if [[ "${collection}" == "partial" ]]; then
+              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+              # nvbug/4801401
+              NSYS_JAX+=" --sample=none"
+            fi
+            set +e
+            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          done
+          if [[ "${mode}" != "1-process" ]]; then
+            # Run nsys-jax-combine
+            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+            for (( i=0; i<PROCESS_COUNT; i++ )); do
+              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+            done
+            set +e
+            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          fi
+        done
+        ls -R .
+        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+        exit $num_failures
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-nsys-jax.log)
+        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        # pytest-driven part
+        test-nsys-jax.log
+        pytest-report.jsonl
+        # nsys-jax logfiles
+        *process-*-execution.log
+        # nsys-jax output for the case that doesn't use nsys-jax-combine
+        1-process-*-execution-0.zip
+        # nsys-jax-combine output/logfiles
+        *process*-*-execution.zip
+        *-execution-combine.log
+    secrets: inherit
 
   #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
   #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
   #not already have nsys-jax installed
-  # test-nsys-jax-archive:
-  #   needs: test-nsys-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   strategy:
-  #     matrix:
-  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-  #   runs-on: ${{ matrix.os }}
-  #   steps:
-  #   - name: Download nsys-jax output .zip files
-  #     uses: actions/download-artifact@v4
-  #     with:
-  #       name: nsys-jax-unit-test-A100
-  #   - name: Extract archives and execute install scripts
-  #     run: |
-  #       pip install virtualenv # for install.sh
-  #       for zip in $(ls *.zip); do
-  #         ZIP="${PWD}/${zip}"
-  #         pushd $(mktemp -d)
-  #         unzip "${ZIP}"
-  #         ls -l
-  #         # TODO: verify this isn't needed, or make sure it isn't needed
-  #         chmod 755 install.sh
-  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-  #         # Skip executing Jupyter lab
-  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-  #         popd
-  #       done
-
-  # test-nsys-jax-eks:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   runs-on: eks
-  #   env:
-  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
-  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-  #     TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
-  #   steps:
-  #   - name: Check out the repository
-  #     uses: actions/checkout@v4
-  #   - name: GHCR login
-  #     uses: ./.github/actions/ghcr-login 
-  #     with: 
-  #       docker-username: ${{ github.repository_owner }}
-  #       docker-password: ${{ secrets.GITHUB_TOKEN}}
-  #       token-name: ${{ env.TOKEN_NAME }}
-  #   - name: Configure Kubernetes job
-  #     run: |
-  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
-  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
-  #         .github/eks-workflow-files/job.yml
-  #       git diff .github/eks-workflow-files/job.yml
-  #   - name: Submit Kubernetes job
-  #     uses: ./.github/actions/submit-k8s-job
-  #     with: 
-  #       job-config-file: .github/eks-workflow-files/job.yml
-  #       job-name: ${{ env.JOB_NAME }}
-  #   - name: Delete eks job
-  #     uses: ./.github/actions/delete-k8s-job
-  #     if: ${{ always() }}
-  #     with: 
-  #       job-name: ${{ env.JOB_NAME }}
-  #   - name: Configure post-processing job
-  #     run: |
-  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-  #         | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-  #         .github/eks-workflow-files/post-process-job.yml
-  #       git diff .github/eks-workflow-files/post-process-job.yml
-  #   - name: Submit post process k8s job
-  #     uses: ./.github/actions/submit-k8s-job
-  #     with: 
-  #         job-config-file: .github/eks-workflow-files/post-process-job.yml
-  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-  #   - name: Delete post process k8s job
-  #     uses: ./.github/actions/delete-k8s-job
-  #     with:
-  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-  #   - name: Delete GitHub Container Registry token
-  #     uses: ./.github/actions/delete-ghcr-token
-  #     if:  ${{ always() }}
-  #     with: 
-  #       token-name: ${{ env.TOKEN_NAME }}
-
-  # test-equinox:
-  #   needs: build-equinox
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-  #     TEST_NAME: equinox
-  #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -exc -o pipefail \
-  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
-  #     STATISTICS_SCRIPT: |
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-equinox.log
-  #   secrets: inherit
-
-  # test-te-multigpu:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_te.yaml
-  #   with:
-  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-upstream-t5x:
-  #   needs: build-upstream-t5x
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-rosetta-t5x:
-  #   needs: build-rosetta-t5x
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-triton:
-  #   needs: build-triton
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: triton
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-triton.log
-  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-  #         # actually having a CUDA backend for pytoch
-  #         pip install --no-deps torch
-  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-  #       passed_tests=$((total_tests - errors - failed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-triton.log
-  #   secrets: inherit
-
-  # test-levanter:
-  #   needs: build-levanter
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: levanter
-  #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g \
-  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-levanter.log
-  #         pip install flake8 pytest soundfile librosa
-  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-levanter.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-levanter.log
-  #   secrets: inherit
-
-  # test-te:
-  #   needs: build-upstream-pax
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: te
-  #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-te.log
-  #         pip install pytest-reportlog
-  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-te.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     TIMEOUT_MINUTES: 120
-  #     ARTIFACTS: |
-  #       test-te.log
-  #       pytest-report.jsonl
-  #   secrets: inherit
-
-  # test-gemma:
-  #   needs: build-gemma
-  #   uses: ./.github/workflows/_test_unit.yaml  
-  #   if: inputs.ARCHITECTURE == 'amd64'
-  #   with:
-  #     TEST_NAME: gemma
-  #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -ec \
-  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-gemma.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-gemma.log
-  #   secrets: inherit
-
-  # test-maxtext:
-  #   needs: build-maxtext
-  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-  #   uses: ./.github/workflows/_test_maxtext.yaml
-  #   with:
-  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-nsys-jax-archive:
+    needs: test-nsys-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Download nsys-jax output .zip files
+      uses: actions/download-artifact@v4
+      with:
+        name: nsys-jax-unit-test-A100
+    - name: Extract archives and execute install scripts
+      run: |
+        pip install virtualenv # for install.sh
+        for zip in $(ls *.zip); do
+          ZIP="${PWD}/${zip}"
+          pushd $(mktemp -d)
+          unzip "${ZIP}"
+          ls -l
+          # TODO: verify this isn't needed, or make sure it isn't needed
+          chmod 755 install.sh
+          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+          # Skip executing Jupyter lab
+          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+          popd
+        done
+
+  test-nsys-jax-eks:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    runs-on: eks
+    env:
+      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: ${{ github.run_id }}-nsys-jax
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+      TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: GHCR login
+      uses: ./.github/actions/ghcr-login 
+      with: 
+        docker-username: ${{ github.repository_owner }}
+        docker-password: ${{ secrets.GITHUB_TOKEN}}
+        token-name: ${{ env.TOKEN_NAME }}
+    - name: Configure Kubernetes job
+      run: |
+        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+          | select(di == 1).metadata.name = strenv(JOB_NAME)
+          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
+          .github/eks-workflow-files/job.yml
+        git diff .github/eks-workflow-files/job.yml
+    - name: Submit Kubernetes job
+      uses: ./.github/actions/submit-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/job.yml
+        job-name: ${{ env.JOB_NAME }}
+    - name: Delete eks job
+      uses: ./.github/actions/delete-k8s-job
+      if: ${{ always() }}
+      with: 
+        job-name: ${{ env.JOB_NAME }}
+    - name: Configure post-processing job
+      run: |
+        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+          .github/eks-workflow-files/post-process-job.yml
+        git diff .github/eks-workflow-files/post-process-job.yml
+    - name: Submit post process k8s job
+      uses: ./.github/actions/submit-k8s-job
+      with: 
+          job-config-file: .github/eks-workflow-files/post-process-job.yml
+          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+    - name: Delete post process k8s job
+      uses: ./.github/actions/delete-k8s-job
+      with:
+          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+    - name: Delete GitHub Container Registry token
+      uses: ./.github/actions/delete-ghcr-token
+      if:  ${{ always() }}
+      with: 
+        token-name: ${{ env.TOKEN_NAME }}
+
+  test-equinox:
+    needs: build-equinox
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: equinox
+      EXECUTE: |
+        docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+        bash -exc -o pipefail \
+        'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+      STATISTICS_SCRIPT: |
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-equinox.log
+    secrets: inherit
+
+  test-te-multigpu:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_te.yaml
+    with:
+      TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-upstream-t5x:
+    needs: build-upstream-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_upstream_t5x.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-rosetta-t5x:
+    needs: build-rosetta-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_t5x_rosetta.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-triton:
+    needs: build-triton
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: triton
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-triton.log
+          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+          # actually having a CUDA backend for pytoch
+          pip install --no-deps torch
+          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+        EOF
+      STATISTICS_SCRIPT: |
+        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+        passed_tests=$((total_tests - errors - failed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-triton.log
+    secrets: inherit
+
+  test-levanter:
+    needs: build-levanter
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: levanter
+      EXECUTE: |
+        docker run -i --gpus all --shm-size=1g \
+        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-levanter.log
+          pip install flake8 pytest soundfile librosa
+          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+        EOF
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-levanter.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-levanter.log
+    secrets: inherit
+
+  test-te:
+    needs: build-upstream-pax
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: te
+      EXECUTE: |
+        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+        ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-te.log
+          pip install pytest-reportlog
+          pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+        EOF
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-te.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      TIMEOUT_MINUTES: 120
+      ARTIFACTS: |
+        test-te.log
+        pytest-report.jsonl
+    secrets: inherit
+
+  test-gemma:
+    needs: build-gemma
+    uses: ./.github/workflows/_test_unit.yaml  
+    if: inputs.ARCHITECTURE == 'amd64'
+    with:
+      TEST_NAME: gemma
+      EXECUTE: |
+        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+        bash -ec \
+        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-gemma.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-gemma.log
+    secrets: inherit
+
+  test-maxtext:
+    needs: build-maxtext
+    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+    uses: ./.github/workflows/_test_maxtext.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index 321fd861f..6f39ebe0b 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -103,7 +103,7 @@ jobs:
           job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml
           job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} 
       - name: Retrieve Kubernetes job status
-        shell: bash -exo pipefail
+        shell: bash -exo pipefail {0}
         run: |
           LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
           while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do

From 8a9de05af5b072decd3b41012a35c40acad49486 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 13:50:31 +0000
Subject: [PATCH 43/89] fix test

---
 .github/workflows/_ci.yaml | 120 ++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 1439f515c..d4837449b 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -174,7 +174,7 @@ jobs:
         URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
         URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
         URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-    secrets: inherit
+      secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -484,37 +484,37 @@ jobs:
       with: 
         token-name: ${{ env.TOKEN_NAME }}
 
-  test-equinox:
-    needs: build-equinox
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-      TEST_NAME: equinox
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-        bash -exc -o pipefail \
-        'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
-      STATISTICS_SCRIPT: |
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-equinox.log
-    secrets: inherit
-
-  test-te-multigpu:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_te.yaml
-    with:
-      TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-equinox:
+  #   needs: build-equinox
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  #     TEST_NAME: equinox
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -exc -o pipefail \
+  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-equinox.log
+  #   secrets: inherit
+
+  # test-te-multigpu:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_te.yaml
+  #   with:
+  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
   test-upstream-t5x:
     needs: build-upstream-t5x
@@ -588,34 +588,34 @@ jobs:
         test-levanter.log
     secrets: inherit
 
-  test-te:
-    needs: build-upstream-pax
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: te
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-        ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-te.log
-          pip install pytest-reportlog
-          pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-te.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      TIMEOUT_MINUTES: 120
-      ARTIFACTS: |
-        test-te.log
-        pytest-report.jsonl
-    secrets: inherit
+  # test-te:
+  #   needs: build-upstream-pax
+  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: te
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-te.log
+  #         pip install pytest-reportlog
+  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-te.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     TIMEOUT_MINUTES: 120
+  #     ARTIFACTS: |
+  #       test-te.log
+  #       pytest-report.jsonl
+  #   secrets: inherit
 
   test-gemma:
     needs: build-gemma

From 5a0bb04d71be7f1cad97cf8821ebda5fb7424ed7 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 13:52:15 +0000
Subject: [PATCH 44/89] remove always

---
 .github/actions/delete-ghcr-token/action.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/delete-ghcr-token/action.yml
index 0d90dd168..1a246bb8f 100644
--- a/.github/actions/delete-ghcr-token/action.yml
+++ b/.github/actions/delete-ghcr-token/action.yml
@@ -11,6 +11,5 @@ runs:
   steps:
     - name: Delete GitHub Container Registry token
       shell: bash
-      if: always()
       run: |
         kubectl delete secret ${{ inputs.token-name }}

From d7fb8c3e70444f8424289e328db6bf691e0448d9 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 14:11:18 +0000
Subject: [PATCH 45/89] indentention error

---
 .github/workflows/_ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index d4837449b..2ebdf149f 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -174,7 +174,7 @@ jobs:
         URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
         URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
         URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-      secrets: inherit
+    secrets: inherit
 
   build-axlearn:
     needs: build-jax

From d3500bdf9c592cd148a162b8095412dd23f2b681 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 18 Feb 2025 16:48:14 +0000
Subject: [PATCH 46/89] fix runner size

---
 .github/workflows/_ci.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 2ebdf149f..513e74306 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -187,6 +187,7 @@ jobs:
       BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
       CONTAINER_NAME: axlearn
       DOCKERFILE: .github/container/Dockerfile.axlearn
+      RUNNER_SIZE: large
     secrets: inherit
 
   collect-docker-tags:

From 569fb5f81627c1a844fef570a5965f79e245eebb Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 20 Feb 2025 11:31:01 +0000
Subject: [PATCH 47/89] try with post step

---
 .github/actions/delete-ghcr-token/action.yml |  9 ++--
 .github/actions/delete-k8s-job/action.yml    | 30 +++++++------
 .github/actions/with-post-step/action.yml    | 42 ++++++++++++++++++
 .github/actions/with-post-step/main.js       | 46 ++++++++++++++++++++
 4 files changed, 110 insertions(+), 17 deletions(-)
 create mode 100644 .github/actions/with-post-step/action.yml
 create mode 100644 .github/actions/with-post-step/main.js

diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/delete-ghcr-token/action.yml
index 1a246bb8f..c6069908b 100644
--- a/.github/actions/delete-ghcr-token/action.yml
+++ b/.github/actions/delete-ghcr-token/action.yml
@@ -10,6 +10,9 @@ runs:
   using: "composite"
   steps:
     - name: Delete GitHub Container Registry token
-      shell: bash
-      run: |
-        kubectl delete secret ${{ inputs.token-name }}
+      uses: ./.github/actions/with-post-step
+      with: 
+        main: | 
+          echo "Main post step action: no action required"
+        post: |
+          kubectl delete secret ${{ inputs.token-name }}
diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml
index 15a5add64..74f1e3129 100644
--- a/.github/actions/delete-k8s-job/action.yml
+++ b/.github/actions/delete-k8s-job/action.yml
@@ -11,18 +11,20 @@ runs:
   using: "composite"
   steps:
     - name: Delete Kubernetes job
-      shell: bash
-      run: |
-        # make sure we're deleting all the resources 
-        pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}')
+      uses: ./.github/actions/with-post-step 
+      with: 
+        main: |
+          echo "Main post step action: no action required"
+        post: | 
+          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}')
 
-        for pod in $pods; do 
-          status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true)
-          echo "Pod: $pod, status: $status"
-          if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then
-            kubectl delete pod "$pod" --force --grace-period=0 || true
-          fi
-        done
-        
-        # make sure job is deleted
-        kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true 
\ No newline at end of file
+          for pod in $pods; do 
+            status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true)
+            echo "Pod: $pod, status: $status"
+            if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then
+              kubectl delete pod "$pod" --force --grace-period=0 || true
+            fi
+          done
+          
+          # make sure job is deleted
+          kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true 
\ No newline at end of file
diff --git a/.github/actions/with-post-step/action.yml b/.github/actions/with-post-step/action.yml
new file mode 100644
index 000000000..9816ee888
--- /dev/null
+++ b/.github/actions/with-post-step/action.yml
@@ -0,0 +1,42 @@
+# ==================================================================================================================== #
+# Authors:                                                                                                             #
+#   Patrick Lehmann                                                                                                    #
+#   Unai Martinez-Corral                                                                                               #
+#                                                                                                                      #
+# ==================================================================================================================== #
+# Copyright 2020-2024 The pyTooling Authors                                                                            #
+#                                                                                                                      #
+# Licensed under the Apache License, Version 2.0 (the "License");                                                      #
+# you may not use this file except in compliance with the License.                                                     #
+# You may obtain a copy of the License at                                                                              #
+#                                                                                                                      #
+#   http://www.apache.org/licenses/LICENSE-2.0                                                                         #
+#                                                                                                                      #
+# Unless required by applicable law or agreed to in writing, software                                                  #
+# distributed under the License is distributed on an "AS IS" BASIS,                                                    #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                             #
+# See the License for the specific language governing permissions and                                                  #
+# limitations under the License.                                                                                       #
+#                                                                                                                      #
+# SPDX-License-Identifier: Apache-2.0                                                                                  #
+# ==================================================================================================================== #
+name: With post step
+
+description: 'Generic JS Action to execute a main command and set a command as a post step.'
+
+inputs:
+  main:
+    description: 'Main command/script.'
+    required: true
+  post:
+    description: 'Post command/script.'
+    required: true
+  key:
+    description: 'Name of the state variable used to detect the post step.'
+    required: false
+    default: POST
+
+runs:
+  using: 'node20'
+  main: 'main.js'
+  post: 'main.js'
\ No newline at end of file
diff --git a/.github/actions/with-post-step/main.js b/.github/actions/with-post-step/main.js
new file mode 100644
index 000000000..47a817cbc
--- /dev/null
+++ b/.github/actions/with-post-step/main.js
@@ -0,0 +1,46 @@
+/* ================================================================================================================== *
+ * Authors:                                                                                                           *
+ *   Unai Martinez-Corral                                                                                             *
+ *                                                                                                                    *
+ * ================================================================================================================== *
+ * Copyright 2021-2022 Unai Martinez-Corral <unai.martinezcorral@ehu.eus>                                             *
+ * Copyright 2022 Unai Martinez-Corral <umartinezcorral@antmicro.com>                                                 *
+ *                                                                                                                    *
+ * Licensed under the Apache License, Version 2.0 (the "License");                                                    *
+ * you may not use this file except in compliance with the License.                                                   *
+ * You may obtain a copy of the License at                                                                            *
+ *                                                                                                                    *
+ *     http://www.apache.org/licenses/LICENSE-2.0                                                                     *
+ *                                                                                                                    *
+ * Unless required by applicable law or agreed to in writing, software                                                *
+ * distributed under the License is distributed on an "AS IS" BASIS,                                                  *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                           *
+ * See the License for the specific language governing permissions and                                                *
+ * limitations under the License.                                                                                     *
+ *                                                                                                                    *
+ * SPDX-License-Identifier: Apache-2.0                                                                                *
+ * ================================================================================================================== *
+ *                                                                                                                    *
+ * Context:                                                                                                           *
+ * * https://github.com/docker/login-action/issues/72                                                                 *
+ * * https://github.com/actions/runner/issues/1478                                                                    *
+ * ================================================================================================================== */
+const { spawn } = require("child_process");
+const { appendFileSync } = require("fs");
+const { EOL } = require("os");
+
+function run(cmd) {
+  const subprocess = spawn(cmd, { stdio: "inherit", shell: true });
+  subprocess.on("exit", (exitCode) => {
+    process.exitCode = exitCode;
+  });
+}
+
+const key = process.env.INPUT_KEY.toUpperCase();
+
+if ( process.env[`STATE_${key}`] !== undefined ) { // Are we in the 'post' step?
+  run(process.env.INPUT_POST);
+} else { // Otherwise, this is the main step
+  appendFileSync(process.env.GITHUB_STATE, `${key}=true${EOL}`);
+  run(process.env.INPUT_MAIN);
+}
\ No newline at end of file

From 0de66b0d2cf2a40df0617713bd6a38c7392585f1 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 20 Feb 2025 17:45:44 +0000
Subject: [PATCH 48/89] build axlearn with tensorflow-cpu

---
 .github/container/Dockerfile.axlearn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 88cbc458c..dde1ae081 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -24,6 +24,7 @@ portpicker==1.6.0
 seqio==0.0.18
 protobuf==3.20.3  
 pytest>=7.4.3
+tensorflow-cpu
 REQUIREMENTS
 EOF
 

From 8fbacde799e9bb4ccf3ad01352442a47cf34bf72 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 21 Feb 2025 16:50:25 +0000
Subject: [PATCH 49/89] placeholder for models on eks

---
 .../axlearn/axlearn-1B-model.yml              | 77 ++++++++++++++++++
 .../axlearn/axlearn-3B-model.yml              | 78 +++++++++++++++++++
 .github/workflows/_ci.yaml                    | 45 +++++++++++
 3 files changed, 200 insertions(+)
 create mode 100644 .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
 create mode 100644 .github/eks-workflow-files/axlearn/axlearn-3B-model.yml

diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
new file mode 100644
index 000000000..de1d77aa8
--- /dev/null
+++ b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
@@ -0,0 +1,77 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
+spec:
+  completions: 1
+  parallelism: 1
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+      - name: axlearn
+        image: PLACEHOLDER
+        command:
+          - bash
+          - -xo
+          - pipefail
+          - -c
+          - |        
+
+            BASEDIR="/opt/axlearn"
+            CONFIG="fuji-1B-v3-flash-single-host"
+            HLO_DUMP=0
+            POSTFIX=""
+
+            AR_THRESHOLD=1073741824
+            AG_THRESHOLD=8589934592
+            RS_THRESHOLD=8589934592
+            XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
+                            --xla_gpu_enable_triton_gemm=false
+                            --xla_gpu_enable_highest_priority_async_stream=true
+                            --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD}
+                            --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD}
+                            --xla_gpu_enable_pipelined_all_gather=true
+                            --xla_gpu_enable_pipelined_reduce_scatter=true
+                            --xla_gpu_enable_nccl_comm_splitting=false"
+
+            export XLA_PYTHON_CLIENT_PREALLOCATE=false
+            export TF_GPU_ALLOCATOR=cuda_malloc_async
+            export XLA_FLAGS="${XLA_BASE_FLAGS}"
+
+            export NCCL_BUFFSIZE=8388608 
+            export NCCL_P2P_NET_CHUNKSIZE=524288
+            export NCCL_LAUNCH_MODE=GROUP
+            export NCCL_DEBUG=INFO
+
+            LOG_DIR=${BASEDIR}/logs
+            TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
+            mkdir -p ${TRAINER_DIR}
+
+            cat << EOF > tf_gpu_fix.py
+            import tensorflow as tf
+            tf.config.set_visible_devices([], 'GPU')
+            import runpy
+            runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__')
+            EOF
+
+            python3 tf_gpu_fix.py  \
+               --module=text.gpt.c4_trainer \
+                --config=${CONFIG} \
+                --trainer_dir=${TRAINER_DIR} \
+                --data_dir=gs://axlearn-public/tensorflow_datasets \
+                --jax_backend=gpu 
+
+        resources:
+          limits:
+            nvidia.com/gpu: 8
+        volumeMounts:
+        - name: output
+          mountPath: /opt/output
+      imagePullSecrets:
+      - name: PLACEHOLDER  
+      volumes:
+      - name: output
+        emptyDir: {}
diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
new file mode 100644
index 000000000..419d8bb0b
--- /dev/null
+++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
@@ -0,0 +1,78 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
+spec:
+  completions: 1
+  parallelism: 1
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+      - name: axlearn
+        image: PLACEHOLDER
+        command:
+          - bash
+          - -xo
+          - pipefail
+          - -c
+          - |        
+
+            BASEDIR="/opt/axlearn"
+            CONFIG="fuji-3B-v3-flash-single-host"
+            HLO_DUMP=0
+            POSTFIX=""
+
+            AR_THRESHOLD=1073741824
+            AG_THRESHOLD=8589934592
+            RS_THRESHOLD=8589934592
+            XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
+                            --xla_gpu_enable_triton_gemm=false
+                            --xla_gpu_enable_highest_priority_async_stream=true
+                            --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD}
+                            --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD}
+                            --xla_gpu_enable_pipelined_all_gather=true
+                            --xla_gpu_enable_pipelined_reduce_scatter=true
+                            --xla_gpu_enable_nccl_comm_splitting=false"
+
+            export XLA_PYTHON_CLEINT_PREALLOCATE=false
+            export TF_GPU_ALLOCATOR=cuda_malloc_async
+            export XLA_FLAGS="${XLA_BASE_FLAGS}"
+
+            export NCCL_BUFFSIZE=8388608 
+            export NCCL_P2P_NET_CHUNKSIZE=524288
+            export NCCL_LAUNCH_MODE=GROUP
+            export NCCL_DEBUG=INFO
+
+            LOG_DIR=${BASEDIR}/logs
+            TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
+            mkdir -p ${TRAINER_DIR}
+
+            echo "Executing TF"
+            cat << EOF > tf_fix_gpu.py
+            import tensorflow as tf
+            tf.config.set_visible_devices([], 'GPU')
+            import runpy
+            runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__')
+            EOF
+
+            python3 tf_fix_gpu.py \
+                --module=text.gpt.c4_trainer \
+                --config=${CONFIG} \
+                --trainer_dir=${TRAINER_DIR} \
+                --data_dir=gs://axlearn-public/tensorflow_datasets \
+                --jax_backend=gpu  
+
+        resources:
+          limits:
+            nvidia.com/gpu: 8
+        volumeMounts:
+        - name: output
+          mountPath: /opt/output
+      imagePullSecrets:
+      - name: PLACEHOLDER  
+      volumes:
+      - name: output
+        emptyDir: {}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 513e74306..c2e240642 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -759,4 +759,49 @@ jobs:
           sitrep.json
           "badge-axlearn-test"
           summary.txt
+
+
+  test-axlearn-fuji-1B:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64'
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: axlearn-fuji-1B-${{ github.run_id }}
+      TOKEN_NAME: axlearn-fuji-1B-${{ github.run_id }}-token
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: GHCR Login
+      uses: ./.github/actions/ghcr-login
+      with: 
+          docker-username: ${{ github.repository_owner }}
+          docker-password: ${{ secrets.GITHUB_TOKEN }}
+          token-name: ${{ env.TOKEN_NAME }}
+    - name: Configure axlearn test job
+      run: |
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+        .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
+
+    - name: Submit & wait for axlearn test job
+      uses: ./.github/actions/submit-k8s-job 
+      with:
+        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml"
+        job-name: ${{ env.JOB_NAME }}
+
+    - name: Delete axlearn test job
+      uses: ./.github/actions/delete-k8s-job
+      if: ${{ always() }}
+      with: 
+        job-name: ${{ env.JOB_NAME }}
+
+    - name: Delete GitHub Container Registry token
+      uses: ./.github/actions/delete-ghcr-token
+      if: ${{ always() }}
+      with: 
+        token-name: ${{ env.TOKEN_NAME }}
   

From 026b37aae67af015b894d1b0bf4e668cb9426e72 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 21 Feb 2025 18:49:27 +0000
Subject: [PATCH 50/89] test a setup for running fuji 1B on slurm

---
 .github/container/Dockerfile.axlearn |   2 +-
 .github/container/test-fuji-1B.sh    |  39 ++
 .github/workflows/_ci.yaml           | 938 ++++++++++++++-------------
 .github/workflows/_test_fuji_1B.yaml | 106 +++
 4 files changed, 619 insertions(+), 466 deletions(-)
 create mode 100644 .github/container/test-fuji-1B.sh
 create mode 100644 .github/workflows/_test_fuji_1B.yaml

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index dde1ae081..c441bdc68 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -34,7 +34,7 @@ EOF
 ###############################################################################
 
 ADD test-axlearn.sh /usr/local/bin
-
+ADD test-fuji-1B.sh /usr/local/bin
 ###############################################################################
 ## Install accumulated packages from the base image and the previous stage
 ###############################################################################
diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh
new file mode 100644
index 000000000..94042de13
--- /dev/null
+++ b/.github/container/test-fuji-1B.sh
@@ -0,0 +1,39 @@
+#! /bin/bash
+BASEDIR="/opt/host/"
+CONFIG="fuji-7B-v3-flash"
+POSTFIX=${POSTFIX:=""}
+
+
+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
+                 --xla_gpu_graph_level=0
+                 --xla_gpu_enable_highest_priority_async_stream=true
+                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                 --xla_gpu_enable_pipelined_all_gather=true
+                 --xla_gpu_enable_pipelined_reduce_scatter=true
+                 --xla_gpu_enable_pipelined_all_reduce=true
+                 --xla_gpu_enable_while_loop_double_buffering=true
+                 --xla_gpu_enable_triton_gemm=false
+                 --xla_gpu_enable_all_gather_combine_by_dim=false
+                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                 --xla_disable_hlo_passes=rematerialization"
+
+export XLA_PYTHON_CLIENT_PREALLOCATE=false
+export TF_GPU_ALLOCATOR=cuda_malloc_async
+export NCCL_BUFFSIZE=8388608 
+export NCCL_P2P_NET_CHUNKSIZE=524288
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_DEBUG=INFO
+LOG_DIF=${BASEDIR}/logs
+TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs
+mkdir -p ${TRAINER_DIR}
+
+#test "${WITH_MP}" == 1 && export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=${SLURM_LAUNCH_NODE_IPADDR}:12345 --process_id=${SLURM_PROCID}"
+
+python3 -m axlearn.common.launch_trainer_main \
+    --module=text.gpt.c4_trainer \
+    --config=${CONFIG} \
+    --trainer_dir=${TRAINER_DIR} \
+    --data_dir=gs://axlearn-public/tensorflow_datasets \
+    --jax_backend=gpu 
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index c2e240642..29ac71306 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,115 +66,115 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  build-triton:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-triton-build
-      BADGE_FILENAME: badge-triton-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: triton
-      DOCKERFILE: .github/container/Dockerfile.triton
-      RUNNER_SIZE: large
-      EXTRA_BUILD_ARGS: |
-        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-    secrets: inherit
+  # build-triton:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-triton-build
+  #     BADGE_FILENAME: badge-triton-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: triton
+  #     DOCKERFILE: .github/container/Dockerfile.triton
+  #     RUNNER_SIZE: large
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+  #   secrets: inherit
 
-  build-equinox:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-equinox-build
-      BADGE_FILENAME: badge-equinox-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: equinox
-      DOCKERFILE: .github/container/Dockerfile.equinox
-      EXTRA_BUILD_ARGS: |
-        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-    secrets: inherit
+  # build-equinox:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-equinox-build
+  #     BADGE_FILENAME: badge-equinox-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: equinox
+  #     DOCKERFILE: .github/container/Dockerfile.equinox
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+  #   secrets: inherit
 
-  build-maxtext:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-maxtext-build
-      BADGE_FILENAME: badge-maxtext-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: maxtext
-      DOCKERFILE: .github/container/Dockerfile.maxtext
-      EXTRA_BUILD_ARGS: |
-        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-    secrets: inherit
+  # build-maxtext:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-maxtext-build
+  #     BADGE_FILENAME: badge-maxtext-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: maxtext
+  #     DOCKERFILE: .github/container/Dockerfile.maxtext
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+  #   secrets: inherit
 
-  build-levanter:
-    needs: [build-jax]
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-levanter-build"
-      BADGE_FILENAME: "badge-levanter-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: levanter
-      DOCKERFILE: .github/container/Dockerfile.levanter
-      EXTRA_BUILD_ARGS: |
-        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-    secrets: inherit
+  # build-levanter:
+  #   needs: [build-jax]
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-levanter-build"
+  #     BADGE_FILENAME: "badge-levanter-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: levanter
+  #     DOCKERFILE: .github/container/Dockerfile.levanter
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+  #   secrets: inherit
 
-  build-upstream-t5x:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-t5x-build"
-      BADGE_FILENAME: "badge-t5x-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-t5x
-      DOCKERFILE: .github/container/Dockerfile.t5x
-      EXTRA_BUILD_ARGS: |
-        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-    secrets: inherit
+  # build-upstream-t5x:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-t5x-build"
+  #     BADGE_FILENAME: "badge-t5x-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: upstream-t5x
+  #     DOCKERFILE: .github/container/Dockerfile.t5x
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+  #   secrets: inherit
 
-  build-rosetta-t5x:
-    needs: build-upstream-t5x
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: t5x
-    secrets: inherit
+  # build-rosetta-t5x:
+  #   needs: build-upstream-t5x
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: t5x
+  #   secrets: inherit
 
-  build-gemma:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-gemma-build
-      BADGE_FILENAME: badge-gemma-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: gemma
-      DOCKERFILE: rosetta/Dockerfile.gemma
-      DOCKER_CONTEXT: .
-      EXTRA_BUILD_ARGS: |
-        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-    secrets: inherit
+  # build-gemma:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-gemma-build
+  #     BADGE_FILENAME: badge-gemma-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: gemma
+  #     DOCKERFILE: rosetta/Dockerfile.gemma
+  #     DOCKER_CONTEXT: .
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+  #   secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -196,13 +196,13 @@ jobs:
     needs:
       - build-base
       - build-jax
-      - build-triton
-      - build-equinox
-      - build-maxtext
-      - build-levanter
-      - build-upstream-t5x
-      - build-rosetta-t5x
-      - build-gemma
+      # - build-triton
+      # - build-equinox
+      # - build-maxtext
+      # - build-levanter
+      # - build-upstream-t5x
+      # - build-rosetta-t5x
+      # - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -214,22 +214,22 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -239,252 +239,252 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  test-distribution:
-    runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
-    steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
-        run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  test-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent 
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee tee test-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-gpu.log
-    secrets: inherit
+  # test-distribution:
+  #   runs-on: ubuntu-22.04
+  #   strategy:
+  #     matrix:
+  #       TEST_SCRIPT:
+  #         - extra-only-distribution.sh
+  #         - mirror-only-distribution.sh
+  #         - upstream-only-distribution.sh
+  #         - local-patch-distribution.sh
+  #     fail-fast: false
+  #   steps:
+  #     - name: Print environment variables
+  #       run: env
+  #     - name: Set git login for tests
+  #       run: |
+  #         git config --global user.email "jax@nvidia.com"
+  #         git config --global user.name "JAX-Toolbox CI"
+  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
+  #       uses: actions/checkout@v4
+  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  # test-jax:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: jax
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-jax.sh -b backend-independent 
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee tee test-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b gpu
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-backend-independent.log
+  #       test-gpu.log
+  #   secrets: inherit
 
-  test-nsys-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: nsys-jax
-      EXECUTE: |
-        set -o pipefail
-        num_tests=0
-        num_failures=0
-        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-        # avoid an early abort here.
-        set +e
-        docker run -i --shm-size=1g --gpus all \
-          -v $PWD:/opt/output \
-          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-          bash <<"EOF" |& tee test-nsys-jax.log
-            # nsys-jax is already installed, this is just adding the test dependencies
-            pip install pytest-reportlog nsys-jax[test]
-            # abuse knowledge that nsys-jax is installed editable, so the tests exist
-            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-        EOF
-        set -e
-        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-        for mode in 1-process 2-process process-per-gpu; do
-          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-          if [[ "${mode}" == "1-process" ]]; then
-            PROCESS_COUNT=1
-            ARGS=""
-          elif [[ "${mode}" == "2-process" ]]; then
-            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-            # this will flush out more bugs than process-per-node or process-per-GPU.
-            PROCESS_COUNT=2
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-          else
-            PROCESS_COUNT=${GPUS_PER_NODE}
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-          fi
-          for collection in full partial; do
-            NSYS_JAX="nsys-jax"
-            if [[ "${mode}" == "1-process" ]]; then
-              # We will not run nsys-jax-combine, so run analyses eagerly
-              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-            fi
-            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-            if [[ "${collection}" == "partial" ]]; then
-              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-              # nvbug/4801401
-              NSYS_JAX+=" --sample=none"
-            fi
-            set +e
-            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          done
-          if [[ "${mode}" != "1-process" ]]; then
-            # Run nsys-jax-combine
-            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-            for (( i=0; i<PROCESS_COUNT; i++ )); do
-              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-            done
-            set +e
-            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          fi
-        done
-        ls -R .
-        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-        exit $num_failures
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-nsys-jax.log)
-        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        # pytest-driven part
-        test-nsys-jax.log
-        pytest-report.jsonl
-        # nsys-jax logfiles
-        *process-*-execution.log
-        # nsys-jax output for the case that doesn't use nsys-jax-combine
-        1-process-*-execution-0.zip
-        # nsys-jax-combine output/logfiles
-        *process*-*-execution.zip
-        *-execution-combine.log
-    secrets: inherit
+  # test-nsys-jax:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: nsys-jax
+  #     EXECUTE: |
+  #       set -o pipefail
+  #       num_tests=0
+  #       num_failures=0
+  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+  #       # avoid an early abort here.
+  #       set +e
+  #       docker run -i --shm-size=1g --gpus all \
+  #         -v $PWD:/opt/output \
+  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #         bash <<"EOF" |& tee test-nsys-jax.log
+  #           # nsys-jax is already installed, this is just adding the test dependencies
+  #           pip install pytest-reportlog nsys-jax[test]
+  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
+  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+  #       EOF
+  #       set -e
+  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+  #       for mode in 1-process 2-process process-per-gpu; do
+  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+  #         if [[ "${mode}" == "1-process" ]]; then
+  #           PROCESS_COUNT=1
+  #           ARGS=""
+  #         elif [[ "${mode}" == "2-process" ]]; then
+  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+  #           # this will flush out more bugs than process-per-node or process-per-GPU.
+  #           PROCESS_COUNT=2
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+  #         else
+  #           PROCESS_COUNT=${GPUS_PER_NODE}
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+  #         fi
+  #         for collection in full partial; do
+  #           NSYS_JAX="nsys-jax"
+  #           if [[ "${mode}" == "1-process" ]]; then
+  #             # We will not run nsys-jax-combine, so run analyses eagerly
+  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+  #           fi
+  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+  #           if [[ "${collection}" == "partial" ]]; then
+  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+  #             # nvbug/4801401
+  #             NSYS_JAX+=" --sample=none"
+  #           fi
+  #           set +e
+  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         done
+  #         if [[ "${mode}" != "1-process" ]]; then
+  #           # Run nsys-jax-combine
+  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
+  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+  #           done
+  #           set +e
+  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         fi
+  #       done
+  #       ls -R .
+  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+  #       exit $num_failures
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-nsys-jax.log)
+  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       # pytest-driven part
+  #       test-nsys-jax.log
+  #       pytest-report.jsonl
+  #       # nsys-jax logfiles
+  #       *process-*-execution.log
+  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
+  #       1-process-*-execution-0.zip
+  #       # nsys-jax-combine output/logfiles
+  #       *process*-*-execution.zip
+  #       *-execution-combine.log
+  #   secrets: inherit
 
   #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
   #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
   #not already have nsys-jax installed
-  test-nsys-jax-archive:
-    needs: test-nsys-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Download nsys-jax output .zip files
-      uses: actions/download-artifact@v4
-      with:
-        name: nsys-jax-unit-test-A100
-    - name: Extract archives and execute install scripts
-      run: |
-        pip install virtualenv # for install.sh
-        for zip in $(ls *.zip); do
-          ZIP="${PWD}/${zip}"
-          pushd $(mktemp -d)
-          unzip "${ZIP}"
-          ls -l
-          # TODO: verify this isn't needed, or make sure it isn't needed
-          chmod 755 install.sh
-          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-          # Skip executing Jupyter lab
-          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-          popd
-        done
-
-  test-nsys-jax-eks:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    runs-on: eks
-    env:
-      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-nsys-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: GHCR login
-      uses: ./.github/actions/ghcr-login 
-      with: 
-        docker-username: ${{ github.repository_owner }}
-        docker-password: ${{ secrets.GITHUB_TOKEN}}
-        token-name: ${{ env.TOKEN_NAME }}
-    - name: Configure Kubernetes job
-      run: |
-        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-          | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
-          .github/eks-workflow-files/job.yml
-        git diff .github/eks-workflow-files/job.yml
-    - name: Submit Kubernetes job
-      uses: ./.github/actions/submit-k8s-job
-      with: 
-        job-config-file: .github/eks-workflow-files/job.yml
-        job-name: ${{ env.JOB_NAME }}
-    - name: Delete eks job
-      uses: ./.github/actions/delete-k8s-job
-      if: ${{ always() }}
-      with: 
-        job-name: ${{ env.JOB_NAME }}
-    - name: Configure post-processing job
-      run: |
-        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-          .github/eks-workflow-files/post-process-job.yml
-        git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post process k8s job
-      uses: ./.github/actions/submit-k8s-job
-      with: 
-          job-config-file: .github/eks-workflow-files/post-process-job.yml
-          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete post process k8s job
-      uses: ./.github/actions/delete-k8s-job
-      with:
-          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete GitHub Container Registry token
-      uses: ./.github/actions/delete-ghcr-token
-      if:  ${{ always() }}
-      with: 
-        token-name: ${{ env.TOKEN_NAME }}
-
+  # test-nsys-jax-archive:
+  #   needs: test-nsys-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   strategy:
+  #     matrix:
+  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+  #   runs-on: ${{ matrix.os }}
+  #   steps:
+  #   - name: Download nsys-jax output .zip files
+  #     uses: actions/download-artifact@v4
+  #     with:
+  #       name: nsys-jax-unit-test-A100
+  #   - name: Extract archives and execute install scripts
+  #     run: |
+  #       pip install virtualenv # for install.sh
+  #       for zip in $(ls *.zip); do
+  #         ZIP="${PWD}/${zip}"
+  #         pushd $(mktemp -d)
+  #         unzip "${ZIP}"
+  #         ls -l
+  #         # TODO: verify this isn't needed, or make sure it isn't needed
+  #         chmod 755 install.sh
+  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+  #         # Skip executing Jupyter lab
+  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+  #         popd
+  #       done
+
+  # test-nsys-jax-eks:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   runs-on: eks
+  #   env:
+  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
+  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+  #     TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: GHCR login
+  #     uses: ./.github/actions/ghcr-login 
+  #     with: 
+  #       docker-username: ${{ github.repository_owner }}
+  #       docker-password: ${{ secrets.GITHUB_TOKEN}}
+  #       token-name: ${{ env.TOKEN_NAME }}
+  #   - name: Configure Kubernetes job
+  #     run: |
+  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
+  #         .github/eks-workflow-files/job.yml
+  #       git diff .github/eks-workflow-files/job.yml
+  #   - name: Submit Kubernetes job
+  #     uses: ./.github/actions/submit-k8s-job
+  #     with: 
+  #       job-config-file: .github/eks-workflow-files/job.yml
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Delete eks job
+  #     uses: ./.github/actions/delete-k8s-job
+  #     if: ${{ always() }}
+  #     with: 
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Configure post-processing job
+  #     run: |
+  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+  #         | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+  #         .github/eks-workflow-files/post-process-job.yml
+  #       git diff .github/eks-workflow-files/post-process-job.yml
+  #   - name: Submit post process k8s job
+  #     uses: ./.github/actions/submit-k8s-job
+  #     with: 
+  #         job-config-file: .github/eks-workflow-files/post-process-job.yml
+  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+  #   - name: Delete post process k8s job
+  #     uses: ./.github/actions/delete-k8s-job
+  #     with:
+  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+  #   - name: Delete GitHub Container Registry token
+  #     uses: ./.github/actions/delete-ghcr-token
+  #     if:  ${{ always() }}
+  #     with: 
+  #       token-name: ${{ env.TOKEN_NAME }}
+  # COMMENT THIS
   # test-equinox:
   #   needs: build-equinox
   #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
@@ -508,7 +508,7 @@ jobs:
   #     ARTIFACTS: |
   #       test-equinox.log
   #   secrets: inherit
-
+  # COMMENT THIS
   # test-te-multigpu:
   #   needs: build-upstream-pax
   #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
@@ -517,78 +517,79 @@ jobs:
   #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
   #   secrets: inherit
 
-  test-upstream-t5x:
-    needs: build-upstream-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_upstream_t5x.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
-
-  test-rosetta-t5x:
-    needs: build-rosetta-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_t5x_rosetta.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-upstream-t5x:
+  #   needs: build-upstream-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-triton:
-    needs: build-triton
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: triton
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-triton.log
-          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-          # actually having a CUDA backend for pytoch
-          pip install --no-deps torch
-          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-        EOF
-      STATISTICS_SCRIPT: |
-        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-        passed_tests=$((total_tests - errors - failed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-triton.log
-    secrets: inherit
+  # test-rosetta-t5x:
+  #   needs: build-rosetta-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-levanter:
-    needs: build-levanter
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: levanter
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g \
-        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-levanter.log
-          pip install flake8 pytest soundfile librosa
-          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-levanter.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-levanter.log
-    secrets: inherit
+  # test-triton:
+  #   needs: build-triton
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: triton
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-triton.log
+  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+  #         # actually having a CUDA backend for pytoch
+  #         pip install --no-deps torch
+  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+  #       passed_tests=$((total_tests - errors - failed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-triton.log
+  #   secrets: inherit
 
+  # test-levanter:
+  #   needs: build-levanter
+  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: levanter
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g \
+  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-levanter.log
+  #         pip install flake8 pytest soundfile librosa
+  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-levanter.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-levanter.log
+  #   secrets: inherit
+    
+  # COMMENT THIS
   # test-te:
   #   needs: build-upstream-pax
   #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
@@ -618,37 +619,37 @@ jobs:
   #       pytest-report.jsonl
   #   secrets: inherit
 
-  test-gemma:
-    needs: build-gemma
-    uses: ./.github/workflows/_test_unit.yaml  
-    if: inputs.ARCHITECTURE == 'amd64'
-    with:
-      TEST_NAME: gemma
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-        bash -ec \
-        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-gemma.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-gemma.log
-    secrets: inherit
+  # test-gemma:
+  #   needs: build-gemma
+  #   uses: ./.github/workflows/_test_unit.yaml  
+  #   if: inputs.ARCHITECTURE == 'amd64'
+  #   with:
+  #     TEST_NAME: gemma
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -ec \
+  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-gemma.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-gemma.log
+  #   secrets: inherit
 
-  test-maxtext:
-    needs: build-maxtext
-    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-maxtext:
+  #   needs: build-maxtext
+  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+  #   uses: ./.github/workflows/_test_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn
@@ -760,6 +761,13 @@ jobs:
           "badge-axlearn-test"
           summary.txt
 
+  test-axlearn-fuji-1B-slurm:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64' 
+    uses: ./.github/workflows/_test_fuji_1B.yaml
+    with:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
   test-axlearn-fuji-1B:
     needs: build-axlearn
diff --git a/.github/workflows/_test_fuji_1B.yaml b/.github/workflows/_test_fuji_1B.yaml
new file mode 100644
index 000000000..54e72b53d
--- /dev/null
+++ b/.github/workflows/_test_fuji_1B.yaml
@@ -0,0 +1,106 @@
+name: ~test MaxText functionality
+
+on:
+  workflow_call:
+    inputs:
+      AXLEARN_DOCKER_IMAGE:
+        type: string
+        description: Axlearn image from ghcr.io/nvidia
+        default: ghcr.io/nvidia/jax:axlearn
+        required: false
+        
+jobs:
+  single-process-single-node:
+    runs-on: jumpbox
+    steps: 
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v4
+
+      - name: Setup SSH
+        id: setup-ssh
+        uses: ./.github/actions/setup-ssh
+        with:
+            ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+            ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+
+      - name: Labels and metadata
+        id: meta
+        shell: bash -x -e {0}
+        run: |
+            IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')"
+            TOTAL_TASKS=1
+            MAX_GPUS_PER_NODE=8
+            NODES=1
+            GPUS_PER_NODE=8
+            JOB_NAME=axlearn-fuji-1B-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
+            LOG_FILE=/nfs/cluster/${JOB_NAME}.log
+            MODEL_PATH=/nfs/cluster/${JOB_NAME}
+            for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
+            echo "$var=${!var}" >> $GITHUB_OUTPUT
+            done
+
+      - name: Submit SLURM jobs over SSH
+        id: submit
+        shell: bash -O expand_aliases -x -e {0}
+        run: |
+        alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+        sshx "date && hostname && sinfo"
+        sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
+        JOB=$(sshx sbatch --parsable << EOF
+        #!/bin/bash
+        #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
+        #SBATCH --exclusive
+        #SBATCH --nodes=${{ steps.meta.outputs.NODES }}
+        #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
+        #SBATCH --time=00:30:00
+        #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
+
+        # preload enroot container using one task per node
+        time srun \
+            --ntasks-per-node=1 \
+            --container-name=runtime \
+            --container-image=${{ steps.meta.outputs.IMAGE }} \
+            true
+
+        # run job with tasks on each node sharing one container
+        time srun \
+            --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
+            --container-name=runtime \
+            --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
+            --container-entrypoint \
+            test-fuji-1B.sh
+        EOF
+        )
+
+        echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT
+
+        . .github/workflows/scripts/wait_for_slurm_job.sh
+
+        wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}
+
+        # Gather job info
+        SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
+        SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
+        echo "SLURM Job state is ${SLURM_STATE}"
+        echo "SLURM Job exit code is ${SLURM_EXITCODE}"
+        echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
+        echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
+        set -x
+
+
+      - name: Remove orphaned SLURM job if the CI job is canceled
+        if: cancelled()
+        shell: bash -x -e {0}
+        run: |
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+            scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
+
+      - name: Write SLURM job status to file
+        shell: bash -x -e {0}
+        run: |
+          python << EOF
+          import json
+          with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
+              dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
+              json.dump(dump, f)
+          EOF
\ No newline at end of file

From 2dd21ad9ce01eaee4a31611214c86036dd139e4c Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 21 Feb 2025 18:50:52 +0000
Subject: [PATCH 51/89] fix naming

---
 .github/workflows/_ci.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 29ac71306..38e112e24 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -775,8 +775,8 @@ jobs:
     runs-on: eks
     env:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: axlearn-fuji-1B-${{ github.run_id }}
-      TOKEN_NAME: axlearn-fuji-1B-${{ github.run_id }}-token
+      JOB_NAME: axlearn-fuji-1b-${{ github.run_id }}
+      TOKEN_NAME: axlearn-fuji-1b-${{ github.run_id }}-token
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4

From e35043492e8dbe4d6b8acd70d63d1bfbf0e3b455 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 21 Feb 2025 18:58:21 +0000
Subject: [PATCH 52/89] fix indt

---
 .github/workflows/_test_fuji_1B.yaml | 76 ++++++++++++++--------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/_test_fuji_1B.yaml b/.github/workflows/_test_fuji_1B.yaml
index 54e72b53d..cd058b59a 100644
--- a/.github/workflows/_test_fuji_1B.yaml
+++ b/.github/workflows/_test_fuji_1B.yaml
@@ -8,7 +8,7 @@ on:
         description: Axlearn image from ghcr.io/nvidia
         default: ghcr.io/nvidia/jax:axlearn
         required: false
-        
+
 jobs:
   single-process-single-node:
     runs-on: jumpbox
@@ -43,49 +43,49 @@ jobs:
         id: submit
         shell: bash -O expand_aliases -x -e {0}
         run: |
-        alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
-        sshx "date && hostname && sinfo"
-        sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
-        JOB=$(sshx sbatch --parsable << EOF
-        #!/bin/bash
-        #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
-        #SBATCH --exclusive
-        #SBATCH --nodes=${{ steps.meta.outputs.NODES }}
-        #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
-        #SBATCH --time=00:30:00
-        #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
+            alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+            sshx "date && hostname && sinfo"
+            sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
+            JOB=$(sshx sbatch --parsable << EOF
+            #!/bin/bash
+            #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
+            #SBATCH --exclusive
+            #SBATCH --nodes=${{ steps.meta.outputs.NODES }}
+            #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
+            #SBATCH --time=00:30:00
+            #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
 
-        # preload enroot container using one task per node
-        time srun \
-            --ntasks-per-node=1 \
-            --container-name=runtime \
-            --container-image=${{ steps.meta.outputs.IMAGE }} \
-            true
+            # preload enroot container using one task per node
+            time srun \
+                --ntasks-per-node=1 \
+                --container-name=runtime \
+                --container-image=${{ steps.meta.outputs.IMAGE }} \
+                true
 
-        # run job with tasks on each node sharing one container
-        time srun \
-            --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
-            --container-name=runtime \
-            --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
-            --container-entrypoint \
-            test-fuji-1B.sh
-        EOF
-        )
+            # run job with tasks on each node sharing one container
+            time srun \
+                --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
+                --container-name=runtime \
+                --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
+                --container-entrypoint \
+                test-fuji-1B.sh
+            EOF
+            )
 
-        echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT
+            echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT
 
-        . .github/workflows/scripts/wait_for_slurm_job.sh
+            . .github/workflows/scripts/wait_for_slurm_job.sh
 
-        wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}
+            wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}
 
-        # Gather job info
-        SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
-        SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
-        echo "SLURM Job state is ${SLURM_STATE}"
-        echo "SLURM Job exit code is ${SLURM_EXITCODE}"
-        echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
-        echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
-        set -x
+            # Gather job info
+            SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
+            SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
+            echo "SLURM Job state is ${SLURM_STATE}"
+            echo "SLURM Job exit code is ${SLURM_EXITCODE}"
+            echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
+            echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
+            set -x
 
 
       - name: Remove orphaned SLURM job if the CI job is canceled

From 2c8409d8d02847cc32b1fbefe37c64ced6666d43 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 24 Feb 2025 09:45:37 +0000
Subject: [PATCH 53/89] set k8s jobs to run for 20 min

---
 .github/eks-workflow-files/axlearn/axlearn-1B-model.yml | 2 ++
 .github/eks-workflow-files/axlearn/axlearn-3B-model.yml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
index de1d77aa8..60ac97e3a 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
@@ -5,6 +5,8 @@ metadata:
   labels:
     kueue.x-k8s.io/queue-name: p5-queue
 spec:
+  # the job will run for 20 mins, as we can' tset max_steps
+  activeDeadlineSeconds: 1200
   completions: 1
   parallelism: 1
   template:
diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
index 419d8bb0b..2461c097a 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
@@ -5,6 +5,8 @@ metadata:
   labels:
     kueue.x-k8s.io/queue-name: p5-queue
 spec:
+  # the job will run for 20 mins, as we can' tset max_steps
+  activeDeadlineSeconds: 1200
   completions: 1
   parallelism: 1
   template:

From 477735935844ff90204dd8e75200ef875863d1aa Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 24 Feb 2025 11:57:13 +0000
Subject: [PATCH 54/89] try a test on fuji 7B params

---
 .github/container/Dockerfile.axlearn          |   1 +
 .github/container/test-fuji-1B.sh             |   2 +-
 .../axlearn/axlearn-3B-model.yml              |   2 +-
 .github/workflows/_ci.yaml                    |  14 ++-
 .github/workflows/_test_fuji_7B.yaml          | 106 ++++++++++++++++++
 5 files changed, 120 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 .github/container/test-fuji-1B.sh
 create mode 100644 .github/workflows/_test_fuji_7B.yaml

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index c441bdc68..5c51697a2 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -35,6 +35,7 @@ EOF
 
 ADD test-axlearn.sh /usr/local/bin
 ADD test-fuji-1B.sh /usr/local/bin
+ADD test-fuji-7B.sh /usr/local/bin
 ###############################################################################
 ## Install accumulated packages from the base image and the previous stage
 ###############################################################################
diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh
old mode 100644
new mode 100755
index 94042de13..9cd9faebd
--- a/.github/container/test-fuji-1B.sh
+++ b/.github/container/test-fuji-1B.sh
@@ -1,6 +1,6 @@
 #! /bin/bash
 BASEDIR="/opt/host/"
-CONFIG="fuji-7B-v3-flash"
+CONFIG="fuji-1B-v3-flash"
 POSTFIX=${POSTFIX:=""}
 
 
diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
index 2461c097a..39fbce6be 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
@@ -39,7 +39,7 @@ spec:
                             --xla_gpu_enable_pipelined_reduce_scatter=true
                             --xla_gpu_enable_nccl_comm_splitting=false"
 
-            export XLA_PYTHON_CLEINT_PREALLOCATE=false
+            export XLA_PYTHON_CLIENT_PREALLOCATE=false
             export TF_GPU_ALLOCATOR=cuda_malloc_async
             export XLA_FLAGS="${XLA_BASE_FLAGS}"
 
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 38e112e24..6634b9661 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -761,15 +761,23 @@ jobs:
           "badge-axlearn-test"
           summary.txt
 
-  test-axlearn-fuji-1B-slurm:
+  # test-axlearn-fuji-1B-slurm:
+  #   needs: build-axlearn
+  #   if: inputs.ARCHITECTURE == 'amd64' 
+  #   uses: ./.github/workflows/_test_fuji_1B.yaml
+  #   with:
+  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
+
+  test-axlearn-fuji-7B-slurm:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64' 
-    uses: ./.github/workflows/_test_fuji_1B.yaml
+    uses: ./.github/workflows/_test_fuji_7B.yaml
     with:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
 
-  test-axlearn-fuji-1B:
+  test-axlearn-fuji-1B-eks:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
     runs-on: eks
diff --git a/.github/workflows/_test_fuji_7B.yaml b/.github/workflows/_test_fuji_7B.yaml
new file mode 100644
index 000000000..544de815a
--- /dev/null
+++ b/.github/workflows/_test_fuji_7B.yaml
@@ -0,0 +1,106 @@
+name: ~test MaxText functionality
+
+on:
+  workflow_call:
+    inputs:
+      AXLEARN_DOCKER_IMAGE:
+        type: string
+        description: Axlearn image from ghcr.io/nvidia
+        default: ghcr.io/nvidia/jax:axlearn
+        required: false
+
+jobs:
+  single-process-single-node:
+    runs-on: jumpbox
+    steps: 
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v4
+
+      - name: Setup SSH
+        id: setup-ssh
+        uses: ./.github/actions/setup-ssh
+        with:
+            ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+            ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+
+      - name: Labels and metadata
+        id: meta
+        shell: bash -x -e {0}
+        run: |
+            IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')"
+            TOTAL_TASKS=1
+            MAX_GPUS_PER_NODE=8
+            NODES=1
+            GPUS_PER_NODE=8
+            JOB_NAME=axlearn-fuji-7B-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
+            LOG_FILE=/nfs/cluster/${JOB_NAME}.log
+            MODEL_PATH=/nfs/cluster/${JOB_NAME}
+            for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
+            echo "$var=${!var}" >> $GITHUB_OUTPUT
+            done
+
+      - name: Submit SLURM jobs over SSH
+        id: submit
+        shell: bash -O expand_aliases -x -e {0}
+        run: |
+            alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+            sshx "date && hostname && sinfo"
+            sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
+            JOB=$(sshx sbatch --parsable << EOF
+            #!/bin/bash
+            #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
+            #SBATCH --exclusive
+            #SBATCH --nodes=${{ steps.meta.outputs.NODES }}
+            #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
+            #SBATCH --time=00:30:00
+            #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
+
+            # preload enroot container using one task per node
+            time srun \
+                --ntasks-per-node=1 \
+                --container-name=runtime \
+                --container-image=${{ steps.meta.outputs.IMAGE }} \
+                true
+
+            # run job with tasks on each node sharing one container
+            time srun \
+                --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
+                --container-name=runtime \
+                --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
+                --container-entrypoint \
+                test-fuji-7B.sh
+            EOF
+            )
+
+            echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT
+
+            . .github/workflows/scripts/wait_for_slurm_job.sh
+
+            wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}
+
+            # Gather job info
+            SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
+            SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
+            echo "SLURM Job state is ${SLURM_STATE}"
+            echo "SLURM Job exit code is ${SLURM_EXITCODE}"
+            echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
+            echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
+            set -x
+
+
+      - name: Remove orphaned SLURM job if the CI job is canceled
+        if: cancelled()
+        shell: bash -x -e {0}
+        run: |
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+            scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
+
+      - name: Write SLURM job status to file
+        shell: bash -x -e {0}
+        run: |
+          python << EOF
+          import json
+          with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
+              dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
+              json.dump(dump, f)
+          EOF
\ No newline at end of file

From 1f3e1e426c043ca0b0d98a2164f9f03d3bbc51cd Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 24 Feb 2025 12:39:14 +0000
Subject: [PATCH 55/89] upload test script for testing

---
 .github/container/test-fuji-7B.sh | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100755 .github/container/test-fuji-7B.sh

diff --git a/.github/container/test-fuji-7B.sh b/.github/container/test-fuji-7B.sh
new file mode 100755
index 000000000..e2ff8dde6
--- /dev/null
+++ b/.github/container/test-fuji-7B.sh
@@ -0,0 +1,40 @@
+#! /bin/bash
+BASEDIR="/opt/host/"
+CONFIG="fuji-7B-v2-flash"
+POSTFIX=${POSTFIX:=""}
+
+
+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
+                 --xla_gpu_graph_level=0
+                 --xla_gpu_enable_highest_priority_async_stream=true
+                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                 --xla_gpu_enable_pipelined_all_gather=true
+                 --xla_gpu_enable_pipelined_reduce_scatter=true
+                 --xla_gpu_enable_pipelined_all_reduce=true
+                 --xla_gpu_enable_while_loop_double_buffering=true
+                 --xla_gpu_enable_triton_gemm=false
+                 --xla_gpu_enable_all_gather_combine_by_dim=false
+                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                 --xla_disable_hlo_passes=rematerialization"
+                 
+export XLA_PYTHON_CLIENT_PREALLOCATE=false
+export TF_GPU_ALLOCATOR=cuda_malloc_async
+export NCCL_BUFFSIZE=8388608 
+export NCCL_P2P_NET_CHUNKSIZE=524288
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_DEBUG=INFO
+LOG_DIF=${BASEDIR}/logs
+TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs
+mkdir -p ${TRAINER_DIR}
+
+export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=127.0.0.1:8080 --process_id=${SLURM_PROCID}"
+
+python3 -m axlearn.common.launch_trainer_main \
+    --module=text.gpt.c4_trainer \
+    --config=${CONFIG} \
+    --trainer_dir=${TRAINER_DIR} \
+    --data_dir=gs://axlearn-public/tensorflow_datasets \
+    --jax_backend=gpu \
+    ${MP_ARGS}

From ea2a2651112a2abd5d7ce847170d11a4cddefa69 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 24 Feb 2025 14:45:35 +0000
Subject: [PATCH 56/89] reset the 7B

---
 .github/container/Dockerfile.axlearn |  9 +++++++--
 .github/container/test-fuji-1B.sh    |  2 +-
 .github/workflows/_ci.yaml           | 13 ++-----------
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 5c51697a2..2c6dad33f 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -24,8 +24,13 @@ portpicker==1.6.0
 seqio==0.0.18
 protobuf==3.20.3  
 pytest>=7.4.3
-tensorflow-cpu
 REQUIREMENTS
+  # Only append "tensorflow-cpu" if running on x86_64
+  if [ "$(uname -m)" = "x86_64" ]; then
+    echo "tensorflow-cpu" >> /opt/pip-tools.d/requirements-axlearn.in
+  else
+    echo "Skipping TF on $(uname -m)"
+  fi
 EOF
 
 
@@ -35,7 +40,7 @@ EOF
 
 ADD test-axlearn.sh /usr/local/bin
 ADD test-fuji-1B.sh /usr/local/bin
-ADD test-fuji-7B.sh /usr/local/bin
+
 ###############################################################################
 ## Install accumulated packages from the base image and the previous stage
 ###############################################################################
diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh
index 9cd9faebd..8208b4003 100755
--- a/.github/container/test-fuji-1B.sh
+++ b/.github/container/test-fuji-1B.sh
@@ -24,7 +24,7 @@ export TF_GPU_ALLOCATOR=cuda_malloc_async
 export NCCL_BUFFSIZE=8388608 
 export NCCL_P2P_NET_CHUNKSIZE=524288
 export NCCL_LAUNCH_MODE=GROUP
-export NCCL_DEBUG=INFO
+
 LOG_DIF=${BASEDIR}/logs
 TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs
 mkdir -p ${TRAINER_DIR}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 6634b9661..eef5a78bb 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -761,18 +761,10 @@ jobs:
           "badge-axlearn-test"
           summary.txt
 
-  # test-axlearn-fuji-1B-slurm:
-  #   needs: build-axlearn
-  #   if: inputs.ARCHITECTURE == 'amd64' 
-  #   uses: ./.github/workflows/_test_fuji_1B.yaml
-  #   with:
-  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  test-axlearn-fuji-7B-slurm:
+  test-axlearn-fuji-1B-slurm:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64' 
-    uses: ./.github/workflows/_test_fuji_7B.yaml
+    uses: ./.github/workflows/_test_fuji_1B.yaml
     with:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
@@ -820,4 +812,3 @@ jobs:
       if: ${{ always() }}
       with: 
         token-name: ${{ env.TOKEN_NAME }}
-  

From 5fd34003adf20484ba3f3fd82d9a990f3b964d08 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 24 Feb 2025 17:31:55 +0000
Subject: [PATCH 57/89] address comments

---
 .github/actions/ghcr-login/action.yml         |  31 -----
 .../action.yml                                |   8 +-
 .../action.yml                                |  19 ++-
 .github/actions/submit-k8s-job/action.yml     |  40 -------
 .github/actions/with-post-step/action.yml     |   2 +-
 .github/container/Dockerfile.axlearn          |   9 +-
 .github/container/test-axlearn.sh             |  31 +----
 .github/container/test-fuji-1B.sh             |  12 +-
 .github/container/test-fuji-7B.sh             |  40 -------
 .../axlearn/axlearn-1B-model.yml              |   4 +-
 .../axlearn/axlearn-3B-model.yml              |  80 -------------
 .github/workflows/_ci.yaml                    |  77 ++++--------
 .github/workflows/_test_fuji_1B.yaml          | 106 -----------------
 .github/workflows/_test_fuji_7B.yaml          | 106 -----------------
 .github/workflows/_test_nccl.yaml             | 110 ++++++++++++++++++
 .github/workflows/nccl-k8s.yaml               | 109 +----------------
 16 files changed, 175 insertions(+), 609 deletions(-)
 delete mode 100644 .github/actions/ghcr-login/action.yml
 rename .github/actions/{delete-ghcr-token => store-delete-k8s-ghcr}/action.yml (53%)
 rename .github/actions/{delete-k8s-job => submit-delete-k8s-job}/action.yml (51%)
 delete mode 100644 .github/actions/submit-k8s-job/action.yml
 delete mode 100755 .github/container/test-fuji-7B.sh
 delete mode 100644 .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
 delete mode 100644 .github/workflows/_test_fuji_1B.yaml
 delete mode 100644 .github/workflows/_test_fuji_7B.yaml
 create mode 100644 .github/workflows/_test_nccl.yaml

diff --git a/.github/actions/ghcr-login/action.yml b/.github/actions/ghcr-login/action.yml
deleted file mode 100644
index 2c62591ed..000000000
--- a/.github/actions/ghcr-login/action.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: Checkout, GHCR login, K8s secret
-description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret.
-
-inputs:
-  docker-username:
-    description: Username for GHCR
-    required: true
-  docker-password:
-    description: Password (e.g., GITHUB_TOKEN)
-    required: true
-  token-name:
-    description: Name of the K8s secret to create
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: "ghcr.io"
-        username: ${{ inputs.docker-username }}
-        password: ${{ inputs.docker-password }}
-
-    - name: Store GitHub Container Registry token as Kubernetes secret
-      shell: bash
-      run: |
-        kubectl create secret generic \
-          ${{ inputs.token-name }} \
-          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-          --type=kubernetes.io/dockerconfigjson
diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
similarity index 53%
rename from .github/actions/delete-ghcr-token/action.yml
rename to .github/actions/store-delete-k8s-ghcr/action.yml
index c6069908b..51eb8b625 100644
--- a/.github/actions/delete-ghcr-token/action.yml
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -13,6 +13,10 @@ runs:
       uses: ./.github/actions/with-post-step
       with: 
         main: | 
-          echo "Main post step action: no action required"
+          # Store GitHub Container Registry token as Kubernetes secret
+          kubectl create secret generic \
+          ${{ inputs.token-name }} \
+          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+          --type=kubernetes.io/dockerconfigjson
         post: |
-          kubectl delete secret ${{ inputs.token-name }}
+          kubectl delete secret ${{ inputs.token-name }}
\ No newline at end of file
diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
similarity index 51%
rename from .github/actions/delete-k8s-job/action.yml
rename to .github/actions/submit-delete-k8s-job/action.yml
index 74f1e3129..e97d4b921 100644
--- a/.github/actions/delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -5,6 +5,9 @@ inputs:
   job-name:
     description: The job name to delete
     required: true
+  job-config-file:
+    description: Path to the Kubernetes job YAML
+    required: true
 
 
 runs:
@@ -14,7 +17,21 @@ runs:
       uses: ./.github/actions/with-post-step 
       with: 
         main: |
-          echo "Main post step action: no action required"
+          echo "Submit K8s job" 
+          kubectl apply -f "${{ inputs.job-config-file }}"
+          # wait for the job to be created 
+          kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s
+
+          # wait for the 'spec.suspend' field to become false. Necessary for kueue
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s
+
+          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+            echo "Waiting for pods to start..."
+            sleep 20
+          done
+
+          # stream the logs 
+          kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
         post: | 
           pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}')
 
diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml
deleted file mode 100644
index 49ddad748..000000000
--- a/.github/actions/submit-k8s-job/action.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Submit & Stream K8s Job
-description: Submits a Kubernetes job and then streams its logs to GitHub Actions.
-
-inputs:
-  job-config-file:
-    description: Path to the Kubernetes job YAML
-    required: true
-  job-name:
-    description: The job name
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Submit Kubernetes job
-      shell: bash
-      run: |
-        kubectl apply -f "${{ inputs.job-config-file }}"
-
-    - name: Wait for job to be un-suspended (Kueue)
-      shell: bash
-      run: |
-        # wait for the job to be created 
-        kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s
-
-        # wait for the 'spec.suspend' field to become false. Necessary for kueue
-        kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=3600s
-
-    - name: Wait for pods to start 
-      shell: bash 
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          echo "Waiting for pods to start..."
-          sleep 20
-        done
-
-    - name: Stream Kubernetes job output
-      shell: bash
-      run: |
-        kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
diff --git a/.github/actions/with-post-step/action.yml b/.github/actions/with-post-step/action.yml
index 9816ee888..69c2a6eff 100644
--- a/.github/actions/with-post-step/action.yml
+++ b/.github/actions/with-post-step/action.yml
@@ -39,4 +39,4 @@ inputs:
 runs:
   using: 'node20'
   main: 'main.js'
-  post: 'main.js'
\ No newline at end of file
+  post: 'main.js'
diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 2c6dad33f..039f767ee 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -10,10 +10,10 @@ FROM ${BASE_IMAGE} AS mealkit
 ARG URLREF_AXLEARN
 ARG SRC_PATH_AXLEARN
 
-RUN <<"EOF" bash -ex
-  git clone "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}"
-EOF
+RUN git-clone.sh "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}"
 
+# these packages are needed to run axlearn tests
+# https://github.com/apple/axlearn/blob/main/pyproject.toml as reference
 RUN <<"EOF" bash -ex
   echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in
   cat <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
@@ -38,8 +38,7 @@ EOF
 ## Add test script to the path
 ###############################################################################
 
-ADD test-axlearn.sh /usr/local/bin
-ADD test-fuji-1B.sh /usr/local/bin
+ADD test-axlearn.sh test-fuji-1B.sh /usr/local/bin/
 
 ###############################################################################
 ## Install accumulated packages from the base image and the previous stage
diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index c62f36f5b..d5e783f56 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -10,19 +10,16 @@ usage() {
     echo "  OPTIONS                       DESCRIPTION"
     echo "  -d, --directory DIR           Directory to run tests in."
     echo "                                Default: 'axlearn/axlearn/common'."
-    echo "  -c, --cuda-devices DEVICES    CUDA devices to use. Default: '0,1,2,3,4,5,6,7'."
     echo "  -t, --test-files FILES        Pattern for test files to run."
     echo "                                Default: '*_test.py'."
     echo "  -o, --output DIRECTORY        Output directory for logs and summary."
     echo "                                Default: 'test_runs/<timestamp>'."
-    echo "  -k, --k8s                     Whether to run on a Kubernetes cluster."
     echo "  -h, --help                    Show this help message and exit."
     exit 1
 }
 
 # Default values
 DIR='axlearn/axlearn/common'
-CUDA_DEVICES='0,1,2,3,4,5,6,7'
 TEST_FILES=()
 OUTPUT_DIRECTORY=''
 K8S=false
@@ -39,14 +36,6 @@ while [[ $# -gt 0 ]]; do
             DIR="$2"
             shift 2
             ;;
-        -c|--cuda-devices)
-            if [[ -z "$2" ]]; then
-                echo "Error: --cuda-devices requires an argument."
-                usage
-            fi
-            CUDA_DEVICES="$2"
-            shift 2
-            ;;
         -t|--test-files)
             shift
             # Collect all arguments until the next option (starting with '-')
@@ -69,10 +58,6 @@ while [[ $# -gt 0 ]]; do
             OUTPUT_DIRECTORY="$2"
             shift 2
             ;;
-        -k|--k8s)
-            K8S=true
-            shift
-            ;;
         -h|--help)
             usage
             ;;
@@ -95,7 +80,6 @@ mkdir -p "${LOG_DIRECTORY}"
 # Print out config for sanity check
 echo "Configuration:"
 echo "  Directory: $DIR"
-echo "  CUDA Devices: $CUDA_DEVICES"
 if [ "${#TEST_FILES[@]}" -gt 0 ]; then
     echo "  Test Files:"
     for f in "${TEST_FILES[@]}"; do
@@ -106,23 +90,14 @@ else
 fi
 echo "  Output Directory: $OUTPUT_DIRECTORY"
 echo "  Kubernetes mode: $K8S"
-echo "" 
-
 
 cd "$DIR" || exit 1
 
-# Set CUDA devices
-export CUDA_VISIBLE_DEVICES="${CUDA_DEVICES}"
-echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES"
-
 echo "Running tests..."
 
-# If we are on Kubernetes, install torch for cpu only
-if [ "$K8S" = true ]; then
-     pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
-     pip install transformers
-     pip install scikit-learn timm 
-fi
+pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+pip install transformers scikit-learn timm 
+
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then
     TEST_FILES=("*_test.py")
diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh
index 8208b4003..9018f37de 100755
--- a/.github/container/test-fuji-1B.sh
+++ b/.github/container/test-fuji-1B.sh
@@ -3,9 +3,7 @@ BASEDIR="/opt/host/"
 CONFIG="fuji-1B-v3-flash"
 POSTFIX=${POSTFIX:=""}
 
-
-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-                 --xla_gpu_graph_level=0
+BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
                  --xla_gpu_enable_highest_priority_async_stream=true
                  --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
                  --xla_gpu_all_gather_combine_threshold_bytes=1073741824
@@ -17,13 +15,9 @@ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
                  --xla_gpu_enable_triton_gemm=false
                  --xla_gpu_enable_all_gather_combine_by_dim=false
                  --xla_gpu_enable_reduce_scatter_combine_by_dim=false
-                 --xla_disable_hlo_passes=rematerialization"
+                 --xla_disable_hlo_passes=rematerialization}
 
-export XLA_PYTHON_CLIENT_PREALLOCATE=false
-export TF_GPU_ALLOCATOR=cuda_malloc_async
-export NCCL_BUFFSIZE=8388608 
-export NCCL_P2P_NET_CHUNKSIZE=524288
-export NCCL_LAUNCH_MODE=GROUP
+export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
 
 LOG_DIF=${BASEDIR}/logs
 TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs
diff --git a/.github/container/test-fuji-7B.sh b/.github/container/test-fuji-7B.sh
deleted file mode 100755
index e2ff8dde6..000000000
--- a/.github/container/test-fuji-7B.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#! /bin/bash
-BASEDIR="/opt/host/"
-CONFIG="fuji-7B-v2-flash"
-POSTFIX=${POSTFIX:=""}
-
-
-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-                 --xla_gpu_graph_level=0
-                 --xla_gpu_enable_highest_priority_async_stream=true
-                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
-                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
-                 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
-                 --xla_gpu_enable_pipelined_all_gather=true
-                 --xla_gpu_enable_pipelined_reduce_scatter=true
-                 --xla_gpu_enable_pipelined_all_reduce=true
-                 --xla_gpu_enable_while_loop_double_buffering=true
-                 --xla_gpu_enable_triton_gemm=false
-                 --xla_gpu_enable_all_gather_combine_by_dim=false
-                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false
-                 --xla_disable_hlo_passes=rematerialization"
-                 
-export XLA_PYTHON_CLIENT_PREALLOCATE=false
-export TF_GPU_ALLOCATOR=cuda_malloc_async
-export NCCL_BUFFSIZE=8388608 
-export NCCL_P2P_NET_CHUNKSIZE=524288
-export NCCL_LAUNCH_MODE=GROUP
-export NCCL_DEBUG=INFO
-LOG_DIF=${BASEDIR}/logs
-TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs
-mkdir -p ${TRAINER_DIR}
-
-export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=127.0.0.1:8080 --process_id=${SLURM_PROCID}"
-
-python3 -m axlearn.common.launch_trainer_main \
-    --module=text.gpt.c4_trainer \
-    --config=${CONFIG} \
-    --trainer_dir=${TRAINER_DIR} \
-    --data_dir=gs://axlearn-public/tensorflow_datasets \
-    --jax_backend=gpu \
-    ${MP_ARGS}
diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
index 60ac97e3a..76b767089 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
@@ -5,7 +5,7 @@ metadata:
   labels:
     kueue.x-k8s.io/queue-name: p5-queue
 spec:
-  # the job will run for 20 mins, as we can' tset max_steps
+  # the job will run for 20 mins, as we can't set max_steps
   activeDeadlineSeconds: 1200
   completions: 1
   parallelism: 1
@@ -13,7 +13,7 @@ spec:
     spec:
       restartPolicy: Never
       containers:
-      - name: axlearn
+      - name: axlearn-fuji-1B
         image: PLACEHOLDER
         command:
           - bash
diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
deleted file mode 100644
index 39fbce6be..000000000
--- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
+++ /dev/null
@@ -1,80 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: PLACEHOLDER
-  labels:
-    kueue.x-k8s.io/queue-name: p5-queue
-spec:
-  # the job will run for 20 mins, as we can' tset max_steps
-  activeDeadlineSeconds: 1200
-  completions: 1
-  parallelism: 1
-  template:
-    spec:
-      restartPolicy: Never
-      containers:
-      - name: axlearn
-        image: PLACEHOLDER
-        command:
-          - bash
-          - -xo
-          - pipefail
-          - -c
-          - |        
-
-            BASEDIR="/opt/axlearn"
-            CONFIG="fuji-3B-v3-flash-single-host"
-            HLO_DUMP=0
-            POSTFIX=""
-
-            AR_THRESHOLD=1073741824
-            AG_THRESHOLD=8589934592
-            RS_THRESHOLD=8589934592
-            XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-                            --xla_gpu_enable_triton_gemm=false
-                            --xla_gpu_enable_highest_priority_async_stream=true
-                            --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD}
-                            --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD}
-                            --xla_gpu_enable_pipelined_all_gather=true
-                            --xla_gpu_enable_pipelined_reduce_scatter=true
-                            --xla_gpu_enable_nccl_comm_splitting=false"
-
-            export XLA_PYTHON_CLIENT_PREALLOCATE=false
-            export TF_GPU_ALLOCATOR=cuda_malloc_async
-            export XLA_FLAGS="${XLA_BASE_FLAGS}"
-
-            export NCCL_BUFFSIZE=8388608 
-            export NCCL_P2P_NET_CHUNKSIZE=524288
-            export NCCL_LAUNCH_MODE=GROUP
-            export NCCL_DEBUG=INFO
-
-            LOG_DIR=${BASEDIR}/logs
-            TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
-            mkdir -p ${TRAINER_DIR}
-
-            echo "Executing TF"
-            cat << EOF > tf_fix_gpu.py
-            import tensorflow as tf
-            tf.config.set_visible_devices([], 'GPU')
-            import runpy
-            runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__')
-            EOF
-
-            python3 tf_fix_gpu.py \
-                --module=text.gpt.c4_trainer \
-                --config=${CONFIG} \
-                --trainer_dir=${TRAINER_DIR} \
-                --data_dir=gs://axlearn-public/tensorflow_datasets \
-                --jax_backend=gpu  
-
-        resources:
-          limits:
-            nvidia.com/gpu: 8
-        volumeMounts:
-        - name: output
-          mountPath: /opt/output
-      imagePullSecrets:
-      - name: PLACEHOLDER  
-      volumes:
-      - name: output
-        emptyDir: {}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index eef5a78bb..d5d0005af 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -660,16 +660,17 @@ jobs:
       JOB_NAME: axlearn-${{ github.run_id }}
       TOKEN_NAME: axlearn-${{ github.run_id }}-token
     steps:
-    - name: Set date env var for saving files
-      run: |
-        echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV
     - name: Check out the repository
       uses: actions/checkout@v4
-    - name: GHCR Login
-      uses: ./.github/actions/ghcr-login
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR login and delete
+      uses: ./.github/actions/store-delete-k8s-ghcr
       with: 
-          docker-username: ${{ github.repository_owner }}
-          docker-password: ${{ secrets.GITHUB_TOKEN }}
           token-name: ${{ env.TOKEN_NAME }}
     - name: Configure axlearn test job
       run: |
@@ -677,34 +678,20 @@ jobs:
         yq -i ea '
            select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-          | select(di == 0).spec.template.spec.containers[1].env[0].value = strenv(DATE_TEST_RAN)
+          | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
         .github/eks-workflow-files/axlearn/axlearn-job.yml
         git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
-
-    - name: Submit & wait for axlearn test job
-      uses: ./.github/actions/submit-k8s-job 
+    - name: Submit & delete axlearn test 
+      uses: ./.github/actions/submit-delete-k8s-job 
       with:
         job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
         job-name: ${{ env.JOB_NAME }}
-
-    - name: Delete axlearn test job
-      uses: ./.github/actions/delete-k8s-job
-      if: ${{ always() }}
-      with: 
-        job-name: ${{ env.JOB_NAME }}
-
-    - name: Delete GitHub Container Registry token
-      uses: ./.github/actions/delete-ghcr-token
-      if: ${{ always() }}
-      with: 
-        token-name: ${{ env.TOKEN_NAME }}
-
     - name: Download logs from S3
       id: log-s3
       run: |
         mkdir -p /tmp/axlearn-output
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ env.DATE_TEST_RAN }}/summary.txt /tmp/axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt /tmp/axlearn-output/
 
         passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true)
         failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true)
@@ -713,7 +700,6 @@ jobs:
         echo "Passed tests: $passed_tests"
         echo "Failed tests: $failed_tests"
         echo "Total tests: $total_tests"
-
         echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
         echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
         echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
@@ -761,14 +747,8 @@ jobs:
           "badge-axlearn-test"
           summary.txt
 
-  test-axlearn-fuji-1B-slurm:
-    needs: build-axlearn
-    if: inputs.ARCHITECTURE == 'amd64' 
-    uses: ./.github/workflows/_test_fuji_1B.yaml
-    with:
-      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
-
+  # the fuji test will run for 20 minutes only, as per 2025-02-24 
+  # is not possible to set the `max_steps` value
   test-axlearn-fuji-1B-eks:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
@@ -780,11 +760,15 @@ jobs:
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
-    - name: GHCR Login
-      uses: ./.github/actions/ghcr-login
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR login and delete
+      uses: ./.github/actions/store-delete-k8s-ghcr
       with: 
-          docker-username: ${{ github.repository_owner }}
-          docker-password: ${{ secrets.GITHUB_TOKEN }}
           token-name: ${{ env.TOKEN_NAME }}
     - name: Configure axlearn test job
       run: |
@@ -795,20 +779,9 @@ jobs:
         .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
         git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
 
-    - name: Submit & wait for axlearn test job
-      uses: ./.github/actions/submit-k8s-job 
+    - name: Submit & delete axlearn test 
+      uses: ./.github/actions/submit-delete-k8s-job 
       with:
-        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml"
+        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml"
         job-name: ${{ env.JOB_NAME }}
 
-    - name: Delete axlearn test job
-      uses: ./.github/actions/delete-k8s-job
-      if: ${{ always() }}
-      with: 
-        job-name: ${{ env.JOB_NAME }}
-
-    - name: Delete GitHub Container Registry token
-      uses: ./.github/actions/delete-ghcr-token
-      if: ${{ always() }}
-      with: 
-        token-name: ${{ env.TOKEN_NAME }}
diff --git a/.github/workflows/_test_fuji_1B.yaml b/.github/workflows/_test_fuji_1B.yaml
deleted file mode 100644
index cd058b59a..000000000
--- a/.github/workflows/_test_fuji_1B.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-name: ~test MaxText functionality
-
-on:
-  workflow_call:
-    inputs:
-      AXLEARN_DOCKER_IMAGE:
-        type: string
-        description: Axlearn image from ghcr.io/nvidia
-        default: ghcr.io/nvidia/jax:axlearn
-        required: false
-
-jobs:
-  single-process-single-node:
-    runs-on: jumpbox
-    steps: 
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-
-      - name: Setup SSH
-        id: setup-ssh
-        uses: ./.github/actions/setup-ssh
-        with:
-            ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-            ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-
-      - name: Labels and metadata
-        id: meta
-        shell: bash -x -e {0}
-        run: |
-            IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')"
-            TOTAL_TASKS=1
-            MAX_GPUS_PER_NODE=8
-            NODES=1
-            GPUS_PER_NODE=8
-            JOB_NAME=axlearn-fuji-1B-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
-            LOG_FILE=/nfs/cluster/${JOB_NAME}.log
-            MODEL_PATH=/nfs/cluster/${JOB_NAME}
-            for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
-            echo "$var=${!var}" >> $GITHUB_OUTPUT
-            done
-
-      - name: Submit SLURM jobs over SSH
-        id: submit
-        shell: bash -O expand_aliases -x -e {0}
-        run: |
-            alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
-            sshx "date && hostname && sinfo"
-            sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
-            JOB=$(sshx sbatch --parsable << EOF
-            #!/bin/bash
-            #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
-            #SBATCH --exclusive
-            #SBATCH --nodes=${{ steps.meta.outputs.NODES }}
-            #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
-            #SBATCH --time=00:30:00
-            #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
-
-            # preload enroot container using one task per node
-            time srun \
-                --ntasks-per-node=1 \
-                --container-name=runtime \
-                --container-image=${{ steps.meta.outputs.IMAGE }} \
-                true
-
-            # run job with tasks on each node sharing one container
-            time srun \
-                --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
-                --container-name=runtime \
-                --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
-                --container-entrypoint \
-                test-fuji-1B.sh
-            EOF
-            )
-
-            echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT
-
-            . .github/workflows/scripts/wait_for_slurm_job.sh
-
-            wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}
-
-            # Gather job info
-            SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
-            SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
-            echo "SLURM Job state is ${SLURM_STATE}"
-            echo "SLURM Job exit code is ${SLURM_EXITCODE}"
-            echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
-            echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
-            set -x
-
-
-      - name: Remove orphaned SLURM job if the CI job is canceled
-        if: cancelled()
-        shell: bash -x -e {0}
-        run: |
-          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
-            scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
-
-      - name: Write SLURM job status to file
-        shell: bash -x -e {0}
-        run: |
-          python << EOF
-          import json
-          with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
-              dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
-              json.dump(dump, f)
-          EOF
\ No newline at end of file
diff --git a/.github/workflows/_test_fuji_7B.yaml b/.github/workflows/_test_fuji_7B.yaml
deleted file mode 100644
index 544de815a..000000000
--- a/.github/workflows/_test_fuji_7B.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-name: ~test MaxText functionality
-
-on:
-  workflow_call:
-    inputs:
-      AXLEARN_DOCKER_IMAGE:
-        type: string
-        description: Axlearn image from ghcr.io/nvidia
-        default: ghcr.io/nvidia/jax:axlearn
-        required: false
-
-jobs:
-  single-process-single-node:
-    runs-on: jumpbox
-    steps: 
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-
-      - name: Setup SSH
-        id: setup-ssh
-        uses: ./.github/actions/setup-ssh
-        with:
-            ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-            ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-
-      - name: Labels and metadata
-        id: meta
-        shell: bash -x -e {0}
-        run: |
-            IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')"
-            TOTAL_TASKS=1
-            MAX_GPUS_PER_NODE=8
-            NODES=1
-            GPUS_PER_NODE=8
-            JOB_NAME=axlearn-fuji-7B-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
-            LOG_FILE=/nfs/cluster/${JOB_NAME}.log
-            MODEL_PATH=/nfs/cluster/${JOB_NAME}
-            for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
-            echo "$var=${!var}" >> $GITHUB_OUTPUT
-            done
-
-      - name: Submit SLURM jobs over SSH
-        id: submit
-        shell: bash -O expand_aliases -x -e {0}
-        run: |
-            alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
-            sshx "date && hostname && sinfo"
-            sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
-            JOB=$(sshx sbatch --parsable << EOF
-            #!/bin/bash
-            #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
-            #SBATCH --exclusive
-            #SBATCH --nodes=${{ steps.meta.outputs.NODES }}
-            #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
-            #SBATCH --time=00:30:00
-            #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
-
-            # preload enroot container using one task per node
-            time srun \
-                --ntasks-per-node=1 \
-                --container-name=runtime \
-                --container-image=${{ steps.meta.outputs.IMAGE }} \
-                true
-
-            # run job with tasks on each node sharing one container
-            time srun \
-                --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
-                --container-name=runtime \
-                --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
-                --container-entrypoint \
-                test-fuji-7B.sh
-            EOF
-            )
-
-            echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT
-
-            . .github/workflows/scripts/wait_for_slurm_job.sh
-
-            wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}
-
-            # Gather job info
-            SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
-            SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
-            echo "SLURM Job state is ${SLURM_STATE}"
-            echo "SLURM Job exit code is ${SLURM_EXITCODE}"
-            echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
-            echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
-            set -x
-
-
-      - name: Remove orphaned SLURM job if the CI job is canceled
-        if: cancelled()
-        shell: bash -x -e {0}
-        run: |
-          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
-            scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
-
-      - name: Write SLURM job status to file
-        shell: bash -x -e {0}
-        run: |
-          python << EOF
-          import json
-          with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
-              dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
-              json.dump(dump, f)
-          EOF
\ No newline at end of file
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
new file mode 100644
index 000000000..2102c214e
--- /dev/null
+++ b/.github/workflows/_test_nccl.yaml
@@ -0,0 +1,110 @@
+name: ~run NCCL tests
+
+on:
+  workflow_call:
+    inputs:
+      # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
+      # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
+      # to be modified to test one of the JAX-Toolbox containers.
+      CONTAINER:
+        type: string
+        description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
+        required: true
+
+permissions:
+  actions:  write # to cancel previous workflows
+  contents: read  # to fetch code
+  packages: write # to upload container
+
+jobs:
+  build-mpi-operator-compatible-base:
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: amd64
+      ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
+      BADGE_FILENAME: badge-mpi-operator-compatible-base-build
+      BUILD_DATE: 0000-00-00 # not important; this image is never published
+      BASE_IMAGE: ${{ inputs.CONTAINER }}
+      CONTAINER_NAME: mpi-operator-compatible-base
+      DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
+      RUNNER_SIZE: small
+    secrets: inherit
+  nccl-test:
+    needs: build-mpi-operator-compatible-base
+    strategy:
+      matrix:
+        test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
+    runs-on: eks
+    env:
+      BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: ${{ matrix.test }}
+
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4
+      - name: Modify variables 
+        id: var
+        shell: bash 
+        run: |
+          export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}"
+          echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT
+          echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT 
+          echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT 
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: K8s GHCR login and delete
+        uses: ./.github/actions/store-delete-k8s-ghcr
+        with: 
+            token-name: ${{ steps.var.outputs.TOKEN_NAME }}
+      - name: Configure Kubernetes job
+        run: |
+          export WORKER_NAME="${JOB_NAME}-worker"
+          yq -i '.metadata.name = strenv(JOB_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
+            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+            .github/eks-workflow-files/mpi-nccl-test.yml
+          git diff .github/eks-workflow-files/mpi-nccl-test.yml
+    - name: Submit & delete Kubernetes test 
+      uses: ./.github/actions/submit-delete-k8s-job 
+      with:
+        job-config-file:  ".github/eks-workflow-files/mpi-nccl-test.yml"
+        job-name: ${{ steps.var.output.JOB_NAME }}
+      - name: Retrieve Kubernetes job status
+        shell: bash -exo pipefail {0}
+        run: |
+          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
+          while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failure=${status[0]:-0}
+            success=${status[1]:-0}
+            total=$((failure+success))
+            if [[ ${total} < 1 ]]; then
+              sleep 1
+            elif [[ ${total} == 1 ]]; then
+              break
+            else
+              # Shouldn't happen, maybe a sign the job being monitored does not have a
+              # single launcher pod?
+              exit 255
+            fi
+          done
+          exit ${failure}
+      # Provide more debug output in case of failure; note that some kinds of launch
+      # failure do not produce any log output.
+      - name: Debug failed Kubernetes job
+        if: failure()
+        run: |
+          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
+          # Provide better debug in case of launch failures that will not produce log output
+          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
+          if [[ -n "${pods}" ]]; then
+            kubectl describe ${pods}
+          fi
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
index 6f39ebe0b..d51c12382 100644
--- a/.github/workflows/nccl-k8s.yaml
+++ b/.github/workflows/nccl-k8s.yaml
@@ -31,111 +31,8 @@ permissions:
   packages: write # to upload container
 
 jobs:
-  build-mpi-operator-compatible-base:
-    uses: ./.github/workflows/_build.yaml
+  nccl-tests:
+    uses: ./.github/workflows/_test_nccl.yaml
     with:
-      ARCHITECTURE: amd64
-      ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
-      BADGE_FILENAME: badge-mpi-operator-compatible-base-build
-      BUILD_DATE: 0000-00-00 # Not important; this image is never published
-      BASE_IMAGE: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }}
-      CONTAINER_NAME: mpi-operator-compatible-base
-      DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
-      RUNNER_SIZE: small
+      CONTAINER: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }}
     secrets: inherit
-
-  nccl-tests:
-    needs: build-mpi-operator-compatible-base
-    runs-on: eks
-    strategy:
-      matrix:
-        test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
-    env:
-      BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
-      TEST_NAME: ${{ matrix.test }}
-
-    
-    steps:
-      - name: Check out the repository
-        uses: actions/checkout@v4
-
-      - name: Modify variables 
-        id: var
-        shell: bash 
-        run: |
-          export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}"
-          echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT
-          echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT 
-          echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT
-
-      - name: GHCR login and store K8s secret
-        uses: ./.github/actions/ghcr-login
-        with:
-          docker-username: ${{ github.repository_owner }}
-          docker-password: ${{ secrets.GITHUB_TOKEN }}
-          token-name: ${{ steps.var.outputs.TOKEN_NAME }}
-      - name: Configure Kubernetes job
-        shell: bash
-        run: |
-          export JOB_NAME="${{ steps.var.outputs.JOB_NAME }}"
-          export LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
-          export TOKEN_NAME="${{ steps.var.outputs.TOKEN_NAME }}"
-          export TEST_NAME="${{ env.TEST_NAME }}"
-          export WORKER_NAME="${JOB_NAME}-worker"
-
-          # Use yq to set our fields in-place
-          yq -i '.metadata.name = strenv(JOB_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
-            .github/eks-workflow-files/mpi-nccl-test.yml
-
-          # (Optional) Show diff for debugging
-          git diff .github/eks-workflow-files/mpi-nccl-test.yml
-
-      - name: Submit & stream K8s job
-        uses: ./.github/actions/submit-k8s-job
-        with:
-          job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml
-          job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} 
-      - name: Retrieve Kubernetes job status
-        shell: bash -exo pipefail {0}
-        run: |
-          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
-          while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
-            failure=${status[0]:-0}
-            success=${status[1]:-0}
-            total=$((failure+success))
-            if [[ ${total} < 1 ]]; then
-              sleep 1
-            elif [[ ${total} == 1 ]]; then
-              break
-            else
-              # If total > 1, that suggests a mismatch that can occur if there's more than one launcher pod
-              exit 255
-            fi
-          done
-          exit ${failure}
-      - name: Debug failed Kubernetes job
-        if: ${{ failure() }}
-        shell: bash
-        run: |
-          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
-          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
-          if [[ -n "${pods}" ]]; then
-            kubectl describe ${pods}
-          fi
-      - name: Delete Kubernetes job
-        if: ${{ always() }}
-        uses: ./.github/actions/delete-k8s-job
-        with:
-          job-name: ${{ steps.var.outputs.LAUNCHER_NAME }}
-      - name: Delete GitHub Container Registry token
-        uses: ./.github/actions/delete-ghcr-token
-        if: ${{ always() }}
-        with: 
-          token-name: ${{ steps.var.outputs.TOKEN_NAME }}
\ No newline at end of file

From d680e667a9f55819d6b57dd7432110087fea6f13 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 24 Feb 2025 19:01:38 +0000
Subject: [PATCH 58/89] fix path for git

---
 .github/container/Dockerfile.axlearn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 039f767ee..ba63fb0c9 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1-labs
 ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
-ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git
+ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main
 ARG SRC_PATH_AXLEARN=/opt/axlearn
 
 ###############################################################################

From c200dea5289198c9fcee5ac767d6744b93f31107 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 24 Feb 2025 20:43:29 +0000
Subject: [PATCH 59/89] fix error in bash

---
 .github/actions/store-delete-k8s-ghcr/action.yml | 1 +
 .github/actions/submit-delete-k8s-job/action.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
index 51eb8b625..803163f9a 100644
--- a/.github/actions/store-delete-k8s-ghcr/action.yml
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -11,6 +11,7 @@ runs:
   steps:
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/with-post-step
+      shell: bash
       with: 
         main: | 
           # Store GitHub Container Registry token as Kubernetes secret
diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index e97d4b921..d2c546273 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -15,6 +15,7 @@ runs:
   steps:
     - name: Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
+      shell: bash
       with: 
         main: |
           echo "Submit K8s job" 

From 0b1a61f8728c87c701e6ca057124b5a930177a61 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 25 Feb 2025 11:29:51 +0000
Subject: [PATCH 60/89] fix the 3B model run on k8s

---
 ...earn-1B-model.yml => axlearn-3B-model.yml} | 42 ++++++++-----------
 .github/workflows/_ci.yaml                    | 12 +++---
 2 files changed, 23 insertions(+), 31 deletions(-)
 rename .github/eks-workflow-files/axlearn/{axlearn-1B-model.yml => axlearn-3B-model.yml} (54%)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
similarity index 54%
rename from .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
rename to .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
index 76b767089..9045044e8 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
@@ -13,7 +13,7 @@ spec:
     spec:
       restartPolicy: Never
       containers:
-      - name: axlearn-fuji-1B
+      - name: axlearn-fuji-3B
         image: PLACEHOLDER
         command:
           - bash
@@ -23,31 +23,23 @@ spec:
           - |        
 
             BASEDIR="/opt/axlearn"
-            CONFIG="fuji-1B-v3-flash-single-host"
-            HLO_DUMP=0
-            POSTFIX=""
-
-            AR_THRESHOLD=1073741824
-            AG_THRESHOLD=8589934592
-            RS_THRESHOLD=8589934592
-            XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-                            --xla_gpu_enable_triton_gemm=false
-                            --xla_gpu_enable_highest_priority_async_stream=true
-                            --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD}
-                            --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD}
-                            --xla_gpu_enable_pipelined_all_gather=true
-                            --xla_gpu_enable_pipelined_reduce_scatter=true
-                            --xla_gpu_enable_nccl_comm_splitting=false"
-
-            export XLA_PYTHON_CLIENT_PREALLOCATE=false
-            export TF_GPU_ALLOCATOR=cuda_malloc_async
-            export XLA_FLAGS="${XLA_BASE_FLAGS}"
-
-            export NCCL_BUFFSIZE=8388608 
-            export NCCL_P2P_NET_CHUNKSIZE=524288
-            export NCCL_LAUNCH_MODE=GROUP
-            export NCCL_DEBUG=INFO
+            CONFIG="fuji-3B-v3-flash-single-host"
+            BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
+                 --xla_gpu_enable_highest_priority_async_stream=true
+                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                 --xla_gpu_enable_pipelined_all_gather=true
+                 --xla_gpu_enable_pipelined_reduce_scatter=true
+                 --xla_gpu_enable_pipelined_all_reduce=true
+                 --xla_gpu_enable_while_loop_double_buffering=true
+                 --xla_gpu_enable_triton_gemm=false
+                 --xla_gpu_enable_all_gather_combine_by_dim=false
+                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                 --xla_disable_hlo_passes=rematerialization}
 
+            export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
+            
             LOG_DIR=${BASEDIR}/logs
             TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
             mkdir -p ${TRAINER_DIR}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index d5d0005af..52b514c9d 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -749,14 +749,14 @@ jobs:
 
   # the fuji test will run for 20 minutes only, as per 2025-02-24 
   # is not possible to set the `max_steps` value
-  test-axlearn-fuji-1B-eks:
+  test-axlearn-fuji-3B-eks:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
     runs-on: eks
     env:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: axlearn-fuji-1b-${{ github.run_id }}
-      TOKEN_NAME: axlearn-fuji-1b-${{ github.run_id }}-token
+      JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
+      TOKEN_NAME: axlearn-fuji-3b-${{ github.run_id }}-token
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
@@ -776,12 +776,12 @@ jobs:
            select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
-        .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
+        .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
 
     - name: Submit & delete axlearn test 
       uses: ./.github/actions/submit-delete-k8s-job 
       with:
-        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml"
+        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-3B-model.yml"
         job-name: ${{ env.JOB_NAME }}
 

From 5693a5c6cd4a21eee8cb79650c084d705c5e638f Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 11:01:21 +0000
Subject: [PATCH 61/89] @olupton comments

---
 .../actions/store-delete-k8s-ghcr/action.yml  |  17 ++-
 .../actions/submit-delete-k8s-job/action.yml  |  14 +--
 .github/container/Dockerfile.axlearn          |   2 +-
 .github/container/test-axlearn.sh             |   3 +-
 .../{test-fuji-1B.sh => test-fuji.sh}         |   0
 .../axlearn/axlearn-3B-model.yml              |  71 ------------
 .../axlearn/axlearn-fuji-model.yml            |  34 ++++++
 .../axlearn/axlearn-job.yml                   | 108 +++++++++---------
 .github/workflows/_ci.yaml                    |  22 ++--
 .github/workflows/_test_nccl.yaml             |  46 ++++----
 10 files changed, 133 insertions(+), 184 deletions(-)
 rename .github/container/{test-fuji-1B.sh => test-fuji.sh} (100%)
 delete mode 100644 .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
 create mode 100644 .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml

diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
index 803163f9a..33a69ebe2 100644
--- a/.github/actions/store-delete-k8s-ghcr/action.yml
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -1,14 +1,19 @@
-name: Delete GHCR Token
-description: Deletes the K8s secret used for pulling images from GHCR.
+name: Store & Delete GHCR Token
+description: Store and Delete the docker credentails for pulling from GHCR
 
-inputs:
+outputs:
   token-name:
     description: Name of the K8s secret to delete
-    required: true
+    value: ${{ steps.token.outputs.token-name }}
 
 runs:
   using: "composite"
   steps:
+    - name: Generate a UUID token 
+      shell: bash 
+      id: token
+      run: | 
+        echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/with-post-step
       shell: bash
@@ -16,8 +21,8 @@ runs:
         main: | 
           # Store GitHub Container Registry token as Kubernetes secret
           kubectl create secret generic \
-          ${{ inputs.token-name }} \
+          ${{ steps.token.outputs.token-name }} \
           --from-file=.dockerconfigjson=$HOME/.docker/config.json \
           --type=kubernetes.io/dockerconfigjson
         post: |
-          kubectl delete secret ${{ inputs.token-name }}
\ No newline at end of file
+          kubectl delete secret ${{ inputs.token-name }}
diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index d2c546273..d8b8cb472 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -34,15 +34,5 @@ runs:
           # stream the logs 
           kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
         post: | 
-          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}')
-
-          for pod in $pods; do 
-            status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true)
-            echo "Pod: $pod, status: $status"
-            if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then
-              kubectl delete pod "$pod" --force --grace-period=0 || true
-            fi
-          done
-          
-          # make sure job is deleted
-          kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true 
\ No newline at end of file
+          kubectl delete job ${{ inputs.job-name }}
+ 
\ No newline at end of file
diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index ba63fb0c9..8c609d08d 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -38,7 +38,7 @@ EOF
 ## Add test script to the path
 ###############################################################################
 
-ADD test-axlearn.sh test-fuji-1B.sh /usr/local/bin/
+ADD test-axlearn.sh test-fuji.sh /usr/local/bin/
 
 ###############################################################################
 ## Install accumulated packages from the base image and the previous stage
diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index d5e783f56..b9c3f2dfe 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -89,7 +89,6 @@ else
     echo "  Test Files Pattern: '*_test.py' (default)"
 fi
 echo "  Output Directory: $OUTPUT_DIRECTORY"
-echo "  Kubernetes mode: $K8S"
 
 cd "$DIR" || exit 1
 
@@ -168,4 +167,4 @@ for test_file in "${final_test_files[@]}"; do
         ((failures++))
     fi
     echo ""
-done
\ No newline at end of file
+done
diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji.sh
similarity index 100%
rename from .github/container/test-fuji-1B.sh
rename to .github/container/test-fuji.sh
diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
deleted file mode 100644
index 9045044e8..000000000
--- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
+++ /dev/null
@@ -1,71 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: PLACEHOLDER
-  labels:
-    kueue.x-k8s.io/queue-name: p5-queue
-spec:
-  # the job will run for 20 mins, as we can't set max_steps
-  activeDeadlineSeconds: 1200
-  completions: 1
-  parallelism: 1
-  template:
-    spec:
-      restartPolicy: Never
-      containers:
-      - name: axlearn-fuji-3B
-        image: PLACEHOLDER
-        command:
-          - bash
-          - -xo
-          - pipefail
-          - -c
-          - |        
-
-            BASEDIR="/opt/axlearn"
-            CONFIG="fuji-3B-v3-flash-single-host"
-            BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
-                 --xla_gpu_enable_highest_priority_async_stream=true
-                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
-                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
-                 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
-                 --xla_gpu_enable_pipelined_all_gather=true
-                 --xla_gpu_enable_pipelined_reduce_scatter=true
-                 --xla_gpu_enable_pipelined_all_reduce=true
-                 --xla_gpu_enable_while_loop_double_buffering=true
-                 --xla_gpu_enable_triton_gemm=false
-                 --xla_gpu_enable_all_gather_combine_by_dim=false
-                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false
-                 --xla_disable_hlo_passes=rematerialization}
-
-            export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
-            
-            LOG_DIR=${BASEDIR}/logs
-            TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
-            mkdir -p ${TRAINER_DIR}
-
-            cat << EOF > tf_gpu_fix.py
-            import tensorflow as tf
-            tf.config.set_visible_devices([], 'GPU')
-            import runpy
-            runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__')
-            EOF
-
-            python3 tf_gpu_fix.py  \
-               --module=text.gpt.c4_trainer \
-                --config=${CONFIG} \
-                --trainer_dir=${TRAINER_DIR} \
-                --data_dir=gs://axlearn-public/tensorflow_datasets \
-                --jax_backend=gpu 
-
-        resources:
-          limits:
-            nvidia.com/gpu: 8
-        volumeMounts:
-        - name: output
-          mountPath: /opt/output
-      imagePullSecrets:
-      - name: PLACEHOLDER  
-      volumes:
-      - name: output
-        emptyDir: {}
diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
new file mode 100644
index 000000000..c6d9db3ab
--- /dev/null
+++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
@@ -0,0 +1,34 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: PLACEHOLDER
+    labels:
+        kueue.x-k8s.io/queue-name: p5-queue
+spec:
+    # the job will run for 20 mins, as we can't set max_steps
+    activeDeadlineSeconds: 1200
+    completions: 1
+    parallelism: 1
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: axlearn-fuji-3B
+                  image: PLACEHOLDER
+                  command:
+                    - bash
+                    - -xo
+                    - pipefail
+                    - -c
+                    - "\nBASEDIR=\"/opt/axlearn\"\nCONFIG=\"fuji-3B-v3-flash-single-host\"\nBASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true\n     --xla_gpu_enable_highest_priority_async_stream=true\n     --xla_gpu_all_reduce_combine_threshold_bytes=1073741824\n     --xla_gpu_all_gather_combine_threshold_bytes=1073741824\n     --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824\n     --xla_gpu_enable_pipelined_all_gather=true\n     --xla_gpu_enable_pipelined_reduce_scatter=true\n     --xla_gpu_enable_pipelined_all_reduce=true\n     --xla_gpu_enable_while_loop_double_buffering=true\n     --xla_gpu_enable_triton_gemm=false\n     --xla_gpu_enable_all_gather_combine_by_dim=false\n     --xla_gpu_enable_reduce_scatter_combine_by_dim=false\n     --xla_disable_hlo_passes=rematerialization}\n\nexport XLA_FLAGS=\"$BASE_XLA_FLAGS ${XLA_FLAGS:-}\" \n\nLOG_DIR=${BASEDIR}/logs\nTRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir\nmkdir -p ${TRAINER_DIR}\n\npython3  -m axlearn.common.launch_trainer_main  \\\n    --module=text.gpt.c4_trainer \\\n    --config=${CONFIG} \\\n    --trainer_dir=${TRAINER_DIR} \\\n    --data_dir=gs://axlearn-public/tensorflow_datasets \\\n    --jax_backend=gpu \n"
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+            imagePullSecrets:
+                - name: PLACEHOLDER
+            volumes:
+                - name: output
+                  emptyDir: {}
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 7c1022f61..b1ac81909 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -1,59 +1,59 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: PLACEHOLDER 
-  labels:
-    kueue.x-k8s.io/queue-name: p5-queue
+    name: PLACEHOLDER
+    labels:
+        kueue.x-k8s.io/queue-name: p5-queue
 spec:
-  completions: 1
-  parallelism: 1
-  template:
-    spec:
-      restartPolicy: Never
-      containers:
-      - name: axlearn
-        image: PLACEHOLDER 
-        command:
-          - bash
-          - -xo
-          - pipefail
-          - -c
-          - |
-            test-axlearn.sh \
-              --directory "." \
-              --output "/opt/output/" \
-              --test-files "/opt/axlearn/axlearn/common/*_test.py" \
-              --k8s
+    completions: 1
+    parallelism: 1
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: axlearn
+                  image: PLACEHOLDER
+                  command:
+                    - bash
+                    - -xo
+                    - pipefail
+                    - -c
+                    - |
+                      test-axlearn.sh \
+                        --directory "." \
+                        --output "/opt/output/" \
+                        --test-files "/opt/axlearn/axlearn/common/*_test.py" \
+                        --k8s
 
-            sync  
-            wait 
-            # after execution flag the results have been produced 
-            touch /opt/output/done
-        resources:
-          limits:
-            nvidia.com/gpu: 8
-        volumeMounts:
-        - name: output
-          mountPath: /opt/output
-      - name: upload
-        image: amazon/aws-cli
-        env: 
-        - name: TEST_DATE 
-          value: PLACEHOLDER
-        command:
-          - sh
-          - -c
-          - |
-            while [ ! -f /opt/output/done ]; do
-              sleep 5
-            done
-            # Upload to S3 bucket
-            aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt
-        volumeMounts:
-        - name: output
-          mountPath: /opt/output
-      imagePullSecrets:
-      - name: PLACEHOLDER  
-      volumes:
-      - name: output
-        emptyDir: {}
+                      sync  
+                      wait 
+                      # after execution flag the results have been produced 
+                      touch /opt/output/done
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+                - name: upload
+                  image: amazon/aws-cli
+                  env:
+                    - name: TEST_DATE
+                      value: PLACEHOLDER
+                  command:
+                    - sh
+                    - -c
+                    - |
+                      while [ ! -f /opt/output/done ]; do
+                        sleep 5
+                      done
+                      # Upload to S3 bucket
+                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+            imagePullSecrets:
+                - name: PLACEHOLDER
+            volumes:
+                - name: output
+                  emptyDir: {}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 52b514c9d..17bb9262b 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -658,7 +658,6 @@ jobs:
     env:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
       JOB_NAME: axlearn-${{ github.run_id }}
-      TOKEN_NAME: axlearn-${{ github.run_id }}-token
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
@@ -668,10 +667,9 @@ jobs:
         registry: ghcr.io
         username: ${{ github.repository_owner }}
         password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR login and delete
+    - name: K8s GHCR store and delete token 
+      id: store-token
       uses: ./.github/actions/store-delete-k8s-ghcr
-      with: 
-          token-name: ${{ env.TOKEN_NAME }}
     - name: Configure axlearn test job
       run: |
         # Replace placeholders in axlearn-job.yml with environment variables
@@ -679,7 +677,7 @@ jobs:
            select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
           | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
-          | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
         .github/eks-workflow-files/axlearn/axlearn-job.yml
         git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
     - name: Submit & delete axlearn test 
@@ -756,7 +754,6 @@ jobs:
     env:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
       JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
-      TOKEN_NAME: axlearn-fuji-3b-${{ github.run_id }}-token
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
@@ -766,22 +763,21 @@ jobs:
         registry: ghcr.io
         username: ${{ github.repository_owner }}
         password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR login and delete
+    - name: K8s GHCR store and delete token
+      id: store-token
       uses: ./.github/actions/store-delete-k8s-ghcr
-      with: 
-          token-name: ${{ env.TOKEN_NAME }}
     - name: Configure axlearn test job
       run: |
         yq -i ea '
            select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-          | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
-        .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
 
     - name: Submit & delete axlearn test 
       uses: ./.github/actions/submit-delete-k8s-job 
       with:
-        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-3B-model.yml"
+        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
         job-name: ${{ env.JOB_NAME }}
 
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 2102c214e..200ef3b37 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -3,18 +3,15 @@ name: ~run NCCL tests
 on:
   workflow_call:
     inputs:
-      # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
-      # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
-      # to be modified to test one of the JAX-Toolbox containers.
       CONTAINER:
         type: string
-        description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
+        description: CUDA image to use as base
         required: true
 
 permissions:
-  actions:  write # to cancel previous workflows
-  contents: read  # to fetch code
-  packages: write # to upload container
+  actions: write
+  contents: read
+  packages: write
 
 jobs:
   build-mpi-operator-compatible-base:
@@ -23,12 +20,13 @@ jobs:
       ARCHITECTURE: amd64
       ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
       BADGE_FILENAME: badge-mpi-operator-compatible-base-build
-      BUILD_DATE: 0000-00-00 # not important; this image is never published
+      BUILD_DATE: 0000-00-00
       BASE_IMAGE: ${{ inputs.CONTAINER }}
       CONTAINER_NAME: mpi-operator-compatible-base
       DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
       RUNNER_SIZE: small
     secrets: inherit
+
   nccl-test:
     needs: build-mpi-operator-compatible-base
     strategy:
@@ -42,6 +40,7 @@ jobs:
     steps:
       - name: Check out the repository
         uses: actions/checkout@v4
+
       - name: Modify variables 
         id: var
         shell: bash 
@@ -49,35 +48,36 @@ jobs:
           export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}"
           echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT
           echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT 
-          echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT 
+
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      - name: K8s GHCR login and delete
-        uses: ./.github/actions/store-delete-k8s-ghcr
-        with: 
-            token-name: ${{ steps.var.outputs.TOKEN_NAME }}
+      - name: K8s GHCR store and delete token
+        id: store-token
+        uses: ./.github/actions/store-delete-k8s-ghcr 
       - name: Configure Kubernetes job
         run: |
           export WORKER_NAME="${JOB_NAME}-worker"
           yq -i '.metadata.name = strenv(JOB_NAME)
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
             | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
             | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
             .github/eks-workflow-files/mpi-nccl-test.yml
           git diff .github/eks-workflow-files/mpi-nccl-test.yml
-    - name: Submit & delete Kubernetes test 
-      uses: ./.github/actions/submit-delete-k8s-job 
-      with:
-        job-config-file:  ".github/eks-workflow-files/mpi-nccl-test.yml"
-        job-name: ${{ steps.var.output.JOB_NAME }}
+
+      - name: Submit & delete Kubernetes test 
+        uses: ./.github/actions/submit-delete-k8s-job 
+        with:
+          job-config-file: ".github/eks-workflow-files/mpi-nccl-test.yml"
+          job-name: ${{ steps.var.outputs.JOB_NAME }}  # Fixed outputs instead of output
+
       - name: Retrieve Kubernetes job status
         shell: bash -exo pipefail {0}
         run: |
@@ -91,19 +91,15 @@ jobs:
             elif [[ ${total} == 1 ]]; then
               break
             else
-              # Shouldn't happen, maybe a sign the job being monitored does not have a
-              # single launcher pod?
               exit 255
             fi
           done
           exit ${failure}
-      # Provide more debug output in case of failure; note that some kinds of launch
-      # failure do not produce any log output.
+
       - name: Debug failed Kubernetes job
         if: failure()
         run: |
           LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
-          # Provide better debug in case of launch failures that will not produce log output
           pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
           if [[ -n "${pods}" ]]; then
             kubectl describe ${pods}

From 64c646f245873e59df8951722b455783986d70f7 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 11:43:20 +0000
Subject: [PATCH 62/89] fix errors

---
 .github/actions/store-delete-k8s-ghcr/action.yml | 1 -
 .github/workflows/_ci.yaml                       | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
index 33a69ebe2..e089dfd21 100644
--- a/.github/actions/store-delete-k8s-ghcr/action.yml
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -16,7 +16,6 @@ runs:
         echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/with-post-step
-      shell: bash
       with: 
         main: | 
           # Store GitHub Container Registry token as Kubernetes secret
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 17bb9262b..bf530d533 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -747,7 +747,7 @@ jobs:
 
   # the fuji test will run for 20 minutes only, as per 2025-02-24 
   # is not possible to set the `max_steps` value
-  test-axlearn-fuji-3B-eks:
+  test-axlearn-fuji-models-eks:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
     runs-on: eks

From 9009dc4f0159337b348e42fc4b2582442cc0f048 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 11:49:56 +0000
Subject: [PATCH 63/89] test uuidgen

---
 .github/actions/store-delete-k8s-ghcr/action.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
index e089dfd21..4a0018c7d 100644
--- a/.github/actions/store-delete-k8s-ghcr/action.yml
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -10,7 +10,6 @@ runs:
   using: "composite"
   steps:
     - name: Generate a UUID token 
-      shell: bash 
       id: token
       run: | 
         echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT

From 56047630046640e3d1d50917c25c03371103be33 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 12:00:12 +0000
Subject: [PATCH 64/89] test with random

---
 .github/actions/store-delete-k8s-ghcr/action.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
index 4a0018c7d..e8761d570 100644
--- a/.github/actions/store-delete-k8s-ghcr/action.yml
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -10,9 +10,10 @@ runs:
   using: "composite"
   steps:
     - name: Generate a UUID token 
+      shell: bash 
       id: token
       run: | 
-        echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT
+        echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT
     - name: Delete GitHub Container Registry token
       uses: ./.github/actions/with-post-step
       with: 

From 9d53298c44d36a7ec15f0591b4f20d82be3a8372 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 12:03:00 +0000
Subject: [PATCH 65/89] no shell needed

---
 .github/actions/submit-delete-k8s-job/action.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index d8b8cb472..a1ed4029c 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -15,7 +15,6 @@ runs:
   steps:
     - name: Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
-      shell: bash
       with: 
         main: |
           echo "Submit K8s job" 

From 2eba3b7694d1ec269ca5ae821789015d619f0ecd Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 13:06:39 +0000
Subject: [PATCH 66/89] revert test nccl and simplify the submit k8s

---
 .../actions/submit-delete-k8s-job/action.yml  | 25 +++----
 .github/workflows/_test_nccl.yaml             | 66 +++++++++++--------
 2 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index a1ed4029c..b58179326 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -9,29 +9,30 @@ inputs:
     description: Path to the Kubernetes job YAML
     required: true
 
-
 runs:
   using: "composite"
   steps:
     - name: Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
+      shell: bash 
       with: 
         main: |
           echo "Submit K8s job" 
           kubectl apply -f "${{ inputs.job-config-file }}"
-          # wait for the job to be created 
+          
+          # Wait for job to be craeted
           kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s
-
-          # wait for the 'spec.suspend' field to become false. Necessary for kueue
+          
+          # Wait for job to be unsuspended
           kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s
-
-          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-            echo "Waiting for pods to start..."
-            sleep 20
-          done
-
-          # stream the logs 
+          
+          # Wait for pods to be running
+          kubectl wait --for=condition=Ready \
+            --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
+            --timeout=600s pod
+          
+          # Stream logs
           kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
+          
         post: | 
           kubectl delete job ${{ inputs.job-name }}
- 
\ No newline at end of file
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 200ef3b37..3ccf55809 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -3,15 +3,18 @@ name: ~run NCCL tests
 on:
   workflow_call:
     inputs:
+      # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
+      # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
+      # to be modified to test one of the JAX-Toolbox containers.
       CONTAINER:
         type: string
-        description: CUDA image to use as base
+        description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
         required: true
 
 permissions:
-  actions: write
-  contents: read
-  packages: write
+  actions:  write # to cancel previous workflows
+  contents: read  # to fetch code
+  packages: write # to upload container
 
 jobs:
   build-mpi-operator-compatible-base:
@@ -20,13 +23,12 @@ jobs:
       ARCHITECTURE: amd64
       ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
       BADGE_FILENAME: badge-mpi-operator-compatible-base-build
-      BUILD_DATE: 0000-00-00
+      BUILD_DATE: 0000-00-00 # not important; this image is never published
       BASE_IMAGE: ${{ inputs.CONTAINER }}
       CONTAINER_NAME: mpi-operator-compatible-base
       DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
       RUNNER_SIZE: small
     secrets: inherit
-
   nccl-test:
     needs: build-mpi-operator-compatible-base
     strategy:
@@ -36,19 +38,9 @@ jobs:
     env:
       BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
       TEST_NAME: ${{ matrix.test }}
-
     steps:
       - name: Check out the repository
         uses: actions/checkout@v4
-
-      - name: Modify variables 
-        id: var
-        shell: bash 
-        run: |
-          export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}"
-          echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT
-          echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT 
-
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
         with:
@@ -71,17 +63,29 @@ jobs:
             | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
             .github/eks-workflow-files/mpi-nccl-test.yml
           git diff .github/eks-workflow-files/mpi-nccl-test.yml
-
-      - name: Submit & delete Kubernetes test 
-        uses: ./.github/actions/submit-delete-k8s-job 
-        with:
-          job-config-file: ".github/eks-workflow-files/mpi-nccl-test.yml"
-          job-name: ${{ steps.var.outputs.JOB_NAME }}  # Fixed outputs instead of output
-
+      - name: Submit Kubernetes job
+        run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
+      - name: Wait for Kubernetes job to start
+        # Note that this is *not* using JOB_NAME
+        run: |
+          # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
+          # resources are available, but that is where there can be a long wait if the
+          # cluster is busy executing other jobs.
+          kubectl wait --for=create job/${LAUNCHER_NAME}
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
+      - name: Stream Kubernetes job output
+        # Note that this is *not* JOB_NAME
+        run: |
+          # Streaming logs will fail if the container/pod is still pending
+          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+            sleep 1
+          done
+          # TODO: --all-containers=true --all-pods=true could make sense here, but it
+          # prefixes lines with a rather verbose tag
+          kubectl logs --follow job/${LAUNCHER_NAME}
       - name: Retrieve Kubernetes job status
         shell: bash -exo pipefail {0}
         run: |
-          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
           while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
             failure=${status[0]:-0}
             success=${status[1]:-0}
@@ -91,16 +95,26 @@ jobs:
             elif [[ ${total} == 1 ]]; then
               break
             else
+              # Shouldn't happen, maybe a sign the job being monitored does not have a
+              # single launcher pod?
               exit 255
             fi
           done
           exit ${failure}
-
+      # Provide more debug output in case of failure; note that some kinds of launch
+      # failure do not produce any log output.
       - name: Debug failed Kubernetes job
         if: failure()
         run: |
-          LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
+          # Provide better debug in case of launch failures that will not produce log output
           pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
           if [[ -n "${pods}" ]]; then
             kubectl describe ${pods}
           fi
+      # Clean up in case of errors as well as success
+      - name: Delete Kubernetes job
+        if: always()
+        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
+      - name: Delete GitHub Container Registry token
+        if: always()
+        run: kubectl delete secret ${TOKEN_NAME}
\ No newline at end of file

From 900ebb220eb63f1a55e5ad5845e6f212f5c5cc35 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 13:29:49 +0000
Subject: [PATCH 67/89] Fix the nccl test

---
 .github/workflows/_test_nccl.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 3ccf55809..53dbcdaca 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -47,6 +47,16 @@ jobs:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Create env vars 
+        id: var 
+        shell: bash 
+        run: | 
+          JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
+          LAUNCHER_NAME="${JOB_NAME}-launcher"
+          TOKEN_NAME="${JOB_NAME}-token"
+          # Make these available to later steps
+          echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
+          echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
       - name: K8s GHCR store and delete token
         id: store-token
         uses: ./.github/actions/store-delete-k8s-ghcr 

From a5b8e082ae2ad4f3acb9b18b3e0f9296f06933ef Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 14:09:34 +0000
Subject: [PATCH 68/89] do not add the shell

---
 .github/actions/store-delete-k8s-ghcr/action.yml | 2 +-
 .github/actions/submit-delete-k8s-job/action.yml | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
index e8761d570..1d3acec18 100644
--- a/.github/actions/store-delete-k8s-ghcr/action.yml
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -24,4 +24,4 @@ runs:
           --from-file=.dockerconfigjson=$HOME/.docker/config.json \
           --type=kubernetes.io/dockerconfigjson
         post: |
-          kubectl delete secret ${{ inputs.token-name }}
+          kubectl delete secret ${{ steps.token.outputs.token-name }}
diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index b58179326..5c91af1f4 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -1,9 +1,9 @@
-name: Delete K8s Job
-description: Cleans up the Job resource to avoid leaving pods behind.
+name: Submit & Delete K8s Job
+description: Submit and delete a K8s job after its execution
 
 inputs:
   job-name:
-    description: The job name to delete
+    description: The job name
     required: true
   job-config-file:
     description: Path to the Kubernetes job YAML
@@ -12,9 +12,8 @@ inputs:
 runs:
   using: "composite"
   steps:
-    - name: Delete Kubernetes job
+    - name: Submit and Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
-      shell: bash 
       with: 
         main: |
           echo "Submit K8s job" 

From 43f75a6ddfc1edf92caa7bcba5892d205d5d302f Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 15:07:18 +0000
Subject: [PATCH 69/89] correct typos

---
 .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml | 2 +-
 .github/eks-workflow-files/axlearn/axlearn-job.yml        | 3 +--
 .github/workflows/_test_nccl.yaml                         | 5 +----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
index c6d9db3ab..de6f6c7ad 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
@@ -13,7 +13,7 @@ spec:
         spec:
             restartPolicy: Never
             containers:
-                - name: axlearn-fuji-3B
+                - name: axlearn-fuji
                   image: PLACEHOLDER
                   command:
                     - bash
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index b1ac81909..f3998ef9f 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -22,8 +22,7 @@ spec:
                       test-axlearn.sh \
                         --directory "." \
                         --output "/opt/output/" \
-                        --test-files "/opt/axlearn/axlearn/common/*_test.py" \
-                        --k8s
+                        --test-files "/opt/axlearn/axlearn/common/*_test.py"
 
                       sync  
                       wait 
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 53dbcdaca..76d66ab9a 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -124,7 +124,4 @@ jobs:
       # Clean up in case of errors as well as success
       - name: Delete Kubernetes job
         if: always()
-        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
-      - name: Delete GitHub Container Registry token
-        if: always()
-        run: kubectl delete secret ${TOKEN_NAME}
\ No newline at end of file
+        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
\ No newline at end of file

From e3a9e4e78f0e8281dd1ccbea97eb7ea2c02c3e4c Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 26 Feb 2025 18:40:40 +0000
Subject: [PATCH 70/89] fix the fuji eks model

---
 .../axlearn/axlearn-fuji-model.yml            | 40 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
index de6f6c7ad..e2662d040 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
@@ -13,14 +13,50 @@ spec:
         spec:
             restartPolicy: Never
             containers:
-                - name: axlearn-fuji
+                - name: axlearn-fuji-model
                   image: PLACEHOLDER
                   command:
                     - bash
                     - -xo
                     - pipefail
                     - -c
-                    - "\nBASEDIR=\"/opt/axlearn\"\nCONFIG=\"fuji-3B-v3-flash-single-host\"\nBASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true\n     --xla_gpu_enable_highest_priority_async_stream=true\n     --xla_gpu_all_reduce_combine_threshold_bytes=1073741824\n     --xla_gpu_all_gather_combine_threshold_bytes=1073741824\n     --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824\n     --xla_gpu_enable_pipelined_all_gather=true\n     --xla_gpu_enable_pipelined_reduce_scatter=true\n     --xla_gpu_enable_pipelined_all_reduce=true\n     --xla_gpu_enable_while_loop_double_buffering=true\n     --xla_gpu_enable_triton_gemm=false\n     --xla_gpu_enable_all_gather_combine_by_dim=false\n     --xla_gpu_enable_reduce_scatter_combine_by_dim=false\n     --xla_disable_hlo_passes=rematerialization}\n\nexport XLA_FLAGS=\"$BASE_XLA_FLAGS ${XLA_FLAGS:-}\" \n\nLOG_DIR=${BASEDIR}/logs\nTRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir\nmkdir -p ${TRAINER_DIR}\n\npython3  -m axlearn.common.launch_trainer_main  \\\n    --module=text.gpt.c4_trainer \\\n    --config=${CONFIG} \\\n    --trainer_dir=${TRAINER_DIR} \\\n    --data_dir=gs://axlearn-public/tensorflow_datasets \\\n    --jax_backend=gpu \n"
+                    - |        
+                      BASEDIR="/opt/axlearn"
+                      CONFIG="fuji-3B-v3-flash-single-host"
+                      HLO_DUMP=0
+                      POSTFIX=""
+
+                      AR_THRESHOLD=1073741824
+                      AG_THRESHOLD=8589934592
+                      RS_THRESHOLD=8589934592
+                      BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
+                          --xla_gpu_enable_highest_priority_async_stream=true
+                          --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                          --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                          --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                          --xla_gpu_enable_pipelined_all_gather=true
+                          --xla_gpu_enable_pipelined_reduce_scatter=true
+                          --xla_gpu_enable_pipelined_all_reduce=true
+                          --xla_gpu_enable_while_loop_double_buffering=true
+                          --xla_gpu_enable_triton_gemm=false
+                          --xla_gpu_enable_all_gather_combine_by_dim=false
+                          --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                          --xla_disable_hlo_passes=rematerialization}
+
+                      export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
+                      export TF_GPU_ALLOCATOR=cuda_malloc_async
+
+                      LOG_DIR=${BASEDIR}/logs
+                      TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
+                      mkdir -p ${TRAINER_DIR}
+
+
+                      python3 -m axlearn.common.launch_trainer_main \
+                          --module=text.gpt.c4_trainer \
+                          --config=${CONFIG} \
+                          --trainer_dir=${TRAINER_DIR} \
+                          --data_dir=gs://axlearn-public/tensorflow_datasets \
+                          --jax_backend=gpu                    
                   resources:
                     limits:
                         nvidia.com/gpu: 8

From 785f8ae6d3e8bce5524763e46128323d738cd001 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 27 Feb 2025 08:58:11 +0000
Subject: [PATCH 71/89] remove k8s

---
 .github/container/test-axlearn.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index b9c3f2dfe..579582a80 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -22,7 +22,6 @@ usage() {
 DIR='axlearn/axlearn/common'
 TEST_FILES=()
 OUTPUT_DIRECTORY=''
-K8S=false
 
 # Parse args manually
 while [[ $# -gt 0 ]]; do

From 7c2da3fe3aad525c39709e34bd5148d427eb8728 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 27 Feb 2025 09:41:57 +0000
Subject: [PATCH 72/89] remove test-fuji.sh, test with slurm

---
 .github/container/Dockerfile.axlearn |  2 +-
 .github/container/test-fuji.sh       | 33 ----------------------------
 2 files changed, 1 insertion(+), 34 deletions(-)
 delete mode 100755 .github/container/test-fuji.sh

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 8c609d08d..b34923e29 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -38,7 +38,7 @@ EOF
 ## Add test script to the path
 ###############################################################################
 
-ADD test-axlearn.sh test-fuji.sh /usr/local/bin/
+ADD test-axlearn.sh /usr/local/bin/
 
 ###############################################################################
 ## Install accumulated packages from the base image and the previous stage
diff --git a/.github/container/test-fuji.sh b/.github/container/test-fuji.sh
deleted file mode 100755
index 9018f37de..000000000
--- a/.github/container/test-fuji.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#! /bin/bash
-BASEDIR="/opt/host/"
-CONFIG="fuji-1B-v3-flash"
-POSTFIX=${POSTFIX:=""}
-
-BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
-                 --xla_gpu_enable_highest_priority_async_stream=true
-                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
-                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
-                 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
-                 --xla_gpu_enable_pipelined_all_gather=true
-                 --xla_gpu_enable_pipelined_reduce_scatter=true
-                 --xla_gpu_enable_pipelined_all_reduce=true
-                 --xla_gpu_enable_while_loop_double_buffering=true
-                 --xla_gpu_enable_triton_gemm=false
-                 --xla_gpu_enable_all_gather_combine_by_dim=false
-                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false
-                 --xla_disable_hlo_passes=rematerialization}
-
-export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
-
-LOG_DIF=${BASEDIR}/logs
-TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs
-mkdir -p ${TRAINER_DIR}
-
-#test "${WITH_MP}" == 1 && export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=${SLURM_LAUNCH_NODE_IPADDR}:12345 --process_id=${SLURM_PROCID}"
-
-python3 -m axlearn.common.launch_trainer_main \
-    --module=text.gpt.c4_trainer \
-    --config=${CONFIG} \
-    --trainer_dir=${TRAINER_DIR} \
-    --data_dir=gs://axlearn-public/tensorflow_datasets \
-    --jax_backend=gpu 

From d2823b8d74b51ae045dfd00ceab575581cee52e3 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 27 Feb 2025 16:32:52 +0000
Subject: [PATCH 73/89] try to not install seqio for tensorflow

---
 .github/container/Dockerfile.axlearn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index b34923e29..8f0ceabac 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -21,7 +21,6 @@ aqtp==0.8.2
 einops==0.8.0
 nltk==3.7
 portpicker==1.6.0
-seqio==0.0.18
 protobuf==3.20.3  
 pytest>=7.4.3
 REQUIREMENTS

From 5afc8d9a4011cc4cfa09062d37fda2bea5a93d80 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 27 Feb 2025 17:23:23 +0000
Subject: [PATCH 74/89] recommit seqio

---
 .github/container/Dockerfile.axlearn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index 8f0ceabac..b34923e29 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -21,6 +21,7 @@ aqtp==0.8.2
 einops==0.8.0
 nltk==3.7
 portpicker==1.6.0
+seqio==0.0.18
 protobuf==3.20.3  
 pytest>=7.4.3
 REQUIREMENTS

From bbe8c3bff04f91130675922d720705a664188399 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 28 Feb 2025 09:28:23 +0000
Subject: [PATCH 75/89] substitute tensorflow with cpu one

---
 .github/container/pip-finalize.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh
index 6d8ceac9b..56013ac78 100755
--- a/.github/container/pip-finalize.sh
+++ b/.github/container/pip-finalize.sh
@@ -46,6 +46,15 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
   exit 1
 fi
 
+# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt
+sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
+
+# Replace any torch==Y with torch==Y+cpu in requirements.txt
+sed -i 's/^torch==\([0-9.*]\+\)$/torch==\1+cpu/' requirements.txt
+
+# Add the --find-links option for PyTorch wheels
+echo "--find-links https://download.pytorch.org/whl/torch" >> requirements.txt
+
 # --no-deps is required since conflicts can still appear during pip-sync
 pip-sync --pip-args '--no-deps --src /opt' requirements.txt
 

From f711efc38d6918a2131faa6cc6a1fc86f9cf790c Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 28 Feb 2025 09:52:18 +0000
Subject: [PATCH 76/89] fix the test

---
 .github/container/test-axlearn.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 579582a80..a46bc0e83 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -93,8 +93,8 @@ cd "$DIR" || exit 1
 
 echo "Running tests..."
 
-pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
-pip install transformers scikit-learn timm 
+pip install transformers --no-deps 
+pip install scikit-learn timm 
 
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then

From faf0b83ec0afe32d05418f3c37a4df3a39e550fa Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 28 Feb 2025 10:38:35 +0000
Subject: [PATCH 77/89] fix installation process

---
 .github/container/Dockerfile.axlearn |  6 ------
 .github/container/pip-finalize.sh    | 15 ++++++---------
 .github/container/test-axlearn.sh    |  4 ++--
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
index b34923e29..ac73d07c6 100644
--- a/.github/container/Dockerfile.axlearn
+++ b/.github/container/Dockerfile.axlearn
@@ -25,12 +25,6 @@ seqio==0.0.18
 protobuf==3.20.3  
 pytest>=7.4.3
 REQUIREMENTS
-  # Only append "tensorflow-cpu" if running on x86_64
-  if [ "$(uname -m)" = "x86_64" ]; then
-    echo "tensorflow-cpu" >> /opt/pip-tools.d/requirements-axlearn.in
-  else
-    echo "Skipping TF on $(uname -m)"
-  fi
 EOF
 
 
diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh
index 56013ac78..285da565c 100755
--- a/.github/container/pip-finalize.sh
+++ b/.github/container/pip-finalize.sh
@@ -46,15 +46,12 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
   exit 1
 fi
 
-# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt
-sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
-
-# Replace any torch==Y with torch==Y+cpu in requirements.txt
-sed -i 's/^torch==\([0-9.*]\+\)$/torch==\1+cpu/' requirements.txt
-
-# Add the --find-links option for PyTorch wheels
-echo "--find-links https://download.pytorch.org/whl/torch" >> requirements.txt
-
+# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64
+if [ "$(uname -m)" = "x86_64" ]; then
+  sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
+else
+  echo "Skipping TF on $(uname -m)"
+fi
 # --no-deps is required since conflicts can still appear during pip-sync
 pip-sync --pip-args '--no-deps --src /opt' requirements.txt
 
diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index a46bc0e83..d1993cc03 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -93,8 +93,8 @@ cd "$DIR" || exit 1
 
 echo "Running tests..."
 
-pip install transformers --no-deps 
-pip install scikit-learn timm 
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+pip install timm transformers scikit-learn 
 
 
 if [ "${#TEST_FILES[@]}" -eq 0 ]; then

From b2579cb0f03b94a387e80e05fc6687ca77477694 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 11:29:56 +0000
Subject: [PATCH 78/89] @olupton comments work

---
 .../actions/submit-delete-k8s-job/action.yml  |  2 +-
 .../axlearn/axlearn-job.yml                   |  8 ++-
 .github/workflows/_ci.yaml                    | 17 ++---
 .github/workflows/_test_nccl.yaml             |  2 +-
 README.md                                     | 25 +++++++
 rosetta/rosetta/projects/axlearn/README.md    | 59 +++++++++++++++
 .../projects/axlearn/scripts/eks-fuji.yaml    | 66 +++++++++++++++++
 .../projects/axlearn/scripts/multinode.py     | 71 +++++++++++++++++++
 8 files changed, 238 insertions(+), 12 deletions(-)
 create mode 100644 rosetta/rosetta/projects/axlearn/README.md
 create mode 100644 rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml
 create mode 100644 rosetta/rosetta/projects/axlearn/scripts/multinode.py

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index 5c91af1f4..dbeabe668 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -34,4 +34,4 @@ runs:
           kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
           
         post: | 
-          kubectl delete job ${{ inputs.job-name }}
+          kubectl delete -f "${{ inputs.job-config-file }}"
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index f3998ef9f..d27ee53d5 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -37,7 +37,7 @@ spec:
                 - name: upload
                   image: amazon/aws-cli
                   env:
-                    - name: TEST_DATE
+                    - name: RUN_ID
                       value: PLACEHOLDER
                   command:
                     - sh
@@ -47,7 +47,11 @@ spec:
                         sleep 5
                       done
                       # Upload to S3 bucket
-                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt
+                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
+                      # Zip the results of all the tests 
+                      tar -czf test_logs.tar.gz /opt/output
+                      # Upload logs to S3 bucket
+                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz
                   volumeMounts:
                     - name: output
                       mountPath: /opt/output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index bf530d533..eaaf82b21 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -688,11 +688,12 @@ jobs:
     - name: Download logs from S3
       id: log-s3
       run: |
-        mkdir -p /tmp/axlearn-output
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt /tmp/axlearn-output/
+        mkdir -p axlearn-output
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.tar.gz axlearn-output/
 
-        passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true)
-        failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true)
+        passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
+        failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
         total_tests=$((failed_tests + passed_tests))
 
         echo "Passed tests: $passed_tests"
@@ -733,7 +734,7 @@ jobs:
         message="Passed $passed_tests out of $total_tests." \
         color=$badge_color \
         to_json schemaVersion label message color \
-        > "badge-axlearn-test"
+        > badge-axlearn-test.json
 
     - name: Upload artifacts
       if: ${{ !cancelled() }}
@@ -742,8 +743,8 @@ jobs:
         name: "artifact-axlearn-test"
         path: |
           sitrep.json
-          "badge-axlearn-test"
-          summary.txt
+          badge-axlearn-test.json
+          axlearn-output/*
 
   # the fuji test will run for 20 minutes only, as per 2025-02-24 
   # is not possible to set the `max_steps` value
@@ -779,5 +780,5 @@ jobs:
       uses: ./.github/actions/submit-delete-k8s-job 
       with:
         job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
-        job-name: ${{ env.JOB_NAME }}
+        job-name: ${{ env.JOB_NAME }}https://docs.google.com/spreadsheets/d/12JIThodWLhf-H7Ob9p3CGZHLjKEPp17ogp9Do5Ofa6U/edit?gid=1030128481#gid=1030128481
 
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 76d66ab9a..f8b328b76 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -124,4 +124,4 @@ jobs:
       # Clean up in case of errors as well as success
       - name: Delete Kubernetes job
         if: always()
-        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
\ No newline at end of file
+        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
diff --git a/README.md b/README.md
index 648208205..83053215e 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ We support and test the following JAX frameworks and model architectures. More d
 | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` |
 | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` |
 | levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` |
+| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | 
 
 # Build Pipeline Status
 <table>
@@ -248,6 +249,30 @@ We support and test the following JAX frameworks and model architectures. More d
         </a>
       </td>
     </tr>
+    <tr>
+      <td>
+        <a href="https://github.com/NVIDIA/JAX-Toolbox/blob/main/.github/container/Dockerfile.axlearn">
+          <img style="height:1em;" src="https://img.shields.io/static/v1?label=&color=gray&logo=docker&message=AXLearn%3D%7Bcore%2CAXLearn%7D">
+        </a>
+      </td>
+      <td>
+        <code>ghcr.io/nvidia/jax:axlearn</code>
+      </td>
+      <td>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-axlearn-md">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-build-amd64.json&logo=docker&label=amd64">
+        </a>
+        <br>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-build-arm64.json&logo=docker&label=arm64">
+        </a>
+      </td>
+      <td>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-maxtext-test-json">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axleran-test.json&logo=nvidia&label=A100%20distributed">
+        </a>
+      </td>
+    </tr>
   </tbody>
 </table>
 
diff --git a/rosetta/rosetta/projects/axlearn/README.md b/rosetta/rosetta/projects/axlearn/README.md
new file mode 100644
index 000000000..f4c8f6679
--- /dev/null
+++ b/rosetta/rosetta/projects/axlearn/README.md
@@ -0,0 +1,59 @@
+# AXLearn
+[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. 
+
+
+## Hardware and Software Specifications
+
+Functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G); please refer to the [Configs](#configs) section below for some initial configs and performance numbers. We will continue to populate it with more models and configs. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU.
+
+
+## Containers
+We provide a fully built and ready-to-use multi-arch container, bleeding edge: `ghcr.io/nvidia/jax:axlearn`. We also provide nightly dated images with the naming pattern `ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD`, but we encourage you to use the latest ones for the best performance.
+
+*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the AXLearn repository. When working interactively with containers, make sure you navigate to `/opt/axlearn` before running any commmands.
+
+## Launching a container
+Use the following command to launch a container:
+```
+docker run -ti --gpus=all --net=host --ipc=host -v <WORKSPACE_PATH>:/opt/axlearn/workspace -w /opt/axlearn <CONTAINER> /bin/bash
+```
+where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag.
+
+## Running a Fuji model
+### Quick Runs
+
+#### EKS Single node: `fuji-3B-v3-flash-single-host`
+Fuji models are defined with 1B, 3B, 7B or 70B parameters. In this example, we deploy the training for a Fuji-3B model, that uses flash attention, and runs on a single host. [Here](scripts/eks-fuji.yaml) we provide an example deployment file. The core point of the deployment is: 
+```bash 
+python3 -m axlearn.common.launch_trainer_main \
+        --module=text.gpt.c4_trainer \
+        --config=${CONFIG} \
+        --trainer_dir=${TRAINER_DIR} \
+        --data_dir=gs://axlearn-public/tensorflow_datasets \
+        --jax_backend=gpu             
+```
+Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). 
+
+#### Running a multinode job for `fuji-XB-v2-flash` 
+
+For running a multinode job  we provide a [custom example](scripts/multinode.py). The code access AXLearn directly, it allows to specify a custom dataset, the number of GPUs to use, the global batch size, as well as the `max_sequence_length`. 
+
+
+## XLA Flags
+The [GPU Performance document](../../../docs/GPU_performance.md) provides a detailed description of the XLA flags that can be set to optimize performance. These are the recommended XLA flags to get good performance for AXLearn.
+
+```
+XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
+            --xla_gpu_enable_triton_gemm=false
+            --xla_gpu_enable_command_buffer=
+            --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
+            --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
+            --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+            --xla_gpu_enable_pipelined_all_gather=true 
+            --xla_gpu_enable_pipelined_reduce_scatter=true 
+            --xla_gpu_enable_pipelined_all_reduce=true 
+            --xla_gpu_enable_while_loop_double_buffering=true
+            --xla_gpu_enable_all_gather_combine_by_dim=false 
+            --xla_gpu_enable_reduce_scatter_combine_by_dim=false 
+            --xla_disable_hlo_passes=rematerialization"
+```
\ No newline at end of file
diff --git a/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml b/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml
new file mode 100644
index 000000000..8d24a1658
--- /dev/null
+++ b/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml
@@ -0,0 +1,66 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: axlearn-fuji
+    # Specify any labels for running on a dedicated queue
+spec:
+    completions: 1
+    parallelism: 1
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: axlearn-fuji-model
+                  image: gchr.io/nvidia/jax:axlearn
+                  command:
+                    - bash
+                    - -xo
+                    - pipefail
+                    - -c
+                    - |        
+                      BASEDIR="/opt/axlearn"
+                      CONFIG="fuji-3B-v3-flash-single-host"
+                      HLO_DUMP=0
+                      POSTFIX=""
+
+                      AR_THRESHOLD=1073741824
+                      AG_THRESHOLD=8589934592
+                      RS_THRESHOLD=8589934592
+                      BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
+                          --xla_gpu_enable_highest_priority_async_stream=true
+                          --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                          --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                          --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                          --xla_gpu_enable_pipelined_all_gather=true
+                          --xla_gpu_enable_pipelined_reduce_scatter=true
+                          --xla_gpu_enable_pipelined_all_reduce=true
+                          --xla_gpu_enable_while_loop_double_buffering=true
+                          --xla_gpu_enable_triton_gemm=false
+                          --xla_gpu_enable_all_gather_combine_by_dim=false
+                          --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                          --xla_disable_hlo_passes=rematerialization}
+
+                      export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
+                      export TF_GPU_ALLOCATOR=cuda_malloc_async
+
+                      LOG_DIR=${BASEDIR}/logs
+                      TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
+                      mkdir -p ${TRAINER_DIR}
+
+
+                      python3 -m axlearn.common.launch_trainer_main \
+                          --module=text.gpt.c4_trainer \
+                          --config=${CONFIG} \
+                          --trainer_dir=${TRAINER_DIR} \
+                          --data_dir=gs://axlearn-public/tensorflow_datasets \
+                          --jax_backend=gpu                    
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+            # specify any image secret if needed
+            volumes:
+                - name: output
+                  emptyDir: {}
diff --git a/rosetta/rosetta/projects/axlearn/scripts/multinode.py b/rosetta/rosetta/projects/axlearn/scripts/multinode.py
new file mode 100644
index 000000000..0107ebddc
--- /dev/null
+++ b/rosetta/rosetta/projects/axlearn/scripts/multinode.py
@@ -0,0 +1,71 @@
+import os
+
+from absl import app, flags
+from axlearn.common.launch_trainer import run_trainer
+from axlearn.common.config import config_for_function
+from axlearn.experiments.text.gpt import c4_trainer
+from axlearn.common.trainer import SpmdTrainer
+
+FLAGS = flags.FLAGS
+FLAGS.set_default("module", "text.gpt.c4_trainer") 
+FLAGS.set_default("config", "fuji-7B-v2-flash")  # Set the model 
+FLAGS.set_default("trainer_dir", "/opt/host/axlearn-checkpoints")  # Set the trainer directory
+
+def main(_):
+    axlearn_path = "/opt/axlearn"  
+    os.environ["PYTHONPATH"] = f"{axlearn_path}:{os.environ.get('PYTHONPATH', '')}"  
+
+    n_gpus = 16 # This can be also an env variable
+    # Base XLA flags
+    base_flags = [
+        "--xla_gpu_enable_latency_hiding_scheduler=true",
+        "--xla_gpu_enable_command_buffer=",
+        "--xla_gpu_enable_highest_priority_async_stream=true",
+        "--xla_gpu_all_reduce_combine_threshold_bytes=1073741824",
+        "--xla_gpu_all_gather_combine_threshold_bytes=1073741824",
+        "--xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824",
+        "--xla_gpu_enable_pipelined_all_gather=true",
+        "--xla_gpu_enable_pipelined_reduce_scatter=true",
+        "--xla_gpu_enable_pipelined_all_reduce=true",
+        "--xla_gpu_enable_while_loop_double_buffering=true",
+        "--xla_gpu_enable_triton_gemm=false",
+        "--xla_gpu_enable_all_gather_combine_by_dim=false",
+        "--xla_gpu_enable_reduce_scatter_combine_by_dim=false",
+        "--xla_disable_hlo_passes=rematerialization",
+    ]
+    # Get existing flags from environment with proper fallback.
+    existing_xla_flags = os.environ.get("XLA_FLAGS", "").split()
+    # XLA flags
+    os.environ.update({
+        "XLA_FLAGS": " ".join([
+            *base_flags,
+            *existing_xla_flags
+        ])})
+
+    os.environ.update({
+        "DATA_DIR":"gs://axlearn-public/tensorflow_datasets", # Set up your input dataset
+        "NUM_PROCESSES":f"{n_gpus}", 
+        "DISTRIBUTED_COORDINATOR":"127.0.0.1:8080", 
+        "PROCESS_ID":"0",
+    })
+
+    # Raw config
+    config_fn = c4_trainer.named_trainer_configs()[FLAGS.config]
+    trainer_config: SpmdTrainer.Config = config_for_function(config_fn).fn()
+
+    trainer_config.max_step = 100 # Set the max number of steps to run
+    trainer_config.dir = "/opt/host/axlearn-checkpoints"  # Use 'dir' instead of 'model_dir'
+    trainer_config.input.input_dispatcher.global_logical_batch_size = 8 # Tune the batch size for training
+    #trainer_config.input.source.max_sequence_length = 2048 # Tune the max sequence length if running in OOM
+    trainer_config.checkpointer.save_policy.n = 500  # Save every 500 steps
+    trainer_config.checkpointer.keep_every_n_steps = 500  # Keep checkpoints
+    trainer_config.summary_writer.write_every_n_steps = 100  # Log every 100 steps
+
+    run_trainer(
+        trainer_config=trainer_config,
+    )
+
+
+if __name__ == "__main__":
+    from absl import app
+    app.run(main)

From fc64bbd5e6005025771ace9ef8bafebd99d621c1 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 12:47:21 +0000
Subject: [PATCH 79/89] fix typo

---
 .github/workflows/_ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index eaaf82b21..82e1bdb35 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -780,5 +780,5 @@ jobs:
       uses: ./.github/actions/submit-delete-k8s-job 
       with:
         job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
-        job-name: ${{ env.JOB_NAME }}https://docs.google.com/spreadsheets/d/12JIThodWLhf-H7Ob9p3CGZHLjKEPp17ogp9Do5Ofa6U/edit?gid=1030128481#gid=1030128481
+        job-name: ${{ env.JOB_NAME }}
 

From 7f186cc72c94bec0ef61204ffdfd23ebc5c46b4e Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 15:17:25 +0000
Subject: [PATCH 80/89] fix readme, and copy of zip file, and xla flags

---
 .../axlearn/axlearn-fuji-model.yml            |  4 --
 .../axlearn/axlearn-job.yml                   |  2 +-
 docs/frameworks/axlearn/README.md             | 40 +++++++++++
 rosetta/rosetta/projects/axlearn/README.md    | 59 ---------------
 .../projects/axlearn/scripts/eks-fuji.yaml    | 66 -----------------
 .../projects/axlearn/scripts/multinode.py     | 71 -------------------
 6 files changed, 41 insertions(+), 201 deletions(-)
 create mode 100644 docs/frameworks/axlearn/README.md
 delete mode 100644 rosetta/rosetta/projects/axlearn/README.md
 delete mode 100644 rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml
 delete mode 100644 rosetta/rosetta/projects/axlearn/scripts/multinode.py

diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
index e2662d040..a36411d73 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
@@ -30,7 +30,6 @@ spec:
                       AG_THRESHOLD=8589934592
                       RS_THRESHOLD=8589934592
                       BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
-                          --xla_gpu_enable_highest_priority_async_stream=true
                           --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
                           --xla_gpu_all_gather_combine_threshold_bytes=1073741824
                           --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
@@ -38,9 +37,6 @@ spec:
                           --xla_gpu_enable_pipelined_reduce_scatter=true
                           --xla_gpu_enable_pipelined_all_reduce=true
                           --xla_gpu_enable_while_loop_double_buffering=true
-                          --xla_gpu_enable_triton_gemm=false
-                          --xla_gpu_enable_all_gather_combine_by_dim=false
-                          --xla_gpu_enable_reduce_scatter_combine_by_dim=false
                           --xla_disable_hlo_passes=rematerialization}
 
                       export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index d27ee53d5..8d0eda9e2 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -51,7 +51,7 @@ spec:
                       # Zip the results of all the tests 
                       tar -czf test_logs.tar.gz /opt/output
                       # Upload logs to S3 bucket
-                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz
+                      aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz
                   volumeMounts:
                     - name: output
                       mountPath: /opt/output
diff --git a/docs/frameworks/axlearn/README.md b/docs/frameworks/axlearn/README.md
new file mode 100644
index 000000000..ad7172ca7
--- /dev/null
+++ b/docs/frameworks/axlearn/README.md
@@ -0,0 +1,40 @@
+# AXLearn
+[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. 
+
+
+## Hardware and Software Specifications
+
+The functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G). 
+
+
+## Containers
+We provide a multi-architecture container that is regularly updated. Use these containers to avoid dependency and environment issues. 
+- Latest container: ghcr.io/nvidia/jax:axlearn
+- Nightly dated container: ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD
+
+When you start an interactive session:
+
+- Navigate to `/opt/axlearn` inside the container.
+- Place your persistent files in a mounted directory (e.g. `/opt/axlearn/workspace`).
+
+## Launching a container
+Use the following command to launch a container:
+```bash
+docker run -ti --gpus=all --net=host --ipc=host -v <WORKSPACE_PATH>:/opt/axlearn/workspace -w /opt/axlearn <CONTAINER> /bin/bash
+```
+where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag.
+
+## Example: training `fuji-3B-v3-flash-single-host` on EKS
+[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml) we're using for deploying the training of Fuji-3B model, that uses flash attention, and runs on a single host. The core part of the deployment is: 
+```bash 
+python3 -m axlearn.common.launch_trainer_main \
+        --module=text.gpt.c4_trainer \
+        --config=${CONFIG} \
+        --trainer_dir=${TRAINER_DIR} \
+        --data_dir=gs://axlearn-public/tensorflow_datasets \
+        --jax_backend=gpu             
+```
+Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). 
+
+## Testing
+[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-job.yml) used for testing AXLearn funcitonalities. In particular, this test makes uses of [`test_axlearn.sh` script](../../../.github/container/test-axlearn.sh). The test runs `pytest` against all the tests contains in `/opt/axlearn/axlearn/common` folder.
diff --git a/rosetta/rosetta/projects/axlearn/README.md b/rosetta/rosetta/projects/axlearn/README.md
deleted file mode 100644
index f4c8f6679..000000000
--- a/rosetta/rosetta/projects/axlearn/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# AXLearn
-[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. 
-
-
-## Hardware and Software Specifications
-
-Functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G); please refer to the [Configs](#configs) section below for some initial configs and performance numbers. We will continue to populate it with more models and configs. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU.
-
-
-## Containers
-We provide a fully built and ready-to-use multi-arch container, bleeding edge: `ghcr.io/nvidia/jax:axlearn`. We also provide nightly dated images with the naming pattern `ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD`, but we encourage you to use the latest ones for the best performance.
-
-*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the AXLearn repository. When working interactively with containers, make sure you navigate to `/opt/axlearn` before running any commmands.
-
-## Launching a container
-Use the following command to launch a container:
-```
-docker run -ti --gpus=all --net=host --ipc=host -v <WORKSPACE_PATH>:/opt/axlearn/workspace -w /opt/axlearn <CONTAINER> /bin/bash
-```
-where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag.
-
-## Running a Fuji model
-### Quick Runs
-
-#### EKS Single node: `fuji-3B-v3-flash-single-host`
-Fuji models are defined with 1B, 3B, 7B or 70B parameters. In this example, we deploy the training for a Fuji-3B model, that uses flash attention, and runs on a single host. [Here](scripts/eks-fuji.yaml) we provide an example deployment file. The core point of the deployment is: 
-```bash 
-python3 -m axlearn.common.launch_trainer_main \
-        --module=text.gpt.c4_trainer \
-        --config=${CONFIG} \
-        --trainer_dir=${TRAINER_DIR} \
-        --data_dir=gs://axlearn-public/tensorflow_datasets \
-        --jax_backend=gpu             
-```
-Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). 
-
-#### Running a multinode job for `fuji-XB-v2-flash` 
-
-For running a multinode job  we provide a [custom example](scripts/multinode.py). The code access AXLearn directly, it allows to specify a custom dataset, the number of GPUs to use, the global batch size, as well as the `max_sequence_length`. 
-
-
-## XLA Flags
-The [GPU Performance document](../../../docs/GPU_performance.md) provides a detailed description of the XLA flags that can be set to optimize performance. These are the recommended XLA flags to get good performance for AXLearn.
-
-```
-XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-            --xla_gpu_enable_triton_gemm=false
-            --xla_gpu_enable_command_buffer=
-            --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
-            --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
-            --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
-            --xla_gpu_enable_pipelined_all_gather=true 
-            --xla_gpu_enable_pipelined_reduce_scatter=true 
-            --xla_gpu_enable_pipelined_all_reduce=true 
-            --xla_gpu_enable_while_loop_double_buffering=true
-            --xla_gpu_enable_all_gather_combine_by_dim=false 
-            --xla_gpu_enable_reduce_scatter_combine_by_dim=false 
-            --xla_disable_hlo_passes=rematerialization"
-```
\ No newline at end of file
diff --git a/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml b/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml
deleted file mode 100644
index 8d24a1658..000000000
--- a/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-    name: axlearn-fuji
-    # Specify any labels for running on a dedicated queue
-spec:
-    completions: 1
-    parallelism: 1
-    template:
-        spec:
-            restartPolicy: Never
-            containers:
-                - name: axlearn-fuji-model
-                  image: gchr.io/nvidia/jax:axlearn
-                  command:
-                    - bash
-                    - -xo
-                    - pipefail
-                    - -c
-                    - |        
-                      BASEDIR="/opt/axlearn"
-                      CONFIG="fuji-3B-v3-flash-single-host"
-                      HLO_DUMP=0
-                      POSTFIX=""
-
-                      AR_THRESHOLD=1073741824
-                      AG_THRESHOLD=8589934592
-                      RS_THRESHOLD=8589934592
-                      BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
-                          --xla_gpu_enable_highest_priority_async_stream=true
-                          --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
-                          --xla_gpu_all_gather_combine_threshold_bytes=1073741824
-                          --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
-                          --xla_gpu_enable_pipelined_all_gather=true
-                          --xla_gpu_enable_pipelined_reduce_scatter=true
-                          --xla_gpu_enable_pipelined_all_reduce=true
-                          --xla_gpu_enable_while_loop_double_buffering=true
-                          --xla_gpu_enable_triton_gemm=false
-                          --xla_gpu_enable_all_gather_combine_by_dim=false
-                          --xla_gpu_enable_reduce_scatter_combine_by_dim=false
-                          --xla_disable_hlo_passes=rematerialization}
-
-                      export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
-                      export TF_GPU_ALLOCATOR=cuda_malloc_async
-
-                      LOG_DIR=${BASEDIR}/logs
-                      TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
-                      mkdir -p ${TRAINER_DIR}
-
-
-                      python3 -m axlearn.common.launch_trainer_main \
-                          --module=text.gpt.c4_trainer \
-                          --config=${CONFIG} \
-                          --trainer_dir=${TRAINER_DIR} \
-                          --data_dir=gs://axlearn-public/tensorflow_datasets \
-                          --jax_backend=gpu                    
-                  resources:
-                    limits:
-                        nvidia.com/gpu: 8
-                  volumeMounts:
-                    - name: output
-                      mountPath: /opt/output
-            # specify any image secret if needed
-            volumes:
-                - name: output
-                  emptyDir: {}
diff --git a/rosetta/rosetta/projects/axlearn/scripts/multinode.py b/rosetta/rosetta/projects/axlearn/scripts/multinode.py
deleted file mode 100644
index 0107ebddc..000000000
--- a/rosetta/rosetta/projects/axlearn/scripts/multinode.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import os
-
-from absl import app, flags
-from axlearn.common.launch_trainer import run_trainer
-from axlearn.common.config import config_for_function
-from axlearn.experiments.text.gpt import c4_trainer
-from axlearn.common.trainer import SpmdTrainer
-
-FLAGS = flags.FLAGS
-FLAGS.set_default("module", "text.gpt.c4_trainer") 
-FLAGS.set_default("config", "fuji-7B-v2-flash")  # Set the model 
-FLAGS.set_default("trainer_dir", "/opt/host/axlearn-checkpoints")  # Set the trainer directory
-
-def main(_):
-    axlearn_path = "/opt/axlearn"  
-    os.environ["PYTHONPATH"] = f"{axlearn_path}:{os.environ.get('PYTHONPATH', '')}"  
-
-    n_gpus = 16 # This can be also an env variable
-    # Base XLA flags
-    base_flags = [
-        "--xla_gpu_enable_latency_hiding_scheduler=true",
-        "--xla_gpu_enable_command_buffer=",
-        "--xla_gpu_enable_highest_priority_async_stream=true",
-        "--xla_gpu_all_reduce_combine_threshold_bytes=1073741824",
-        "--xla_gpu_all_gather_combine_threshold_bytes=1073741824",
-        "--xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824",
-        "--xla_gpu_enable_pipelined_all_gather=true",
-        "--xla_gpu_enable_pipelined_reduce_scatter=true",
-        "--xla_gpu_enable_pipelined_all_reduce=true",
-        "--xla_gpu_enable_while_loop_double_buffering=true",
-        "--xla_gpu_enable_triton_gemm=false",
-        "--xla_gpu_enable_all_gather_combine_by_dim=false",
-        "--xla_gpu_enable_reduce_scatter_combine_by_dim=false",
-        "--xla_disable_hlo_passes=rematerialization",
-    ]
-    # Get existing flags from environment with proper fallback.
-    existing_xla_flags = os.environ.get("XLA_FLAGS", "").split()
-    # XLA flags
-    os.environ.update({
-        "XLA_FLAGS": " ".join([
-            *base_flags,
-            *existing_xla_flags
-        ])})
-
-    os.environ.update({
-        "DATA_DIR":"gs://axlearn-public/tensorflow_datasets", # Set up your input dataset
-        "NUM_PROCESSES":f"{n_gpus}", 
-        "DISTRIBUTED_COORDINATOR":"127.0.0.1:8080", 
-        "PROCESS_ID":"0",
-    })
-
-    # Raw config
-    config_fn = c4_trainer.named_trainer_configs()[FLAGS.config]
-    trainer_config: SpmdTrainer.Config = config_for_function(config_fn).fn()
-
-    trainer_config.max_step = 100 # Set the max number of steps to run
-    trainer_config.dir = "/opt/host/axlearn-checkpoints"  # Use 'dir' instead of 'model_dir'
-    trainer_config.input.input_dispatcher.global_logical_batch_size = 8 # Tune the batch size for training
-    #trainer_config.input.source.max_sequence_length = 2048 # Tune the max sequence length if running in OOM
-    trainer_config.checkpointer.save_policy.n = 500  # Save every 500 steps
-    trainer_config.checkpointer.keep_every_n_steps = 500  # Keep checkpoints
-    trainer_config.summary_writer.write_every_n_steps = 100  # Log every 100 steps
-
-    run_trainer(
-        trainer_config=trainer_config,
-    )
-
-
-if __name__ == "__main__":
-    from absl import app
-    app.run(main)

From 63ace5d8ceb38d31e57f663f1c5224f92a3491d2 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 17:45:15 +0000
Subject: [PATCH 81/89] fix test error

---
 .github/eks-workflow-files/axlearn/axlearn-job.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 8d0eda9e2..d3dd154df 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -49,7 +49,7 @@ spec:
                       # Upload to S3 bucket
                       aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
                       # Zip the results of all the tests 
-                      tar -czf test_logs.tar.gz /opt/output
+                      tar cvzf test_logs.tar.gz /opt/output
                       # Upload logs to S3 bucket
                       aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz
                   volumeMounts:

From ec6b54871e8bf8333dc3cc9b81c6699599470432 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 17:46:26 +0000
Subject: [PATCH 82/89] run small test

---
 .github/container/test-axlearn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index d1993cc03..e31b0bf30 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -149,7 +149,7 @@ passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]}"; do
+for test_file in "${final_test_files[@]:0:10}"; do
     echo "Running: ${test_file}"
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
     log_file="${LOG_DIRECTORY}/${log_file_name}"

From 68c001088101f7e43c18f5e43139fbcfb5401c90 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 19:32:27 +0000
Subject: [PATCH 83/89] change with zip

---
 .github/eks-workflow-files/axlearn/axlearn-job.yml | 4 ++--
 .github/workflows/_ci.yaml                         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index d3dd154df..14124cc4c 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -49,9 +49,9 @@ spec:
                       # Upload to S3 bucket
                       aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
                       # Zip the results of all the tests 
-                      tar cvzf test_logs.tar.gz /opt/output
+                      zip test_logs.zip /opt/output
                       # Upload logs to S3 bucket
-                      aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz
+                      aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.zip
                   volumeMounts:
                     - name: output
                       mountPath: /opt/output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 82e1bdb35..35784c5a3 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -690,7 +690,7 @@ jobs:
       run: |
         mkdir -p axlearn-output
         aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.tar.gz axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.zip axlearn-output/
 
         passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
         failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)

From 46b6c9ee3f9e2dc344cc9d6d85fef5e27e67076c Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 19:32:56 +0000
Subject: [PATCH 84/89] change with zip

---
 .github/eks-workflow-files/axlearn/axlearn-job.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 14124cc4c..f2da9ff0d 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -49,7 +49,7 @@ spec:
                       # Upload to S3 bucket
                       aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
                       # Zip the results of all the tests 
-                      zip test_logs.zip /opt/output
+                      zip -r test_logs.zip /opt/output
                       # Upload logs to S3 bucket
                       aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.zip
                   volumeMounts:

From 7ef84c401c7e4f019905f6369700dc5dacbc5729 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 20:45:46 +0000
Subject: [PATCH 85/89] fix the copy

---
 .github/container/test-axlearn.sh                  | 2 +-
 .github/eks-workflow-files/axlearn/axlearn-job.yml | 4 +---
 .github/workflows/_ci.yaml                         | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index e31b0bf30..9d7faf9dd 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -149,7 +149,7 @@ passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]:0:10}"; do
+for test_file in "${final_test_files[@]:0:3}"; do
     echo "Running: ${test_file}"
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
     log_file="${LOG_DIRECTORY}/${log_file_name}"
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index f2da9ff0d..8f70908da 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -48,10 +48,8 @@ spec:
                       done
                       # Upload to S3 bucket
                       aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
-                      # Zip the results of all the tests 
-                      zip -r test_logs.zip /opt/output
                       # Upload logs to S3 bucket
-                      aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.zip
+                      aws s3 cp /opt/output/ s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/ --recursive --exclude "*" --include "*.log"
                   volumeMounts:
                     - name: output
                       mountPath: /opt/output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 35784c5a3..a1d333d91 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -690,7 +690,7 @@ jobs:
       run: |
         mkdir -p axlearn-output
         aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.zip axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
 
         passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
         failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)

From 828073c14059954d8eed46ca52304a09a6a401d6 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 3 Mar 2025 22:20:54 +0000
Subject: [PATCH 86/89] fixed tests and comments @olupton

---
 .github/container/test-axlearn.sh |   2 +-
 .github/workflows/_ci.yaml        | 976 +++++++++++++++---------------
 2 files changed, 489 insertions(+), 489 deletions(-)

diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
index 9d7faf9dd..d1993cc03 100755
--- a/.github/container/test-axlearn.sh
+++ b/.github/container/test-axlearn.sh
@@ -149,7 +149,7 @@ passed=0
 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
 
 
-for test_file in "${final_test_files[@]:0:3}"; do
+for test_file in "${final_test_files[@]}"; do
     echo "Running: ${test_file}"
     log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
     log_file="${LOG_DIRECTORY}/${log_file_name}"
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index a1d333d91..9ded946d2 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,115 +66,115 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  # build-triton:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-triton-build
-  #     BADGE_FILENAME: badge-triton-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: triton
-  #     DOCKERFILE: .github/container/Dockerfile.triton
-  #     RUNNER_SIZE: large
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-  #   secrets: inherit
+  build-triton:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-triton-build
+      BADGE_FILENAME: badge-triton-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: triton
+      DOCKERFILE: .github/container/Dockerfile.triton
+      RUNNER_SIZE: large
+      EXTRA_BUILD_ARGS: |
+        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+    secrets: inherit
 
-  # build-equinox:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-equinox-build
-  #     BADGE_FILENAME: badge-equinox-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: equinox
-  #     DOCKERFILE: .github/container/Dockerfile.equinox
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-  #   secrets: inherit
+  build-equinox:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-equinox-build
+      BADGE_FILENAME: badge-equinox-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: equinox
+      DOCKERFILE: .github/container/Dockerfile.equinox
+      EXTRA_BUILD_ARGS: |
+        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+    secrets: inherit
 
-  # build-maxtext:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-maxtext-build
-  #     BADGE_FILENAME: badge-maxtext-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: maxtext
-  #     DOCKERFILE: .github/container/Dockerfile.maxtext
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-  #   secrets: inherit
+  build-maxtext:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-maxtext-build
+      BADGE_FILENAME: badge-maxtext-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: maxtext
+      DOCKERFILE: .github/container/Dockerfile.maxtext
+      EXTRA_BUILD_ARGS: |
+        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+    secrets: inherit
 
-  # build-levanter:
-  #   needs: [build-jax]
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-levanter-build"
-  #     BADGE_FILENAME: "badge-levanter-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: levanter
-  #     DOCKERFILE: .github/container/Dockerfile.levanter
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-  #   secrets: inherit
+  build-levanter:
+    needs: [build-jax]
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-levanter-build"
+      BADGE_FILENAME: "badge-levanter-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: levanter
+      DOCKERFILE: .github/container/Dockerfile.levanter
+      EXTRA_BUILD_ARGS: |
+        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+    secrets: inherit
 
-  # build-upstream-t5x:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: "artifact-t5x-build"
-  #     BADGE_FILENAME: "badge-t5x-build"
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: upstream-t5x
-  #     DOCKERFILE: .github/container/Dockerfile.t5x
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-  #   secrets: inherit
+  build-upstream-t5x:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: "artifact-t5x-build"
+      BADGE_FILENAME: "badge-t5x-build"
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: upstream-t5x
+      DOCKERFILE: .github/container/Dockerfile.t5x
+      EXTRA_BUILD_ARGS: |
+        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+    secrets: inherit
 
-  # build-rosetta-t5x:
-  #   needs: build-upstream-t5x
-  #   uses: ./.github/workflows/_build_rosetta.yaml
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-  #     BASE_LIBRARY: t5x
-  #   secrets: inherit
+  build-rosetta-t5x:
+    needs: build-upstream-t5x
+    uses: ./.github/workflows/_build_rosetta.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+      BASE_LIBRARY: t5x
+    secrets: inherit
 
-  # build-gemma:
-  #   needs: build-jax
-  #   uses: ./.github/workflows/_build.yaml
-  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-  #   with:
-  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #     ARTIFACT_NAME: artifact-gemma-build
-  #     BADGE_FILENAME: badge-gemma-build
-  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #     CONTAINER_NAME: gemma
-  #     DOCKERFILE: rosetta/Dockerfile.gemma
-  #     DOCKER_CONTEXT: .
-  #     EXTRA_BUILD_ARGS: |
-  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-  #   secrets: inherit
+  build-gemma:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-gemma-build
+      BADGE_FILENAME: badge-gemma-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: gemma
+      DOCKERFILE: rosetta/Dockerfile.gemma
+      DOCKER_CONTEXT: .
+      EXTRA_BUILD_ARGS: |
+        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+    secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -196,13 +196,13 @@ jobs:
     needs:
       - build-base
       - build-jax
-      # - build-triton
-      # - build-equinox
-      # - build-maxtext
-      # - build-levanter
-      # - build-upstream-t5x
-      # - build-rosetta-t5x
-      # - build-gemma
+      - build-triton
+      - build-equinox
+      - build-maxtext
+      - build-levanter
+      - build-upstream-t5x
+      - build-rosetta-t5x
+      - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -214,22 +214,22 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            # {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            # {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -239,276 +239,276 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  # test-distribution:
-  #   runs-on: ubuntu-22.04
-  #   strategy:
-  #     matrix:
-  #       TEST_SCRIPT:
-  #         - extra-only-distribution.sh
-  #         - mirror-only-distribution.sh
-  #         - upstream-only-distribution.sh
-  #         - local-patch-distribution.sh
-  #     fail-fast: false
-  #   steps:
-  #     - name: Print environment variables
-  #       run: env
-  #     - name: Set git login for tests
-  #       run: |
-  #         git config --global user.email "jax@nvidia.com"
-  #         git config --global user.name "JAX-Toolbox CI"
-  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
-  #       uses: actions/checkout@v4
-  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  # test-jax:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: jax
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-backend-independent.log
-  #         test-jax.sh -b backend-independent 
-  #       EOF
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee tee test-gpu.log
-  #         nvidia-cuda-mps-control -d
-  #         test-jax.sh -b gpu
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-backend-independent.log
-  #       test-gpu.log
-  #   secrets: inherit
+  test-distribution:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        TEST_SCRIPT:
+          - extra-only-distribution.sh
+          - mirror-only-distribution.sh
+          - upstream-only-distribution.sh
+          - local-patch-distribution.sh
+      fail-fast: false
+    steps:
+      - name: Print environment variables
+        run: env
+      - name: Set git login for tests
+        run: |
+          git config --global user.email "jax@nvidia.com"
+          git config --global user.name "JAX-Toolbox CI"
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v4
+      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  test-jax:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: jax
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all \
+        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-backend-independent.log
+          test-jax.sh -b backend-independent 
+        EOF
+        docker run -i --shm-size=1g --gpus all \
+        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee tee test-gpu.log
+          nvidia-cuda-mps-control -d
+          test-jax.sh -b gpu
+        EOF
+      STATISTICS_SCRIPT: |
+        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-backend-independent.log
+        test-gpu.log
+    secrets: inherit
 
-  # test-nsys-jax:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: nsys-jax
-  #     EXECUTE: |
-  #       set -o pipefail
-  #       num_tests=0
-  #       num_failures=0
-  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-  #       # avoid an early abort here.
-  #       set +e
-  #       docker run -i --shm-size=1g --gpus all \
-  #         -v $PWD:/opt/output \
-  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #         bash <<"EOF" |& tee test-nsys-jax.log
-  #           # nsys-jax is already installed, this is just adding the test dependencies
-  #           pip install pytest-reportlog nsys-jax[test]
-  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
-  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-  #       EOF
-  #       set -e
-  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-  #       for mode in 1-process 2-process process-per-gpu; do
-  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-  #         if [[ "${mode}" == "1-process" ]]; then
-  #           PROCESS_COUNT=1
-  #           ARGS=""
-  #         elif [[ "${mode}" == "2-process" ]]; then
-  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-  #           # this will flush out more bugs than process-per-node or process-per-GPU.
-  #           PROCESS_COUNT=2
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-  #         else
-  #           PROCESS_COUNT=${GPUS_PER_NODE}
-  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-  #         fi
-  #         for collection in full partial; do
-  #           NSYS_JAX="nsys-jax"
-  #           if [[ "${mode}" == "1-process" ]]; then
-  #             # We will not run nsys-jax-combine, so run analyses eagerly
-  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-  #           fi
-  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-  #           if [[ "${collection}" == "partial" ]]; then
-  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-  #             # nvbug/4801401
-  #             NSYS_JAX+=" --sample=none"
-  #           fi
-  #           set +e
-  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         done
-  #         if [[ "${mode}" != "1-process" ]]; then
-  #           # Run nsys-jax-combine
-  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
-  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-  #           done
-  #           set +e
-  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-  #           num_failures=$((num_failures + ($? != 0)))
-  #           set -e
-  #           num_tests=$((num_tests + 1))
-  #         fi
-  #       done
-  #       ls -R .
-  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-  #       exit $num_failures
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-nsys-jax.log)
-  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       # pytest-driven part
-  #       test-nsys-jax.log
-  #       pytest-report.jsonl
-  #       # nsys-jax logfiles
-  #       *process-*-execution.log
-  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
-  #       1-process-*-execution-0.zip
-  #       # nsys-jax-combine output/logfiles
-  #       *process*-*-execution.zip
-  #       *-execution-combine.log
-  #   secrets: inherit
+  test-nsys-jax:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: nsys-jax
+      EXECUTE: |
+        set -o pipefail
+        num_tests=0
+        num_failures=0
+        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+        # avoid an early abort here.
+        set +e
+        docker run -i --shm-size=1g --gpus all \
+          -v $PWD:/opt/output \
+          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+          bash <<"EOF" |& tee test-nsys-jax.log
+            # nsys-jax is already installed, this is just adding the test dependencies
+            pip install pytest-reportlog nsys-jax[test]
+            # abuse knowledge that nsys-jax is installed editable, so the tests exist
+            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+        EOF
+        set -e
+        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+        for mode in 1-process 2-process process-per-gpu; do
+          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+          if [[ "${mode}" == "1-process" ]]; then
+            PROCESS_COUNT=1
+            ARGS=""
+          elif [[ "${mode}" == "2-process" ]]; then
+            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+            # this will flush out more bugs than process-per-node or process-per-GPU.
+            PROCESS_COUNT=2
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+          else
+            PROCESS_COUNT=${GPUS_PER_NODE}
+            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+          fi
+          for collection in full partial; do
+            NSYS_JAX="nsys-jax"
+            if [[ "${mode}" == "1-process" ]]; then
+              # We will not run nsys-jax-combine, so run analyses eagerly
+              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+            fi
+            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+            if [[ "${collection}" == "partial" ]]; then
+              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+              # nvbug/4801401
+              NSYS_JAX+=" --sample=none"
+            fi
+            set +e
+            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          done
+          if [[ "${mode}" != "1-process" ]]; then
+            # Run nsys-jax-combine
+            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+            for (( i=0; i<PROCESS_COUNT; i++ )); do
+              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+            done
+            set +e
+            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+            num_failures=$((num_failures + ($? != 0)))
+            set -e
+            num_tests=$((num_tests + 1))
+          fi
+        done
+        ls -R .
+        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+        exit $num_failures
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-nsys-jax.log)
+        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        # pytest-driven part
+        test-nsys-jax.log
+        pytest-report.jsonl
+        # nsys-jax logfiles
+        *process-*-execution.log
+        # nsys-jax output for the case that doesn't use nsys-jax-combine
+        1-process-*-execution-0.zip
+        # nsys-jax-combine output/logfiles
+        *process*-*-execution.zip
+        *-execution-combine.log
+    secrets: inherit
 
   #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
   #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
   #not already have nsys-jax installed
-  # test-nsys-jax-archive:
-  #   needs: test-nsys-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   strategy:
-  #     matrix:
-  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-  #   runs-on: ${{ matrix.os }}
-  #   steps:
-  #   - name: Download nsys-jax output .zip files
-  #     uses: actions/download-artifact@v4
-  #     with:
-  #       name: nsys-jax-unit-test-A100
-  #   - name: Extract archives and execute install scripts
-  #     run: |
-  #       pip install virtualenv # for install.sh
-  #       for zip in $(ls *.zip); do
-  #         ZIP="${PWD}/${zip}"
-  #         pushd $(mktemp -d)
-  #         unzip "${ZIP}"
-  #         ls -l
-  #         # TODO: verify this isn't needed, or make sure it isn't needed
-  #         chmod 755 install.sh
-  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-  #         # Skip executing Jupyter lab
-  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-  #         popd
-  #       done
-
-  # test-nsys-jax-eks:
-  #   needs: build-jax
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   runs-on: eks
-  #   env:
-  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
-  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-  #     TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
-  #   steps:
-  #   - name: Check out the repository
-  #     uses: actions/checkout@v4
-  #   - name: GHCR login
-  #     uses: ./.github/actions/ghcr-login 
-  #     with: 
-  #       docker-username: ${{ github.repository_owner }}
-  #       docker-password: ${{ secrets.GITHUB_TOKEN}}
-  #       token-name: ${{ env.TOKEN_NAME }}
-  #   - name: Configure Kubernetes job
-  #     run: |
-  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
-  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
-  #         .github/eks-workflow-files/job.yml
-  #       git diff .github/eks-workflow-files/job.yml
-  #   - name: Submit Kubernetes job
-  #     uses: ./.github/actions/submit-k8s-job
-  #     with: 
-  #       job-config-file: .github/eks-workflow-files/job.yml
-  #       job-name: ${{ env.JOB_NAME }}
-  #   - name: Delete eks job
-  #     uses: ./.github/actions/delete-k8s-job
-  #     if: ${{ always() }}
-  #     with: 
-  #       job-name: ${{ env.JOB_NAME }}
-  #   - name: Configure post-processing job
-  #     run: |
-  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-  #         | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-  #         .github/eks-workflow-files/post-process-job.yml
-  #       git diff .github/eks-workflow-files/post-process-job.yml
-  #   - name: Submit post process k8s job
-  #     uses: ./.github/actions/submit-k8s-job
-  #     with: 
-  #         job-config-file: .github/eks-workflow-files/post-process-job.yml
-  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-  #   - name: Delete post process k8s job
-  #     uses: ./.github/actions/delete-k8s-job
-  #     with:
-  #         job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-  #   - name: Delete GitHub Container Registry token
-  #     uses: ./.github/actions/delete-ghcr-token
-  #     if:  ${{ always() }}
-  #     with: 
-  #       token-name: ${{ env.TOKEN_NAME }}
-  # COMMENT THIS
-  # test-equinox:
-  #   needs: build-equinox
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-  #     TEST_NAME: equinox
-  #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -exc -o pipefail \
-  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
-  #     STATISTICS_SCRIPT: |
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-equinox.log
-  #   secrets: inherit
-  # COMMENT THIS
+  test-nsys-jax-archive:
+    needs: test-nsys-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Download nsys-jax output .zip files
+      uses: actions/download-artifact@v4
+      with:
+        name: nsys-jax-unit-test-A100
+    - name: Extract archives and execute install scripts
+      run: |
+        pip install virtualenv # for install.sh
+        for zip in $(ls *.zip); do
+          ZIP="${PWD}/${zip}"
+          pushd $(mktemp -d)
+          unzip "${ZIP}"
+          ls -l
+          # TODO: verify this isn't needed, or make sure it isn't needed
+          chmod 755 install.sh
+          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+          # Skip executing Jupyter lab
+          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+          popd
+        done
+
+  test-nsys-jax-eks:
+    needs: build-jax
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    runs-on: eks
+    env:
+      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: ${{ github.run_id }}-nsys-jax
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+      TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: GHCR login
+      uses: ./.github/actions/ghcr-login 
+      with: 
+        docker-username: ${{ github.repository_owner }}
+        docker-password: ${{ secrets.GITHUB_TOKEN}}
+        token-name: ${{ env.TOKEN_NAME }}
+    - name: Configure Kubernetes job
+      run: |
+        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+          | select(di == 1).metadata.name = strenv(JOB_NAME)
+          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
+          .github/eks-workflow-files/job.yml
+        git diff .github/eks-workflow-files/job.yml
+    - name: Submit Kubernetes job
+      uses: ./.github/actions/submit-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/job.yml
+        job-name: ${{ env.JOB_NAME }}
+    - name: Delete eks job
+      uses: ./.github/actions/delete-k8s-job
+      if: ${{ always() }}
+      with: 
+        job-name: ${{ env.JOB_NAME }}
+    - name: Configure post-processing job
+      run: |
+        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+          .github/eks-workflow-files/post-process-job.yml
+        git diff .github/eks-workflow-files/post-process-job.yml
+    - name: Submit post process k8s job
+      uses: ./.github/actions/submit-k8s-job
+      with: 
+          job-config-file: .github/eks-workflow-files/post-process-job.yml
+          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+    - name: Delete post process k8s job
+      uses: ./.github/actions/delete-k8s-job
+      with:
+          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+    - name: Delete GitHub Container Registry token
+      uses: ./.github/actions/delete-ghcr-token
+      if:  ${{ always() }}
+      with: 
+        token-name: ${{ env.TOKEN_NAME }}
+  COMMENT THIS
+  test-equinox:
+    needs: build-equinox
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: equinox
+      EXECUTE: |
+        docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+        bash -exc -o pipefail \
+        'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+      STATISTICS_SCRIPT: |
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-equinox.log
+    secrets: inherit
+
   # test-te-multigpu:
   #   needs: build-upstream-pax
   #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
@@ -517,79 +517,78 @@ jobs:
   #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
   #   secrets: inherit
 
-  # test-upstream-t5x:
-  #   needs: build-upstream-t5x
-  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-upstream-t5x:
+    needs: build-upstream-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+    uses: ./.github/workflows/_test_upstream_t5x.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
-  # test-rosetta-t5x:
-  #   needs: build-rosetta-t5x
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
-  #   with:
-  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-rosetta-t5x:
+    needs: build-rosetta-t5x
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_t5x_rosetta.yaml
+    with:
+      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
-  # test-triton:
-  #   needs: build-triton
-  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: triton
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-triton.log
-  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-  #         # actually having a CUDA backend for pytoch
-  #         pip install --no-deps torch
-  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-  #       passed_tests=$((total_tests - errors - failed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-triton.log
-  #   secrets: inherit
+  test-triton:
+    needs: build-triton
+    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: triton
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-triton.log
+          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+          # actually having a CUDA backend for pytoch
+          pip install --no-deps torch
+          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+        EOF
+      STATISTICS_SCRIPT: |
+        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+        passed_tests=$((total_tests - errors - failed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-triton.log
+    secrets: inherit
 
-  # test-levanter:
-  #   needs: build-levanter
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: levanter
-  #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g \
-  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-levanter.log
-  #         pip install flake8 pytest soundfile librosa
-  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-levanter.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-levanter.log
-  #   secrets: inherit
+  test-levanter:
+    needs: build-levanter
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: levanter
+      EXECUTE: |
+        docker run -i --gpus all --shm-size=1g \
+        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+        bash <<"EOF" |& tee test-levanter.log
+          pip install flake8 pytest soundfile librosa
+          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
+        EOF
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-levanter.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-levanter.log
+    secrets: inherit
     
-  # COMMENT THIS
   # test-te:
   #   needs: build-upstream-pax
   #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
@@ -619,37 +618,37 @@ jobs:
   #       pytest-report.jsonl
   #   secrets: inherit
 
-  # test-gemma:
-  #   needs: build-gemma
-  #   uses: ./.github/workflows/_test_unit.yaml  
-  #   if: inputs.ARCHITECTURE == 'amd64'
-  #   with:
-  #     TEST_NAME: gemma
-  #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -ec \
-  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-gemma.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-gemma.log
-  #   secrets: inherit
+  test-gemma:
+    needs: build-gemma
+    uses: ./.github/workflows/_test_unit.yaml  
+    if: inputs.ARCHITECTURE == 'amd64'
+    with:
+      TEST_NAME: gemma
+      EXECUTE: |
+        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+        bash -ec \
+        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-gemma.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-gemma.log
+    secrets: inherit
 
-  # test-maxtext:
-  #   needs: build-maxtext
-  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-  #   uses: ./.github/workflows/_test_maxtext.yaml
-  #   with:
-  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
+  test-maxtext:
+    needs: build-maxtext
+    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+    uses: ./.github/workflows/_test_maxtext.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn
@@ -748,6 +747,7 @@ jobs:
 
   # the fuji test will run for 20 minutes only, as per 2025-02-24 
   # is not possible to set the `max_steps` value
+  # this will be done with a customer python code
   test-axlearn-fuji-models-eks:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'

From 97f02157f35ae71faf68daf59ab18abf97431f87 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 4 Mar 2025 13:33:43 +0000
Subject: [PATCH 87/89] fix ci typo

---
 .github/workflows/_ci.yaml | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 9ded946d2..8072e282d 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -484,30 +484,30 @@ jobs:
       if:  ${{ always() }}
       with: 
         token-name: ${{ env.TOKEN_NAME }}
-  COMMENT THIS
-  test-equinox:
-    needs: build-equinox
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-      TEST_NAME: equinox
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-        bash -exc -o pipefail \
-        'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
-      STATISTICS_SCRIPT: |
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-equinox.log
-    secrets: inherit
+
+  # test-equinox:
+  #   needs: build-equinox
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  #     TEST_NAME: equinox
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -exc -o pipefail \
+  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-equinox.log
+  #   secrets: inherit
 
   # test-te-multigpu:
   #   needs: build-upstream-pax

From f1fbff2a06fc7c3d67f2f2d8b61d2708daf90712 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 4 Mar 2025 14:00:05 +0000
Subject: [PATCH 88/89] Fix test-nsys-jax-eks

---
 .github/workflows/_ci.yaml | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 8072e282d..9f0bb971a 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -436,12 +436,9 @@ jobs:
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
-    - name: GHCR login
-      uses: ./.github/actions/ghcr-login 
-      with: 
-        docker-username: ${{ github.repository_owner }}
-        docker-password: ${{ secrets.GITHUB_TOKEN}}
-        token-name: ${{ env.TOKEN_NAME }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr 
     - name: Configure Kubernetes job
       run: |
         yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
@@ -452,15 +449,10 @@ jobs:
           .github/eks-workflow-files/job.yml
         git diff .github/eks-workflow-files/job.yml
     - name: Submit Kubernetes job
-      uses: ./.github/actions/submit-k8s-job
+      uses: ./.github/actions/submit-delete-k8s-job
       with: 
         job-config-file: .github/eks-workflow-files/job.yml
         job-name: ${{ env.JOB_NAME }}
-    - name: Delete eks job
-      uses: ./.github/actions/delete-k8s-job
-      if: ${{ always() }}
-      with: 
-        job-name: ${{ env.JOB_NAME }}
     - name: Configure post-processing job
       run: |
         export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
@@ -470,20 +462,11 @@ jobs:
           | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
           .github/eks-workflow-files/post-process-job.yml
         git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post process k8s job
-      uses: ./.github/actions/submit-k8s-job
-      with: 
-          job-config-file: .github/eks-workflow-files/post-process-job.yml
-          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete post process k8s job
-      uses: ./.github/actions/delete-k8s-job
-      with:
-          job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-    - name: Delete GitHub Container Registry token
-      uses: ./.github/actions/delete-ghcr-token
-      if:  ${{ always() }}
+    - name: Submit post process Kubernetes job
+      uses: ./.github/actions/submit-delete-k8s-job
       with: 
-        token-name: ${{ env.TOKEN_NAME }}
+        job-config-file: .github/eks-workflow-files/post-process-job.yml
+        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
 
   # test-equinox:
   #   needs: build-equinox

From 626d1a76da5ca1decfd9822f512849a2b5164cef Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Tue, 4 Mar 2025 14:25:29 +0000
Subject: [PATCH 89/89] fix names in CI

---
 .github/workflows/_ci.yaml | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 9f0bb971a..8ed17d9d6 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -432,10 +432,15 @@ jobs:
       JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
       JOB_NAME: ${{ github.run_id }}-nsys-jax
       POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
     - name: K8s GHCR store and delete token
       id: store-token
       uses: ./.github/actions/store-delete-k8s-ghcr 
@@ -443,7 +448,7 @@ jobs:
       run: |
         yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
           | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
           | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
           | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
           .github/eks-workflow-files/job.yml
@@ -458,7 +463,7 @@ jobs:
         export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
         yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
           | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
           | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
           .github/eks-workflow-files/post-process-job.yml
         git diff .github/eks-workflow-files/post-process-job.yml