From 5bea679265e0fd87917ce283d89d54b238e38192 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 31 Jan 2025 17:27:58 +0000 Subject: [PATCH 01/89] start drafting support for axlearn --- .github/container/Dockerfile.axlearn | 44 ++++++++ .github/container/test-axlearn.sh | 146 +++++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 .github/container/Dockerfile.axlearn create mode 100644 .github/container/test-axlearn.sh diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn new file mode 100644 index 000000000..857e3941f --- /dev/null +++ b/.github/container/Dockerfile.axlearn @@ -0,0 +1,44 @@ +# syntax=docker/dockerfile:1-labs +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git +ARG SRC_PATH_AXLEARN=/opt/axlearn + +############################################################################### +## Download source and configure dependencies +############################################################################### +FROM ${BASE_IMAGE} AS mealkit +ARG URLREF_AXLEARN +ARG SRC_PATH_AXLEARN + +RUN <<"EOF" bash -ex + git clone "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}" +EOF + +RUN <<"EOF" bash -ex + echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in + echo <> /opt/pip-tools.d/requirements-axlearn.in +aqtp==0.8.2 +einops==0.8.0 +nltk==3.7 +portpicker==1.6.0 +seqio==0.0.18 +protobuf==3.20.3 +tensorflow==2.18.0 +tensorflow-datasets==4.9.7 +tensorflow-io==0.37.1 +tensorflow-io-gcs-filesystem==0.37.1 +tensorflow-metadata==1.13.1 +tensorflow-probability==0.24.0 +tensorflow-text==2.18.1 +pytest>=7.4.3 +REQUIREMENTS +EOF + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### +FROM mealkit AS final + +RUN pip-finalize.sh + +WORKDIR ${SRC_PATH_AXLEARN} diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh new file mode 100644 index 000000000..5d36ad11c --- /dev/null +++ b/.github/container/test-axlearn.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +set -euo pipefail + +usage() { + echo "Run tests in axlearn with specified options." + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo " OPTIONS DESCRIPTION" + echo " -d, --directory DIR Directory to run tests in." + echo " Default: 'axlearn/axlearn/common'." + echo " -p, --packages PACKAGES Space-separated list of packages to install via pip." + echo " Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'." + echo " -c, --cuda-devices DEVICES CUDA devices to use. Default: '0,1'." + echo " -t, --test-files PATTERN Pattern for test files to run." + echo " Default: '*_test.py'." + echo " --test-files-list FILE File containing the list of test files to run." + echo " -o, --output DIRECTORY Output directory for logs and summary." + echo " Default: 'test_runs/'." + echo " -h, --help Show this help message and exit." + exit 1 +} + +# Default values +DIR='axlearn/axlearn/common' +PACKAGES='attrs scikit-learn torch evaluate transformers timm wandb grain' +CUDNN_VERSION='9.7.0.66' # TODO check the cudnn version on compute +CUDA_DEVICES='0,1' +TEST_FILES_PATTERN='*_test.py' +TEST_FILES_LIST='' +OUTPUT_DIRECTORY='' + +# Parse args +args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,test-files-list:,output:,help -- "$@") +if [ $? -ne 0 ]; then + usage + exit 1 +fi + +eval set -- "$args" + +while true; do + case "$1" in + -d|--directory) + DIR="$2" + shift 2 + ;; + -p|--packages) + PACKAGES="$2" + shift 2 + ;; + -c|--cuda-devices) + CUDA_DEVICES="$2" + shift 2 + ;; + -t|--test-files) + TEST_FILES_PATTERN="$2" + shift 2 + ;; + --test-files-list) + TEST_FILES_LIST="$2" + shift 2 + ;; + -o|--output) + OUTPUT_DIRECTORY="$2" + shift 2 + ;; + -h|--help) + usage + ;; + --) + shift + break + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +# TODO double check what's the best choice +if [ -z "$OUTPUT_DIRECTORY" ]; then + timestamp=$(date +%Y%m%d_%H%M%S) + OUTPUT_DIRECTORY="test_runs/${timestamp}" +fi +LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs" + +mkdir -p "${LOG_DIRECTORY}" + +# Print out config for sanity check +echo "Configuration:" +echo " Directory: $DIR" +echo " Packages: $PACKAGES" +echo " CUDA Devices: $CUDA_DEVICES" +if [ -n "$TEST_FILES_LIST" ]; then + echo " Test Files List: $TEST_FILES_LIST" +else + echo " Test Files Pattern: $TEST_FILES_PATTERN" +fi +echo " Output Directory: $OUTPUT_DIRECTORY" +echo "" + + +cd "$DIR" || exit 1 + +# Install all the neeeded packages +echo "Installing packages..." +pip install $PACKAGES + +# Set CUDA devices +export CUDA_VISIBLE_DEVICES="${CUDA_DEVICES}" +echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES" + +echo "Running tests..." + +if [ -n "$TEST_FILES_LIST" ]; then + mapfile -t test_files < "$TEST_FILES_LIST" +else + shopt -s nullglob + test_files=($TEST_FILES_PATTERN) + shopt -u nullglob +fi + +if [ "${#test_files[@]}" -eq 0 ]; then + echo "No test files found to run." + exit 1 +fi + +for test_file in "${test_files[@]}"; do + echo "Running: ${test_file}" + # Ensure the test file exists + if [ ! -f "${test_file}" ]; then + echo "${test_file}: NOT FOUND" >> "${SUMMARY_FILE}" + echo "Test file not found: ${test_file}" + ((errors++)) + continue + fi + log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log + log_file="${LOG_DIRECTORY}/${log_file_name}" + # run the tests and save them as *.log + pytest "${test_file}" -v --capture=tee-sys | tee "${log_file}" + # TODO parse the logs + #echo ${PIPESTATUS[0]} +done From 807d3df8d3c1a0b7295a7d25d3db5a4e3e1fc64b Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Feb 2025 16:47:57 +0000 Subject: [PATCH 02/89] fix test for axlearn --- .github/container/test-axlearn.sh | 131 ++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 26 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 5d36ad11c..70d7ecb00 100644 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -uo pipefail usage() { echo "Run tests in axlearn with specified options." @@ -13,9 +13,8 @@ usage() { echo " -p, --packages PACKAGES Space-separated list of packages to install via pip." echo " Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'." echo " -c, --cuda-devices DEVICES CUDA devices to use. Default: '0,1'." - echo " -t, --test-files PATTERN Pattern for test files to run." + echo " -t, --test-files FILES Pattern for test files to run." echo " Default: '*_test.py'." - echo " --test-files-list FILE File containing the list of test files to run." echo " -o, --output DIRECTORY Output directory for logs and summary." echo " Default: 'test_runs/'." echo " -h, --help Show this help message and exit." @@ -27,12 +26,11 @@ DIR='axlearn/axlearn/common' PACKAGES='attrs scikit-learn torch evaluate transformers timm wandb grain' CUDNN_VERSION='9.7.0.66' # TODO check the cudnn version on compute CUDA_DEVICES='0,1' -TEST_FILES_PATTERN='*_test.py' -TEST_FILES_LIST='' +TEST_FILES=() OUTPUT_DIRECTORY='' # Parse args -args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,test-files-list:,output:,help -- "$@") +args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,output:,help -- "$@") if [ $? -ne 0 ]; then usage exit 1 @@ -55,12 +53,12 @@ while true; do shift 2 ;; -t|--test-files) - TEST_FILES_PATTERN="$2" - shift 2 - ;; - --test-files-list) - TEST_FILES_LIST="$2" - shift 2 + shift + # Collect all arguments until the next option (starting with '-') + while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do + TEST_FILES+=("$1") + shift + done ;; -o|--output) OUTPUT_DIRECTORY="$2" @@ -94,10 +92,13 @@ echo "Configuration:" echo " Directory: $DIR" echo " Packages: $PACKAGES" echo " CUDA Devices: $CUDA_DEVICES" -if [ -n "$TEST_FILES_LIST" ]; then - echo " Test Files List: $TEST_FILES_LIST" +if [ "${#TEST_FILES[@]}" -gt 0 ]; then + echo " Test Files:" + for f in "${TEST_FILES[@]}"; do + echo " $f" + done else - echo " Test Files Pattern: $TEST_FILES_PATTERN" + echo " Test Files Pattern: '*_test.py' (default)" fi echo " Output Directory: $OUTPUT_DIRECTORY" echo "" @@ -115,20 +116,73 @@ echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES" echo "Running tests..." -if [ -n "$TEST_FILES_LIST" ]; then - mapfile -t test_files < "$TEST_FILES_LIST" -else - shopt -s nullglob - test_files=($TEST_FILES_PATTERN) - shopt -u nullglob +if [ "${#TEST_FILES[@]}" -eq 0 ]; then + TEST_FILES=("*_test.py") fi +expanded_test_files=() +for pattern in "${TEST_FILES[@]}"; do + # Use globbing to expand pattern + files=( $pattern ) + if [ "${#files[@]}" -gt 0 ]; then + expanded_test_files+=( "${files[@]}" ) + else + echo "Warning: No files matched pattern '$pattern'" + fi +done + -if [ "${#test_files[@]}" -eq 0 ]; then +if [ "${#expanded_test_files[@]}" -eq 0 ]; then echo "No test files found to run." exit 1 fi -for test_file in "${test_files[@]}"; do +echo "These are the test files:" +for f in "${expanded_test_files[@]}"; do + echo " $f" +done + +# Get the directory where the script is located +#SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt" +EXCLUDE_PATTERNS=() + +if [ -f "$EXCLUDE_LIST_FILE" ]; then + echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'" + mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE" +else + echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'" +fi +echo "Exclude patterns read:" +for pattern in "${EXCLUDE_PATTERNS[@]}"; do + echo "$pattern" +done + +#expanded_test_files=( "${expanded_test_files[@]:0:10}" ) +# we are skipping some tests as there's still wip by Apple +final_test_files=() + +for test_file in "${expanded_test_files[@]}"; do + exclude=false + #echo $test_file + for pattern in "${EXCLUDE_PATTERNS[@]}"; do + if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then + exclude=true + break + fi + done + if [ "$exclude" = false ]; then + final_test_files+=("$test_file") + fi +done + +# Initialize counters +errors=0 +failures=0 +passed=0 +SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" + + +for test_file in "${final_test_files[@]:0:10}"; do echo "Running: ${test_file}" # Ensure the test file exists if [ ! -f "${test_file}" ]; then @@ -140,7 +194,32 @@ for test_file in "${test_files[@]}"; do log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log log_file="${LOG_DIRECTORY}/${log_file_name}" # run the tests and save them as *.log - pytest "${test_file}" -v --capture=tee-sys | tee "${log_file}" - # TODO parse the logs - #echo ${PIPESTATUS[0]} + pytest "${test_file}" --capture=tee-sys | tee "${log_file}" + # TODO parse the logs? + exit_code=${PIPESTATUS[0]} + echo $exit_code + if [ $exit_code -eq 0 ]; then + echo "${test_file}: PASSED" >> "${SUMMARY_FILE}" + ((passed++)) + else + echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}" + ((failures++)) + fi + echo "" done + +echo $errors +echo $passed +echo $failures + +# e.g. of output summary +#/opt/axlearn/axlearn/common/adapter_flax_test.py: PASSED +#/opt/axlearn/axlearn/common/attention_bias_test.py: PASSED +#/opt/axlearn/axlearn/common/bert_test.py: FAILED (Exit code: 1) +#/opt/axlearn/axlearn/common/causal_lm_test.py: FAILED (Exit code: 1) +#/opt/axlearn/axlearn/common/checkpointer_orbax_test.py: PASSED +#/opt/axlearn/axlearn/common/checkpointer_test.py: PASSED +#/opt/axlearn/axlearn/common/compiler_options_test.py: PASSED +#/opt/axlearn/axlearn/common/config_test.py: PASSED +#/opt/axlearn/axlearn/common/conformer_test.py: FAILED (Exit code: 1) +#/opt/axlearn/axlearn/common/convolution_test.py: FAILED (Exit code: 1) From 9947c0843862c1d668434be8e827705692bffdae Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Feb 2025 17:38:45 +0000 Subject: [PATCH 03/89] add build for axlearn --- .github/workflows/_ci.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 167c4f009..b31359d0c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -203,6 +203,21 @@ jobs: URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} secrets: inherit + build-axlearn: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-axlearn-build + BADGE_FILENAME: badge-axlearn-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: axlearn + DOCKERFILE: .github/container/Dockerfile.axlearn + EXTRA_BUILD_ARGS: | + URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + secrets: inherit + collect-docker-tags: runs-on: ubuntu-22.04 if: "!cancelled()" @@ -218,6 +233,7 @@ jobs: - build-rosetta-t5x - build-rosetta-pax - build-gemma + - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} steps: @@ -237,6 +253,7 @@ jobs: {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ @@ -247,6 +264,7 @@ jobs: {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ ] From ef775d5fd8d54e9ba1965afad9c29414eed1f3d6 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 4 Feb 2025 11:04:28 +0000 Subject: [PATCH 04/89] install dependencies --- .github/container/Dockerfile.axlearn | 15 +++++++++++++++ .github/container/test-axlearn.sh | 10 ++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 857e3941f..1ec98c86b 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -31,9 +31,24 @@ tensorflow-metadata==1.13.1 tensorflow-probability==0.24.0 tensorflow-text==2.18.1 pytest>=7.4.3 +scikit-learn +torch +evaluate +transformers +timm +wandb +grain +nvidia-cudnn-cu12==9.7.0.66 REQUIREMENTS EOF + +############################################################################### +## Add test script to the path +############################################################################### + +ADD test-axlearn.sh /usr/local/bin + ############################################################################### ## Install accumulated packages from the base image and the previous stage ############################################################################### diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 70d7ecb00..e59933129 100644 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -12,7 +12,7 @@ usage() { echo " Default: 'axlearn/axlearn/common'." echo " -p, --packages PACKAGES Space-separated list of packages to install via pip." echo " Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'." - echo " -c, --cuda-devices DEVICES CUDA devices to use. Default: '0,1'." + echo " -c, --cuda-devices DEVICES CUDA devices to use. Default: '0,1,2,3,4,5,6,7'." echo " -t, --test-files FILES Pattern for test files to run." echo " Default: '*_test.py'." echo " -o, --output DIRECTORY Output directory for logs and summary." @@ -23,9 +23,7 @@ usage() { # Default values DIR='axlearn/axlearn/common' -PACKAGES='attrs scikit-learn torch evaluate transformers timm wandb grain' -CUDNN_VERSION='9.7.0.66' # TODO check the cudnn version on compute -CUDA_DEVICES='0,1' +CUDA_DEVICES='0,1,2,3,4,5,6,7' TEST_FILES=() OUTPUT_DIRECTORY='' @@ -106,10 +104,6 @@ echo "" cd "$DIR" || exit 1 -# Install all the neeeded packages -echo "Installing packages..." -pip install $PACKAGES - # Set CUDA devices export CUDA_VISIBLE_DEVICES="${CUDA_DEVICES}" echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES" From 15927d2eaa6871d7c4a17c28b56f31ba4b7448be Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 4 Feb 2025 17:26:30 +0000 Subject: [PATCH 05/89] check tests --- .github/workflows/_ci.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index b31359d0c..84fdc05fc 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -710,3 +710,21 @@ jobs: with: MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} secrets: inherit + + test-axlearn: + needs: build-axlearn + if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + uses: ./.github/workflows/_test_unit.yaml + with: # fix the arguments below + TEST_NAME: axlearn + EXECUTE: | + docker run -i --shm-size=1g --gpus all \ + ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-backend-independent.log + test-axlearn.sh --directory $pwd --output /opt/output/output.log --test-files /opt/axlearn/axlearn/common/*_test.py + EOF + STATISTICS_SCRIPT: | + echo "Todo" + ARTIFACTS: | + test-backend-independent.log + secrets: inherit \ No newline at end of file From af9dad37a01de86eb7e2886bd331a8b67e02b59a Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 6 Feb 2025 10:37:56 +0000 Subject: [PATCH 06/89] make the bash script executable --- .github/container/test-axlearn.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .github/container/test-axlearn.sh diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh old mode 100644 new mode 100755 From 9e4d4a551b7df060a055140790934e5769481327 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 7 Feb 2025 09:31:44 +0100 Subject: [PATCH 07/89] minimal ci to test axlearn --- .github/workflows/_ci.yaml | 1124 ++++++++++++++++++------------------ 1 file changed, 553 insertions(+), 571 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 84fdc05fc..514545edb 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,142 +66,142 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - build-triton: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-triton-build - BADGE_FILENAME: badge-triton-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: triton - DOCKERFILE: .github/container/Dockerfile.triton - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - secrets: inherit + # build-triton: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-triton-build + # BADGE_FILENAME: badge-triton-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: triton + # DOCKERFILE: .github/container/Dockerfile.triton + # RUNNER_SIZE: large + # EXTRA_BUILD_ARGS: | + # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + # secrets: inherit - build-equinox: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - secrets: inherit + # build-equinox: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-equinox-build + # BADGE_FILENAME: badge-equinox-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: equinox + # DOCKERFILE: .github/container/Dockerfile.equinox + # EXTRA_BUILD_ARGS: | + # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + # secrets: inherit - build-maxtext: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - secrets: inherit + # build-maxtext: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-maxtext-build + # BADGE_FILENAME: badge-maxtext-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: maxtext + # DOCKERFILE: .github/container/Dockerfile.maxtext + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + # secrets: inherit - build-levanter: - needs: [build-jax] - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-levanter-build" - BADGE_FILENAME: "badge-levanter-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: levanter - DOCKERFILE: .github/container/Dockerfile.levanter - EXTRA_BUILD_ARGS: | - URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - secrets: inherit + # build-levanter: + # needs: [build-jax] + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-levanter-build" + # BADGE_FILENAME: "badge-levanter-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: levanter + # DOCKERFILE: .github/container/Dockerfile.levanter + # EXTRA_BUILD_ARGS: | + # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + # secrets: inherit - build-upstream-t5x: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - secrets: inherit + # build-upstream-t5x: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-t5x-build" + # BADGE_FILENAME: "badge-t5x-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-t5x + # DOCKERFILE: .github/container/Dockerfile.t5x + # EXTRA_BUILD_ARGS: | + # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + # secrets: inherit - build-upstream-pax: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-pax-build - BADGE_FILENAME: badge-pax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-pax - DOCKERFILE: .github/container/Dockerfile.pax - EXTRA_BUILD_ARGS: | - URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} - URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} - URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} - secrets: inherit + # build-upstream-pax: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-pax-build + # BADGE_FILENAME: badge-pax-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-pax + # DOCKERFILE: .github/container/Dockerfile.pax + # EXTRA_BUILD_ARGS: | + # URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} + # URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} + # URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} + # secrets: inherit - build-rosetta-t5x: - needs: build-upstream-t5x - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: t5x - secrets: inherit + # build-rosetta-t5x: + # needs: build-upstream-t5x + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: t5x + # secrets: inherit - build-rosetta-pax: - needs: build-upstream-pax - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: pax - secrets: inherit + # build-rosetta-pax: + # needs: build-upstream-pax + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: pax + # secrets: inherit - build-gemma: - needs: build-jax - uses: ./.github/workflows/_build.yaml - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-gemma-build - BADGE_FILENAME: badge-gemma-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: gemma - DOCKERFILE: rosetta/Dockerfile.gemma - DOCKER_CONTEXT: . - EXTRA_BUILD_ARGS: | - URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - secrets: inherit + # build-gemma: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-gemma-build + # BADGE_FILENAME: badge-gemma-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: gemma + # DOCKERFILE: rosetta/Dockerfile.gemma + # DOCKER_CONTEXT: . + # EXTRA_BUILD_ARGS: | + # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + # secrets: inherit build-axlearn: needs: build-jax @@ -224,15 +224,15 @@ jobs: needs: - build-base - build-jax - - build-triton - - build-equinox - - build-maxtext - - build-levanter - - build-upstream-t5x - - build-upstream-pax - - build-rosetta-t5x - - build-rosetta-pax - - build-gemma + # - build-triton + # - build-equinox + # - build-maxtext + # - build-levanter + # - build-upstream-t5x + # - build-upstream-pax + # - build-rosetta-t5x + # - build-rosetta-pax + # - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -244,26 +244,8 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -273,447 +255,447 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - test-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee tee test-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit - - test-nsys-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - num_tests=0 - num_failures=0 - # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # avoid an early abort here. - set +e - docker run -i --shm-size=1g --gpus all \ - -v $PWD:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - EOF - set -e - GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - for mode in 1-process 2-process process-per-gpu; do - DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - if [[ "${mode}" == "1-process" ]]; then - PROCESS_COUNT=1 - ARGS="" - elif [[ "${mode}" == "2-process" ]]; then - # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # this will flush out more bugs than process-per-node or process-per-GPU. - PROCESS_COUNT=2 - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - else - PROCESS_COUNT=${GPUS_PER_NODE} - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - fi - for collection in full partial; do - NSYS_JAX="nsys-jax" - if [[ "${mode}" == "1-process" ]]; then - # We will not run nsys-jax-combine, so run analyses eagerly - NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - fi - NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - if [[ "${collection}" == "partial" ]]; then - NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # nvbug/4801401 - NSYS_JAX+=" --sample=none" - fi - set +e - ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - num_failures=$((num_failures + ($? != 0))) - set -e - num_tests=$((num_tests + 1)) - done - if [[ "${mode}" != "1-process" ]]; then - # Run nsys-jax-combine - NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - for (( i=0; i> $GITHUB_ENV - echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - exit $num_failures - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - pytest-report.jsonl - # nsys-jax logfiles - *process-*-execution.log - # nsys-jax output for the case that doesn't use nsys-jax-combine - 1-process-*-execution-0.zip - # nsys-jax-combine output/logfiles - *process*-*-execution.zip - *-execution-combine.log - secrets: inherit - - # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - # not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done - - test-nsys-jax-eks: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - runs-on: eks - env: - JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess - TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Store GitHub Container Registry token as Kubernetes secret - run: | - kubectl create secret generic \ - ${{ github.run_id }}-${{ github.run_attempt }}-token \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson - - name: Configure Kubernetes job - run: | - yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - .github/eks-workflow-files/job.yml - git diff .github/eks-workflow-files/job.yml - - name: Submit Kubernetes job - run: kubectl apply -f .github/eks-workflow-files/job.yml - - name: Wait for Kubernetes job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 2 - done - - name: Stream Kubernetes job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax - # Clean up in case of errors as well as success - - name: Delete Kubernetes job - if: always() - run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax - - name: Configure post-processing job - run: | - export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post-processing Kubernetes job - run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml - - name: Wait for post-processing Kubernetes job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 2 - done - - name: Stream post-processing Kubernetes job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess - # Clean up in case of errors as well as success - - name: Delete post-processing Kubernetes job - if: always() - run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess - - name: Delete GitHub Container Registry token - if: always() - run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token - - # test-equinox: - # needs: build-equinox + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + # test-jax: + # needs: build-jax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a # uses: ./.github/workflows/_test_unit.yaml # with: - # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # TEST_NAME: equinox + # TEST_NAME: jax # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # bash -exc -o pipefail \ - # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee tee test-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b gpu + # EOF # STATISTICS_SCRIPT: | - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) # total_tests=$((failed_tests + passed_tests)) # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT # ARTIFACTS: | - # test-equinox.log + # test-backend-independent.log + # test-gpu.log # secrets: inherit - test-te-multigpu: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_te.yaml - with: - TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-nsys-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # num_tests=0 + # num_failures=0 + # # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # # avoid an early abort here. + # set +e + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # EOF + # set -e + # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + # for mode in 1-process 2-process process-per-gpu; do + # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + # if [[ "${mode}" == "1-process" ]]; then + # PROCESS_COUNT=1 + # ARGS="" + # elif [[ "${mode}" == "2-process" ]]; then + # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # # this will flush out more bugs than process-per-node or process-per-GPU. + # PROCESS_COUNT=2 + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + # else + # PROCESS_COUNT=${GPUS_PER_NODE} + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + # fi + # for collection in full partial; do + # NSYS_JAX="nsys-jax" + # if [[ "${mode}" == "1-process" ]]; then + # # We will not run nsys-jax-combine, so run analyses eagerly + # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + # fi + # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + # if [[ "${collection}" == "partial" ]]; then + # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # # nvbug/4801401 + # NSYS_JAX+=" --sample=none" + # fi + # set +e + # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + # num_failures=$((num_failures + ($? != 0))) + # set -e + # num_tests=$((num_tests + 1)) + # done + # if [[ "${mode}" != "1-process" ]]; then + # # Run nsys-jax-combine + # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + # for (( i=0; i> $GITHUB_ENV + # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + # exit $num_failures + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # pytest-report.jsonl + # # nsys-jax logfiles + # *process-*-execution.log + # # nsys-jax output for the case that doesn't use nsys-jax-combine + # 1-process-*-execution-0.zip + # # nsys-jax-combine output/logfiles + # *process*-*-execution.zip + # *-execution-combine.log + # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + # # not already have nsys-jax installed + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done + + # test-nsys-jax-eks: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # runs-on: eks + # env: + # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax + # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess + # TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: Store GitHub Container Registry token as Kubernetes secret + # run: | + # kubectl create secret generic \ + # ${{ github.run_id }}-${{ github.run_attempt }}-token \ + # --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + # --type=kubernetes.io/dockerconfigjson + # - name: Configure Kubernetes job + # run: | + # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + # | select(di == 1).metadata.name = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + # .github/eks-workflow-files/job.yml + # git diff .github/eks-workflow-files/job.yml + # - name: Submit Kubernetes job + # run: kubectl apply -f .github/eks-workflow-files/job.yml + # - name: Wait for Kubernetes job to start + # run: | + # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + # sleep 2 + # done + # - name: Stream Kubernetes job output + # run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax + # # Clean up in case of errors as well as success + # - name: Delete Kubernetes job + # if: always() + # run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax + # - name: Configure post-processing job + # run: | + # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + # | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + # .github/eks-workflow-files/post-process-job.yml + # git diff .github/eks-workflow-files/post-process-job.yml + # - name: Submit post-processing Kubernetes job + # run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml + # - name: Wait for post-processing Kubernetes job to start + # run: | + # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + # sleep 2 + # done + # - name: Stream post-processing Kubernetes job output + # run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess + # # Clean up in case of errors as well as success + # - name: Delete post-processing Kubernetes job + # if: always() + # run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess + # - name: Delete GitHub Container Registry token + # if: always() + # run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token + + # # test-equinox: + # # needs: build-equinox + # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # # uses: ./.github/workflows/_test_unit.yaml + # # with: + # # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + # # TEST_NAME: equinox + # # EXECUTE: | + # # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + # # bash -exc -o pipefail \ + # # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # # STATISTICS_SCRIPT: | + # # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # # total_tests=$((failed_tests + passed_tests)) + # # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # # ARTIFACTS: | + # # test-equinox.log + # # secrets: inherit + + # test-te-multigpu: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_te.yaml + # with: + # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # actually having a CUDA backend for pytoch - pip install --no-deps torch - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install flake8 pytest soundfile librosa - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # # actually having a CUDA backend for pytoch + # pip install --no-deps torch + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit - test-te: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: te - EXECUTE: | - docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-te.log - pip install pytest-reportlog - pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-te.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - TIMEOUT_MINUTES: 120 - ARTIFACTS: | - test-te.log - pytest-report.jsonl - secrets: inherit + # test-levanter: + # needs: build-levanter + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: levanter + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install flake8 pytest soundfile librosa + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-levanter.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-levanter.log + # secrets: inherit - test-upstream-pax: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_upstream_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-te: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: te + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-te.log + # pip install pytest-reportlog + # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-te.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # TIMEOUT_MINUTES: 120 + # ARTIFACTS: | + # test-te.log + # pytest-report.jsonl + # secrets: inherit - test-rosetta-pax: - needs: build-rosetta-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_pax_rosetta.yaml - with: - PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-pax: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_upstream_pax.yaml + # with: + # PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + # test-rosetta-pax: + # needs: build-rosetta-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_pax_rosetta.yaml + # with: + # PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-gemma: - needs: build-gemma - uses: ./.github/workflows/_test_unit.yaml - if: inputs.ARCHITECTURE == 'amd64' - with: - TEST_NAME: gemma - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - bash -ec \ - "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-gemma.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-gemma.log - secrets: inherit + # test-gemma: + # needs: build-gemma + # uses: ./.github/workflows/_test_unit.yaml + # if: inputs.ARCHITECTURE == 'amd64' + # with: + # TEST_NAME: gemma + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + # bash -ec \ + # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-gemma.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-gemma.log + # secrets: inherit - test-maxtext: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-maxtext: + # needs: build-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit test-axlearn: needs: build-axlearn - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + if: inputs.ARCHITECTURE == 'amd64' uses: ./.github/workflows/_test_unit.yaml with: # fix the arguments below TEST_NAME: axlearn From a3b8f266dcaef5d266622545c5e323f679b4ce2d Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Feb 2025 14:12:36 +0000 Subject: [PATCH 08/89] fix requirements --- .github/container/Dockerfile.axlearn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 1ec98c86b..c09b2c08c 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -16,7 +16,7 @@ EOF RUN <<"EOF" bash -ex echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in - echo <> /opt/pip-tools.d/requirements-axlearn.in + cat <> /opt/pip-tools.d/requirements-axlearn.in aqtp==0.8.2 einops==0.8.0 nltk==3.7 From 94054fd34bfb73e91ad13ca794090cd337b25c21 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Feb 2025 14:15:00 +0000 Subject: [PATCH 09/89] fix installation from pip --- .github/container/test-axlearn.sh | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index e59933129..2ffda9c4d 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -10,8 +10,6 @@ usage() { echo " OPTIONS DESCRIPTION" echo " -d, --directory DIR Directory to run tests in." echo " Default: 'axlearn/axlearn/common'." - echo " -p, --packages PACKAGES Space-separated list of packages to install via pip." - echo " Default: 'attrs scikit-learn torch evaluate transformers timm wandb grain'." echo " -c, --cuda-devices DEVICES CUDA devices to use. Default: '0,1,2,3,4,5,6,7'." echo " -t, --test-files FILES Pattern for test files to run." echo " Default: '*_test.py'." @@ -28,7 +26,7 @@ TEST_FILES=() OUTPUT_DIRECTORY='' # Parse args -args=$(getopt -o d:p:c:t:o:h --long directory:,packages:,cuda-devices:,test-files:,output:,help -- "$@") +args=$(getopt -o d:p:c:t:o:h --long directory:,cuda-devices:,test-files:,output:,help -- "$@") if [ $? -ne 0 ]; then usage exit 1 @@ -42,10 +40,6 @@ while true; do DIR="$2" shift 2 ;; - -p|--packages) - PACKAGES="$2" - shift 2 - ;; -c|--cuda-devices) CUDA_DEVICES="$2" shift 2 @@ -88,7 +82,6 @@ mkdir -p "${LOG_DIRECTORY}" # Print out config for sanity check echo "Configuration:" echo " Directory: $DIR" -echo " Packages: $PACKAGES" echo " CUDA Devices: $CUDA_DEVICES" if [ "${#TEST_FILES[@]}" -gt 0 ]; then echo " Test Files:" @@ -99,8 +92,7 @@ else echo " Test Files Pattern: '*_test.py' (default)" fi echo " Output Directory: $OUTPUT_DIRECTORY" -echo "" - +echo "" cd "$DIR" || exit 1 From b9e893c52aa97c93903eae7fe8ec2cff30eae61f Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Feb 2025 15:33:17 +0000 Subject: [PATCH 10/89] remove the nvidia-cunn package --- .github/container/Dockerfile.axlearn | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index c09b2c08c..6a0d3ac76 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -38,7 +38,6 @@ transformers timm wandb grain -nvidia-cudnn-cu12==9.7.0.66 REQUIREMENTS EOF From 4088e2ffaaf659414033f0013e30445cb5e4eeb8 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Feb 2025 18:37:46 +0000 Subject: [PATCH 11/89] fix input for tests --- .github/workflows/_ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 514545edb..3ea68ced4 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -703,7 +703,7 @@ jobs: docker run -i --shm-size=1g --gpus all \ ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \ bash <<"EOF" |& tee test-backend-independent.log - test-axlearn.sh --directory $pwd --output /opt/output/output.log --test-files /opt/axlearn/axlearn/common/*_test.py + test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" EOF STATISTICS_SCRIPT: | echo "Todo" From 15781cbed8854ea86f100275a750041453da5cd2 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 11 Feb 2025 12:03:48 +0000 Subject: [PATCH 12/89] fix test and create output --- .github/container/test-axlearn.sh | 19 ++----------------- .github/workflows/_ci.yaml | 10 +++++++++- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 2ffda9c4d..57b9d7080 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -162,19 +162,17 @@ for test_file in "${expanded_test_files[@]}"; do done # Initialize counters -errors=0 failures=0 passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]:0:10}"; do +for test_file in "${final_test_files[@]}"; do echo "Running: ${test_file}" # Ensure the test file exists if [ ! -f "${test_file}" ]; then echo "${test_file}: NOT FOUND" >> "${SUMMARY_FILE}" echo "Test file not found: ${test_file}" - ((errors++)) continue fi log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log @@ -194,18 +192,5 @@ for test_file in "${final_test_files[@]:0:10}"; do echo "" done -echo $errors echo $passed -echo $failures - -# e.g. of output summary -#/opt/axlearn/axlearn/common/adapter_flax_test.py: PASSED -#/opt/axlearn/axlearn/common/attention_bias_test.py: PASSED -#/opt/axlearn/axlearn/common/bert_test.py: FAILED (Exit code: 1) -#/opt/axlearn/axlearn/common/causal_lm_test.py: FAILED (Exit code: 1) -#/opt/axlearn/axlearn/common/checkpointer_orbax_test.py: PASSED -#/opt/axlearn/axlearn/common/checkpointer_test.py: PASSED -#/opt/axlearn/axlearn/common/compiler_options_test.py: PASSED -#/opt/axlearn/axlearn/common/config_test.py: PASSED -#/opt/axlearn/axlearn/common/conformer_test.py: FAILED (Exit code: 1) -#/opt/axlearn/axlearn/common/convolution_test.py: FAILED (Exit code: 1) +echo $failures \ No newline at end of file diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 3ea68ced4..563a1aa2c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -706,7 +706,15 @@ jobs: test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" EOF STATISTICS_SCRIPT: | - echo "Todo" + # Parse the summary.txt file to count passed/failed/error tests + # Adjust greps if your output format changes. + passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true) + failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true) + total_tests=$((failed_tests + passed_tests)) + + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT ARTIFACTS: | test-backend-independent.log secrets: inherit \ No newline at end of file From 031cfb0119adfcbfeb488a548611e682f52f5dc0 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 11 Feb 2025 16:18:55 +0000 Subject: [PATCH 13/89] fix requirements --- .github/container/Dockerfile.axlearn | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 6a0d3ac76..88cbc458c 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -23,21 +23,7 @@ nltk==3.7 portpicker==1.6.0 seqio==0.0.18 protobuf==3.20.3 -tensorflow==2.18.0 -tensorflow-datasets==4.9.7 -tensorflow-io==0.37.1 -tensorflow-io-gcs-filesystem==0.37.1 -tensorflow-metadata==1.13.1 -tensorflow-probability==0.24.0 -tensorflow-text==2.18.1 pytest>=7.4.3 -scikit-learn -torch -evaluate -transformers -timm -wandb -grain REQUIREMENTS EOF From 1fce714bbe35fbe3257d0a4a710643fcd666e7fc Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 13 Feb 2025 12:59:13 +0000 Subject: [PATCH 14/89] setup for running axlearn tests on k8s --- .../axlearn/axlearn-job.yml | 63 ++++++++++ .../axlearn/axlearn-postprocess-job.yml | 45 +++++++ .github/workflows/_ci.yaml | 119 ++++++++++++++---- 3 files changed, 202 insertions(+), 25 deletions(-) create mode 100644 .github/eks-workflow-files/axlearn/axlearn-job.yml create mode 100644 .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml new file mode 100644 index 000000000..e5fbca44f --- /dev/null +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -0,0 +1,63 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn + image: PLACEHOLDER + command: + - bash + - -exo + - pipefail + - -c + - | + # Example test command; adapted from your Docker run snippet + # Writes logs to /opt/output/test-backend-independent.log + # Also writes a summary file to /opt/output/summary.txt + test-axlearn.sh \ + --directory "." \ + --output "/opt/output/" \ + --test-files "/opt/axlearn/axlearn/common/*_test.py" + + # Wait a moment to ensure logs are flushed + sync + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + + - name: upload + image: amazon/aws-cli + command: + - sh + - -c + - | + # Wait for the summary file to appear + while [ ! -f /opt/output/summary.txt ]; do + sleep 1 + done + # Also wait for the main log + while [ ! -f /opt/output/test-backend-independent.log ]; do + sleep 1 + done + # Now upload to your S3 bucket + aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/ + aws s3 cp /opt/output/test-backend-independent.log s3://jax-toolbox-eks-output/ + volumeMounts: + - name: output + mountPath: /opt/output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml b/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml new file mode 100644 index 000000000..b6404a559 --- /dev/null +++ b/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml @@ -0,0 +1,45 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER +spec: + template: + spec: + restartPolicy: Never + initContainers: + - name: download + image: amazon/aws-cli + command: + - sh + - -c + - | + aws s3 cp s3://jax-toolbox-eks-output/summary.txt /opt/output/ + aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /opt/output/ + volumeMounts: + - mountPath: /opt/output + name: output + containers: + - name: parse-axlearn + image: ubuntu:22.04 + command: + - bash + - -exo + - pipefail + - -c + - | + if [ ! -f /opt/output/summary.txt ]; then + echo "summary.txt not found!" + exit 1 + fi + + passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true) + failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true) + total_tests=$((failed_tests + passed_tests)) + volumeMounts: + - mountPath: /opt/output + name: output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 563a1aa2c..9686170ce 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -693,28 +693,97 @@ jobs: # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} # secrets: inherit - test-axlearn: - needs: build-axlearn - if: inputs.ARCHITECTURE == 'amd64' - uses: ./.github/workflows/_test_unit.yaml - with: # fix the arguments below - TEST_NAME: axlearn - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" - EOF - STATISTICS_SCRIPT: | - # Parse the summary.txt file to count passed/failed/error tests - # Adjust greps if your output format changes. - passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true) - failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true) - total_tests=$((failed_tests + passed_tests)) - - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - secrets: inherit \ No newline at end of file + # test-axlearn-slurm: + # needs: build-axlearn + # if: inputs.ARCHITECTURE == 'amd64' + # uses: ./.github/workflows/_test_unit.yaml + # with: # fix the arguments below + # TEST_NAME: axlearn + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" + # EOF + # STATISTICS_SCRIPT: | + # # Parse the summary.txt file to count passed/failed/error tests + # # Adjust greps if your output format changes. + # passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true) + # failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true) + # total_tests=$((failed_tests + passed_tests)) + + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-backend-independent.log + # secrets: inherit + +# TODO WE CAN CREATE A RESUABLE ACTION HERE +# FIX everything with env.something +test-axlearn-eks: + needs: build-axlearn + if: inputs.ARCHITECTURE == 'amd64' + runs-on: eks + env: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess + TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Store GitHub Container Registry token as Kubernetes secret + run: | + kubectl create secret generic \ + ${{ github.run_id }}-${{ github.run_attempt }}-token \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson + - name: Configure axlearn test job + run: | + # Replace placeholders in axlearn-job.yml with environment variables + yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + .github/eks-workflow-files/axlearn-job.yml + git diff .github/eks-workflow-files/axlearn-job.yml + - name: Submit axlearn test job + run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml + - name: Wait for axlearn test job to start + run: | + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 10 + done + - name: Stream axlearn test job output + run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn + - name: Delete axlearn test job + if: always() + run: kubectl delete job ${{ env.JOB_NAME }} + - name: Configure axlearn post-processing job + run: | + yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME) + ' \ + .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml + git diff .github/eks-workflow-files/axlearn-postprocess-job.yml + - name: Submit axlearn post-processing job + run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml + - name: Wait for axlearn post-processing job to start + run: | + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 10 + done + - name: Stream axlearn post-processing job output + run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess + - name: Delete axlearn post-processing job + if: always() + run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess + - name: Delete GitHub Container Registry token + if: always() + run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token From 89d388ca1237ea599be0b1b2671670418ccf0067 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 13 Feb 2025 13:05:19 +0000 Subject: [PATCH 15/89] fix indentation --- .github/workflows/_ci.yaml | 136 ++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 9686170ce..6e277fcec 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -719,71 +719,71 @@ jobs: # test-backend-independent.log # secrets: inherit -# TODO WE CAN CREATE A RESUABLE ACTION HERE -# FIX everything with env.something -test-axlearn-eks: - needs: build-axlearn - if: inputs.ARCHITECTURE == 'amd64' - runs-on: eks - env: - AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess - TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Store GitHub Container Registry token as Kubernetes secret - run: | - kubectl create secret generic \ - ${{ github.run_id }}-${{ github.run_attempt }}-token \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson - - name: Configure axlearn test job - run: | - # Replace placeholders in axlearn-job.yml with environment variables - yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME) - | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ - .github/eks-workflow-files/axlearn-job.yml - git diff .github/eks-workflow-files/axlearn-job.yml - - name: Submit axlearn test job - run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml - - name: Wait for axlearn test job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 10 - done - - name: Stream axlearn test job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn - - name: Delete axlearn test job - if: always() - run: kubectl delete job ${{ env.JOB_NAME }} - - name: Configure axlearn post-processing job - run: | - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME) - ' \ - .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - git diff .github/eks-workflow-files/axlearn-postprocess-job.yml - - name: Submit axlearn post-processing job - run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml - - name: Wait for axlearn post-processing job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 10 - done - - name: Stream axlearn post-processing job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess - - name: Delete axlearn post-processing job - if: always() - run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess - - name: Delete GitHub Container Registry token - if: always() - run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token + # TODO WE CAN CREATE A RESUABLE ACTION HERE + # FIX everything with env.something + test-axlearn-eks: + needs: build-axlearn + if: inputs.ARCHITECTURE == 'amd64' + runs-on: eks + env: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess + TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Store GitHub Container Registry token as Kubernetes secret + run: | + kubectl create secret generic \ + ${{ github.run_id }}-${{ github.run_attempt }}-token \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson + - name: Configure axlearn test job + run: | + # Replace placeholders in axlearn-job.yml with environment variables + yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + .github/eks-workflow-files/axlearn-job.yml + git diff .github/eks-workflow-files/axlearn-job.yml + - name: Submit axlearn test job + run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml + - name: Wait for axlearn test job to start + run: | + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 10 + done + - name: Stream axlearn test job output + run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn + - name: Delete axlearn test job + if: always() + run: kubectl delete job ${{ env.JOB_NAME }} + - name: Configure axlearn post-processing job + run: | + yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME) + ' \ + .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml + git diff .github/eks-workflow-files/axlearn-postprocess-job.yml + - name: Submit axlearn post-processing job + run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml + - name: Wait for axlearn post-processing job to start + run: | + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 10 + done + - name: Stream axlearn post-processing job output + run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess + - name: Delete axlearn post-processing job + if: always() + run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess + - name: Delete GitHub Container Registry token + if: always() + run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token From 6c47cf57b372fd16ea3b9bcf7333facd46c4e4b3 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 13 Feb 2025 14:25:00 +0000 Subject: [PATCH 16/89] what an error --- .github/workflows/_ci.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 6e277fcec..c3e0b3b2c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -751,8 +751,8 @@ jobs: yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ - .github/eks-workflow-files/axlearn-job.yml - git diff .github/eks-workflow-files/axlearn-job.yml + .github/eks-workflow-files/axlearn/axlearn-job.yml + git diff .github/eks-workflow-files/axlearn/axlearn-job.yml - name: Submit axlearn test job run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml - name: Wait for axlearn test job to start @@ -771,9 +771,9 @@ jobs: | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME) ' \ .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - git diff .github/eks-workflow-files/axlearn-postprocess-job.yml + git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - name: Submit axlearn post-processing job - run: kubectl apply -f .github/eks-workflow-files/axlearn-postprocess-job.yml + run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - name: Wait for axlearn post-processing job to start run: | while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do From 44ffb6ef2c6e34db359bfd72573e13ab1967c81a Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 13 Feb 2025 16:01:44 +0000 Subject: [PATCH 17/89] add the k8s option --- .github/container/test-axlearn.sh | 53 +++++++++++++------ .../axlearn/axlearn-job.yml | 3 +- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 57b9d7080..a82cdefe1 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -15,6 +15,7 @@ usage() { echo " Default: '*_test.py'." echo " -o, --output DIRECTORY Output directory for logs and summary." echo " Default: 'test_runs/'." + echo " -k, --k8s Whether to run on a Kubernetes cluster." echo " -h, --help Show this help message and exit." exit 1 } @@ -24,45 +25,57 @@ DIR='axlearn/axlearn/common' CUDA_DEVICES='0,1,2,3,4,5,6,7' TEST_FILES=() OUTPUT_DIRECTORY='' +K8S=false -# Parse args -args=$(getopt -o d:p:c:t:o:h --long directory:,cuda-devices:,test-files:,output:,help -- "$@") -if [ $? -ne 0 ]; then - usage - exit 1 -fi - -eval set -- "$args" - -while true; do - case "$1" in +# Parse args manually +while [[ $# -gt 0 ]]; do + key="$1" + case $key in -d|--directory) + if [[ -z "$2" ]]; then + echo "Error: --directory requires an argument." + usage + fi DIR="$2" shift 2 ;; -c|--cuda-devices) + if [[ -z "$2" ]]; then + echo "Error: --cuda-devices requires an argument." + usage + fi CUDA_DEVICES="$2" shift 2 ;; -t|--test-files) shift # Collect all arguments until the next option (starting with '-') + if [[ $# -eq 0 ]]; then + echo "Error: --test-files requires at least one file pattern." + usage + fi + echo "Option -t|--test-files with arguments:" while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do + echo " $1" TEST_FILES+=("$1") shift done ;; -o|--output) + if [[ -z "$2" ]]; then + echo "Error: --output requires an argument." + usage + fi OUTPUT_DIRECTORY="$2" shift 2 ;; + -k|--k8s) + K8S=true + shift + ;; -h|--help) usage ;; - --) - shift - break - ;; *) echo "Unknown option: $1" usage @@ -70,7 +83,7 @@ while true; do esac done -# TODO double check what's the best choice + if [ -z "$OUTPUT_DIRECTORY" ]; then timestamp=$(date +%Y%m%d_%H%M%S) OUTPUT_DIRECTORY="test_runs/${timestamp}" @@ -92,8 +105,10 @@ else echo " Test Files Pattern: '*_test.py' (default)" fi echo " Output Directory: $OUTPUT_DIRECTORY" +echo " Kubernetes mode: $K8S" echo "" + cd "$DIR" || exit 1 # Set CUDA devices @@ -102,6 +117,12 @@ echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES" echo "Running tests..." +# If we are on Kubernetes, install torch +if [ "$K8S" = true ]; then + echo "K8S mode is true. Installing torch..." + pip install torch +fi + if [ "${#TEST_FILES[@]}" -eq 0 ]; then TEST_FILES=("*_test.py") fi diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index e5fbca44f..eb75faad5 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -25,7 +25,8 @@ spec: test-axlearn.sh \ --directory "." \ --output "/opt/output/" \ - --test-files "/opt/axlearn/axlearn/common/*_test.py" + --test-files "/opt/axlearn/axlearn/common/*_test.py" \ + --k8s # Wait a moment to ensure logs are flushed sync From a926bf99cc216d3d00df650dd28509b55e7d03fc Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 13 Feb 2025 18:57:58 +0000 Subject: [PATCH 18/89] try a test with 5 files and avoid postprocessing on k8s --- .github/container/test-axlearn.sh | 8 +++-- .github/workflows/_ci.yaml | 59 +++++++++++++++++++------------ 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index a82cdefe1..9d7ec05ed 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -119,8 +119,10 @@ echo "Running tests..." # If we are on Kubernetes, install torch if [ "$K8S" = true ]; then - echo "K8S mode is true. Installing torch..." - pip install torch + uname -a + python --version + #pip install torch # install cpu version + #nvidia-cudnn-cu12==9.7.0.66 fi if [ "${#TEST_FILES[@]}" -eq 0 ]; then @@ -168,7 +170,7 @@ done # we are skipping some tests as there's still wip by Apple final_test_files=() -for test_file in "${expanded_test_files[@]}"; do +for test_file in "${expanded_test_files[@]:0:5}"; do exclude=false #echo $test_file for pattern in "${EXCLUDE_PATTERNS[@]}"; do diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index c3e0b3b2c..cd047662c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -742,7 +742,7 @@ jobs: - name: Store GitHub Container Registry token as Kubernetes secret run: | kubectl create secret generic \ - ${{ github.run_id }}-${{ github.run_attempt }}-token \ + ${{ env.TOKEN_NAME }} \ --from-file=.dockerconfigjson=$HOME/.docker/config.json \ --type=kubernetes.io/dockerconfigjson - name: Configure axlearn test job @@ -757,33 +757,48 @@ jobs: run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml - name: Wait for axlearn test job to start run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do sleep 10 done - name: Stream axlearn test job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn + run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.JOB_NAME }} - name: Delete axlearn test job if: always() run: kubectl delete job ${{ env.JOB_NAME }} - - name: Configure axlearn post-processing job + - name: Download logs from S3 run: | - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME) - ' \ - .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - - name: Submit axlearn post-processing job - run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - - name: Wait for axlearn post-processing job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 10 - done - - name: Stream axlearn post-processing job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess - - name: Delete axlearn post-processing job - if: always() - run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess + mkdir -p /tmp/axlearn-output + aws s3 cp s3://jax-toolbox-eks-output/summary.txt /tmp/axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /tmp/axlearn-output/ + + passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true) + failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true) + total_tests=$((failed_tests + passed_tests)) + + echo "Passed tests: $passed_tests" + echo "Failed tests: $failed_tests" + echo "Total tests: $total_tests" + # - name: Configure axlearn post-processing job + # run: | + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME) + # ' \ + # .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml + # git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml + # - name: Submit axlearn post-processing job + # run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml + # - name: Wait for axlearn post-processing job to start + # run: | + # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.POSTPROCESS_JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + # sleep 10 + # done + # - name: Stream axlearn post-processing job output + # run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.POSTPROCESS_JOB_NAME }} + # - name: Delete axlearn post-processing job + # if: always() + # run: kubectl delete job ${{ env.POSTPROCESS_JOB_NAME }} + # TODO upload aritfacts to github - name: Delete GitHub Container Registry token if: always() - run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token + run: kubectl delete secret ${{ env.TOKEN }} + From 6349f445b32ea3aae0c2e8e16aa422a201c24c6b Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 12:03:09 +0000 Subject: [PATCH 19/89] fix test --- .github/container/test-axlearn.sh | 10 +-- .../axlearn/axlearn-job.yml | 11 +-- .github/workflows/_ci.yaml | 78 +++++++++++++------ 3 files changed, 62 insertions(+), 37 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 9d7ec05ed..e8e87a10b 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -119,9 +119,7 @@ echo "Running tests..." # If we are on Kubernetes, install torch if [ "$K8S" = true ]; then - uname -a - python --version - #pip install torch # install cpu version + pip install torch==2.6.0+cpu.cxx11.abi-cp312-cp312-linux_x86_64.whl --index-url https://download.pytorch.org/whl/torch/ #nvidia-cudnn-cu12==9.7.0.66 fi @@ -166,11 +164,9 @@ for pattern in "${EXCLUDE_PATTERNS[@]}"; do echo "$pattern" done -#expanded_test_files=( "${expanded_test_files[@]:0:10}" ) -# we are skipping some tests as there's still wip by Apple final_test_files=() -for test_file in "${expanded_test_files[@]:0:5}"; do +for test_file in "${expanded_test_files[@]}"; do exclude=false #echo $test_file for pattern in "${EXCLUDE_PATTERNS[@]}"; do @@ -190,7 +186,7 @@ passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]}"; do +for test_file in "${final_test_files[@]:0:5}"; do echo "Running: ${test_file}" # Ensure the test file exists if [ ! -f "${test_file}" ]; then diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index eb75faad5..56183f35a 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -36,9 +36,11 @@ spec: volumeMounts: - name: output mountPath: /opt/output - - name: upload image: amazon/aws-cli + env: + - name: TEST_DATE + value: PLACEHOLDER command: - sh - -c @@ -47,13 +49,8 @@ spec: while [ ! -f /opt/output/summary.txt ]; do sleep 1 done - # Also wait for the main log - while [ ! -f /opt/output/test-backend-independent.log ]; do - sleep 1 - done # Now upload to your S3 bucket - aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/ - aws s3 cp /opt/output/test-backend-independent.log s3://jax-toolbox-eks-output/ + aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt volumeMounts: - name: output mountPath: /opt/output diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index cd047662c..d4e6097cd 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -745,11 +745,16 @@ jobs: ${{ env.TOKEN_NAME }} \ --from-file=.dockerconfigjson=$HOME/.docker/config.json \ --type=kubernetes.io/dockerconfigjson + - name: Set date environment variable + run: | + echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV - name: Configure axlearn test job run: | # Replace placeholders in axlearn-job.yml with environment variables - yq -i ea 'select(di == 0).metadata.name = strenv(JOB_NAME) + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.containers[1].env[0].value = strenv(DATE_TEST_RAN) | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ .github/eks-workflow-files/axlearn/axlearn-job.yml git diff .github/eks-workflow-files/axlearn/axlearn-job.yml @@ -768,8 +773,7 @@ jobs: - name: Download logs from S3 run: | mkdir -p /tmp/axlearn-output - aws s3 cp s3://jax-toolbox-eks-output/summary.txt /tmp/axlearn-output/ - aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /tmp/axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ env.DATE_TEST_RAN }}/summary.txt /tmp/axlearn-output/ passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true) failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true) @@ -778,26 +782,54 @@ jobs: echo "Passed tests: $passed_tests" echo "Failed tests: $failed_tests" echo "Total tests: $total_tests" - # - name: Configure axlearn post-processing job - # run: | - # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - # | .spec.template.spec.containers[].imagePullSecrets[].name = strenv(TOKEN_NAME) - # ' \ - # .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - # git diff .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - # - name: Submit axlearn post-processing job - # run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml - # - name: Wait for axlearn post-processing job to start - # run: | - # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.POSTPROCESS_JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - # sleep 10 - # done - # - name: Stream axlearn post-processing job output - # run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.POSTPROCESS_JOB_NAME }} - # - name: Delete axlearn post-processing job - # if: always() - # run: kubectl delete job ${{ env.POSTPROCESS_JOB_NAME }} - # TODO upload aritfacts to github + + echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT + echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT + + - name: Generate sitrep + id: sitrep + if: "!cancelled()" + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='Axlearn EKS Unit' + + total_tests=${{ steps.test-stats.outputs.TOTAL_TESTS }} \ + failed_tests=${{ steps.test-stats.outputs.FAILED_TESTS }} \ + passed_tests=${{ steps.test-stats.outputs.PASSED_TESTS }} \ + errors="0" \ + summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ + badge_message="Passed $passed_tests out of $total_tests." \ + badge_color="brightgreen" + if [ "$failed_tests" -gt 0 ]; then + badge_color="red" + fi \ + + to_json \ + summary \ + errors total_tests passed_tests failed_tests \ + badge_label badge_color badge_message \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="Passed $passed_tests out of $total_tests." \ + color=$badge_color \ + to_json schemaVersion label message color \ + > "badge-axlearn-test" + + - name: Upload artifacts + if: "!cancelled()" + uses: actions/upload-artifact@v4 + with: + name: "artifact-axlearn-test" + path: | + sitrep.json + "badge-axlearn-test" + summary.txt - name: Delete GitHub Container Registry token if: always() run: kubectl delete secret ${{ env.TOKEN }} From 004ed787e7c65f1c7fd21582d6c26a1ea9e92730 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 13:18:44 +0000 Subject: [PATCH 20/89] remove postprocess --- .../axlearn/axlearn-postprocess-job.yml | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 .github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml diff --git a/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml b/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml deleted file mode 100644 index b6404a559..000000000 --- a/.github/eks-workflow-files/axlearn/axlearn-postprocess-job.yml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: PLACEHOLDER -spec: - template: - spec: - restartPolicy: Never - initContainers: - - name: download - image: amazon/aws-cli - command: - - sh - - -c - - | - aws s3 cp s3://jax-toolbox-eks-output/summary.txt /opt/output/ - aws s3 cp s3://jax-toolbox-eks-output/test-backend-independent.log /opt/output/ - volumeMounts: - - mountPath: /opt/output - name: output - containers: - - name: parse-axlearn - image: ubuntu:22.04 - command: - - bash - - -exo - - pipefail - - -c - - | - if [ ! -f /opt/output/summary.txt ]; then - echo "summary.txt not found!" - exit 1 - fi - - passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true) - failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true) - total_tests=$((failed_tests + passed_tests)) - volumeMounts: - - mountPath: /opt/output - name: output - imagePullSecrets: - - name: PLACEHOLDER - volumes: - - name: output - emptyDir: {} From cf858141c7b173fe13b84bd7ae92a1d2d47a981a Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 14:17:56 +0000 Subject: [PATCH 21/89] reusable actions test --- .../actions/checkout-ghcr-login/action.yml | 34 +++++++++++ .github/actions/delete-ghcr-token/action.yml | 16 ++++++ .github/actions/delete-k8s-job/action.yml | 15 +++++ .github/actions/submit-k8s-job/action.yml | 31 ++++++++++ .github/container/test-axlearn.sh | 31 +++------- .github/workflows/_ci.yaml | 56 ++++++++----------- 6 files changed, 127 insertions(+), 56 deletions(-) create mode 100644 .github/actions/checkout-ghcr-login/action.yml create mode 100644 .github/actions/delete-ghcr-token/action.yml create mode 100644 .github/actions/delete-k8s-job/action.yml create mode 100644 .github/actions/submit-k8s-job/action.yml diff --git a/.github/actions/checkout-ghcr-login/action.yml b/.github/actions/checkout-ghcr-login/action.yml new file mode 100644 index 000000000..a71a1be12 --- /dev/null +++ b/.github/actions/checkout-ghcr-login/action.yml @@ -0,0 +1,34 @@ +name: Checkout, GHCR login, K8s secret +description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret. + +inputs: + docker-username: + description: Username for GHCR + required: true + docker-password: + description: Password (e.g., GITHUB_TOKEN) + required: true + token-name: + description: Name of the K8s secret to create + required: true + +runs: + using: "composite" + steps: + - name: Check out the repository + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: "ghcr.io" + username: ${{ inputs.docker-username }} + password: ${{ inputs.docker-password }} + + - name: Store GitHub Container Registry token as Kubernetes secret + shell: bash + run: | + kubectl create secret generic \ + ${{ inputs.token-name }} \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/delete-ghcr-token/action.yml new file mode 100644 index 000000000..0d90dd168 --- /dev/null +++ b/.github/actions/delete-ghcr-token/action.yml @@ -0,0 +1,16 @@ +name: Delete GHCR Token +description: Deletes the K8s secret used for pulling images from GHCR. + +inputs: + token-name: + description: Name of the K8s secret to delete + required: true + +runs: + using: "composite" + steps: + - name: Delete GitHub Container Registry token + shell: bash + if: always() + run: | + kubectl delete secret ${{ inputs.token-name }} diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml new file mode 100644 index 000000000..cf8011fcc --- /dev/null +++ b/.github/actions/delete-k8s-job/action.yml @@ -0,0 +1,15 @@ +name: Delete K8s Job +description: Cleans up the Job resource to avoid leaving pods behind. + +inputs: + job-name: + description: The job name to delete + required: true + +runs: + using: "composite" + steps: + - name: Delete Kubernetes job + if: always() + run: | + kubectl delete job ${{ inputs.job-name }} diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml new file mode 100644 index 000000000..c00826897 --- /dev/null +++ b/.github/actions/submit-k8s-job/action.yml @@ -0,0 +1,31 @@ +name: Submit & Stream K8s Job +description: Submits a Kubernetes job and then streams its logs to GitHub Actions. + +inputs: + job-config-file: + description: Path to the Kubernetes job YAML + required: true + job-name: + description: The job name + required: true + +runs: + using: "composite" + steps: + - name: Submit Kubernetes job + shell: bash + run: | + kubectl apply -f "${{ inputs.job-config-file }}" + + - name: Wait for Kubernetes job to start + shell: bash + run: | + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + echo "Waiting for pods to start..." + sleep 10 + done + + - name: Stream Kubernetes job output + shell: bash + run: | + kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index e8e87a10b..8cf2a94d6 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -117,18 +117,18 @@ echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES" echo "Running tests..." -# If we are on Kubernetes, install torch +# If we are on Kubernetes, install torch for cpu only if [ "$K8S" = true ]; then - pip install torch==2.6.0+cpu.cxx11.abi-cp312-cp312-linux_x86_64.whl --index-url https://download.pytorch.org/whl/torch/ - #nvidia-cudnn-cu12==9.7.0.66 + pip install torch --extra-index-url https://download.pytorch.org/whl/cpu fi if [ "${#TEST_FILES[@]}" -eq 0 ]; then TEST_FILES=("*_test.py") fi + expanded_test_files=() for pattern in "${TEST_FILES[@]}"; do - # Use globbing to expand pattern + # retrieve all the files files=( $pattern ) if [ "${#files[@]}" -gt 0 ]; then expanded_test_files+=( "${files[@]}" ) @@ -137,19 +137,12 @@ for pattern in "${TEST_FILES[@]}"; do fi done - if [ "${#expanded_test_files[@]}" -eq 0 ]; then echo "No test files found to run." exit 1 fi -echo "These are the test files:" -for f in "${expanded_test_files[@]}"; do - echo " $f" -done - -# Get the directory where the script is located -#SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +# in case we have the exclusion list file EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt" EXCLUDE_PATTERNS=() @@ -159,16 +152,11 @@ if [ -f "$EXCLUDE_LIST_FILE" ]; then else echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'" fi -echo "Exclude patterns read:" -for pattern in "${EXCLUDE_PATTERNS[@]}"; do - echo "$pattern" -done final_test_files=() for test_file in "${expanded_test_files[@]}"; do exclude=false - #echo $test_file for pattern in "${EXCLUDE_PATTERNS[@]}"; do if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then exclude=true @@ -180,7 +168,7 @@ for test_file in "${expanded_test_files[@]}"; do fi done -# Initialize counters +# Initialize counters for test failures=0 passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" @@ -198,9 +186,9 @@ for test_file in "${final_test_files[@]:0:5}"; do log_file="${LOG_DIRECTORY}/${log_file_name}" # run the tests and save them as *.log pytest "${test_file}" --capture=tee-sys | tee "${log_file}" - # TODO parse the logs? exit_code=${PIPESTATUS[0]} echo $exit_code + # write number of tests passed and failed if [ $exit_code -eq 0 ]; then echo "${test_file}: PASSED" >> "${SUMMARY_FILE}" ((passed++)) @@ -209,7 +197,4 @@ for test_file in "${final_test_files[@]:0:5}"; do ((failures++)) fi echo "" -done - -echo $passed -echo $failures \ No newline at end of file +done \ No newline at end of file diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index d4e6097cd..d36750e10 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -719,35 +719,25 @@ jobs: # test-backend-independent.log # secrets: inherit - # TODO WE CAN CREATE A RESUABLE ACTION HERE - # FIX everything with env.something + test-axlearn-eks: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' runs-on: eks env: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-axlearn-postprocess - TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token + JOB_NAME: axlearn-${{ github.run_id }} + TOKEN_NAME: axlearn-${{ github.run_id }}-token steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Store GitHub Container Registry token as Kubernetes secret - run: | - kubectl create secret generic \ - ${{ env.TOKEN_NAME }} \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson - - name: Set date environment variable + - name: Set date env var for saving files run: | echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV + - name: Check and GHCR Login + uses: /.github/actions/checkout-ghcr-login + with: + docker-username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + token-name: ${{ env.TOKEN_NAME }} - name: Configure axlearn test job run: | # Replace placeholders in axlearn-job.yml with environment variables @@ -758,18 +748,17 @@ jobs: | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ .github/eks-workflow-files/axlearn/axlearn-job.yml git diff .github/eks-workflow-files/axlearn/axlearn-job.yml - - name: Submit axlearn test job - run: kubectl apply -f .github/eks-workflow-files/axlearn/axlearn-job.yml - - name: Wait for axlearn test job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ env.JOB_NAME }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 10 - done - - name: Stream axlearn test job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ env.JOB_NAME }} + - name: Submit & wait for axlearn test job + uses: ./.github/actions/submit-k8s-job + with: + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" + job-name: ${{ env.JOB_NAME }} + - name: Delete axlearn test job - if: always() - run: kubectl delete job ${{ env.JOB_NAME }} + uses: ./.github/actions/delete-k8s-job + with: + job-name: ${{ env.JOB_NAME }} + - name: Download logs from S3 run: | mkdir -p /tmp/axlearn-output @@ -831,6 +820,7 @@ jobs: "badge-axlearn-test" summary.txt - name: Delete GitHub Container Registry token - if: always() - run: kubectl delete secret ${{ env.TOKEN }} + uses: ./.github/actions/delete-ghcr-token + with: + token-name: ${{ env.TOKEN_NAME }} From f32ee766dfc12f83c2cd3f1f9b4c0bc7b3486e74 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 14:36:31 +0000 Subject: [PATCH 22/89] fix --- .github/actions/delete-k8s-job/action.yml | 1 + .github/workflows/_ci.yaml | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml index cf8011fcc..877039672 100644 --- a/.github/actions/delete-k8s-job/action.yml +++ b/.github/actions/delete-k8s-job/action.yml @@ -10,6 +10,7 @@ runs: using: "composite" steps: - name: Delete Kubernetes job + shell: bash if: always() run: | kubectl delete job ${{ inputs.job-name }} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index d36750e10..f0323e040 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -733,10 +733,10 @@ jobs: run: | echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV - name: Check and GHCR Login - uses: /.github/actions/checkout-ghcr-login + uses: ./.github/actions/checkout-ghcr-login with: docker-username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + docker-password: ${{ secrets.GITHUB_TOKEN }} token-name: ${{ env.TOKEN_NAME }} - name: Configure axlearn test job run: | @@ -748,6 +748,7 @@ jobs: | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ .github/eks-workflow-files/axlearn/axlearn-job.yml git diff .github/eks-workflow-files/axlearn/axlearn-job.yml + - name: Submit & wait for axlearn test job uses: ./.github/actions/submit-k8s-job with: From 04c6cf9ab7edb70187e2f9a8801af8208e75c675 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 15:39:04 +0000 Subject: [PATCH 23/89] test on single piece --- .../actions/checkout-ghcr-login/action.yml | 34 --- .github/workflows/_ci.yaml | 231 +++++++++--------- 2 files changed, 111 insertions(+), 154 deletions(-) delete mode 100644 .github/actions/checkout-ghcr-login/action.yml diff --git a/.github/actions/checkout-ghcr-login/action.yml b/.github/actions/checkout-ghcr-login/action.yml deleted file mode 100644 index a71a1be12..000000000 --- a/.github/actions/checkout-ghcr-login/action.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: Checkout, GHCR login, K8s secret -description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret. - -inputs: - docker-username: - description: Username for GHCR - required: true - docker-password: - description: Password (e.g., GITHUB_TOKEN) - required: true - token-name: - description: Name of the K8s secret to create - required: true - -runs: - using: "composite" - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: "ghcr.io" - username: ${{ inputs.docker-username }} - password: ${{ inputs.docker-password }} - - - name: Store GitHub Container Registry token as Kubernetes secret - shell: bash - run: | - kubectl create secret generic \ - ${{ inputs.token-name }} \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index f0323e040..7bfd93bbd 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -38,33 +38,33 @@ permissions: jobs: - build-base: - uses: ./.github/workflows/_build_base.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BASE_IMAGE: ${{ inputs.CUDA_IMAGE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} - secrets: inherit - - build-jax: - needs: build-base - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-jax-build - BADGE_FILENAME: badge-jax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - CONTAINER_NAME: jax - DOCKERFILE: .github/container/Dockerfile.jax - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} - URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} - URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} - URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} - secrets: inherit + # build-base: + # uses: ./.github/workflows/_build_base.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BASE_IMAGE: ${{ inputs.CUDA_IMAGE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} + # secrets: inherit + + # build-jax: + # needs: build-base + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-jax-build + # BADGE_FILENAME: badge-jax-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + # CONTAINER_NAME: jax + # DOCKERFILE: .github/container/Dockerfile.jax + # RUNNER_SIZE: large + # EXTRA_BUILD_ARGS: | + # URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} + # URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} + # URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} + # URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} + # secrets: inherit # build-triton: # needs: build-jax @@ -203,57 +203,57 @@ jobs: # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} # secrets: inherit - build-axlearn: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-axlearn-build - BADGE_FILENAME: badge-axlearn-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: axlearn - DOCKERFILE: .github/container/Dockerfile.axlearn - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - secrets: inherit - - collect-docker-tags: - runs-on: ubuntu-22.04 - if: "!cancelled()" - needs: - - build-base - - build-jax - # - build-triton - # - build-equinox - # - build-maxtext - # - build-levanter - # - build-upstream-t5x - # - build-upstream-pax - # - build-rosetta-t5x - # - build-rosetta-pax - # - build-gemma - - build-axlearn - outputs: - TAGS: ${{ steps.collect-tags.outputs.TAGS }} - steps: - - name: Save docker tags as a JSON object - id: collect-tags - run: | - TAGS=$(cat <> $GITHUB_OUTPUT + # build-axlearn: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-axlearn-build + # BADGE_FILENAME: badge-axlearn-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: axlearn + # DOCKERFILE: .github/container/Dockerfile.axlearn + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + # secrets: inherit + + # collect-docker-tags: + # runs-on: ubuntu-22.04 + # if: "!cancelled()" + # needs: + # - build-base + # - build-jax + # # - build-triton + # # - build-equinox + # # - build-maxtext + # # - build-levanter + # # - build-upstream-t5x + # # - build-upstream-pax + # # - build-rosetta-t5x + # # - build-rosetta-pax + # # - build-gemma + # - build-axlearn + # outputs: + # TAGS: ${{ steps.collect-tags.outputs.TAGS }} + # steps: + # - name: Save docker tags as a JSON object + # id: collect-tags + # run: | + # TAGS=$(cat <> $GITHUB_OUTPUT # test-distribution: # runs-on: ubuntu-22.04 @@ -450,20 +450,12 @@ jobs: # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess # TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token # steps: - # - name: Check out the repository - # uses: actions/checkout@v4 - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v3 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - # - name: Store GitHub Container Registry token as Kubernetes secret - # run: | - # kubectl create secret generic \ - # ${{ github.run_id }}-${{ github.run_attempt }}-token \ - # --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - # --type=kubernetes.io/dockerconfigjson + # - name: GHCR login + # uses: ./.github/actions/ghcr-login + # with: + # docker-username: ${{ github.repository_owner }} + # docker-password: ${{ secrets.GITHUB_TOKEN}} + # token-name: ${{ env.TOKEN_NAME }} # - name: Configure Kubernetes job # run: | # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) @@ -474,18 +466,17 @@ jobs: # .github/eks-workflow-files/job.yml # git diff .github/eks-workflow-files/job.yml # - name: Submit Kubernetes job - # run: kubectl apply -f .github/eks-workflow-files/job.yml - # - name: Wait for Kubernetes job to start - # run: | - # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - # sleep 2 - # done - # - name: Stream Kubernetes job output - # run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax + # uses: ./.github/acitons/submit-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/job.yml + # job-name: ${{ env.JOB_NAME }} + # # Clean up in case of errors as well as success # - name: Delete Kubernetes job - # if: always() - # run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax + # uses: ./.github/actions/delete-k8s-job + # with: + # job-name: ${{ env.JOB_NAME }} + # - name: Configure post-processing job # run: | # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" @@ -495,22 +486,19 @@ jobs: # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ # .github/eks-workflow-files/post-process-job.yml # git diff .github/eks-workflow-files/post-process-job.yml - # - name: Submit post-processing Kubernetes job - # run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml - # - name: Wait for post-processing Kubernetes job to start - # run: | - # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - # sleep 2 - # done - # - name: Stream post-processing Kubernetes job output - # run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess - # # Clean up in case of errors as well as success + # - name: Submit Kubernetes job + # uses: ./.github/acitons/submit-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/post-process-job.yml + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} # - name: Delete post-processing Kubernetes job - # if: always() - # run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess + # uses: ./.github/actions/delete-k8s-job + # with: + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} # - name: Delete GitHub Container Registry token - # if: always() - # run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token + # uses: ./.github/actions/delete-ghcr-token + # with: + # token-name: ${{ env.TOKEN_NAME }} # # test-equinox: # # needs: build-equinox @@ -721,19 +709,22 @@ jobs: test-axlearn-eks: - needs: build-axlearn + #needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' runs-on: eks env: - AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # needs.build-axlearn.outputs.DOCKER_TAG_FINAL + AXLEARN_DOCKER_IMAGE: "ghcr.io/nvidia/jax-toolbox-internal:13331372559-axlearn-amd6" JOB_NAME: axlearn-${{ github.run_id }} TOKEN_NAME: axlearn-${{ github.run_id }}-token steps: - name: Set date env var for saving files run: | echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV - - name: Check and GHCR Login - uses: ./.github/actions/checkout-ghcr-login + - name: Check out the repository + uses: actions/checkout@v4 + - name: GHCR Login + uses: ./.github/actions/ghcr-login with: docker-username: ${{ github.repository_owner }} docker-password: ${{ secrets.GITHUB_TOKEN }} From cfc68db19e41a67c858ec06ac59a587446f359dc Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 15:42:55 +0000 Subject: [PATCH 24/89] add checkout --- .github/actions/ghcr-login/action.yml | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/actions/ghcr-login/action.yml diff --git a/.github/actions/ghcr-login/action.yml b/.github/actions/ghcr-login/action.yml new file mode 100644 index 000000000..2c62591ed --- /dev/null +++ b/.github/actions/ghcr-login/action.yml @@ -0,0 +1,31 @@ +name: Checkout, GHCR login, K8s secret +description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret. + +inputs: + docker-username: + description: Username for GHCR + required: true + docker-password: + description: Password (e.g., GITHUB_TOKEN) + required: true + token-name: + description: Name of the K8s secret to create + required: true + +runs: + using: "composite" + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: "ghcr.io" + username: ${{ inputs.docker-username }} + password: ${{ inputs.docker-password }} + + - name: Store GitHub Container Registry token as Kubernetes secret + shell: bash + run: | + kubectl create secret generic \ + ${{ inputs.token-name }} \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson From 65eca97ac54d52f8d707369766f0799bf22384e9 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 15:52:16 +0000 Subject: [PATCH 25/89] restart ci --- .github/workflows/_ci.yaml | 159 ++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 81 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 7bfd93bbd..5d1028a17 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -38,33 +38,33 @@ permissions: jobs: - # build-base: - # uses: ./.github/workflows/_build_base.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # BASE_IMAGE: ${{ inputs.CUDA_IMAGE }} - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} - # secrets: inherit - - # build-jax: - # needs: build-base - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-jax-build - # BADGE_FILENAME: badge-jax-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - # CONTAINER_NAME: jax - # DOCKERFILE: .github/container/Dockerfile.jax - # RUNNER_SIZE: large - # EXTRA_BUILD_ARGS: | - # URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} - # URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} - # URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} - # URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} - # secrets: inherit + build-base: + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BASE_IMAGE: ${{ inputs.CUDA_IMAGE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} + secrets: inherit + + build-jax: + needs: build-base + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-jax-build + BADGE_FILENAME: badge-jax-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + CONTAINER_NAME: jax + DOCKERFILE: .github/container/Dockerfile.jax + RUNNER_SIZE: large + EXTRA_BUILD_ARGS: | + URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} + URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} + URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} + URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} + secrets: inherit # build-triton: # needs: build-jax @@ -203,57 +203,55 @@ jobs: # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} # secrets: inherit - # build-axlearn: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-axlearn-build - # BADGE_FILENAME: badge-axlearn-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: axlearn - # DOCKERFILE: .github/container/Dockerfile.axlearn - # EXTRA_BUILD_ARGS: | - # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - # secrets: inherit - - # collect-docker-tags: - # runs-on: ubuntu-22.04 - # if: "!cancelled()" - # needs: - # - build-base - # - build-jax - # # - build-triton - # # - build-equinox - # # - build-maxtext - # # - build-levanter - # # - build-upstream-t5x - # # - build-upstream-pax - # # - build-rosetta-t5x - # # - build-rosetta-pax - # # - build-gemma - # - build-axlearn - # outputs: - # TAGS: ${{ steps.collect-tags.outputs.TAGS }} - # steps: - # - name: Save docker tags as a JSON object - # id: collect-tags - # run: | - # TAGS=$(cat <> $GITHUB_OUTPUT + build-axlearn: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-axlearn-build + BADGE_FILENAME: badge-axlearn-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: axlearn + DOCKERFILE: .github/container/Dockerfile.axlearn + secrets: inherit + + collect-docker-tags: + runs-on: ubuntu-22.04 + if: "!cancelled()" + needs: + - build-base + - build-jax + # - build-triton + # - build-equinox + # - build-maxtext + # - build-levanter + # - build-upstream-t5x + # - build-upstream-pax + # - build-rosetta-t5x + # - build-rosetta-pax + # - build-gemma + - build-axlearn + outputs: + TAGS: ${{ steps.collect-tags.outputs.TAGS }} + steps: + - name: Save docker tags as a JSON object + id: collect-tags + run: | + TAGS=$(cat <> $GITHUB_OUTPUT # test-distribution: # runs-on: ubuntu-22.04 @@ -709,12 +707,11 @@ jobs: test-axlearn-eks: - #needs: build-axlearn + needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' runs-on: eks env: - # needs.build-axlearn.outputs.DOCKER_TAG_FINAL - AXLEARN_DOCKER_IMAGE: "ghcr.io/nvidia/jax-toolbox-internal:13331372559-axlearn-amd6" + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} JOB_NAME: axlearn-${{ github.run_id }} TOKEN_NAME: axlearn-${{ github.run_id }}-token steps: From 580bf733ffd0d328dabf2f05b360872b145760a8 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 14 Feb 2025 17:23:23 +0000 Subject: [PATCH 26/89] general clean up --- .github/container/test-axlearn.sh | 9 +- .github/workflows/_ci.yaml | 1125 ++++++++++++++--------------- 2 files changed, 560 insertions(+), 574 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 8cf2a94d6..27118b7a0 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -120,6 +120,7 @@ echo "Running tests..." # If we are on Kubernetes, install torch for cpu only if [ "$K8S" = true ]; then pip install torch --extra-index-url https://download.pytorch.org/whl/cpu + pip install transformers fi if [ "${#TEST_FILES[@]}" -eq 0 ]; then @@ -174,14 +175,8 @@ passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]:0:5}"; do +for test_file in "${final_test_files[@]}"; do echo "Running: ${test_file}" - # Ensure the test file exists - if [ ! -f "${test_file}" ]; then - echo "${test_file}: NOT FOUND" >> "${SUMMARY_FILE}" - echo "Test file not found: ${test_file}" - continue - fi log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log log_file="${LOG_DIRECTORY}/${log_file_name}" # run the tests and save them as *.log diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 5d1028a17..02c2c4611 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,142 +66,142 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - # build-triton: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-triton-build - # BADGE_FILENAME: badge-triton-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: triton - # DOCKERFILE: .github/container/Dockerfile.triton - # RUNNER_SIZE: large - # EXTRA_BUILD_ARGS: | - # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - # secrets: inherit + build-triton: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-triton-build + BADGE_FILENAME: badge-triton-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: triton + DOCKERFILE: .github/container/Dockerfile.triton + RUNNER_SIZE: large + EXTRA_BUILD_ARGS: | + URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + secrets: inherit - # build-equinox: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-equinox-build - # BADGE_FILENAME: badge-equinox-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: equinox - # DOCKERFILE: .github/container/Dockerfile.equinox - # EXTRA_BUILD_ARGS: | - # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - # secrets: inherit + build-equinox: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-equinox-build + BADGE_FILENAME: badge-equinox-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: equinox + DOCKERFILE: .github/container/Dockerfile.equinox + EXTRA_BUILD_ARGS: | + URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + secrets: inherit - # build-maxtext: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-maxtext-build - # BADGE_FILENAME: badge-maxtext-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: maxtext - # DOCKERFILE: .github/container/Dockerfile.maxtext - # EXTRA_BUILD_ARGS: | - # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - # secrets: inherit + build-maxtext: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-maxtext-build + BADGE_FILENAME: badge-maxtext-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: maxtext + DOCKERFILE: .github/container/Dockerfile.maxtext + EXTRA_BUILD_ARGS: | + URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + secrets: inherit - # build-levanter: - # needs: [build-jax] - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-levanter-build" - # BADGE_FILENAME: "badge-levanter-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: levanter - # DOCKERFILE: .github/container/Dockerfile.levanter - # EXTRA_BUILD_ARGS: | - # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - # secrets: inherit + build-levanter: + needs: [build-jax] + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-levanter-build" + BADGE_FILENAME: "badge-levanter-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: levanter + DOCKERFILE: .github/container/Dockerfile.levanter + EXTRA_BUILD_ARGS: | + URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + secrets: inherit - # build-upstream-t5x: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-t5x-build" - # BADGE_FILENAME: "badge-t5x-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: upstream-t5x - # DOCKERFILE: .github/container/Dockerfile.t5x - # EXTRA_BUILD_ARGS: | - # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - # secrets: inherit + build-upstream-t5x: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-t5x-build" + BADGE_FILENAME: "badge-t5x-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-t5x + DOCKERFILE: .github/container/Dockerfile.t5x + EXTRA_BUILD_ARGS: | + URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + secrets: inherit - # build-upstream-pax: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-pax-build - # BADGE_FILENAME: badge-pax-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: upstream-pax - # DOCKERFILE: .github/container/Dockerfile.pax - # EXTRA_BUILD_ARGS: | - # URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} - # URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} - # URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} - # secrets: inherit + build-upstream-pax: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-pax-build + BADGE_FILENAME: badge-pax-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-pax + DOCKERFILE: .github/container/Dockerfile.pax + EXTRA_BUILD_ARGS: | + URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} + URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} + URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} + secrets: inherit - # build-rosetta-t5x: - # needs: build-upstream-t5x - # uses: ./.github/workflows/_build_rosetta.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: t5x - # secrets: inherit + build-rosetta-t5x: + needs: build-upstream-t5x + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: t5x + secrets: inherit - # build-rosetta-pax: - # needs: build-upstream-pax - # uses: ./.github/workflows/_build_rosetta.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: pax - # secrets: inherit + build-rosetta-pax: + needs: build-upstream-pax + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: pax + secrets: inherit - # build-gemma: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-gemma-build - # BADGE_FILENAME: badge-gemma-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: gemma - # DOCKERFILE: rosetta/Dockerfile.gemma - # DOCKER_CONTEXT: . - # EXTRA_BUILD_ARGS: | - # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - # secrets: inherit + build-gemma: + needs: build-jax + uses: ./.github/workflows/_build.yaml + if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-gemma-build + BADGE_FILENAME: badge-gemma-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: gemma + DOCKERFILE: rosetta/Dockerfile.gemma + DOCKER_CONTEXT: . + EXTRA_BUILD_ARGS: | + URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + secrets: inherit build-axlearn: needs: build-jax @@ -222,15 +222,15 @@ jobs: needs: - build-base - build-jax - # - build-triton - # - build-equinox - # - build-maxtext - # - build-levanter - # - build-upstream-t5x - # - build-upstream-pax - # - build-rosetta-t5x - # - build-rosetta-pax - # - build-gemma + - build-triton + - build-equinox + - build-maxtext + - build-levanter + - build-upstream-t5x + - build-upstream-pax + - build-rosetta-t5x + - build-rosetta-pax + - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -242,8 +242,26 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -253,343 +271,263 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - # test-distribution: - # runs-on: ubuntu-22.04 - # strategy: - # matrix: - # TEST_SCRIPT: - # - extra-only-distribution.sh - # - mirror-only-distribution.sh - # - upstream-only-distribution.sh - # - local-patch-distribution.sh - # fail-fast: false - # steps: - # - name: Print environment variables - # run: env - # - name: Set git login for tests - # run: | - # git config --global user.email "jax@nvidia.com" - # git config --global user.name "JAX-Toolbox CI" - # - name: Check out the repository under ${GITHUB_WORKSPACE} - # uses: actions/checkout@v4 - # - name: Run integration test ${{ matrix.TEST_SCRIPT }} - # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - # test-jax: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: jax - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-backend-independent.log - # test-jax.sh -b backend-independent - # EOF - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee tee test-gpu.log - # nvidia-cuda-mps-control -d - # test-jax.sh -b gpu - # EOF - # STATISTICS_SCRIPT: | - # errors=$(cat test-*.log | grep -c 'ERROR:' || true) - # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-backend-independent.log - # test-gpu.log - # secrets: inherit + test-distribution: + runs-on: ubuntu-22.04 + strategy: + matrix: + TEST_SCRIPT: + - extra-only-distribution.sh + - mirror-only-distribution.sh + - upstream-only-distribution.sh + - local-patch-distribution.sh + fail-fast: false + steps: + - name: Print environment variables + run: env + - name: Set git login for tests + run: | + git config --global user.email "jax@nvidia.com" + git config --global user.name "JAX-Toolbox CI" + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v4 + - name: Run integration test ${{ matrix.TEST_SCRIPT }} + run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + test-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: jax + EXECUTE: | + docker run -i --shm-size=1g --gpus all \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-backend-independent.log + test-jax.sh -b backend-independent + EOF + docker run -i --shm-size=1g --gpus all \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee tee test-gpu.log + nvidia-cuda-mps-control -d + test-jax.sh -b gpu + EOF + STATISTICS_SCRIPT: | + errors=$(cat test-*.log | grep -c 'ERROR:' || true) + failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-backend-independent.log + test-gpu.log + secrets: inherit - # test-nsys-jax: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: nsys-jax - # EXECUTE: | - # set -o pipefail - # num_tests=0 - # num_failures=0 - # # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # # avoid an early abort here. - # set +e - # docker run -i --shm-size=1g --gpus all \ - # -v $PWD:/opt/output \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-nsys-jax.log - # # nsys-jax is already installed, this is just adding the test dependencies - # pip install pytest-reportlog nsys-jax[test] - # # abuse knowledge that nsys-jax is installed editable, so the tests exist - # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - # EOF - # set -e - # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - # for mode in 1-process 2-process process-per-gpu; do - # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - # if [[ "${mode}" == "1-process" ]]; then - # PROCESS_COUNT=1 - # ARGS="" - # elif [[ "${mode}" == "2-process" ]]; then - # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # # this will flush out more bugs than process-per-node or process-per-GPU. - # PROCESS_COUNT=2 - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - # else - # PROCESS_COUNT=${GPUS_PER_NODE} - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - # fi - # for collection in full partial; do - # NSYS_JAX="nsys-jax" - # if [[ "${mode}" == "1-process" ]]; then - # # We will not run nsys-jax-combine, so run analyses eagerly - # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - # fi - # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - # if [[ "${collection}" == "partial" ]]; then - # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # # nvbug/4801401 - # NSYS_JAX+=" --sample=none" - # fi - # set +e - # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - # num_failures=$((num_failures + ($? != 0))) - # set -e - # num_tests=$((num_tests + 1)) - # done - # if [[ "${mode}" != "1-process" ]]; then - # # Run nsys-jax-combine - # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - # for (( i=0; i> $GITHUB_ENV - # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - # exit $num_failures - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-nsys-jax.log) - # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # # pytest-driven part - # test-nsys-jax.log - # pytest-report.jsonl - # # nsys-jax logfiles - # *process-*-execution.log - # # nsys-jax output for the case that doesn't use nsys-jax-combine - # 1-process-*-execution-0.zip - # # nsys-jax-combine output/logfiles - # *process*-*-execution.zip - # *-execution-combine.log - # secrets: inherit + test-nsys-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: nsys-jax + EXECUTE: | + set -o pipefail + num_tests=0 + num_failures=0 + # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # avoid an early abort here. + set +e + docker run -i --shm-size=1g --gpus all \ + -v $PWD:/opt/output \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-nsys-jax.log + # nsys-jax is already installed, this is just adding the test dependencies + pip install pytest-reportlog nsys-jax[test] + # abuse knowledge that nsys-jax is installed editable, so the tests exist + test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + EOF + set -e + GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + for mode in 1-process 2-process process-per-gpu; do + DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + if [[ "${mode}" == "1-process" ]]; then + PROCESS_COUNT=1 + ARGS="" + elif [[ "${mode}" == "2-process" ]]; then + # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # this will flush out more bugs than process-per-node or process-per-GPU. + PROCESS_COUNT=2 + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + else + PROCESS_COUNT=${GPUS_PER_NODE} + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + fi + for collection in full partial; do + NSYS_JAX="nsys-jax" + if [[ "${mode}" == "1-process" ]]; then + # We will not run nsys-jax-combine, so run analyses eagerly + NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + fi + NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + if [[ "${collection}" == "partial" ]]; then + NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # nvbug/4801401 + NSYS_JAX+=" --sample=none" + fi + set +e + ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + num_failures=$((num_failures + ($? != 0))) + set -e + num_tests=$((num_tests + 1)) + done + if [[ "${mode}" != "1-process" ]]; then + # Run nsys-jax-combine + NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + for (( i=0; i> $GITHUB_ENV + echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + exit $num_failures + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-nsys-jax.log) + num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + ARTIFACTS: | + # pytest-driven part + test-nsys-jax.log + pytest-report.jsonl + # nsys-jax logfiles + *process-*-execution.log + # nsys-jax output for the case that doesn't use nsys-jax-combine + 1-process-*-execution-0.zip + # nsys-jax-combine output/logfiles + *process*-*-execution.zip + *-execution-combine.log + secrets: inherit - # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - # # not already have nsys-jax installed - # test-nsys-jax-archive: - # needs: test-nsys-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # strategy: - # matrix: - # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - # runs-on: ${{ matrix.os }} - # steps: - # - name: Download nsys-jax output .zip files - # uses: actions/download-artifact@v4 - # with: - # name: nsys-jax-unit-test-A100 - # - name: Extract archives and execute install scripts - # run: | - # pip install virtualenv # for install.sh - # for zip in $(ls *.zip); do - # ZIP="${PWD}/${zip}" - # pushd $(mktemp -d) - # unzip "${ZIP}" - # ls -l - # # TODO: verify this isn't needed, or make sure it isn't needed - # chmod 755 install.sh - # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # # Skip executing Jupyter lab - # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - # popd - # done - - # test-nsys-jax-eks: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # runs-on: eks - # env: - # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax - # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess - # TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token - # steps: - # - name: GHCR login - # uses: ./.github/actions/ghcr-login - # with: - # docker-username: ${{ github.repository_owner }} - # docker-password: ${{ secrets.GITHUB_TOKEN}} - # token-name: ${{ env.TOKEN_NAME }} - # - name: Configure Kubernetes job - # run: | - # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - # | select(di == 1).metadata.name = strenv(JOB_NAME) - # | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - # .github/eks-workflow-files/job.yml - # git diff .github/eks-workflow-files/job.yml - # - name: Submit Kubernetes job - # uses: ./.github/acitons/submit-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/job.yml - # job-name: ${{ env.JOB_NAME }} - - # # Clean up in case of errors as well as success - # - name: Delete Kubernetes job - # uses: ./.github/actions/delete-k8s-job - # with: - # job-name: ${{ env.JOB_NAME }} + # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + # not already have nsys-jax installed + test-nsys-jax-archive: + needs: test-nsys-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + runs-on: ${{ matrix.os }} + steps: + - name: Download nsys-jax output .zip files + uses: actions/download-artifact@v4 + with: + name: nsys-jax-unit-test-A100 + - name: Extract archives and execute install scripts + run: | + pip install virtualenv # for install.sh + for zip in $(ls *.zip); do + ZIP="${PWD}/${zip}" + pushd $(mktemp -d) + unzip "${ZIP}" + ls -l + # TODO: verify this isn't needed, or make sure it isn't needed + chmod 755 install.sh + # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # Skip executing Jupyter lab + NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + popd + done + + test-nsys-jax-eks: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + runs-on: eks + env: + JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess + TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token + steps: + - name: GHCR login + uses: ./.github/actions/ghcr-login + with: + docker-username: ${{ github.repository_owner }} + docker-password: ${{ secrets.GITHUB_TOKEN}} + token-name: ${{ env.TOKEN_NAME }} + - name: Configure Kubernetes job + run: | + yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + | select(di == 1).metadata.name = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + .github/eks-workflow-files/job.yml + git diff .github/eks-workflow-files/job.yml + - name: Submit Kubernetes job + uses: ./.github/acitons/submit-k8s-job + with: + job-config-file: .github/eks-workflow-files/job.yml + job-name: ${{ env.JOB_NAME }} + + # Clean up in case of errors as well as success + - name: Delete Kubernetes job + uses: ./.github/actions/delete-k8s-job + with: + job-name: ${{ env.JOB_NAME }} - # - name: Configure post-processing job - # run: | - # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - # | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - # .github/eks-workflow-files/post-process-job.yml - # git diff .github/eks-workflow-files/post-process-job.yml - # - name: Submit Kubernetes job - # uses: ./.github/acitons/submit-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/post-process-job.yml - # job-name: ${{ env.POSTPROCESS_JOB_NAME }} - # - name: Delete post-processing Kubernetes job - # uses: ./.github/actions/delete-k8s-job - # with: - # job-name: ${{ env.POSTPROCESS_JOB_NAME }} - # - name: Delete GitHub Container Registry token - # uses: ./.github/actions/delete-ghcr-token - # with: - # token-name: ${{ env.TOKEN_NAME }} - - # # test-equinox: - # # needs: build-equinox - # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # # uses: ./.github/workflows/_test_unit.yaml - # # with: - # # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # # TEST_NAME: equinox - # # EXECUTE: | - # # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # # bash -exc -o pipefail \ - # # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - # # STATISTICS_SCRIPT: | - # # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # # total_tests=$((failed_tests + passed_tests)) - # # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # # ARTIFACTS: | - # # test-equinox.log - # # secrets: inherit - - # test-te-multigpu: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_te.yaml - # with: - # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + - name: Configure post-processing job + run: | + export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + .github/eks-workflow-files/post-process-job.yml + git diff .github/eks-workflow-files/post-process-job.yml + - name: Submit Kubernetes job + uses: ./.github/acitons/submit-k8s-job + with: + job-config-file: .github/eks-workflow-files/post-process-job.yml + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete post-processing Kubernetes job + uses: ./.github/actions/delete-k8s-job + with: + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete GitHub Container Registry token + uses: ./.github/actions/delete-ghcr-token + with: + token-name: ${{ env.TOKEN_NAME }} - # test-upstream-t5x: - # needs: build-upstream-t5x + # test-equinox: + # needs: build-equinox # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_upstream_t5x.yaml - # with: - # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-rosetta-t5x: - # needs: build-rosetta-t5x - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_t5x_rosetta.yaml - # with: - # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-triton: - # needs: build-triton - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: triton - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-triton.log - # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # # actually having a CUDA backend for pytoch - # pip install --no-deps torch - # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - # EOF - # STATISTICS_SCRIPT: | - # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - # passed_tests=$((total_tests - errors - failed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-triton.log - # secrets: inherit - - # test-levanter: - # needs: build-levanter - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a # uses: ./.github/workflows/_test_unit.yaml # with: - # TEST_NAME: levanter + # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + # TEST_NAME: equinox # EXECUTE: | - # docker run -i --gpus all --shm-size=1g \ - # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-levanter.log - # pip install flake8 pytest soundfile librosa - # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - # EOF + # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + # bash -exc -o pipefail \ + # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-levanter.log) # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') @@ -599,112 +537,165 @@ jobs: # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT # ARTIFACTS: | - # test-levanter.log + # test-equinox.log # secrets: inherit - # test-te: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: te - # EXECUTE: | - # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-te.log - # pip install pytest-reportlog - # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax - # EOF - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-te.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # TIMEOUT_MINUTES: 120 - # ARTIFACTS: | - # test-te.log - # pytest-report.jsonl - # secrets: inherit + test-te-multigpu: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_te.yaml + with: + TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit - # test-upstream-pax: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_upstream_pax.yaml - # with: - # PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-upstream-t5x: + needs: build-upstream-t5x + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_upstream_t5x.yaml + with: + T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit - # test-rosetta-pax: - # needs: build-rosetta-pax - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_pax_rosetta.yaml - # with: - # PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-gemma: - # needs: build-gemma - # uses: ./.github/workflows/_test_unit.yaml - # if: inputs.ARCHITECTURE == 'amd64' - # with: - # TEST_NAME: gemma - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - # bash -ec \ - # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-gemma.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-gemma.log - # secrets: inherit + test-rosetta-t5x: + needs: build-rosetta-t5x + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_t5x_rosetta.yaml + with: + T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit - # test-maxtext: - # needs: build-maxtext - # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - # uses: ./.github/workflows/_test_maxtext.yaml - # with: - # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-triton: + needs: build-triton + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: triton + EXECUTE: | + docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-triton.log + # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # actually having a CUDA backend for pytoch + pip install --no-deps torch + python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + EOF + STATISTICS_SCRIPT: | + curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + passed_tests=$((total_tests - errors - failed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-triton.log + secrets: inherit - # test-axlearn-slurm: - # needs: build-axlearn - # if: inputs.ARCHITECTURE == 'amd64' - # uses: ./.github/workflows/_test_unit.yaml - # with: # fix the arguments below - # TEST_NAME: axlearn - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-backend-independent.log - # test-axlearn.sh --directory "." --output "/opt/output/" --test-files "/opt/axlearn/axlearn/common/*_test.py" - # EOF - # STATISTICS_SCRIPT: | - # # Parse the summary.txt file to count passed/failed/error tests - # # Adjust greps if your output format changes. - # passed_tests=$(grep -c ": PASSED" /opt/output/summary.txt || true) - # failed_tests=$(grep -c ": FAILED" /opt/output/summary.txt || true) - # total_tests=$((failed_tests + passed_tests)) + test-levanter: + needs: build-levanter + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: levanter + EXECUTE: | + docker run -i --gpus all --shm-size=1g \ + ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-levanter.log + pip install flake8 pytest soundfile librosa + PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-levanter.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-levanter.log + secrets: inherit - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-backend-independent.log - # secrets: inherit + test-te: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: te + EXECUTE: | + docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-te.log + pip install pytest-reportlog + pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-te.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + TIMEOUT_MINUTES: 120 + ARTIFACTS: | + test-te.log + pytest-report.jsonl + secrets: inherit + test-upstream-pax: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_upstream_pax.yaml + with: + PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-rosetta-pax: + needs: build-rosetta-pax + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_pax_rosetta.yaml + with: + PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-gemma: + needs: build-gemma + uses: ./.github/workflows/_test_unit.yaml + if: inputs.ARCHITECTURE == 'amd64' + with: + TEST_NAME: gemma + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + bash -ec \ + "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-gemma.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-gemma.log + secrets: inherit + + test-maxtext: + needs: build-maxtext + if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + uses: ./.github/workflows/_test_maxtext.yaml + with: + MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + secrets: inherit test-axlearn-eks: needs: build-axlearn @@ -749,6 +740,7 @@ jobs: job-name: ${{ env.JOB_NAME }} - name: Download logs from S3 + id: log-s3 run: | mkdir -p /tmp/axlearn-output aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ env.DATE_TEST_RAN }}/summary.txt /tmp/axlearn-output/ @@ -764,7 +756,6 @@ jobs: echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT - - name: Generate sitrep id: sitrep if: "!cancelled()" @@ -775,9 +766,9 @@ jobs: badge_label='Axlearn EKS Unit' - total_tests=${{ steps.test-stats.outputs.TOTAL_TESTS }} \ - failed_tests=${{ steps.test-stats.outputs.FAILED_TESTS }} \ - passed_tests=${{ steps.test-stats.outputs.PASSED_TESTS }} \ + total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ + failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ + passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ errors="0" \ summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ badge_message="Passed $passed_tests out of $total_tests." \ From 3cd5b7842c88db218db868c510291a4d578eaa86 Mon Sep 17 00:00:00 2001 From: Steboss Date: Sat, 15 Feb 2025 19:58:19 +0000 Subject: [PATCH 27/89] Fix nsys --- .github/workflows/_ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 02c2c4611..bab13b8eb 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -466,6 +466,8 @@ jobs: POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token steps: + - name: Check out the repository + uses: actions/checkout@v4 - name: GHCR login uses: ./.github/actions/ghcr-login with: From 51307d9cd04c9bbab3ba48cb3a698a705e6a59f0 Mon Sep 17 00:00:00 2001 From: Steboss Date: Sun, 16 Feb 2025 20:50:18 +0000 Subject: [PATCH 28/89] fix typo --- .github/workflows/_ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index bab13b8eb..078449f63 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -484,7 +484,7 @@ jobs: .github/eks-workflow-files/job.yml git diff .github/eks-workflow-files/job.yml - name: Submit Kubernetes job - uses: ./.github/acitons/submit-k8s-job + uses: ./.github/actions/submit-k8s-job with: job-config-file: .github/eks-workflow-files/job.yml job-name: ${{ env.JOB_NAME }} @@ -505,7 +505,7 @@ jobs: .github/eks-workflow-files/post-process-job.yml git diff .github/eks-workflow-files/post-process-job.yml - name: Submit Kubernetes job - uses: ./.github/acitons/submit-k8s-job + uses: ./.github/actions/submit-k8s-job with: job-config-file: .github/eks-workflow-files/post-process-job.yml job-name: ${{ env.POSTPROCESS_JOB_NAME }} From 8d7af610ad81805d6ffbfd316730a6ba07715906 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 17 Feb 2025 10:37:46 +0000 Subject: [PATCH 29/89] test on eks --- .github/actions/delete-k8s-job/action.yml | 21 +- .github/workflows/_ci.yaml | 915 +++++++++++----------- 2 files changed, 477 insertions(+), 459 deletions(-) diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml index 877039672..97e2b12a9 100644 --- a/.github/actions/delete-k8s-job/action.yml +++ b/.github/actions/delete-k8s-job/action.yml @@ -5,12 +5,29 @@ inputs: job-name: description: The job name to delete required: true + token-name: + description: Name of the K8s secret to delete + required: true runs: using: "composite" steps: - name: Delete Kubernetes job shell: bash - if: always() run: | - kubectl delete job ${{ inputs.job-name }} + # make sure we're deleting all the resources + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}') + + for pod in $pods; do + status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true) + echo "Pod: $pod, status: $status" + if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then + kubectl delete pod "$pod" --force --grace-period=0 || true + fi + + # make sure job is deleted + kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true + + + # delet eghcr secret + kubectl delete secret ${{ inputs.token-name }} || true diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 078449f63..87c0b689a 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,142 +66,142 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - build-triton: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-triton-build - BADGE_FILENAME: badge-triton-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: triton - DOCKERFILE: .github/container/Dockerfile.triton - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - secrets: inherit + # build-triton: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-triton-build + # BADGE_FILENAME: badge-triton-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: triton + # DOCKERFILE: .github/container/Dockerfile.triton + # RUNNER_SIZE: large + # EXTRA_BUILD_ARGS: | + # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + # secrets: inherit - build-equinox: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - secrets: inherit + # build-equinox: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-equinox-build + # BADGE_FILENAME: badge-equinox-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: equinox + # DOCKERFILE: .github/container/Dockerfile.equinox + # EXTRA_BUILD_ARGS: | + # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + # secrets: inherit - build-maxtext: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - secrets: inherit + # build-maxtext: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-maxtext-build + # BADGE_FILENAME: badge-maxtext-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: maxtext + # DOCKERFILE: .github/container/Dockerfile.maxtext + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + # secrets: inherit - build-levanter: - needs: [build-jax] - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-levanter-build" - BADGE_FILENAME: "badge-levanter-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: levanter - DOCKERFILE: .github/container/Dockerfile.levanter - EXTRA_BUILD_ARGS: | - URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - secrets: inherit + # build-levanter: + # needs: [build-jax] + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-levanter-build" + # BADGE_FILENAME: "badge-levanter-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: levanter + # DOCKERFILE: .github/container/Dockerfile.levanter + # EXTRA_BUILD_ARGS: | + # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + # secrets: inherit - build-upstream-t5x: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - secrets: inherit + # build-upstream-t5x: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-t5x-build" + # BADGE_FILENAME: "badge-t5x-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-t5x + # DOCKERFILE: .github/container/Dockerfile.t5x + # EXTRA_BUILD_ARGS: | + # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + # secrets: inherit - build-upstream-pax: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-pax-build - BADGE_FILENAME: badge-pax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-pax - DOCKERFILE: .github/container/Dockerfile.pax - EXTRA_BUILD_ARGS: | - URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} - URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} - URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} - secrets: inherit + # build-upstream-pax: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-pax-build + # BADGE_FILENAME: badge-pax-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-pax + # DOCKERFILE: .github/container/Dockerfile.pax + # EXTRA_BUILD_ARGS: | + # URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} + # URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} + # URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} + # secrets: inherit - build-rosetta-t5x: - needs: build-upstream-t5x - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: t5x - secrets: inherit + # build-rosetta-t5x: + # needs: build-upstream-t5x + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: t5x + # secrets: inherit - build-rosetta-pax: - needs: build-upstream-pax - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: pax - secrets: inherit + # build-rosetta-pax: + # needs: build-upstream-pax + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: pax + # secrets: inherit - build-gemma: - needs: build-jax - uses: ./.github/workflows/_build.yaml - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-gemma-build - BADGE_FILENAME: badge-gemma-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: gemma - DOCKERFILE: rosetta/Dockerfile.gemma - DOCKER_CONTEXT: . - EXTRA_BUILD_ARGS: | - URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - secrets: inherit + # build-gemma: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-gemma-build + # BADGE_FILENAME: badge-gemma-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: gemma + # DOCKERFILE: rosetta/Dockerfile.gemma + # DOCKER_CONTEXT: . + # EXTRA_BUILD_ARGS: | + # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + # secrets: inherit build-axlearn: needs: build-jax @@ -222,15 +222,15 @@ jobs: needs: - build-base - build-jax - - build-triton - - build-equinox - - build-maxtext - - build-levanter - - build-upstream-t5x - - build-upstream-pax - - build-rosetta-t5x - - build-rosetta-pax - - build-gemma + # - build-triton + # - build-equinox + # - build-maxtext + # - build-levanter + # - build-upstream-t5x + # - build-upstream-pax + # - build-rosetta-t5x + # - build-rosetta-pax + # - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -242,26 +242,26 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -271,27 +271,27 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} test-jax: needs: build-jax @@ -325,136 +325,136 @@ jobs: test-gpu.log secrets: inherit - test-nsys-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - num_tests=0 - num_failures=0 - # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # avoid an early abort here. - set +e - docker run -i --shm-size=1g --gpus all \ - -v $PWD:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - EOF - set -e - GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - for mode in 1-process 2-process process-per-gpu; do - DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - if [[ "${mode}" == "1-process" ]]; then - PROCESS_COUNT=1 - ARGS="" - elif [[ "${mode}" == "2-process" ]]; then - # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # this will flush out more bugs than process-per-node or process-per-GPU. - PROCESS_COUNT=2 - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - else - PROCESS_COUNT=${GPUS_PER_NODE} - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - fi - for collection in full partial; do - NSYS_JAX="nsys-jax" - if [[ "${mode}" == "1-process" ]]; then - # We will not run nsys-jax-combine, so run analyses eagerly - NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - fi - NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - if [[ "${collection}" == "partial" ]]; then - NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # nvbug/4801401 - NSYS_JAX+=" --sample=none" - fi - set +e - ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - num_failures=$((num_failures + ($? != 0))) - set -e - num_tests=$((num_tests + 1)) - done - if [[ "${mode}" != "1-process" ]]; then - # Run nsys-jax-combine - NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - for (( i=0; i> $GITHUB_ENV - echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - exit $num_failures - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - pytest-report.jsonl - # nsys-jax logfiles - *process-*-execution.log - # nsys-jax output for the case that doesn't use nsys-jax-combine - 1-process-*-execution-0.zip - # nsys-jax-combine output/logfiles - *process*-*-execution.zip - *-execution-combine.log - secrets: inherit + # test-nsys-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # num_tests=0 + # num_failures=0 + # # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # # avoid an early abort here. + # set +e + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # EOF + # set -e + # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + # for mode in 1-process 2-process process-per-gpu; do + # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + # if [[ "${mode}" == "1-process" ]]; then + # PROCESS_COUNT=1 + # ARGS="" + # elif [[ "${mode}" == "2-process" ]]; then + # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # # this will flush out more bugs than process-per-node or process-per-GPU. + # PROCESS_COUNT=2 + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + # else + # PROCESS_COUNT=${GPUS_PER_NODE} + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + # fi + # for collection in full partial; do + # NSYS_JAX="nsys-jax" + # if [[ "${mode}" == "1-process" ]]; then + # # We will not run nsys-jax-combine, so run analyses eagerly + # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + # fi + # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + # if [[ "${collection}" == "partial" ]]; then + # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # # nvbug/4801401 + # NSYS_JAX+=" --sample=none" + # fi + # set +e + # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + # num_failures=$((num_failures + ($? != 0))) + # set -e + # num_tests=$((num_tests + 1)) + # done + # if [[ "${mode}" != "1-process" ]]; then + # # Run nsys-jax-combine + # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + # for (( i=0; i> $GITHUB_ENV + # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + # exit $num_failures + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # pytest-report.jsonl + # # nsys-jax logfiles + # *process-*-execution.log + # # nsys-jax output for the case that doesn't use nsys-jax-combine + # 1-process-*-execution-0.zip + # # nsys-jax-combine output/logfiles + # *process*-*-execution.zip + # *-execution-combine.log + # secrets: inherit # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does # not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done test-nsys-jax-eks: needs: build-jax @@ -489,11 +489,12 @@ jobs: job-config-file: .github/eks-workflow-files/job.yml job-name: ${{ env.JOB_NAME }} - # Clean up in case of errors as well as success - - name: Delete Kubernetes job + - name: Delete eks job uses: ./.github/actions/delete-k8s-job + if: ( cancelled() || always() ) with: job-name: ${{ env.JOB_NAME }} + token-name: ${{ env.TOKEN_NAME }} - name: Configure post-processing job run: | @@ -509,10 +510,12 @@ jobs: with: job-config-file: .github/eks-workflow-files/post-process-job.yml job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete post-processing Kubernetes job + - name: Delete eks postprocess job uses: ./.github/actions/delete-k8s-job + if: ( cancelled() || always() ) with: - job-name: ${{ env.POSTPROCESS_JOB_NAME }} + job-name: ${{ env.JOB_NAME }} + token-name: ${{ env.TOKEN_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token with: @@ -542,162 +545,162 @@ jobs: # test-equinox.log # secrets: inherit - test-te-multigpu: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_te.yaml - with: - TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-te-multigpu: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_te.yaml + # with: + # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # actually having a CUDA backend for pytoch - pip install --no-deps torch - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # # actually having a CUDA backend for pytoch + # pip install --no-deps torch + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install flake8 pytest soundfile librosa - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit + # test-levanter: + # needs: build-levanter + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: levanter + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install flake8 pytest soundfile librosa + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-levanter.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-levanter.log + # secrets: inherit - test-te: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: te - EXECUTE: | - docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-te.log - pip install pytest-reportlog - pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-te.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - TIMEOUT_MINUTES: 120 - ARTIFACTS: | - test-te.log - pytest-report.jsonl - secrets: inherit + # test-te: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: te + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-te.log + # pip install pytest-reportlog + # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-te.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # TIMEOUT_MINUTES: 120 + # ARTIFACTS: | + # test-te.log + # pytest-report.jsonl + # secrets: inherit - test-upstream-pax: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_upstream_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-pax: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_upstream_pax.yaml + # with: + # PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-pax: - needs: build-rosetta-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_pax_rosetta.yaml - with: - PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-pax: + # needs: build-rosetta-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_pax_rosetta.yaml + # with: + # PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-gemma: - needs: build-gemma - uses: ./.github/workflows/_test_unit.yaml - if: inputs.ARCHITECTURE == 'amd64' - with: - TEST_NAME: gemma - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - bash -ec \ - "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-gemma.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-gemma.log - secrets: inherit + # test-gemma: + # needs: build-gemma + # uses: ./.github/workflows/_test_unit.yaml + # if: inputs.ARCHITECTURE == 'amd64' + # with: + # TEST_NAME: gemma + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + # bash -ec \ + # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-gemma.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-gemma.log + # secrets: inherit - test-maxtext: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-maxtext: + # needs: build-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit test-axlearn-eks: needs: build-axlearn @@ -738,8 +741,10 @@ jobs: - name: Delete axlearn test job uses: ./.github/actions/delete-k8s-job + if: ( cancelled() || always() ) with: job-name: ${{ env.JOB_NAME }} + token-name: ${{ env.TOKEN_NAME }} - name: Download logs from S3 id: log-s3 @@ -801,8 +806,4 @@ jobs: sitrep.json "badge-axlearn-test" summary.txt - - name: Delete GitHub Container Registry token - uses: ./.github/actions/delete-ghcr-token - with: - token-name: ${{ env.TOKEN_NAME }} From ca15908a9b6c0de0039093ae50aace4ab04d78ba Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 17 Feb 2025 11:21:44 +0000 Subject: [PATCH 30/89] forgot the done for --- .github/actions/delete-k8s-job/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml index 97e2b12a9..db749daab 100644 --- a/.github/actions/delete-k8s-job/action.yml +++ b/.github/actions/delete-k8s-job/action.yml @@ -24,6 +24,7 @@ runs: if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then kubectl delete pod "$pod" --force --grace-period=0 || true fi + done # make sure job is deleted kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true From 9fe301c82ee8f3c8a7629de95865fa18bb9c2907 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 17 Feb 2025 12:03:48 +0000 Subject: [PATCH 31/89] move ghcr deletion a part --- .github/actions/delete-k8s-job/action.yml | 10 ++-------- .github/workflows/_ci.yaml | 8 ++++++-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml index db749daab..15a5add64 100644 --- a/.github/actions/delete-k8s-job/action.yml +++ b/.github/actions/delete-k8s-job/action.yml @@ -5,9 +5,7 @@ inputs: job-name: description: The job name to delete required: true - token-name: - description: Name of the K8s secret to delete - required: true + runs: using: "composite" @@ -27,8 +25,4 @@ runs: done # make sure job is deleted - kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true - - - # delet eghcr secret - kubectl delete secret ${{ inputs.token-name }} || true + kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true \ No newline at end of file diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 87c0b689a..b2fa91b25 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -494,7 +494,6 @@ jobs: if: ( cancelled() || always() ) with: job-name: ${{ env.JOB_NAME }} - token-name: ${{ env.TOKEN_NAME }} - name: Configure post-processing job run: | @@ -515,9 +514,9 @@ jobs: if: ( cancelled() || always() ) with: job-name: ${{ env.JOB_NAME }} - token-name: ${{ env.TOKEN_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token + if: ( cancelled() || always() ) with: token-name: ${{ env.TOKEN_NAME }} @@ -744,6 +743,11 @@ jobs: if: ( cancelled() || always() ) with: job-name: ${{ env.JOB_NAME }} + + - name: Delete GitHub Container Registry token + uses: ./.github/actions/delete-ghcr-token + if: ( cancelled() || always() ) + with: token-name: ${{ env.TOKEN_NAME }} - name: Download logs from S3 From 9125c820cc4aca7db768c8937c32fff297c0ccff Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 17 Feb 2025 14:22:03 +0000 Subject: [PATCH 32/89] try to replace postprocess --- .github/container/test-axlearn.sh | 2 +- .github/workflows/_ci.yaml | 26 +++++++++----------------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 27118b7a0..088c955df 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -120,7 +120,7 @@ echo "Running tests..." # If we are on Kubernetes, install torch for cpu only if [ "$K8S" = true ]; then pip install torch --extra-index-url https://download.pytorch.org/whl/cpu - pip install transformers + pip install transformers sklearn timm fi if [ "${#TEST_FILES[@]}" -eq 0 ]; then diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index b2fa91b25..0c09e3ba5 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -495,25 +495,17 @@ jobs: with: job-name: ${{ env.JOB_NAME }} - - name: Configure post-processing job + + - name: Postprocess retrieve test run: | export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit Kubernetes job - uses: ./.github/actions/submit-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete eks postprocess job - uses: ./.github/actions/delete-k8s-job - if: ( cancelled() || always() ) - with: - job-name: ${{ env.JOB_NAME }} + mkdir -p /tmp/axlearn-output + aws s3 cp --recursive --exclude "*" --include "${JOB_OUTPUT_PATTERN}" s3://jax-toolbox-eks-output/ /tmp/axlearn-output + + - name: Combine with nsys-jax-combine + run: | + cd /tmp/axlearn-output + nsys-jax-combine -o combined.zip ./*.zip --analysis communication - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token if: ( cancelled() || always() ) From 4b39c9cdd2ea1b0ce241e48bd0be742df3e7f331 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 17 Feb 2025 16:20:36 +0000 Subject: [PATCH 33/89] fix nccl test --- .github/actions/submit-k8s-job/action.yml | 13 ++- .github/workflows/_ci.yaml | 41 ++++--- .github/workflows/_test_nccl.yaml | 131 ---------------------- .github/workflows/nccl-k8s.yaml | 105 ++++++++++++++++- 4 files changed, 136 insertions(+), 154 deletions(-) delete mode 100644 .github/workflows/_test_nccl.yaml diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml index c00826897..aa73cf2e2 100644 --- a/.github/actions/submit-k8s-job/action.yml +++ b/.github/actions/submit-k8s-job/action.yml @@ -17,12 +17,21 @@ runs: run: | kubectl apply -f "${{ inputs.job-config-file }}" - - name: Wait for Kubernetes job to start + - name: Wait for job to be un-suspended (Kueue) shell: bash + run: | + # wait for the job to be created + kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60 + + # wait for the 'spec.suspend' field to become false. Necessary for kueue + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=3600s + + - name: Wait for pods to start + shell: bash run: | while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do echo "Waiting for pods to start..." - sleep 10 + sleep 20 done - name: Stream Kubernetes job output diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 0c09e3ba5..cfe15607c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -462,9 +462,9 @@ jobs: runs-on: eks env: JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess - TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token + JOB_NAME: ${{ github.run_id }}-nsys-jax + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token steps: - name: Check out the repository uses: actions/checkout@v4 @@ -488,27 +488,32 @@ jobs: with: job-config-file: .github/eks-workflow-files/job.yml job-name: ${{ env.JOB_NAME }} - - name: Delete eks job uses: ./.github/actions/delete-k8s-job - if: ( cancelled() || always() ) + if: always() with: job-name: ${{ env.JOB_NAME }} - - - - name: Postprocess retrieve test + - name: Configure post-processing job run: | export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - mkdir -p /tmp/axlearn-output - aws s3 cp --recursive --exclude "*" --include "${JOB_OUTPUT_PATTERN}" s3://jax-toolbox-eks-output/ /tmp/axlearn-output - - - name: Combine with nsys-jax-combine - run: | - cd /tmp/axlearn-output - nsys-jax-combine -o combined.zip ./*.zip --analysis communication + yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + .github/eks-workflow-files/post-process-job.yml + git diff .github/eks-workflow-files/post-process-job.yml + - name: Submit post process k8s job + uses: ./.github/actions/submit-k8s/job + with: + job-config-file: .github/eks-workflow-files/post-process-job.yml + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete post process k8s job + uses: ./.github/actions/delete-k8s-job + with: + job-name: ${{ env.POSTPROCESS_JOB_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token - if: ( cancelled() || always() ) + if: always() with: token-name: ${{ env.TOKEN_NAME }} @@ -732,13 +737,13 @@ jobs: - name: Delete axlearn test job uses: ./.github/actions/delete-k8s-job - if: ( cancelled() || always() ) + if: always() with: job-name: ${{ env.JOB_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token - if: ( cancelled() || always() ) + if: always() with: token-name: ${{ env.TOKEN_NAME }} diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml deleted file mode 100644 index 54da0886e..000000000 --- a/.github/workflows/_test_nccl.yaml +++ /dev/null @@ -1,131 +0,0 @@ -name: ~run NCCL tests - -on: - workflow_call: - inputs: - # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda - # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought - # to be modified to test one of the JAX-Toolbox containers. - CONTAINER: - type: string - description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04 - required: true - -permissions: - actions: write # to cancel previous workflows - contents: read # to fetch code - packages: write # to upload container - -jobs: - build-mpi-operator-compatible-base: - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: amd64 - ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build - BADGE_FILENAME: badge-mpi-operator-compatible-base-build - BUILD_DATE: 0000-00-00 # not important; this image is never published - BASE_IMAGE: ${{ inputs.CONTAINER }} - CONTAINER_NAME: mpi-operator-compatible-base - DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base - RUNNER_SIZE: small - secrets: inherit - nccl-test: - needs: build-mpi-operator-compatible-base - strategy: - matrix: - test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi] - runs-on: eks - env: - BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} - TEST_NAME: ${{ matrix.test }} - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Store GitHub Container Registry token as Kubernetes secret - run: | - # Replace underscores in TEST_NAME with - to make a valid Kubernetes name - JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}" - LAUNCHER_NAME="${JOB_NAME}-launcher" - TOKEN_NAME="${JOB_NAME}-token" - # Make these available to later steps - echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV" - echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV" - echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV" - kubectl create secret generic \ - ${TOKEN_NAME} \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson - - name: Configure Kubernetes job - run: | - export WORKER_NAME="${JOB_NAME}-worker" - yq -i '.metadata.name = strenv(JOB_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) - | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) - | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) - | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) - | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ - .github/eks-workflow-files/mpi-nccl-test.yml - git diff .github/eks-workflow-files/mpi-nccl-test.yml - - name: Submit Kubernetes job - run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml - - name: Wait for Kubernetes job to start - # Note that this is *not* using JOB_NAME - run: | - # Launcher job is created eagerly, but suspended. Kueue un-suspends it when - # resources are available, but that is where there can be a long wait if the - # cluster is busy executing other jobs. - kubectl wait --for=create job/${LAUNCHER_NAME} - kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s - - name: Stream Kubernetes job output - # Note that this is *not* JOB_NAME - run: | - # Streaming logs will fail if the container/pod is still pending - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 1 - done - # TODO: --all-containers=true --all-pods=true could make sense here, but it - # prefixes lines with a rather verbose tag - kubectl logs --follow job/${LAUNCHER_NAME} - - name: Retrieve Kubernetes job status - shell: bash -exo pipefail {0} - run: | - while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do - failure=${status[0]:-0} - success=${status[1]:-0} - total=$((failure+success)) - if [[ ${total} < 1 ]]; then - sleep 1 - elif [[ ${total} == 1 ]]; then - break - else - # Shouldn't happen, maybe a sign the job being monitored does not have a - # single launcher pod? - exit 255 - fi - done - exit ${failure} - # Provide more debug output in case of failure; note that some kinds of launch - # failure do not produce any log output. - - name: Debug failed Kubernetes job - if: failure() - run: | - # Provide better debug in case of launch failures that will not produce log output - pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) - if [[ -n "${pods}" ]]; then - kubectl describe ${pods} - fi - # Clean up in case of errors as well as success - - name: Delete Kubernetes job - if: always() - run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml - - name: Delete GitHub Container Registry token - if: always() - run: kubectl delete secret ${TOKEN_NAME} diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index d51c12382..65dcc660c 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -31,8 +31,107 @@ permissions: packages: write # to upload container jobs: - nccl-tests: - uses: ./.github/workflows/_test_nccl.yaml + build-mpi-operator-compatible-base: + uses: ./.github/workflows/_build.yaml with: - CONTAINER: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }} + ARCHITECTURE: amd64 + ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build + BADGE_FILENAME: badge-mpi-operator-compatible-base-build + BUILD_DATE: 0000-00-00 # Not important; this image is never published + BASE_IMAGE: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }} + CONTAINER_NAME: mpi-operator-compatible-base + DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base + RUNNER_SIZE: small secrets: inherit + + nccl-tests: + needs: build-mpi-operator-compatible-base + runs-on: eks + strategy: + matrix: + test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi] + env: + BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} + TEST_NAME: ${{ matrix.test }} + JOB_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}" + LAUNCHER_NAME: "${{ env.JOB_NAME }}-launcher" + TOKEN_NAME: "${{ env.JOB_NAME }}-token" + + + steps: + - name: Check out the repository + uses: actions/checkout@v4 + + - name: GHCR login and store K8s secret + uses: ./.github/actions/ghcr-login + with: + docker-username: ${{ github.repository_owner }} + docker-password: ${{ secrets.GITHUB_TOKEN }} + token-name: ${{ env.TOKEN_NAME }} + - name: Configure Kubernetes job + shell: bash + run: | + export JOB_NAME="${{ env.JOB_NAME }}" + export LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}" + export TOKEN_NAME="${{ env.TOKEN_NAME }}" + export TEST_NAME="${{ env.TEST_NAME }}" + export WORKER_NAME="${JOB_NAME}-worker" + + # Use yq to set our fields in-place + yq -i '.metadata.name = strenv(JOB_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) + | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) + | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) + | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) + | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + .github/eks-workflow-files/mpi-nccl-test.yml + + # (Optional) Show diff for debugging + git diff .github/eks-workflow-files/mpi-nccl-test.yml + + - name: Submit & stream K8s job + uses: ./.github/actions/submit-k8s-job + with: + job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml + job-name: ${{ env.LAUNCHER_NAME }} + - name: Retrieve Kubernetes job status + shell: bash -exo pipefail + run: | + LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}" + while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do + failure=${status[0]:-0} + success=${status[1]:-0} + total=$((failure+success)) + if [[ ${total} < 1 ]]; then + sleep 1 + elif [[ ${total} == 1 ]]; then + break + else + # If total > 1, that suggests a mismatch that can occur if there's more than one launcher pod + exit 255 + fi + done + exit ${failure} + - name: Debug failed Kubernetes job + if: failure() + shell: bash + run: | + LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}" + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) + if [[ -n "${pods}" ]]; then + kubectl describe ${pods} + fi + - name: Delete Kubernetes job + if: always() + uses: ./.github/actions/delete-k8s-job + with: + job-name: ${{ env.LAUNCHER_NAME }} + + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete GitHub Container Registry token + uses: ./.github/actions/delete-ghcr-token + if: always() + with: + token-name: ${{ env.TOKEN_NAME }} \ No newline at end of file From 9516183231a94a2272e366e5bbad806a0d4bbce9 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 17 Feb 2025 16:50:34 +0000 Subject: [PATCH 34/89] fix errors --- .github/actions/submit-k8s-job/action.yml | 2 +- .github/workflows/nccl-k8s.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml index aa73cf2e2..49ddad748 100644 --- a/.github/actions/submit-k8s-job/action.yml +++ b/.github/actions/submit-k8s-job/action.yml @@ -21,7 +21,7 @@ runs: shell: bash run: | # wait for the job to be created - kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60 + kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s # wait for the 'spec.suspend' field to become false. Necessary for kueue kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=3600s diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index 65dcc660c..805aba9e8 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -54,8 +54,8 @@ jobs: BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} TEST_NAME: ${{ matrix.test }} JOB_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}" - LAUNCHER_NAME: "${{ env.JOB_NAME }}-launcher" - TOKEN_NAME: "${{ env.JOB_NAME }}-token" + LAUNCHER_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-launcher" + TOKEN_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-token" steps: From cbee8bbc6909b1598aee4ab8bb5404a8f888ce52 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 17 Feb 2025 17:19:01 +0000 Subject: [PATCH 35/89] fix typo --- .github/workflows/_ci.yaml | 2 +- .github/workflows/nccl-k8s.yaml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index cfe15607c..74a840cb0 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -503,7 +503,7 @@ jobs: .github/eks-workflow-files/post-process-job.yml git diff .github/eks-workflow-files/post-process-job.yml - name: Submit post process k8s job - uses: ./.github/actions/submit-k8s/job + uses: ./.github/actions/submit-k8s-job with: job-config-file: .github/eks-workflow-files/post-process-job.yml job-name: ${{ env.POSTPROCESS_JOB_NAME }} diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index 805aba9e8..176217234 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -128,8 +128,6 @@ jobs: uses: ./.github/actions/delete-k8s-job with: job-name: ${{ env.LAUNCHER_NAME }} - - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token if: always() From 8aed044a4eb29a7fe70484b591672371bef8b89c Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 10:29:41 +0000 Subject: [PATCH 36/89] make a test with 5 files --- .github/container/test-axlearn.sh | 2 +- .../axlearn/axlearn-job.yml | 5 ++-- .github/workflows/nccl-k8s.yaml | 29 +++++++++++-------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 088c955df..5d256706e 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -175,7 +175,7 @@ passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]}"; do +for test_file in "${final_test_files[@]:0:10}"; do echo "Running: ${test_file}" log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log log_file="${LOG_DIRECTORY}/${log_file_name}" diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 56183f35a..7e7fe0f15 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -15,7 +15,7 @@ spec: image: PLACEHOLDER command: - bash - - -exo + - -xo - pipefail - -c - | @@ -29,7 +29,8 @@ spec: --k8s # Wait a moment to ensure logs are flushed - sync + sync + resources: limits: nvidia.com/gpu: 8 diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index 176217234..816979355 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -53,27 +53,32 @@ jobs: env: BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} TEST_NAME: ${{ matrix.test }} - JOB_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}" - LAUNCHER_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-launcher" - TOKEN_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-token" steps: - name: Check out the repository uses: actions/checkout@v4 + - name: Modify variables + id: var + shell: bash + run: | + echo "JOB_NAME=${{ env.JOB_NAME}//_/-}" >> $GITHUB_OUTPUT + echo "LAUNCHER_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-launcher" >> $GITHUB_OUTPUT + echo "TOKEN_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-token" >> $GITHUB_OUTPUT + - name: GHCR login and store K8s secret uses: ./.github/actions/ghcr-login with: docker-username: ${{ github.repository_owner }} docker-password: ${{ secrets.GITHUB_TOKEN }} - token-name: ${{ env.TOKEN_NAME }} + token-name: ${{ steps.var.TOKEN_NAME }} - name: Configure Kubernetes job shell: bash run: | - export JOB_NAME="${{ env.JOB_NAME }}" - export LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}" - export TOKEN_NAME="${{ env.TOKEN_NAME }}" + export JOB_NAME="${{ steps.var.JOB_NAME }}" + export LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}" + export TOKEN_NAME="${{ steps.var.TOKEN_NAME }}" export TEST_NAME="${{ env.TEST_NAME }}" export WORKER_NAME="${JOB_NAME}-worker" @@ -95,11 +100,11 @@ jobs: uses: ./.github/actions/submit-k8s-job with: job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml - job-name: ${{ env.LAUNCHER_NAME }} + job-name: ${{ steps.var.LAUNCHER_NAME }} - name: Retrieve Kubernetes job status shell: bash -exo pipefail run: | - LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}" + LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}" while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do failure=${status[0]:-0} success=${status[1]:-0} @@ -118,7 +123,7 @@ jobs: if: failure() shell: bash run: | - LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}" + LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}" pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) if [[ -n "${pods}" ]]; then kubectl describe ${pods} @@ -127,9 +132,9 @@ jobs: if: always() uses: ./.github/actions/delete-k8s-job with: - job-name: ${{ env.LAUNCHER_NAME }} + job-name: ${{ steps.var.LAUNCHER_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token if: always() with: - token-name: ${{ env.TOKEN_NAME }} \ No newline at end of file + token-name: ${{ steps.var.TOKEN_NAME }} \ No newline at end of file From 91a2bf7a2d38ee3536830a5f9ba13f7a062606c1 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 10:32:28 +0000 Subject: [PATCH 37/89] fix conflicts --- .github/workflows/_ci.yaml | 610 ++++++++++++++++++------------------- 1 file changed, 305 insertions(+), 305 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 74a840cb0..1164e2a3d 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,142 +66,142 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - # build-triton: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-triton-build - # BADGE_FILENAME: badge-triton-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: triton - # DOCKERFILE: .github/container/Dockerfile.triton - # RUNNER_SIZE: large - # EXTRA_BUILD_ARGS: | - # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - # secrets: inherit + build-triton: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-triton-build + BADGE_FILENAME: badge-triton-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: triton + DOCKERFILE: .github/container/Dockerfile.triton + RUNNER_SIZE: large + EXTRA_BUILD_ARGS: | + URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + secrets: inherit - # build-equinox: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-equinox-build - # BADGE_FILENAME: badge-equinox-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: equinox - # DOCKERFILE: .github/container/Dockerfile.equinox - # EXTRA_BUILD_ARGS: | - # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - # secrets: inherit + build-equinox: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-equinox-build + BADGE_FILENAME: badge-equinox-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: equinox + DOCKERFILE: .github/container/Dockerfile.equinox + EXTRA_BUILD_ARGS: | + URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + secrets: inherit - # build-maxtext: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-maxtext-build - # BADGE_FILENAME: badge-maxtext-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: maxtext - # DOCKERFILE: .github/container/Dockerfile.maxtext - # EXTRA_BUILD_ARGS: | - # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - # secrets: inherit + build-maxtext: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-maxtext-build + BADGE_FILENAME: badge-maxtext-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: maxtext + DOCKERFILE: .github/container/Dockerfile.maxtext + EXTRA_BUILD_ARGS: | + URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + secrets: inherit - # build-levanter: - # needs: [build-jax] - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-levanter-build" - # BADGE_FILENAME: "badge-levanter-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: levanter - # DOCKERFILE: .github/container/Dockerfile.levanter - # EXTRA_BUILD_ARGS: | - # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - # secrets: inherit + build-levanter: + needs: [build-jax] + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-levanter-build" + BADGE_FILENAME: "badge-levanter-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: levanter + DOCKERFILE: .github/container/Dockerfile.levanter + EXTRA_BUILD_ARGS: | + URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + secrets: inherit - # build-upstream-t5x: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-t5x-build" - # BADGE_FILENAME: "badge-t5x-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: upstream-t5x - # DOCKERFILE: .github/container/Dockerfile.t5x - # EXTRA_BUILD_ARGS: | - # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - # secrets: inherit + build-upstream-t5x: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-t5x-build" + BADGE_FILENAME: "badge-t5x-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-t5x + DOCKERFILE: .github/container/Dockerfile.t5x + EXTRA_BUILD_ARGS: | + URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + secrets: inherit - # build-upstream-pax: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-pax-build - # BADGE_FILENAME: badge-pax-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: upstream-pax - # DOCKERFILE: .github/container/Dockerfile.pax - # EXTRA_BUILD_ARGS: | - # URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} - # URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} - # URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} - # secrets: inherit + build-upstream-pax: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-pax-build + BADGE_FILENAME: badge-pax-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-pax + DOCKERFILE: .github/container/Dockerfile.pax + EXTRA_BUILD_ARGS: | + URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} + URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} + URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} + secrets: inherit - # build-rosetta-t5x: - # needs: build-upstream-t5x - # uses: ./.github/workflows/_build_rosetta.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: t5x - # secrets: inherit + build-rosetta-t5x: + needs: build-upstream-t5x + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: t5x + secrets: inherit - # build-rosetta-pax: - # needs: build-upstream-pax - # uses: ./.github/workflows/_build_rosetta.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: pax - # secrets: inherit + build-rosetta-pax: + needs: build-upstream-pax + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: pax + secrets: inherit - # build-gemma: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-gemma-build - # BADGE_FILENAME: badge-gemma-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: gemma - # DOCKERFILE: rosetta/Dockerfile.gemma - # DOCKER_CONTEXT: . - # EXTRA_BUILD_ARGS: | - # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - # secrets: inherit + build-gemma: + needs: build-jax + uses: ./.github/workflows/_build.yaml + if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-gemma-build + BADGE_FILENAME: badge-gemma-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: gemma + DOCKERFILE: rosetta/Dockerfile.gemma + DOCKER_CONTEXT: . + EXTRA_BUILD_ARGS: | + URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + secrets: inherit build-axlearn: needs: build-jax @@ -222,15 +222,15 @@ jobs: needs: - build-base - build-jax - # - build-triton - # - build-equinox - # - build-maxtext - # - build-levanter - # - build-upstream-t5x - # - build-upstream-pax - # - build-rosetta-t5x - # - build-rosetta-pax - # - build-gemma + - build-triton + - build-equinox + - build-maxtext + - build-levanter + - build-upstream-t5x + - build-upstream-pax + - build-rosetta-t5x + - build-rosetta-pax + - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -242,26 +242,26 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -271,27 +271,27 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - # test-distribution: - # runs-on: ubuntu-22.04 - # strategy: - # matrix: - # TEST_SCRIPT: - # - extra-only-distribution.sh - # - mirror-only-distribution.sh - # - upstream-only-distribution.sh - # - local-patch-distribution.sh - # fail-fast: false - # steps: - # - name: Print environment variables - # run: env - # - name: Set git login for tests - # run: | - # git config --global user.email "jax@nvidia.com" - # git config --global user.name "JAX-Toolbox CI" - # - name: Check out the repository under ${GITHUB_WORKSPACE} - # uses: actions/checkout@v4 - # - name: Run integration test ${{ matrix.TEST_SCRIPT }} - # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + test-distribution: + runs-on: ubuntu-22.04 + strategy: + matrix: + TEST_SCRIPT: + - extra-only-distribution.sh + - mirror-only-distribution.sh + - upstream-only-distribution.sh + - local-patch-distribution.sh + fail-fast: false + steps: + - name: Print environment variables + run: env + - name: Set git login for tests + run: | + git config --global user.email "jax@nvidia.com" + git config --global user.name "JAX-Toolbox CI" + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v4 + - name: Run integration test ${{ matrix.TEST_SCRIPT }} + run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} test-jax: needs: build-jax @@ -325,136 +325,136 @@ jobs: test-gpu.log secrets: inherit - # test-nsys-jax: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: nsys-jax - # EXECUTE: | - # set -o pipefail - # num_tests=0 - # num_failures=0 - # # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # # avoid an early abort here. - # set +e - # docker run -i --shm-size=1g --gpus all \ - # -v $PWD:/opt/output \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-nsys-jax.log - # # nsys-jax is already installed, this is just adding the test dependencies - # pip install pytest-reportlog nsys-jax[test] - # # abuse knowledge that nsys-jax is installed editable, so the tests exist - # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - # EOF - # set -e - # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - # for mode in 1-process 2-process process-per-gpu; do - # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - # if [[ "${mode}" == "1-process" ]]; then - # PROCESS_COUNT=1 - # ARGS="" - # elif [[ "${mode}" == "2-process" ]]; then - # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # # this will flush out more bugs than process-per-node or process-per-GPU. - # PROCESS_COUNT=2 - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - # else - # PROCESS_COUNT=${GPUS_PER_NODE} - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - # fi - # for collection in full partial; do - # NSYS_JAX="nsys-jax" - # if [[ "${mode}" == "1-process" ]]; then - # # We will not run nsys-jax-combine, so run analyses eagerly - # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - # fi - # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - # if [[ "${collection}" == "partial" ]]; then - # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # # nvbug/4801401 - # NSYS_JAX+=" --sample=none" - # fi - # set +e - # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - # num_failures=$((num_failures + ($? != 0))) - # set -e - # num_tests=$((num_tests + 1)) - # done - # if [[ "${mode}" != "1-process" ]]; then - # # Run nsys-jax-combine - # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - # for (( i=0; i> $GITHUB_ENV - # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - # exit $num_failures - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-nsys-jax.log) - # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # # pytest-driven part - # test-nsys-jax.log - # pytest-report.jsonl - # # nsys-jax logfiles - # *process-*-execution.log - # # nsys-jax output for the case that doesn't use nsys-jax-combine - # 1-process-*-execution-0.zip - # # nsys-jax-combine output/logfiles - # *process*-*-execution.zip - # *-execution-combine.log - # secrets: inherit + test-nsys-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: nsys-jax + EXECUTE: | + set -o pipefail + num_tests=0 + num_failures=0 + # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # avoid an early abort here. + set +e + docker run -i --shm-size=1g --gpus all \ + -v $PWD:/opt/output \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-nsys-jax.log + # nsys-jax is already installed, this is just adding the test dependencies + pip install pytest-reportlog nsys-jax[test] + # abuse knowledge that nsys-jax is installed editable, so the tests exist + test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + EOF + set -e + GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + for mode in 1-process 2-process process-per-gpu; do + DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + if [[ "${mode}" == "1-process" ]]; then + PROCESS_COUNT=1 + ARGS="" + elif [[ "${mode}" == "2-process" ]]; then + # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # this will flush out more bugs than process-per-node or process-per-GPU. + PROCESS_COUNT=2 + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + else + PROCESS_COUNT=${GPUS_PER_NODE} + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + fi + for collection in full partial; do + NSYS_JAX="nsys-jax" + if [[ "${mode}" == "1-process" ]]; then + # We will not run nsys-jax-combine, so run analyses eagerly + NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + fi + NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + if [[ "${collection}" == "partial" ]]; then + NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # nvbug/4801401 + NSYS_JAX+=" --sample=none" + fi + set +e + ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + num_failures=$((num_failures + ($? != 0))) + set -e + num_tests=$((num_tests + 1)) + done + if [[ "${mode}" != "1-process" ]]; then + # Run nsys-jax-combine + NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + for (( i=0; i> $GITHUB_ENV + echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + exit $num_failures + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-nsys-jax.log) + num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + ARTIFACTS: | + # pytest-driven part + test-nsys-jax.log + pytest-report.jsonl + # nsys-jax logfiles + *process-*-execution.log + # nsys-jax output for the case that doesn't use nsys-jax-combine + 1-process-*-execution-0.zip + # nsys-jax-combine output/logfiles + *process*-*-execution.zip + *-execution-combine.log + secrets: inherit - # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - # not already have nsys-jax installed - # test-nsys-jax-archive: - # needs: test-nsys-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # strategy: - # matrix: - # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - # runs-on: ${{ matrix.os }} - # steps: - # - name: Download nsys-jax output .zip files - # uses: actions/download-artifact@v4 - # with: - # name: nsys-jax-unit-test-A100 - # - name: Extract archives and execute install scripts - # run: | - # pip install virtualenv # for install.sh - # for zip in $(ls *.zip); do - # ZIP="${PWD}/${zip}" - # pushd $(mktemp -d) - # unzip "${ZIP}" - # ls -l - # # TODO: verify this isn't needed, or make sure it isn't needed - # chmod 755 install.sh - # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # # Skip executing Jupyter lab - # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - # popd - # done + test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + not already have nsys-jax installed + test-nsys-jax-archive: + needs: test-nsys-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + runs-on: ${{ matrix.os }} + steps: + - name: Download nsys-jax output .zip files + uses: actions/download-artifact@v4 + with: + name: nsys-jax-unit-test-A100 + - name: Extract archives and execute install scripts + run: | + pip install virtualenv # for install.sh + for zip in $(ls *.zip); do + ZIP="${PWD}/${zip}" + pushd $(mktemp -d) + unzip "${ZIP}" + ls -l + # TODO: verify this isn't needed, or make sure it isn't needed + chmod 755 install.sh + # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # Skip executing Jupyter lab + NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + popd + done test-nsys-jax-eks: needs: build-jax From 1a97746a6b59a7c254854b86a960e49af98574c8 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 10:33:38 +0000 Subject: [PATCH 38/89] fix comments --- .github/workflows/_ci.yaml | 364 ++++++++++++++++++------------------- 1 file changed, 182 insertions(+), 182 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 1164e2a3d..89397634d 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -425,9 +425,9 @@ jobs: *-execution-combine.log secrets: inherit - test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - not already have nsys-jax installed + #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + #not already have nsys-jax installed test-nsys-jax-archive: needs: test-nsys-jax if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -517,186 +517,186 @@ jobs: with: token-name: ${{ env.TOKEN_NAME }} - # test-equinox: - # needs: build-equinox - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # TEST_NAME: equinox - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # bash -exc -o pipefail \ - # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - # STATISTICS_SCRIPT: | - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-equinox.log - # secrets: inherit - - # test-te-multigpu: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_te.yaml - # with: - # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-upstream-t5x: - # needs: build-upstream-t5x - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_upstream_t5x.yaml - # with: - # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-rosetta-t5x: - # needs: build-rosetta-t5x - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_t5x_rosetta.yaml - # with: - # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-triton: - # needs: build-triton - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: triton - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-triton.log - # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # # actually having a CUDA backend for pytoch - # pip install --no-deps torch - # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - # EOF - # STATISTICS_SCRIPT: | - # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - # passed_tests=$((total_tests - errors - failed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-triton.log - # secrets: inherit - - # test-levanter: - # needs: build-levanter - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: levanter - # EXECUTE: | - # docker run -i --gpus all --shm-size=1g \ - # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-levanter.log - # pip install flake8 pytest soundfile librosa - # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - # EOF - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-levanter.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-levanter.log - # secrets: inherit - - # test-te: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: te - # EXECUTE: | - # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-te.log - # pip install pytest-reportlog - # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax - # EOF - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-te.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # TIMEOUT_MINUTES: 120 - # ARTIFACTS: | - # test-te.log - # pytest-report.jsonl - # secrets: inherit - - # test-upstream-pax: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_upstream_pax.yaml - # with: - # PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-rosetta-pax: - # needs: build-rosetta-pax - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_pax_rosetta.yaml - # with: - # PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-equinox: + needs: build-equinox + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + TEST_NAME: equinox + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + bash -exc -o pipefail \ + 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + STATISTICS_SCRIPT: | + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-equinox.log + secrets: inherit + + test-te-multigpu: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_te.yaml + with: + TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-upstream-t5x: + needs: build-upstream-t5x + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_upstream_t5x.yaml + with: + T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-rosetta-t5x: + needs: build-rosetta-t5x + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_t5x_rosetta.yaml + with: + T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-triton: + needs: build-triton + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: triton + EXECUTE: | + docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-triton.log + # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # actually having a CUDA backend for pytoch + pip install --no-deps torch + python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + EOF + STATISTICS_SCRIPT: | + curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + passed_tests=$((total_tests - errors - failed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-triton.log + secrets: inherit + + test-levanter: + needs: build-levanter + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: levanter + EXECUTE: | + docker run -i --gpus all --shm-size=1g \ + ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-levanter.log + pip install flake8 pytest soundfile librosa + PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-levanter.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-levanter.log + secrets: inherit + + test-te: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: te + EXECUTE: | + docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-te.log + pip install pytest-reportlog + pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-te.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + TIMEOUT_MINUTES: 120 + ARTIFACTS: | + test-te.log + pytest-report.jsonl + secrets: inherit + + test-upstream-pax: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_upstream_pax.yaml + with: + PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-rosetta-pax: + needs: build-rosetta-pax + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_pax_rosetta.yaml + with: + PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit - # test-gemma: - # needs: build-gemma - # uses: ./.github/workflows/_test_unit.yaml - # if: inputs.ARCHITECTURE == 'amd64' - # with: - # TEST_NAME: gemma - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - # bash -ec \ - # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-gemma.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-gemma.log - # secrets: inherit - - # test-maxtext: - # needs: build-maxtext - # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - # uses: ./.github/workflows/_test_maxtext.yaml - # with: - # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-gemma: + needs: build-gemma + uses: ./.github/workflows/_test_unit.yaml + if: inputs.ARCHITECTURE == 'amd64' + with: + TEST_NAME: gemma + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + bash -ec \ + "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-gemma.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-gemma.log + secrets: inherit + + test-maxtext: + needs: build-maxtext + if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + uses: ./.github/workflows/_test_maxtext.yaml + with: + MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + secrets: inherit test-axlearn-eks: needs: build-axlearn From 852d381c189fb1d46e501c8e138d001a33ecb2b3 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 10:37:41 +0000 Subject: [PATCH 39/89] test axlearn --- .github/workflows/_ci.yaml | 974 ++++++++++++++++++------------------- 1 file changed, 487 insertions(+), 487 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 5d7a2cf0a..d2055d6a8 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,115 +66,115 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - build-triton: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-triton-build - BADGE_FILENAME: badge-triton-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: triton - DOCKERFILE: .github/container/Dockerfile.triton - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - secrets: inherit + # build-triton: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-triton-build + # BADGE_FILENAME: badge-triton-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: triton + # DOCKERFILE: .github/container/Dockerfile.triton + # RUNNER_SIZE: large + # EXTRA_BUILD_ARGS: | + # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + # secrets: inherit - build-equinox: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - secrets: inherit + # build-equinox: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-equinox-build + # BADGE_FILENAME: badge-equinox-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: equinox + # DOCKERFILE: .github/container/Dockerfile.equinox + # EXTRA_BUILD_ARGS: | + # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + # secrets: inherit - build-maxtext: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - secrets: inherit + # build-maxtext: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-maxtext-build + # BADGE_FILENAME: badge-maxtext-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: maxtext + # DOCKERFILE: .github/container/Dockerfile.maxtext + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + # secrets: inherit - build-levanter: - needs: [build-jax] - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-levanter-build" - BADGE_FILENAME: "badge-levanter-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: levanter - DOCKERFILE: .github/container/Dockerfile.levanter - EXTRA_BUILD_ARGS: | - URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - secrets: inherit + # build-levanter: + # needs: [build-jax] + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-levanter-build" + # BADGE_FILENAME: "badge-levanter-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: levanter + # DOCKERFILE: .github/container/Dockerfile.levanter + # EXTRA_BUILD_ARGS: | + # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + # secrets: inherit - build-upstream-t5x: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - secrets: inherit + # build-upstream-t5x: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-t5x-build" + # BADGE_FILENAME: "badge-t5x-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-t5x + # DOCKERFILE: .github/container/Dockerfile.t5x + # EXTRA_BUILD_ARGS: | + # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + # secrets: inherit - build-rosetta-t5x: - needs: build-upstream-t5x - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: t5x - secrets: inherit + # build-rosetta-t5x: + # needs: build-upstream-t5x + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: t5x + # secrets: inherit - build-gemma: - needs: build-jax - uses: ./.github/workflows/_build.yaml - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-gemma-build - BADGE_FILENAME: badge-gemma-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: gemma - DOCKERFILE: rosetta/Dockerfile.gemma - DOCKER_CONTEXT: . - EXTRA_BUILD_ARGS: | - URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - secrets: inherit + # build-gemma: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-gemma-build + # BADGE_FILENAME: badge-gemma-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: gemma + # DOCKERFILE: rosetta/Dockerfile.gemma + # DOCKER_CONTEXT: . + # EXTRA_BUILD_ARGS: | + # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + # secrets: inherit build-axlearn: needs: build-jax @@ -194,14 +194,14 @@ jobs: if: "!cancelled()" needs: - build-base - - build-jax - - build-triton - - build-equinox - - build-maxtext - - build-levanter - - build-upstream-t5x - - build-rosetta-t5x - - build-gemma + # - build-jax + # - build-triton + # - build-equinox + # - build-maxtext + # - build-levanter + # - build-upstream-t5x + # - build-rosetta-t5x + # - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -213,22 +213,22 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -238,275 +238,275 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - test-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee tee test-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + # test-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: jax + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee tee test-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b gpu + # EOF + # STATISTICS_SCRIPT: | + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-backend-independent.log + # test-gpu.log + # secrets: inherit - test-nsys-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - num_tests=0 - num_failures=0 - # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # avoid an early abort here. - set +e - docker run -i --shm-size=1g --gpus all \ - -v $PWD:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - EOF - set -e - GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - for mode in 1-process 2-process process-per-gpu; do - DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - if [[ "${mode}" == "1-process" ]]; then - PROCESS_COUNT=1 - ARGS="" - elif [[ "${mode}" == "2-process" ]]; then - # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # this will flush out more bugs than process-per-node or process-per-GPU. - PROCESS_COUNT=2 - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - else - PROCESS_COUNT=${GPUS_PER_NODE} - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - fi - for collection in full partial; do - NSYS_JAX="nsys-jax" - if [[ "${mode}" == "1-process" ]]; then - # We will not run nsys-jax-combine, so run analyses eagerly - NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - fi - NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - if [[ "${collection}" == "partial" ]]; then - NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # nvbug/4801401 - NSYS_JAX+=" --sample=none" - fi - set +e - ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - num_failures=$((num_failures + ($? != 0))) - set -e - num_tests=$((num_tests + 1)) - done - if [[ "${mode}" != "1-process" ]]; then - # Run nsys-jax-combine - NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - for (( i=0; i> $GITHUB_ENV - echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - exit $num_failures - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - pytest-report.jsonl - # nsys-jax logfiles - *process-*-execution.log - # nsys-jax output for the case that doesn't use nsys-jax-combine - 1-process-*-execution-0.zip - # nsys-jax-combine output/logfiles - *process*-*-execution.zip - *-execution-combine.log - secrets: inherit + # test-nsys-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # num_tests=0 + # num_failures=0 + # # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # # avoid an early abort here. + # set +e + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # EOF + # set -e + # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + # for mode in 1-process 2-process process-per-gpu; do + # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + # if [[ "${mode}" == "1-process" ]]; then + # PROCESS_COUNT=1 + # ARGS="" + # elif [[ "${mode}" == "2-process" ]]; then + # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # # this will flush out more bugs than process-per-node or process-per-GPU. + # PROCESS_COUNT=2 + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + # else + # PROCESS_COUNT=${GPUS_PER_NODE} + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + # fi + # for collection in full partial; do + # NSYS_JAX="nsys-jax" + # if [[ "${mode}" == "1-process" ]]; then + # # We will not run nsys-jax-combine, so run analyses eagerly + # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + # fi + # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + # if [[ "${collection}" == "partial" ]]; then + # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # # nvbug/4801401 + # NSYS_JAX+=" --sample=none" + # fi + # set +e + # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + # num_failures=$((num_failures + ($? != 0))) + # set -e + # num_tests=$((num_tests + 1)) + # done + # if [[ "${mode}" != "1-process" ]]; then + # # Run nsys-jax-combine + # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + # for (( i=0; i> $GITHUB_ENV + # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + # exit $num_failures + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # pytest-report.jsonl + # # nsys-jax logfiles + # *process-*-execution.log + # # nsys-jax output for the case that doesn't use nsys-jax-combine + # 1-process-*-execution-0.zip + # # nsys-jax-combine output/logfiles + # *process*-*-execution.zip + # *-execution-combine.log + # secrets: inherit #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does #not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done - - test-nsys-jax-eks: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - runs-on: eks - env: - JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-nsys-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: GHCR login - uses: ./.github/actions/ghcr-login - with: - docker-username: ${{ github.repository_owner }} - docker-password: ${{ secrets.GITHUB_TOKEN}} - token-name: ${{ env.TOKEN_NAME }} - - name: Configure Kubernetes job - run: | - yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - .github/eks-workflow-files/job.yml - git diff .github/eks-workflow-files/job.yml - - name: Submit Kubernetes job - uses: ./.github/actions/submit-k8s-job - with: - job-config-file: .github/eks-workflow-files/job.yml - job-name: ${{ env.JOB_NAME }} - - name: Delete eks job - uses: ./.github/actions/delete-k8s-job - if: always() - with: - job-name: ${{ env.JOB_NAME }} - - name: Configure post-processing job - run: | - export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post process k8s job - uses: ./.github/actions/submit-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete post process k8s job - uses: ./.github/actions/delete-k8s-job - with: - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete GitHub Container Registry token - uses: ./.github/actions/delete-ghcr-token - if: always() - with: - token-name: ${{ env.TOKEN_NAME }} - - test-equinox: - needs: build-equinox - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - TEST_NAME: equinox - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - bash -exc -o pipefail \ - 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - STATISTICS_SCRIPT: | - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-equinox.log - secrets: inherit + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done + + # test-nsys-jax-eks: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # runs-on: eks + # env: + # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: ${{ github.run_id }}-nsys-jax + # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + # TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: GHCR login + # uses: ./.github/actions/ghcr-login + # with: + # docker-username: ${{ github.repository_owner }} + # docker-password: ${{ secrets.GITHUB_TOKEN}} + # token-name: ${{ env.TOKEN_NAME }} + # - name: Configure Kubernetes job + # run: | + # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + # | select(di == 1).metadata.name = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + # .github/eks-workflow-files/job.yml + # git diff .github/eks-workflow-files/job.yml + # - name: Submit Kubernetes job + # uses: ./.github/actions/submit-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/job.yml + # job-name: ${{ env.JOB_NAME }} + # - name: Delete eks job + # uses: ./.github/actions/delete-k8s-job + # if: always() + # with: + # job-name: ${{ env.JOB_NAME }} + # - name: Configure post-processing job + # run: | + # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + # | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + # .github/eks-workflow-files/post-process-job.yml + # git diff .github/eks-workflow-files/post-process-job.yml + # - name: Submit post process k8s job + # uses: ./.github/actions/submit-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/post-process-job.yml + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + # - name: Delete post process k8s job + # uses: ./.github/actions/delete-k8s-job + # with: + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + # - name: Delete GitHub Container Registry token + # uses: ./.github/actions/delete-ghcr-token + # if: always() + # with: + # token-name: ${{ env.TOKEN_NAME }} + + # test-equinox: + # needs: build-equinox + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + # TEST_NAME: equinox + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + # bash -exc -o pipefail \ + # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # STATISTICS_SCRIPT: | + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-equinox.log + # secrets: inherit # test-te-multigpu: # needs: build-upstream-pax @@ -516,77 +516,77 @@ jobs: # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # actually having a CUDA backend for pytoch - pip install --no-deps torch - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # # actually having a CUDA backend for pytoch + # pip install --no-deps torch + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install flake8 pytest soundfile librosa - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit + # test-levanter: + # needs: build-levanter + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: levanter + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install flake8 pytest soundfile librosa + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-levanter.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-levanter.log + # secrets: inherit # test-te: # needs: build-upstream-pax @@ -617,37 +617,37 @@ jobs: # pytest-report.jsonl # secrets: inherit - test-gemma: - needs: build-gemma - uses: ./.github/workflows/_test_unit.yaml - if: inputs.ARCHITECTURE == 'amd64' - with: - TEST_NAME: gemma - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - bash -ec \ - "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-gemma.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-gemma.log - secrets: inherit + # test-gemma: + # needs: build-gemma + # uses: ./.github/workflows/_test_unit.yaml + # if: inputs.ARCHITECTURE == 'amd64' + # with: + # TEST_NAME: gemma + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + # bash -ec \ + # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-gemma.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-gemma.log + # secrets: inherit - test-maxtext: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-maxtext: + # needs: build-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit test-axlearn-eks: needs: build-axlearn From c4d3bbfd2a7302d57fbc11cd768baa4ebb7092e2 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 11:36:25 +0000 Subject: [PATCH 40/89] fix nccl test variables, install in test file, make a signal for test finished --- .github/container/test-axlearn.sh | 3 +- .../axlearn/axlearn-job.yml | 8 +++-- .github/workflows/_ci.yaml | 14 ++++----- .github/workflows/nccl-k8s.yaml | 29 ++++++++++--------- 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 5d256706e..d2786a8ca 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -120,7 +120,8 @@ echo "Running tests..." # If we are on Kubernetes, install torch for cpu only if [ "$K8S" = true ]; then pip install torch --extra-index-url https://download.pytorch.org/whl/cpu - pip install transformers sklearn timm + pip install transformers + pip install scikit-learn timm fi if [ "${#TEST_FILES[@]}" -eq 0 ]; then diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 7e7fe0f15..fd4a63d31 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -30,7 +30,9 @@ spec: # Wait a moment to ensure logs are flushed sync - + wait + # after execution flag the results have been produced + touch /opt/output/done resources: limits: nvidia.com/gpu: 8 @@ -46,8 +48,8 @@ spec: - sh - -c - | - # Wait for the summary file to appear - while [ ! -f /opt/output/summary.txt ]; do + # Wait for the tests to finish + while [ ! -f /opt/output/done ]; do sleep 1 done # Now upload to your S3 bucket diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index d2055d6a8..0a02fe54a 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -191,7 +191,7 @@ jobs: collect-docker-tags: runs-on: ubuntu-22.04 - if: "!cancelled()" + if: ${{ !cancelled() }} needs: - build-base # - build-jax @@ -457,7 +457,7 @@ jobs: # job-name: ${{ env.JOB_NAME }} # - name: Delete eks job # uses: ./.github/actions/delete-k8s-job - # if: always() + # if: ${{ always() }} # with: # job-name: ${{ env.JOB_NAME }} # - name: Configure post-processing job @@ -480,7 +480,7 @@ jobs: # job-name: ${{ env.POSTPROCESS_JOB_NAME }} # - name: Delete GitHub Container Registry token # uses: ./.github/actions/delete-ghcr-token - # if: always() + # if: ${{ always() }} # with: # token-name: ${{ env.TOKEN_NAME }} @@ -688,13 +688,13 @@ jobs: - name: Delete axlearn test job uses: ./.github/actions/delete-k8s-job - if: always() + if: ${{ always() }} with: job-name: ${{ env.JOB_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token - if: always() + if: ${{ always() }} with: token-name: ${{ env.TOKEN_NAME }} @@ -717,7 +717,7 @@ jobs: echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT - name: Generate sitrep id: sitrep - if: "!cancelled()" + if: ${{ !cancelled() }} shell: bash -x -e {0} run: | # bring in utility functions @@ -750,7 +750,7 @@ jobs: > "badge-axlearn-test" - name: Upload artifacts - if: "!cancelled()" + if: ${{ !cancelled() }} uses: actions/upload-artifact@v4 with: name: "artifact-axlearn-test" diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index 816979355..aff2f8709 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -63,9 +63,10 @@ jobs: id: var shell: bash run: | - echo "JOB_NAME=${{ env.JOB_NAME}//_/-}" >> $GITHUB_OUTPUT - echo "LAUNCHER_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-launcher" >> $GITHUB_OUTPUT - echo "TOKEN_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-token" >> $GITHUB_OUTPUT + export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}" + echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT + echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT + echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT - name: GHCR login and store K8s secret uses: ./.github/actions/ghcr-login @@ -76,9 +77,9 @@ jobs: - name: Configure Kubernetes job shell: bash run: | - export JOB_NAME="${{ steps.var.JOB_NAME }}" - export LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}" - export TOKEN_NAME="${{ steps.var.TOKEN_NAME }}" + export JOB_NAME="${{ steps.var.outputs.JOB_NAME }}" + export LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" + export TOKEN_NAME="${{ steps.var.outputs.TOKEN_NAME }}" export TEST_NAME="${{ env.TEST_NAME }}" export WORKER_NAME="${JOB_NAME}-worker" @@ -100,11 +101,11 @@ jobs: uses: ./.github/actions/submit-k8s-job with: job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml - job-name: ${{ steps.var.LAUNCHER_NAME }} + job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} - name: Retrieve Kubernetes job status shell: bash -exo pipefail run: | - LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}" + LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do failure=${status[0]:-0} success=${status[1]:-0} @@ -120,21 +121,21 @@ jobs: done exit ${failure} - name: Debug failed Kubernetes job - if: failure() + if: ${{ failure() }} shell: bash run: | - LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}" + LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) if [[ -n "${pods}" ]]; then kubectl describe ${pods} fi - name: Delete Kubernetes job - if: always() + if: ${{ always() }} uses: ./.github/actions/delete-k8s-job with: - job-name: ${{ steps.var.LAUNCHER_NAME }} + job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} - name: Delete GitHub Container Registry token uses: ./.github/actions/delete-ghcr-token - if: always() + if: ${{ always() }} with: - token-name: ${{ steps.var.TOKEN_NAME }} \ No newline at end of file + token-name: ${{ steps.var.outputs.TOKEN_NAME }} \ No newline at end of file From 4b5a56b0d1220dd3914e370a5b0343c87addbbb5 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 11:41:49 +0000 Subject: [PATCH 41/89] Fix var output --- .github/workflows/nccl-k8s.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index aff2f8709..321fd861f 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -73,7 +73,7 @@ jobs: with: docker-username: ${{ github.repository_owner }} docker-password: ${{ secrets.GITHUB_TOKEN }} - token-name: ${{ steps.var.TOKEN_NAME }} + token-name: ${{ steps.var.outputs.TOKEN_NAME }} - name: Configure Kubernetes job shell: bash run: | From d205f6a2e199bf1944d45004e4fa942b6172633a Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 13:40:02 +0000 Subject: [PATCH 42/89] test clean --- .github/container/test-axlearn.sh | 2 +- .../axlearn/axlearn-job.yml | 9 +- .github/workflows/_ci.yaml | 1074 ++++++++--------- .github/workflows/nccl-k8s.yaml | 2 +- 4 files changed, 541 insertions(+), 546 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index d2786a8ca..c62f36f5b 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -176,7 +176,7 @@ passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]:0:10}"; do +for test_file in "${final_test_files[@]}"; do echo "Running: ${test_file}" log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log log_file="${LOG_DIRECTORY}/${log_file_name}" diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index fd4a63d31..7c1022f61 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -19,16 +19,12 @@ spec: - pipefail - -c - | - # Example test command; adapted from your Docker run snippet - # Writes logs to /opt/output/test-backend-independent.log - # Also writes a summary file to /opt/output/summary.txt test-axlearn.sh \ --directory "." \ --output "/opt/output/" \ --test-files "/opt/axlearn/axlearn/common/*_test.py" \ --k8s - # Wait a moment to ensure logs are flushed sync wait # after execution flag the results have been produced @@ -48,11 +44,10 @@ spec: - sh - -c - | - # Wait for the tests to finish while [ ! -f /opt/output/done ]; do - sleep 1 + sleep 5 done - # Now upload to your S3 bucket + # Upload to S3 bucket aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt volumeMounts: - name: output diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 0a02fe54a..1439f515c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,115 +66,115 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - # build-triton: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-triton-build - # BADGE_FILENAME: badge-triton-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: triton - # DOCKERFILE: .github/container/Dockerfile.triton - # RUNNER_SIZE: large - # EXTRA_BUILD_ARGS: | - # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - # secrets: inherit - - # build-equinox: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-equinox-build - # BADGE_FILENAME: badge-equinox-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: equinox - # DOCKERFILE: .github/container/Dockerfile.equinox - # EXTRA_BUILD_ARGS: | - # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - # secrets: inherit - - # build-maxtext: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-maxtext-build - # BADGE_FILENAME: badge-maxtext-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: maxtext - # DOCKERFILE: .github/container/Dockerfile.maxtext - # EXTRA_BUILD_ARGS: | - # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - # secrets: inherit - - # build-levanter: - # needs: [build-jax] - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-levanter-build" - # BADGE_FILENAME: "badge-levanter-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: levanter - # DOCKERFILE: .github/container/Dockerfile.levanter - # EXTRA_BUILD_ARGS: | - # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - # secrets: inherit - - # build-upstream-t5x: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-t5x-build" - # BADGE_FILENAME: "badge-t5x-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: upstream-t5x - # DOCKERFILE: .github/container/Dockerfile.t5x - # EXTRA_BUILD_ARGS: | - # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - # secrets: inherit - - # build-rosetta-t5x: - # needs: build-upstream-t5x - # uses: ./.github/workflows/_build_rosetta.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: t5x - # secrets: inherit - - # build-gemma: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-gemma-build - # BADGE_FILENAME: badge-gemma-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: gemma - # DOCKERFILE: rosetta/Dockerfile.gemma - # DOCKER_CONTEXT: . - # EXTRA_BUILD_ARGS: | - # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - # secrets: inherit + build-triton: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-triton-build + BADGE_FILENAME: badge-triton-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: triton + DOCKERFILE: .github/container/Dockerfile.triton + RUNNER_SIZE: large + EXTRA_BUILD_ARGS: | + URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + secrets: inherit + + build-equinox: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-equinox-build + BADGE_FILENAME: badge-equinox-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: equinox + DOCKERFILE: .github/container/Dockerfile.equinox + EXTRA_BUILD_ARGS: | + URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + secrets: inherit + + build-maxtext: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-maxtext-build + BADGE_FILENAME: badge-maxtext-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: maxtext + DOCKERFILE: .github/container/Dockerfile.maxtext + EXTRA_BUILD_ARGS: | + URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + secrets: inherit + + build-levanter: + needs: [build-jax] + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-levanter-build" + BADGE_FILENAME: "badge-levanter-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: levanter + DOCKERFILE: .github/container/Dockerfile.levanter + EXTRA_BUILD_ARGS: | + URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + secrets: inherit + + build-upstream-t5x: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-t5x-build" + BADGE_FILENAME: "badge-t5x-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-t5x + DOCKERFILE: .github/container/Dockerfile.t5x + EXTRA_BUILD_ARGS: | + URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + secrets: inherit + + build-rosetta-t5x: + needs: build-upstream-t5x + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: t5x + secrets: inherit + + build-gemma: + needs: build-jax + uses: ./.github/workflows/_build.yaml + if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-gemma-build + BADGE_FILENAME: badge-gemma-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: gemma + DOCKERFILE: rosetta/Dockerfile.gemma + DOCKER_CONTEXT: . + EXTRA_BUILD_ARGS: | + URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + secrets: inherit build-axlearn: needs: build-jax @@ -194,14 +194,14 @@ jobs: if: ${{ !cancelled() }} needs: - build-base - # - build-jax - # - build-triton - # - build-equinox - # - build-maxtext - # - build-levanter - # - build-upstream-t5x - # - build-rosetta-t5x - # - build-gemma + - build-jax + - build-triton + - build-equinox + - build-maxtext + - build-levanter + - build-upstream-t5x + - build-rosetta-t5x + - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -213,22 +213,22 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -238,416 +238,416 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - # test-distribution: - # runs-on: ubuntu-22.04 - # strategy: - # matrix: - # TEST_SCRIPT: - # - extra-only-distribution.sh - # - mirror-only-distribution.sh - # - upstream-only-distribution.sh - # - local-patch-distribution.sh - # fail-fast: false - # steps: - # - name: Print environment variables - # run: env - # - name: Set git login for tests - # run: | - # git config --global user.email "jax@nvidia.com" - # git config --global user.name "JAX-Toolbox CI" - # - name: Check out the repository under ${GITHUB_WORKSPACE} - # uses: actions/checkout@v4 - # - name: Run integration test ${{ matrix.TEST_SCRIPT }} - # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - # test-jax: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: jax - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-backend-independent.log - # test-jax.sh -b backend-independent - # EOF - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee tee test-gpu.log - # nvidia-cuda-mps-control -d - # test-jax.sh -b gpu - # EOF - # STATISTICS_SCRIPT: | - # errors=$(cat test-*.log | grep -c 'ERROR:' || true) - # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-backend-independent.log - # test-gpu.log - # secrets: inherit - - # test-nsys-jax: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: nsys-jax - # EXECUTE: | - # set -o pipefail - # num_tests=0 - # num_failures=0 - # # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # # avoid an early abort here. - # set +e - # docker run -i --shm-size=1g --gpus all \ - # -v $PWD:/opt/output \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-nsys-jax.log - # # nsys-jax is already installed, this is just adding the test dependencies - # pip install pytest-reportlog nsys-jax[test] - # # abuse knowledge that nsys-jax is installed editable, so the tests exist - # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - # EOF - # set -e - # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - # for mode in 1-process 2-process process-per-gpu; do - # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - # if [[ "${mode}" == "1-process" ]]; then - # PROCESS_COUNT=1 - # ARGS="" - # elif [[ "${mode}" == "2-process" ]]; then - # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # # this will flush out more bugs than process-per-node or process-per-GPU. - # PROCESS_COUNT=2 - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - # else - # PROCESS_COUNT=${GPUS_PER_NODE} - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - # fi - # for collection in full partial; do - # NSYS_JAX="nsys-jax" - # if [[ "${mode}" == "1-process" ]]; then - # # We will not run nsys-jax-combine, so run analyses eagerly - # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - # fi - # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - # if [[ "${collection}" == "partial" ]]; then - # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # # nvbug/4801401 - # NSYS_JAX+=" --sample=none" - # fi - # set +e - # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - # num_failures=$((num_failures + ($? != 0))) - # set -e - # num_tests=$((num_tests + 1)) - # done - # if [[ "${mode}" != "1-process" ]]; then - # # Run nsys-jax-combine - # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - # for (( i=0; i> $GITHUB_ENV - # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - # exit $num_failures - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-nsys-jax.log) - # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # # pytest-driven part - # test-nsys-jax.log - # pytest-report.jsonl - # # nsys-jax logfiles - # *process-*-execution.log - # # nsys-jax output for the case that doesn't use nsys-jax-combine - # 1-process-*-execution-0.zip - # # nsys-jax-combine output/logfiles - # *process*-*-execution.zip - # *-execution-combine.log - # secrets: inherit + test-distribution: + runs-on: ubuntu-22.04 + strategy: + matrix: + TEST_SCRIPT: + - extra-only-distribution.sh + - mirror-only-distribution.sh + - upstream-only-distribution.sh + - local-patch-distribution.sh + fail-fast: false + steps: + - name: Print environment variables + run: env + - name: Set git login for tests + run: | + git config --global user.email "jax@nvidia.com" + git config --global user.name "JAX-Toolbox CI" + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v4 + - name: Run integration test ${{ matrix.TEST_SCRIPT }} + run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + test-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: jax + EXECUTE: | + docker run -i --shm-size=1g --gpus all \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-backend-independent.log + test-jax.sh -b backend-independent + EOF + docker run -i --shm-size=1g --gpus all \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee tee test-gpu.log + nvidia-cuda-mps-control -d + test-jax.sh -b gpu + EOF + STATISTICS_SCRIPT: | + errors=$(cat test-*.log | grep -c 'ERROR:' || true) + failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-backend-independent.log + test-gpu.log + secrets: inherit + + test-nsys-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: nsys-jax + EXECUTE: | + set -o pipefail + num_tests=0 + num_failures=0 + # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # avoid an early abort here. + set +e + docker run -i --shm-size=1g --gpus all \ + -v $PWD:/opt/output \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-nsys-jax.log + # nsys-jax is already installed, this is just adding the test dependencies + pip install pytest-reportlog nsys-jax[test] + # abuse knowledge that nsys-jax is installed editable, so the tests exist + test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + EOF + set -e + GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + for mode in 1-process 2-process process-per-gpu; do + DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + if [[ "${mode}" == "1-process" ]]; then + PROCESS_COUNT=1 + ARGS="" + elif [[ "${mode}" == "2-process" ]]; then + # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # this will flush out more bugs than process-per-node or process-per-GPU. + PROCESS_COUNT=2 + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + else + PROCESS_COUNT=${GPUS_PER_NODE} + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + fi + for collection in full partial; do + NSYS_JAX="nsys-jax" + if [[ "${mode}" == "1-process" ]]; then + # We will not run nsys-jax-combine, so run analyses eagerly + NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + fi + NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + if [[ "${collection}" == "partial" ]]; then + NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # nvbug/4801401 + NSYS_JAX+=" --sample=none" + fi + set +e + ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + num_failures=$((num_failures + ($? != 0))) + set -e + num_tests=$((num_tests + 1)) + done + if [[ "${mode}" != "1-process" ]]; then + # Run nsys-jax-combine + NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + for (( i=0; i> $GITHUB_ENV + echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + exit $num_failures + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-nsys-jax.log) + num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + ARTIFACTS: | + # pytest-driven part + test-nsys-jax.log + pytest-report.jsonl + # nsys-jax logfiles + *process-*-execution.log + # nsys-jax output for the case that doesn't use nsys-jax-combine + 1-process-*-execution-0.zip + # nsys-jax-combine output/logfiles + *process*-*-execution.zip + *-execution-combine.log + secrets: inherit #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does #not already have nsys-jax installed - # test-nsys-jax-archive: - # needs: test-nsys-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # strategy: - # matrix: - # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - # runs-on: ${{ matrix.os }} - # steps: - # - name: Download nsys-jax output .zip files - # uses: actions/download-artifact@v4 - # with: - # name: nsys-jax-unit-test-A100 - # - name: Extract archives and execute install scripts - # run: | - # pip install virtualenv # for install.sh - # for zip in $(ls *.zip); do - # ZIP="${PWD}/${zip}" - # pushd $(mktemp -d) - # unzip "${ZIP}" - # ls -l - # # TODO: verify this isn't needed, or make sure it isn't needed - # chmod 755 install.sh - # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # # Skip executing Jupyter lab - # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - # popd - # done - - # test-nsys-jax-eks: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # runs-on: eks - # env: - # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: ${{ github.run_id }}-nsys-jax - # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - # TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token - # steps: - # - name: Check out the repository - # uses: actions/checkout@v4 - # - name: GHCR login - # uses: ./.github/actions/ghcr-login - # with: - # docker-username: ${{ github.repository_owner }} - # docker-password: ${{ secrets.GITHUB_TOKEN}} - # token-name: ${{ env.TOKEN_NAME }} - # - name: Configure Kubernetes job - # run: | - # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - # | select(di == 1).metadata.name = strenv(JOB_NAME) - # | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - # .github/eks-workflow-files/job.yml - # git diff .github/eks-workflow-files/job.yml - # - name: Submit Kubernetes job - # uses: ./.github/actions/submit-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/job.yml - # job-name: ${{ env.JOB_NAME }} - # - name: Delete eks job - # uses: ./.github/actions/delete-k8s-job - # if: ${{ always() }} - # with: - # job-name: ${{ env.JOB_NAME }} - # - name: Configure post-processing job - # run: | - # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - # | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - # .github/eks-workflow-files/post-process-job.yml - # git diff .github/eks-workflow-files/post-process-job.yml - # - name: Submit post process k8s job - # uses: ./.github/actions/submit-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/post-process-job.yml - # job-name: ${{ env.POSTPROCESS_JOB_NAME }} - # - name: Delete post process k8s job - # uses: ./.github/actions/delete-k8s-job - # with: - # job-name: ${{ env.POSTPROCESS_JOB_NAME }} - # - name: Delete GitHub Container Registry token - # uses: ./.github/actions/delete-ghcr-token - # if: ${{ always() }} - # with: - # token-name: ${{ env.TOKEN_NAME }} - - # test-equinox: - # needs: build-equinox - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # TEST_NAME: equinox - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # bash -exc -o pipefail \ - # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - # STATISTICS_SCRIPT: | - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-equinox.log - # secrets: inherit - - # test-te-multigpu: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_te.yaml - # with: - # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-upstream-t5x: - # needs: build-upstream-t5x - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_upstream_t5x.yaml - # with: - # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-rosetta-t5x: - # needs: build-rosetta-t5x - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_t5x_rosetta.yaml - # with: - # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-triton: - # needs: build-triton - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: triton - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-triton.log - # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # # actually having a CUDA backend for pytoch - # pip install --no-deps torch - # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - # EOF - # STATISTICS_SCRIPT: | - # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - # passed_tests=$((total_tests - errors - failed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-triton.log - # secrets: inherit - - # test-levanter: - # needs: build-levanter - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: levanter - # EXECUTE: | - # docker run -i --gpus all --shm-size=1g \ - # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-levanter.log - # pip install flake8 pytest soundfile librosa - # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - # EOF - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-levanter.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-levanter.log - # secrets: inherit - - # test-te: - # needs: build-upstream-pax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: te - # EXECUTE: | - # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-te.log - # pip install pytest-reportlog - # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax - # EOF - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-te.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # TIMEOUT_MINUTES: 120 - # ARTIFACTS: | - # test-te.log - # pytest-report.jsonl - # secrets: inherit - - # test-gemma: - # needs: build-gemma - # uses: ./.github/workflows/_test_unit.yaml - # if: inputs.ARCHITECTURE == 'amd64' - # with: - # TEST_NAME: gemma - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - # bash -ec \ - # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-gemma.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-gemma.log - # secrets: inherit - - # test-maxtext: - # needs: build-maxtext - # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - # uses: ./.github/workflows/_test_maxtext.yaml - # with: - # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-nsys-jax-archive: + needs: test-nsys-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + runs-on: ${{ matrix.os }} + steps: + - name: Download nsys-jax output .zip files + uses: actions/download-artifact@v4 + with: + name: nsys-jax-unit-test-A100 + - name: Extract archives and execute install scripts + run: | + pip install virtualenv # for install.sh + for zip in $(ls *.zip); do + ZIP="${PWD}/${zip}" + pushd $(mktemp -d) + unzip "${ZIP}" + ls -l + # TODO: verify this isn't needed, or make sure it isn't needed + chmod 755 install.sh + # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # Skip executing Jupyter lab + NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + popd + done + + test-nsys-jax-eks: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + runs-on: eks + env: + JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: ${{ github.run_id }}-nsys-jax + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: GHCR login + uses: ./.github/actions/ghcr-login + with: + docker-username: ${{ github.repository_owner }} + docker-password: ${{ secrets.GITHUB_TOKEN}} + token-name: ${{ env.TOKEN_NAME }} + - name: Configure Kubernetes job + run: | + yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + | select(di == 1).metadata.name = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + .github/eks-workflow-files/job.yml + git diff .github/eks-workflow-files/job.yml + - name: Submit Kubernetes job + uses: ./.github/actions/submit-k8s-job + with: + job-config-file: .github/eks-workflow-files/job.yml + job-name: ${{ env.JOB_NAME }} + - name: Delete eks job + uses: ./.github/actions/delete-k8s-job + if: ${{ always() }} + with: + job-name: ${{ env.JOB_NAME }} + - name: Configure post-processing job + run: | + export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + .github/eks-workflow-files/post-process-job.yml + git diff .github/eks-workflow-files/post-process-job.yml + - name: Submit post process k8s job + uses: ./.github/actions/submit-k8s-job + with: + job-config-file: .github/eks-workflow-files/post-process-job.yml + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete post process k8s job + uses: ./.github/actions/delete-k8s-job + with: + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete GitHub Container Registry token + uses: ./.github/actions/delete-ghcr-token + if: ${{ always() }} + with: + token-name: ${{ env.TOKEN_NAME }} + + test-equinox: + needs: build-equinox + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + TEST_NAME: equinox + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + bash -exc -o pipefail \ + 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + STATISTICS_SCRIPT: | + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-equinox.log + secrets: inherit + + test-te-multigpu: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_te.yaml + with: + TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-upstream-t5x: + needs: build-upstream-t5x + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_upstream_t5x.yaml + with: + T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-rosetta-t5x: + needs: build-rosetta-t5x + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_t5x_rosetta.yaml + with: + T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-triton: + needs: build-triton + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: triton + EXECUTE: | + docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-triton.log + # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # actually having a CUDA backend for pytoch + pip install --no-deps torch + python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + EOF + STATISTICS_SCRIPT: | + curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + passed_tests=$((total_tests - errors - failed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-triton.log + secrets: inherit + + test-levanter: + needs: build-levanter + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: levanter + EXECUTE: | + docker run -i --gpus all --shm-size=1g \ + ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-levanter.log + pip install flake8 pytest soundfile librosa + PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-levanter.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-levanter.log + secrets: inherit + + test-te: + needs: build-upstream-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: te + EXECUTE: | + docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-te.log + pip install pytest-reportlog + pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-te.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + TIMEOUT_MINUTES: 120 + ARTIFACTS: | + test-te.log + pytest-report.jsonl + secrets: inherit + + test-gemma: + needs: build-gemma + uses: ./.github/workflows/_test_unit.yaml + if: inputs.ARCHITECTURE == 'amd64' + with: + TEST_NAME: gemma + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + bash -ec \ + "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-gemma.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-gemma.log + secrets: inherit + + test-maxtext: + needs: build-maxtext + if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + uses: ./.github/workflows/_test_maxtext.yaml + with: + MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + secrets: inherit test-axlearn-eks: needs: build-axlearn diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index 321fd861f..6f39ebe0b 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -103,7 +103,7 @@ jobs: job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} - name: Retrieve Kubernetes job status - shell: bash -exo pipefail + shell: bash -exo pipefail {0} run: | LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do From 8a9de05af5b072decd3b41012a35c40acad49486 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 13:50:31 +0000 Subject: [PATCH 43/89] fix test --- .github/workflows/_ci.yaml | 120 ++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 1439f515c..d4837449b 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -174,7 +174,7 @@ jobs: URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - secrets: inherit + secrets: inherit build-axlearn: needs: build-jax @@ -484,37 +484,37 @@ jobs: with: token-name: ${{ env.TOKEN_NAME }} - test-equinox: - needs: build-equinox - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - TEST_NAME: equinox - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - bash -exc -o pipefail \ - 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - STATISTICS_SCRIPT: | - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-equinox.log - secrets: inherit - - test-te-multigpu: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_te.yaml - with: - TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-equinox: + # needs: build-equinox + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + # TEST_NAME: equinox + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + # bash -exc -o pipefail \ + # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # STATISTICS_SCRIPT: | + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-equinox.log + # secrets: inherit + + # test-te-multigpu: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_te.yaml + # with: + # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit test-upstream-t5x: needs: build-upstream-t5x @@ -588,34 +588,34 @@ jobs: test-levanter.log secrets: inherit - test-te: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: te - EXECUTE: | - docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-te.log - pip install pytest-reportlog - pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-te.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - TIMEOUT_MINUTES: 120 - ARTIFACTS: | - test-te.log - pytest-report.jsonl - secrets: inherit + # test-te: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: te + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-te.log + # pip install pytest-reportlog + # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-te.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # TIMEOUT_MINUTES: 120 + # ARTIFACTS: | + # test-te.log + # pytest-report.jsonl + # secrets: inherit test-gemma: needs: build-gemma From 5a0bb04d71be7f1cad97cf8821ebda5fb7424ed7 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 13:52:15 +0000 Subject: [PATCH 44/89] remove always --- .github/actions/delete-ghcr-token/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/delete-ghcr-token/action.yml index 0d90dd168..1a246bb8f 100644 --- a/.github/actions/delete-ghcr-token/action.yml +++ b/.github/actions/delete-ghcr-token/action.yml @@ -11,6 +11,5 @@ runs: steps: - name: Delete GitHub Container Registry token shell: bash - if: always() run: | kubectl delete secret ${{ inputs.token-name }} From d7fb8c3e70444f8424289e328db6bf691e0448d9 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 14:11:18 +0000 Subject: [PATCH 45/89] indentention error --- .github/workflows/_ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index d4837449b..2ebdf149f 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -174,7 +174,7 @@ jobs: URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - secrets: inherit + secrets: inherit build-axlearn: needs: build-jax From d3500bdf9c592cd148a162b8095412dd23f2b681 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 18 Feb 2025 16:48:14 +0000 Subject: [PATCH 46/89] fix runner size --- .github/workflows/_ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 2ebdf149f..513e74306 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -187,6 +187,7 @@ jobs: BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: axlearn DOCKERFILE: .github/container/Dockerfile.axlearn + RUNNER_SIZE: large secrets: inherit collect-docker-tags: From 569fb5f81627c1a844fef570a5965f79e245eebb Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 20 Feb 2025 11:31:01 +0000 Subject: [PATCH 47/89] try with post step --- .github/actions/delete-ghcr-token/action.yml | 9 ++-- .github/actions/delete-k8s-job/action.yml | 30 +++++++------ .github/actions/with-post-step/action.yml | 42 ++++++++++++++++++ .github/actions/with-post-step/main.js | 46 ++++++++++++++++++++ 4 files changed, 110 insertions(+), 17 deletions(-) create mode 100644 .github/actions/with-post-step/action.yml create mode 100644 .github/actions/with-post-step/main.js diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/delete-ghcr-token/action.yml index 1a246bb8f..c6069908b 100644 --- a/.github/actions/delete-ghcr-token/action.yml +++ b/.github/actions/delete-ghcr-token/action.yml @@ -10,6 +10,9 @@ runs: using: "composite" steps: - name: Delete GitHub Container Registry token - shell: bash - run: | - kubectl delete secret ${{ inputs.token-name }} + uses: ./.github/actions/with-post-step + with: + main: | + echo "Main post step action: no action required" + post: | + kubectl delete secret ${{ inputs.token-name }} diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/delete-k8s-job/action.yml index 15a5add64..74f1e3129 100644 --- a/.github/actions/delete-k8s-job/action.yml +++ b/.github/actions/delete-k8s-job/action.yml @@ -11,18 +11,20 @@ runs: using: "composite" steps: - name: Delete Kubernetes job - shell: bash - run: | - # make sure we're deleting all the resources - pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}') + uses: ./.github/actions/with-post-step + with: + main: | + echo "Main post step action: no action required" + post: | + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}') - for pod in $pods; do - status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true) - echo "Pod: $pod, status: $status" - if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then - kubectl delete pod "$pod" --force --grace-period=0 || true - fi - done - - # make sure job is deleted - kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true \ No newline at end of file + for pod in $pods; do + status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true) + echo "Pod: $pod, status: $status" + if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then + kubectl delete pod "$pod" --force --grace-period=0 || true + fi + done + + # make sure job is deleted + kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true \ No newline at end of file diff --git a/.github/actions/with-post-step/action.yml b/.github/actions/with-post-step/action.yml new file mode 100644 index 000000000..9816ee888 --- /dev/null +++ b/.github/actions/with-post-step/action.yml @@ -0,0 +1,42 @@ +# ==================================================================================================================== # +# Authors: # +# Patrick Lehmann # +# Unai Martinez-Corral # +# # +# ==================================================================================================================== # +# Copyright 2020-2024 The pyTooling Authors # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +# SPDX-License-Identifier: Apache-2.0 # +# ==================================================================================================================== # +name: With post step + +description: 'Generic JS Action to execute a main command and set a command as a post step.' + +inputs: + main: + description: 'Main command/script.' + required: true + post: + description: 'Post command/script.' + required: true + key: + description: 'Name of the state variable used to detect the post step.' + required: false + default: POST + +runs: + using: 'node20' + main: 'main.js' + post: 'main.js' \ No newline at end of file diff --git a/.github/actions/with-post-step/main.js b/.github/actions/with-post-step/main.js new file mode 100644 index 000000000..47a817cbc --- /dev/null +++ b/.github/actions/with-post-step/main.js @@ -0,0 +1,46 @@ +/* ================================================================================================================== * + * Authors: * + * Unai Martinez-Corral * + * * + * ================================================================================================================== * + * Copyright 2021-2022 Unai Martinez-Corral * + * Copyright 2022 Unai Martinez-Corral * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * + * SPDX-License-Identifier: Apache-2.0 * + * ================================================================================================================== * + * * + * Context: * + * * https://github.com/docker/login-action/issues/72 * + * * https://github.com/actions/runner/issues/1478 * + * ================================================================================================================== */ +const { spawn } = require("child_process"); +const { appendFileSync } = require("fs"); +const { EOL } = require("os"); + +function run(cmd) { + const subprocess = spawn(cmd, { stdio: "inherit", shell: true }); + subprocess.on("exit", (exitCode) => { + process.exitCode = exitCode; + }); +} + +const key = process.env.INPUT_KEY.toUpperCase(); + +if ( process.env[`STATE_${key}`] !== undefined ) { // Are we in the 'post' step? + run(process.env.INPUT_POST); +} else { // Otherwise, this is the main step + appendFileSync(process.env.GITHUB_STATE, `${key}=true${EOL}`); + run(process.env.INPUT_MAIN); +} \ No newline at end of file From 0de66b0d2cf2a40df0617713bd6a38c7392585f1 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 20 Feb 2025 17:45:44 +0000 Subject: [PATCH 48/89] build axlearn with tensorflow-cpu --- .github/container/Dockerfile.axlearn | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 88cbc458c..dde1ae081 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -24,6 +24,7 @@ portpicker==1.6.0 seqio==0.0.18 protobuf==3.20.3 pytest>=7.4.3 +tensorflow-cpu REQUIREMENTS EOF From 8fbacde799e9bb4ccf3ad01352442a47cf34bf72 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 21 Feb 2025 16:50:25 +0000 Subject: [PATCH 49/89] placeholder for models on eks --- .../axlearn/axlearn-1B-model.yml | 77 ++++++++++++++++++ .../axlearn/axlearn-3B-model.yml | 78 +++++++++++++++++++ .github/workflows/_ci.yaml | 45 +++++++++++ 3 files changed, 200 insertions(+) create mode 100644 .github/eks-workflow-files/axlearn/axlearn-1B-model.yml create mode 100644 .github/eks-workflow-files/axlearn/axlearn-3B-model.yml diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml new file mode 100644 index 000000000..de1d77aa8 --- /dev/null +++ b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml @@ -0,0 +1,77 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn + image: PLACEHOLDER + command: + - bash + - -xo + - pipefail + - -c + - | + + BASEDIR="/opt/axlearn" + CONFIG="fuji-1B-v3-flash-single-host" + HLO_DUMP=0 + POSTFIX="" + + AR_THRESHOLD=1073741824 + AG_THRESHOLD=8589934592 + RS_THRESHOLD=8589934592 + XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD} + --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD} + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_nccl_comm_splitting=false" + + export XLA_PYTHON_CLIENT_PREALLOCATE=false + export TF_GPU_ALLOCATOR=cuda_malloc_async + export XLA_FLAGS="${XLA_BASE_FLAGS}" + + export NCCL_BUFFSIZE=8388608 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_LAUNCH_MODE=GROUP + export NCCL_DEBUG=INFO + + LOG_DIR=${BASEDIR}/logs + TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir + mkdir -p ${TRAINER_DIR} + + cat << EOF > tf_gpu_fix.py + import tensorflow as tf + tf.config.set_visible_devices([], 'GPU') + import runpy + runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__') + EOF + + python3 tf_gpu_fix.py \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu + + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml new file mode 100644 index 000000000..419d8bb0b --- /dev/null +++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml @@ -0,0 +1,78 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn + image: PLACEHOLDER + command: + - bash + - -xo + - pipefail + - -c + - | + + BASEDIR="/opt/axlearn" + CONFIG="fuji-3B-v3-flash-single-host" + HLO_DUMP=0 + POSTFIX="" + + AR_THRESHOLD=1073741824 + AG_THRESHOLD=8589934592 + RS_THRESHOLD=8589934592 + XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD} + --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD} + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_nccl_comm_splitting=false" + + export XLA_PYTHON_CLEINT_PREALLOCATE=false + export TF_GPU_ALLOCATOR=cuda_malloc_async + export XLA_FLAGS="${XLA_BASE_FLAGS}" + + export NCCL_BUFFSIZE=8388608 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_LAUNCH_MODE=GROUP + export NCCL_DEBUG=INFO + + LOG_DIR=${BASEDIR}/logs + TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir + mkdir -p ${TRAINER_DIR} + + echo "Executing TF" + cat << EOF > tf_fix_gpu.py + import tensorflow as tf + tf.config.set_visible_devices([], 'GPU') + import runpy + runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__') + EOF + + python3 tf_fix_gpu.py \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu + + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 513e74306..c2e240642 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -759,4 +759,49 @@ jobs: sitrep.json "badge-axlearn-test" summary.txt + + + test-axlearn-fuji-1B: + needs: build-axlearn + if: inputs.ARCHITECTURE == 'amd64' + runs-on: eks + env: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: axlearn-fuji-1B-${{ github.run_id }} + TOKEN_NAME: axlearn-fuji-1B-${{ github.run_id }}-token + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: GHCR Login + uses: ./.github/actions/ghcr-login + with: + docker-username: ${{ github.repository_owner }} + docker-password: ${{ secrets.GITHUB_TOKEN }} + token-name: ${{ env.TOKEN_NAME }} + - name: Configure axlearn test job + run: | + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + .github/eks-workflow-files/axlearn/axlearn-1B-model.yml + git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml + + - name: Submit & wait for axlearn test job + uses: ./.github/actions/submit-k8s-job + with: + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml" + job-name: ${{ env.JOB_NAME }} + + - name: Delete axlearn test job + uses: ./.github/actions/delete-k8s-job + if: ${{ always() }} + with: + job-name: ${{ env.JOB_NAME }} + + - name: Delete GitHub Container Registry token + uses: ./.github/actions/delete-ghcr-token + if: ${{ always() }} + with: + token-name: ${{ env.TOKEN_NAME }} From 026b37aae67af015b894d1b0bf4e668cb9426e72 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 21 Feb 2025 18:49:27 +0000 Subject: [PATCH 50/89] test a setup for running fuji 1B on slurm --- .github/container/Dockerfile.axlearn | 2 +- .github/container/test-fuji-1B.sh | 39 ++ .github/workflows/_ci.yaml | 938 ++++++++++++++------------- .github/workflows/_test_fuji_1B.yaml | 106 +++ 4 files changed, 619 insertions(+), 466 deletions(-) create mode 100644 .github/container/test-fuji-1B.sh create mode 100644 .github/workflows/_test_fuji_1B.yaml diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index dde1ae081..c441bdc68 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -34,7 +34,7 @@ EOF ############################################################################### ADD test-axlearn.sh /usr/local/bin - +ADD test-fuji-1B.sh /usr/local/bin ############################################################################### ## Install accumulated packages from the base image and the previous stage ############################################################################### diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh new file mode 100644 index 000000000..94042de13 --- /dev/null +++ b/.github/container/test-fuji-1B.sh @@ -0,0 +1,39 @@ +#! /bin/bash +BASEDIR="/opt/host/" +CONFIG="fuji-7B-v3-flash" +POSTFIX=${POSTFIX:=""} + + +export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_graph_level=0 + --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_pipelined_all_reduce=true + --xla_gpu_enable_while_loop_double_buffering=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_all_gather_combine_by_dim=false + --xla_gpu_enable_reduce_scatter_combine_by_dim=false + --xla_disable_hlo_passes=rematerialization" + +export XLA_PYTHON_CLIENT_PREALLOCATE=false +export TF_GPU_ALLOCATOR=cuda_malloc_async +export NCCL_BUFFSIZE=8388608 +export NCCL_P2P_NET_CHUNKSIZE=524288 +export NCCL_LAUNCH_MODE=GROUP +export NCCL_DEBUG=INFO +LOG_DIF=${BASEDIR}/logs +TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs +mkdir -p ${TRAINER_DIR} + +#test "${WITH_MP}" == 1 && export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=${SLURM_LAUNCH_NODE_IPADDR}:12345 --process_id=${SLURM_PROCID}" + +python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index c2e240642..29ac71306 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,115 +66,115 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - build-triton: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-triton-build - BADGE_FILENAME: badge-triton-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: triton - DOCKERFILE: .github/container/Dockerfile.triton - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - secrets: inherit + # build-triton: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-triton-build + # BADGE_FILENAME: badge-triton-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: triton + # DOCKERFILE: .github/container/Dockerfile.triton + # RUNNER_SIZE: large + # EXTRA_BUILD_ARGS: | + # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + # secrets: inherit - build-equinox: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - secrets: inherit + # build-equinox: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-equinox-build + # BADGE_FILENAME: badge-equinox-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: equinox + # DOCKERFILE: .github/container/Dockerfile.equinox + # EXTRA_BUILD_ARGS: | + # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + # secrets: inherit - build-maxtext: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - secrets: inherit + # build-maxtext: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-maxtext-build + # BADGE_FILENAME: badge-maxtext-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: maxtext + # DOCKERFILE: .github/container/Dockerfile.maxtext + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + # secrets: inherit - build-levanter: - needs: [build-jax] - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-levanter-build" - BADGE_FILENAME: "badge-levanter-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: levanter - DOCKERFILE: .github/container/Dockerfile.levanter - EXTRA_BUILD_ARGS: | - URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - secrets: inherit + # build-levanter: + # needs: [build-jax] + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-levanter-build" + # BADGE_FILENAME: "badge-levanter-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: levanter + # DOCKERFILE: .github/container/Dockerfile.levanter + # EXTRA_BUILD_ARGS: | + # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + # secrets: inherit - build-upstream-t5x: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - secrets: inherit + # build-upstream-t5x: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-t5x-build" + # BADGE_FILENAME: "badge-t5x-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-t5x + # DOCKERFILE: .github/container/Dockerfile.t5x + # EXTRA_BUILD_ARGS: | + # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + # secrets: inherit - build-rosetta-t5x: - needs: build-upstream-t5x - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: t5x - secrets: inherit + # build-rosetta-t5x: + # needs: build-upstream-t5x + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: t5x + # secrets: inherit - build-gemma: - needs: build-jax - uses: ./.github/workflows/_build.yaml - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-gemma-build - BADGE_FILENAME: badge-gemma-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: gemma - DOCKERFILE: rosetta/Dockerfile.gemma - DOCKER_CONTEXT: . - EXTRA_BUILD_ARGS: | - URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - secrets: inherit + # build-gemma: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-gemma-build + # BADGE_FILENAME: badge-gemma-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: gemma + # DOCKERFILE: rosetta/Dockerfile.gemma + # DOCKER_CONTEXT: . + # EXTRA_BUILD_ARGS: | + # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + # secrets: inherit build-axlearn: needs: build-jax @@ -196,13 +196,13 @@ jobs: needs: - build-base - build-jax - - build-triton - - build-equinox - - build-maxtext - - build-levanter - - build-upstream-t5x - - build-rosetta-t5x - - build-gemma + # - build-triton + # - build-equinox + # - build-maxtext + # - build-levanter + # - build-upstream-t5x + # - build-rosetta-t5x + # - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -214,22 +214,22 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -239,252 +239,252 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - test-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee tee test-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + # test-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: jax + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee tee test-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b gpu + # EOF + # STATISTICS_SCRIPT: | + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-backend-independent.log + # test-gpu.log + # secrets: inherit - test-nsys-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - num_tests=0 - num_failures=0 - # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # avoid an early abort here. - set +e - docker run -i --shm-size=1g --gpus all \ - -v $PWD:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - EOF - set -e - GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - for mode in 1-process 2-process process-per-gpu; do - DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - if [[ "${mode}" == "1-process" ]]; then - PROCESS_COUNT=1 - ARGS="" - elif [[ "${mode}" == "2-process" ]]; then - # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # this will flush out more bugs than process-per-node or process-per-GPU. - PROCESS_COUNT=2 - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - else - PROCESS_COUNT=${GPUS_PER_NODE} - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - fi - for collection in full partial; do - NSYS_JAX="nsys-jax" - if [[ "${mode}" == "1-process" ]]; then - # We will not run nsys-jax-combine, so run analyses eagerly - NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - fi - NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - if [[ "${collection}" == "partial" ]]; then - NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # nvbug/4801401 - NSYS_JAX+=" --sample=none" - fi - set +e - ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - num_failures=$((num_failures + ($? != 0))) - set -e - num_tests=$((num_tests + 1)) - done - if [[ "${mode}" != "1-process" ]]; then - # Run nsys-jax-combine - NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - for (( i=0; i> $GITHUB_ENV - echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - exit $num_failures - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - pytest-report.jsonl - # nsys-jax logfiles - *process-*-execution.log - # nsys-jax output for the case that doesn't use nsys-jax-combine - 1-process-*-execution-0.zip - # nsys-jax-combine output/logfiles - *process*-*-execution.zip - *-execution-combine.log - secrets: inherit + # test-nsys-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # num_tests=0 + # num_failures=0 + # # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # # avoid an early abort here. + # set +e + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # EOF + # set -e + # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + # for mode in 1-process 2-process process-per-gpu; do + # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + # if [[ "${mode}" == "1-process" ]]; then + # PROCESS_COUNT=1 + # ARGS="" + # elif [[ "${mode}" == "2-process" ]]; then + # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # # this will flush out more bugs than process-per-node or process-per-GPU. + # PROCESS_COUNT=2 + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + # else + # PROCESS_COUNT=${GPUS_PER_NODE} + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + # fi + # for collection in full partial; do + # NSYS_JAX="nsys-jax" + # if [[ "${mode}" == "1-process" ]]; then + # # We will not run nsys-jax-combine, so run analyses eagerly + # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + # fi + # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + # if [[ "${collection}" == "partial" ]]; then + # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # # nvbug/4801401 + # NSYS_JAX+=" --sample=none" + # fi + # set +e + # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + # num_failures=$((num_failures + ($? != 0))) + # set -e + # num_tests=$((num_tests + 1)) + # done + # if [[ "${mode}" != "1-process" ]]; then + # # Run nsys-jax-combine + # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + # for (( i=0; i> $GITHUB_ENV + # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + # exit $num_failures + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # pytest-report.jsonl + # # nsys-jax logfiles + # *process-*-execution.log + # # nsys-jax output for the case that doesn't use nsys-jax-combine + # 1-process-*-execution-0.zip + # # nsys-jax-combine output/logfiles + # *process*-*-execution.zip + # *-execution-combine.log + # secrets: inherit #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does #not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done - - test-nsys-jax-eks: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - runs-on: eks - env: - JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-nsys-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: GHCR login - uses: ./.github/actions/ghcr-login - with: - docker-username: ${{ github.repository_owner }} - docker-password: ${{ secrets.GITHUB_TOKEN}} - token-name: ${{ env.TOKEN_NAME }} - - name: Configure Kubernetes job - run: | - yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - .github/eks-workflow-files/job.yml - git diff .github/eks-workflow-files/job.yml - - name: Submit Kubernetes job - uses: ./.github/actions/submit-k8s-job - with: - job-config-file: .github/eks-workflow-files/job.yml - job-name: ${{ env.JOB_NAME }} - - name: Delete eks job - uses: ./.github/actions/delete-k8s-job - if: ${{ always() }} - with: - job-name: ${{ env.JOB_NAME }} - - name: Configure post-processing job - run: | - export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post process k8s job - uses: ./.github/actions/submit-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete post process k8s job - uses: ./.github/actions/delete-k8s-job - with: - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete GitHub Container Registry token - uses: ./.github/actions/delete-ghcr-token - if: ${{ always() }} - with: - token-name: ${{ env.TOKEN_NAME }} - + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done + + # test-nsys-jax-eks: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # runs-on: eks + # env: + # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: ${{ github.run_id }}-nsys-jax + # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + # TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: GHCR login + # uses: ./.github/actions/ghcr-login + # with: + # docker-username: ${{ github.repository_owner }} + # docker-password: ${{ secrets.GITHUB_TOKEN}} + # token-name: ${{ env.TOKEN_NAME }} + # - name: Configure Kubernetes job + # run: | + # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + # | select(di == 1).metadata.name = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + # .github/eks-workflow-files/job.yml + # git diff .github/eks-workflow-files/job.yml + # - name: Submit Kubernetes job + # uses: ./.github/actions/submit-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/job.yml + # job-name: ${{ env.JOB_NAME }} + # - name: Delete eks job + # uses: ./.github/actions/delete-k8s-job + # if: ${{ always() }} + # with: + # job-name: ${{ env.JOB_NAME }} + # - name: Configure post-processing job + # run: | + # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + # | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + # .github/eks-workflow-files/post-process-job.yml + # git diff .github/eks-workflow-files/post-process-job.yml + # - name: Submit post process k8s job + # uses: ./.github/actions/submit-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/post-process-job.yml + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + # - name: Delete post process k8s job + # uses: ./.github/actions/delete-k8s-job + # with: + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + # - name: Delete GitHub Container Registry token + # uses: ./.github/actions/delete-ghcr-token + # if: ${{ always() }} + # with: + # token-name: ${{ env.TOKEN_NAME }} + # COMMENT THIS # test-equinox: # needs: build-equinox # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -508,7 +508,7 @@ jobs: # ARTIFACTS: | # test-equinox.log # secrets: inherit - + # COMMENT THIS # test-te-multigpu: # needs: build-upstream-pax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -517,78 +517,79 @@ jobs: # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # actually having a CUDA backend for pytoch - pip install --no-deps torch - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install flake8 pytest soundfile librosa - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # # actually having a CUDA backend for pytoch + # pip install --no-deps torch + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit + # test-levanter: + # needs: build-levanter + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: levanter + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install flake8 pytest soundfile librosa + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-levanter.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-levanter.log + # secrets: inherit + + # COMMENT THIS # test-te: # needs: build-upstream-pax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -618,37 +619,37 @@ jobs: # pytest-report.jsonl # secrets: inherit - test-gemma: - needs: build-gemma - uses: ./.github/workflows/_test_unit.yaml - if: inputs.ARCHITECTURE == 'amd64' - with: - TEST_NAME: gemma - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - bash -ec \ - "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-gemma.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-gemma.log - secrets: inherit + # test-gemma: + # needs: build-gemma + # uses: ./.github/workflows/_test_unit.yaml + # if: inputs.ARCHITECTURE == 'amd64' + # with: + # TEST_NAME: gemma + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + # bash -ec \ + # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-gemma.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-gemma.log + # secrets: inherit - test-maxtext: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-maxtext: + # needs: build-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit test-axlearn-eks: needs: build-axlearn @@ -760,6 +761,13 @@ jobs: "badge-axlearn-test" summary.txt + test-axlearn-fuji-1B-slurm: + needs: build-axlearn + if: inputs.ARCHITECTURE == 'amd64' + uses: ./.github/workflows/_test_fuji_1B.yaml + with: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + secrets: inherit test-axlearn-fuji-1B: needs: build-axlearn diff --git a/.github/workflows/_test_fuji_1B.yaml b/.github/workflows/_test_fuji_1B.yaml new file mode 100644 index 000000000..54e72b53d --- /dev/null +++ b/.github/workflows/_test_fuji_1B.yaml @@ -0,0 +1,106 @@ +name: ~test MaxText functionality + +on: + workflow_call: + inputs: + AXLEARN_DOCKER_IMAGE: + type: string + description: Axlearn image from ghcr.io/nvidia + default: ghcr.io/nvidia/jax:axlearn + required: false + +jobs: + single-process-single-node: + runs-on: jumpbox + steps: + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v4 + + - name: Setup SSH + id: setup-ssh + uses: ./.github/actions/setup-ssh + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')" + TOTAL_TASKS=1 + MAX_GPUS_PER_NODE=8 + NODES=1 + GPUS_PER_NODE=8 + JOB_NAME=axlearn-fuji-1B-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=${{ steps.meta.outputs.NODES }} + #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + + # preload enroot container using one task per node + time srun \ + --ntasks-per-node=1 \ + --container-name=runtime \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + true + + # run job with tasks on each node sharing one container + time srun \ + --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ + --container-name=runtime \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + test-fuji-1B.sh + EOF + ) + + echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT + + . .github/workflows/scripts/wait_for_slurm_job.sh + + wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} + + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + set -x + + + - name: Remove orphaned SLURM job if the CI job is canceled + if: cancelled() + shell: bash -x -e {0} + run: | + ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} + + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF \ No newline at end of file From 2dd21ad9ce01eaee4a31611214c86036dd139e4c Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 21 Feb 2025 18:50:52 +0000 Subject: [PATCH 51/89] fix naming --- .github/workflows/_ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 29ac71306..38e112e24 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -775,8 +775,8 @@ jobs: runs-on: eks env: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: axlearn-fuji-1B-${{ github.run_id }} - TOKEN_NAME: axlearn-fuji-1B-${{ github.run_id }}-token + JOB_NAME: axlearn-fuji-1b-${{ github.run_id }} + TOKEN_NAME: axlearn-fuji-1b-${{ github.run_id }}-token steps: - name: Check out the repository uses: actions/checkout@v4 From e35043492e8dbe4d6b8acd70d63d1bfbf0e3b455 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 21 Feb 2025 18:58:21 +0000 Subject: [PATCH 52/89] fix indt --- .github/workflows/_test_fuji_1B.yaml | 76 ++++++++++++++-------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/.github/workflows/_test_fuji_1B.yaml b/.github/workflows/_test_fuji_1B.yaml index 54e72b53d..cd058b59a 100644 --- a/.github/workflows/_test_fuji_1B.yaml +++ b/.github/workflows/_test_fuji_1B.yaml @@ -8,7 +8,7 @@ on: description: Axlearn image from ghcr.io/nvidia default: ghcr.io/nvidia/jax:axlearn required: false - + jobs: single-process-single-node: runs-on: jumpbox @@ -43,49 +43,49 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" - sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - JOB=$(sshx sbatch --parsable << EOF - #!/bin/bash - #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - #SBATCH --exclusive - #SBATCH --nodes=${{ steps.meta.outputs.NODES }} - #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} - #SBATCH --time=00:30:00 - #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=${{ steps.meta.outputs.NODES }} + #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - # preload enroot container using one task per node - time srun \ - --ntasks-per-node=1 \ - --container-name=runtime \ - --container-image=${{ steps.meta.outputs.IMAGE }} \ - true + # preload enroot container using one task per node + time srun \ + --ntasks-per-node=1 \ + --container-name=runtime \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + true - # run job with tasks on each node sharing one container - time srun \ - --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ - --container-name=runtime \ - --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - --container-entrypoint \ - test-fuji-1B.sh - EOF - ) + # run job with tasks on each node sharing one container + time srun \ + --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ + --container-name=runtime \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + test-fuji-1B.sh + EOF + ) - echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT + echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT - . .github/workflows/scripts/wait_for_slurm_job.sh + . .github/workflows/scripts/wait_for_slurm_job.sh - wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} + wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - # Gather job info - SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - echo "SLURM Job state is ${SLURM_STATE}" - echo "SLURM Job exit code is ${SLURM_EXITCODE}" - echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - set -x + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + set -x - name: Remove orphaned SLURM job if the CI job is canceled From 2c8409d8d02847cc32b1fbefe37c64ced6666d43 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 24 Feb 2025 09:45:37 +0000 Subject: [PATCH 53/89] set k8s jobs to run for 20 min --- .github/eks-workflow-files/axlearn/axlearn-1B-model.yml | 2 ++ .github/eks-workflow-files/axlearn/axlearn-3B-model.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml index de1d77aa8..60ac97e3a 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml @@ -5,6 +5,8 @@ metadata: labels: kueue.x-k8s.io/queue-name: p5-queue spec: + # the job will run for 20 mins, as we can' tset max_steps + activeDeadlineSeconds: 1200 completions: 1 parallelism: 1 template: diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml index 419d8bb0b..2461c097a 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml @@ -5,6 +5,8 @@ metadata: labels: kueue.x-k8s.io/queue-name: p5-queue spec: + # the job will run for 20 mins, as we can' tset max_steps + activeDeadlineSeconds: 1200 completions: 1 parallelism: 1 template: From 477735935844ff90204dd8e75200ef875863d1aa Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 24 Feb 2025 11:57:13 +0000 Subject: [PATCH 54/89] try a test on fuji 7B params --- .github/container/Dockerfile.axlearn | 1 + .github/container/test-fuji-1B.sh | 2 +- .../axlearn/axlearn-3B-model.yml | 2 +- .github/workflows/_ci.yaml | 14 ++- .github/workflows/_test_fuji_7B.yaml | 106 ++++++++++++++++++ 5 files changed, 120 insertions(+), 5 deletions(-) mode change 100644 => 100755 .github/container/test-fuji-1B.sh create mode 100644 .github/workflows/_test_fuji_7B.yaml diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index c441bdc68..5c51697a2 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -35,6 +35,7 @@ EOF ADD test-axlearn.sh /usr/local/bin ADD test-fuji-1B.sh /usr/local/bin +ADD test-fuji-7B.sh /usr/local/bin ############################################################################### ## Install accumulated packages from the base image and the previous stage ############################################################################### diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh old mode 100644 new mode 100755 index 94042de13..9cd9faebd --- a/.github/container/test-fuji-1B.sh +++ b/.github/container/test-fuji-1B.sh @@ -1,6 +1,6 @@ #! /bin/bash BASEDIR="/opt/host/" -CONFIG="fuji-7B-v3-flash" +CONFIG="fuji-1B-v3-flash" POSTFIX=${POSTFIX:=""} diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml index 2461c097a..39fbce6be 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml @@ -39,7 +39,7 @@ spec: --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_nccl_comm_splitting=false" - export XLA_PYTHON_CLEINT_PREALLOCATE=false + export XLA_PYTHON_CLIENT_PREALLOCATE=false export TF_GPU_ALLOCATOR=cuda_malloc_async export XLA_FLAGS="${XLA_BASE_FLAGS}" diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 38e112e24..6634b9661 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -761,15 +761,23 @@ jobs: "badge-axlearn-test" summary.txt - test-axlearn-fuji-1B-slurm: + # test-axlearn-fuji-1B-slurm: + # needs: build-axlearn + # if: inputs.ARCHITECTURE == 'amd64' + # uses: ./.github/workflows/_test_fuji_1B.yaml + # with: + # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + test-axlearn-fuji-7B-slurm: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' - uses: ./.github/workflows/_test_fuji_1B.yaml + uses: ./.github/workflows/_test_fuji_7B.yaml with: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} secrets: inherit - test-axlearn-fuji-1B: + test-axlearn-fuji-1B-eks: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' runs-on: eks diff --git a/.github/workflows/_test_fuji_7B.yaml b/.github/workflows/_test_fuji_7B.yaml new file mode 100644 index 000000000..544de815a --- /dev/null +++ b/.github/workflows/_test_fuji_7B.yaml @@ -0,0 +1,106 @@ +name: ~test MaxText functionality + +on: + workflow_call: + inputs: + AXLEARN_DOCKER_IMAGE: + type: string + description: Axlearn image from ghcr.io/nvidia + default: ghcr.io/nvidia/jax:axlearn + required: false + +jobs: + single-process-single-node: + runs-on: jumpbox + steps: + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v4 + + - name: Setup SSH + id: setup-ssh + uses: ./.github/actions/setup-ssh + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')" + TOTAL_TASKS=1 + MAX_GPUS_PER_NODE=8 + NODES=1 + GPUS_PER_NODE=8 + JOB_NAME=axlearn-fuji-7B-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=${{ steps.meta.outputs.NODES }} + #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + + # preload enroot container using one task per node + time srun \ + --ntasks-per-node=1 \ + --container-name=runtime \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + true + + # run job with tasks on each node sharing one container + time srun \ + --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ + --container-name=runtime \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + test-fuji-7B.sh + EOF + ) + + echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT + + . .github/workflows/scripts/wait_for_slurm_job.sh + + wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} + + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + set -x + + + - name: Remove orphaned SLURM job if the CI job is canceled + if: cancelled() + shell: bash -x -e {0} + run: | + ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} + + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF \ No newline at end of file From 1f3e1e426c043ca0b0d98a2164f9f03d3bbc51cd Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 24 Feb 2025 12:39:14 +0000 Subject: [PATCH 55/89] upload test script for testing --- .github/container/test-fuji-7B.sh | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 .github/container/test-fuji-7B.sh diff --git a/.github/container/test-fuji-7B.sh b/.github/container/test-fuji-7B.sh new file mode 100755 index 000000000..e2ff8dde6 --- /dev/null +++ b/.github/container/test-fuji-7B.sh @@ -0,0 +1,40 @@ +#! /bin/bash +BASEDIR="/opt/host/" +CONFIG="fuji-7B-v2-flash" +POSTFIX=${POSTFIX:=""} + + +export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_graph_level=0 + --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_pipelined_all_reduce=true + --xla_gpu_enable_while_loop_double_buffering=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_all_gather_combine_by_dim=false + --xla_gpu_enable_reduce_scatter_combine_by_dim=false + --xla_disable_hlo_passes=rematerialization" + +export XLA_PYTHON_CLIENT_PREALLOCATE=false +export TF_GPU_ALLOCATOR=cuda_malloc_async +export NCCL_BUFFSIZE=8388608 +export NCCL_P2P_NET_CHUNKSIZE=524288 +export NCCL_LAUNCH_MODE=GROUP +export NCCL_DEBUG=INFO +LOG_DIF=${BASEDIR}/logs +TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs +mkdir -p ${TRAINER_DIR} + +export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=127.0.0.1:8080 --process_id=${SLURM_PROCID}" + +python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu \ + ${MP_ARGS} From ea2a2651112a2abd5d7ce847170d11a4cddefa69 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 24 Feb 2025 14:45:35 +0000 Subject: [PATCH 56/89] reset the 7B --- .github/container/Dockerfile.axlearn | 9 +++++++-- .github/container/test-fuji-1B.sh | 2 +- .github/workflows/_ci.yaml | 13 ++----------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 5c51697a2..2c6dad33f 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -24,8 +24,13 @@ portpicker==1.6.0 seqio==0.0.18 protobuf==3.20.3 pytest>=7.4.3 -tensorflow-cpu REQUIREMENTS + # Only append "tensorflow-cpu" if running on x86_64 + if [ "$(uname -m)" = "x86_64" ]; then + echo "tensorflow-cpu" >> /opt/pip-tools.d/requirements-axlearn.in + else + echo "Skipping TF on $(uname -m)" + fi EOF @@ -35,7 +40,7 @@ EOF ADD test-axlearn.sh /usr/local/bin ADD test-fuji-1B.sh /usr/local/bin -ADD test-fuji-7B.sh /usr/local/bin + ############################################################################### ## Install accumulated packages from the base image and the previous stage ############################################################################### diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh index 9cd9faebd..8208b4003 100755 --- a/.github/container/test-fuji-1B.sh +++ b/.github/container/test-fuji-1B.sh @@ -24,7 +24,7 @@ export TF_GPU_ALLOCATOR=cuda_malloc_async export NCCL_BUFFSIZE=8388608 export NCCL_P2P_NET_CHUNKSIZE=524288 export NCCL_LAUNCH_MODE=GROUP -export NCCL_DEBUG=INFO + LOG_DIF=${BASEDIR}/logs TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs mkdir -p ${TRAINER_DIR} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 6634b9661..eef5a78bb 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -761,18 +761,10 @@ jobs: "badge-axlearn-test" summary.txt - # test-axlearn-fuji-1B-slurm: - # needs: build-axlearn - # if: inputs.ARCHITECTURE == 'amd64' - # uses: ./.github/workflows/_test_fuji_1B.yaml - # with: - # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - test-axlearn-fuji-7B-slurm: + test-axlearn-fuji-1B-slurm: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' - uses: ./.github/workflows/_test_fuji_7B.yaml + uses: ./.github/workflows/_test_fuji_1B.yaml with: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} secrets: inherit @@ -820,4 +812,3 @@ jobs: if: ${{ always() }} with: token-name: ${{ env.TOKEN_NAME }} - From 5fd34003adf20484ba3f3fd82d9a990f3b964d08 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 24 Feb 2025 17:31:55 +0000 Subject: [PATCH 57/89] address comments --- .github/actions/ghcr-login/action.yml | 31 ----- .../action.yml | 8 +- .../action.yml | 19 ++- .github/actions/submit-k8s-job/action.yml | 40 ------- .github/actions/with-post-step/action.yml | 2 +- .github/container/Dockerfile.axlearn | 9 +- .github/container/test-axlearn.sh | 31 +---- .github/container/test-fuji-1B.sh | 12 +- .github/container/test-fuji-7B.sh | 40 ------- .../axlearn/axlearn-1B-model.yml | 4 +- .../axlearn/axlearn-3B-model.yml | 80 ------------- .github/workflows/_ci.yaml | 77 ++++-------- .github/workflows/_test_fuji_1B.yaml | 106 ----------------- .github/workflows/_test_fuji_7B.yaml | 106 ----------------- .github/workflows/_test_nccl.yaml | 110 ++++++++++++++++++ .github/workflows/nccl-k8s.yaml | 109 +---------------- 16 files changed, 175 insertions(+), 609 deletions(-) delete mode 100644 .github/actions/ghcr-login/action.yml rename .github/actions/{delete-ghcr-token => store-delete-k8s-ghcr}/action.yml (53%) rename .github/actions/{delete-k8s-job => submit-delete-k8s-job}/action.yml (51%) delete mode 100644 .github/actions/submit-k8s-job/action.yml delete mode 100755 .github/container/test-fuji-7B.sh delete mode 100644 .github/eks-workflow-files/axlearn/axlearn-3B-model.yml delete mode 100644 .github/workflows/_test_fuji_1B.yaml delete mode 100644 .github/workflows/_test_fuji_7B.yaml create mode 100644 .github/workflows/_test_nccl.yaml diff --git a/.github/actions/ghcr-login/action.yml b/.github/actions/ghcr-login/action.yml deleted file mode 100644 index 2c62591ed..000000000 --- a/.github/actions/ghcr-login/action.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Checkout, GHCR login, K8s secret -description: Performs repository checkout, logs into GitHub Container Registry, and stores the token as a Kubernetes secret. - -inputs: - docker-username: - description: Username for GHCR - required: true - docker-password: - description: Password (e.g., GITHUB_TOKEN) - required: true - token-name: - description: Name of the K8s secret to create - required: true - -runs: - using: "composite" - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: "ghcr.io" - username: ${{ inputs.docker-username }} - password: ${{ inputs.docker-password }} - - - name: Store GitHub Container Registry token as Kubernetes secret - shell: bash - run: | - kubectl create secret generic \ - ${{ inputs.token-name }} \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson diff --git a/.github/actions/delete-ghcr-token/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml similarity index 53% rename from .github/actions/delete-ghcr-token/action.yml rename to .github/actions/store-delete-k8s-ghcr/action.yml index c6069908b..51eb8b625 100644 --- a/.github/actions/delete-ghcr-token/action.yml +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -13,6 +13,10 @@ runs: uses: ./.github/actions/with-post-step with: main: | - echo "Main post step action: no action required" + # Store GitHub Container Registry token as Kubernetes secret + kubectl create secret generic \ + ${{ inputs.token-name }} \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson post: | - kubectl delete secret ${{ inputs.token-name }} + kubectl delete secret ${{ inputs.token-name }} \ No newline at end of file diff --git a/.github/actions/delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml similarity index 51% rename from .github/actions/delete-k8s-job/action.yml rename to .github/actions/submit-delete-k8s-job/action.yml index 74f1e3129..e97d4b921 100644 --- a/.github/actions/delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -5,6 +5,9 @@ inputs: job-name: description: The job name to delete required: true + job-config-file: + description: Path to the Kubernetes job YAML + required: true runs: @@ -14,7 +17,21 @@ runs: uses: ./.github/actions/with-post-step with: main: | - echo "Main post step action: no action required" + echo "Submit K8s job" + kubectl apply -f "${{ inputs.job-config-file }}" + # wait for the job to be created + kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s + + # wait for the 'spec.suspend' field to become false. Necessary for kueue + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s + + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + echo "Waiting for pods to start..." + sleep 20 + done + + # stream the logs + kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} post: | pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}') diff --git a/.github/actions/submit-k8s-job/action.yml b/.github/actions/submit-k8s-job/action.yml deleted file mode 100644 index 49ddad748..000000000 --- a/.github/actions/submit-k8s-job/action.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Submit & Stream K8s Job -description: Submits a Kubernetes job and then streams its logs to GitHub Actions. - -inputs: - job-config-file: - description: Path to the Kubernetes job YAML - required: true - job-name: - description: The job name - required: true - -runs: - using: "composite" - steps: - - name: Submit Kubernetes job - shell: bash - run: | - kubectl apply -f "${{ inputs.job-config-file }}" - - - name: Wait for job to be un-suspended (Kueue) - shell: bash - run: | - # wait for the job to be created - kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s - - # wait for the 'spec.suspend' field to become false. Necessary for kueue - kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=3600s - - - name: Wait for pods to start - shell: bash - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - echo "Waiting for pods to start..." - sleep 20 - done - - - name: Stream Kubernetes job output - shell: bash - run: | - kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} diff --git a/.github/actions/with-post-step/action.yml b/.github/actions/with-post-step/action.yml index 9816ee888..69c2a6eff 100644 --- a/.github/actions/with-post-step/action.yml +++ b/.github/actions/with-post-step/action.yml @@ -39,4 +39,4 @@ inputs: runs: using: 'node20' main: 'main.js' - post: 'main.js' \ No newline at end of file + post: 'main.js' diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 2c6dad33f..039f767ee 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -10,10 +10,10 @@ FROM ${BASE_IMAGE} AS mealkit ARG URLREF_AXLEARN ARG SRC_PATH_AXLEARN -RUN <<"EOF" bash -ex - git clone "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}" -EOF +RUN git-clone.sh "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}" +# these packages are needed to run axlearn tests +# https://github.com/apple/axlearn/blob/main/pyproject.toml as reference RUN <<"EOF" bash -ex echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in cat <> /opt/pip-tools.d/requirements-axlearn.in @@ -38,8 +38,7 @@ EOF ## Add test script to the path ############################################################################### -ADD test-axlearn.sh /usr/local/bin -ADD test-fuji-1B.sh /usr/local/bin +ADD test-axlearn.sh test-fuji-1B.sh /usr/local/bin/ ############################################################################### ## Install accumulated packages from the base image and the previous stage diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index c62f36f5b..d5e783f56 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -10,19 +10,16 @@ usage() { echo " OPTIONS DESCRIPTION" echo " -d, --directory DIR Directory to run tests in." echo " Default: 'axlearn/axlearn/common'." - echo " -c, --cuda-devices DEVICES CUDA devices to use. Default: '0,1,2,3,4,5,6,7'." echo " -t, --test-files FILES Pattern for test files to run." echo " Default: '*_test.py'." echo " -o, --output DIRECTORY Output directory for logs and summary." echo " Default: 'test_runs/'." - echo " -k, --k8s Whether to run on a Kubernetes cluster." echo " -h, --help Show this help message and exit." exit 1 } # Default values DIR='axlearn/axlearn/common' -CUDA_DEVICES='0,1,2,3,4,5,6,7' TEST_FILES=() OUTPUT_DIRECTORY='' K8S=false @@ -39,14 +36,6 @@ while [[ $# -gt 0 ]]; do DIR="$2" shift 2 ;; - -c|--cuda-devices) - if [[ -z "$2" ]]; then - echo "Error: --cuda-devices requires an argument." - usage - fi - CUDA_DEVICES="$2" - shift 2 - ;; -t|--test-files) shift # Collect all arguments until the next option (starting with '-') @@ -69,10 +58,6 @@ while [[ $# -gt 0 ]]; do OUTPUT_DIRECTORY="$2" shift 2 ;; - -k|--k8s) - K8S=true - shift - ;; -h|--help) usage ;; @@ -95,7 +80,6 @@ mkdir -p "${LOG_DIRECTORY}" # Print out config for sanity check echo "Configuration:" echo " Directory: $DIR" -echo " CUDA Devices: $CUDA_DEVICES" if [ "${#TEST_FILES[@]}" -gt 0 ]; then echo " Test Files:" for f in "${TEST_FILES[@]}"; do @@ -106,23 +90,14 @@ else fi echo " Output Directory: $OUTPUT_DIRECTORY" echo " Kubernetes mode: $K8S" -echo "" - cd "$DIR" || exit 1 -# Set CUDA devices -export CUDA_VISIBLE_DEVICES="${CUDA_DEVICES}" -echo "Using CUDA devices: $CUDA_VISIBLE_DEVICES" - echo "Running tests..." -# If we are on Kubernetes, install torch for cpu only -if [ "$K8S" = true ]; then - pip install torch --extra-index-url https://download.pytorch.org/whl/cpu - pip install transformers - pip install scikit-learn timm -fi +pip install torch --extra-index-url https://download.pytorch.org/whl/cpu +pip install transformers scikit-learn timm + if [ "${#TEST_FILES[@]}" -eq 0 ]; then TEST_FILES=("*_test.py") diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji-1B.sh index 8208b4003..9018f37de 100755 --- a/.github/container/test-fuji-1B.sh +++ b/.github/container/test-fuji-1B.sh @@ -3,9 +3,7 @@ BASEDIR="/opt/host/" CONFIG="fuji-1B-v3-flash" POSTFIX=${POSTFIX:=""} - -export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_graph_level=0 +BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 @@ -17,13 +15,9 @@ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false - --xla_disable_hlo_passes=rematerialization" + --xla_disable_hlo_passes=rematerialization} -export XLA_PYTHON_CLIENT_PREALLOCATE=false -export TF_GPU_ALLOCATOR=cuda_malloc_async -export NCCL_BUFFSIZE=8388608 -export NCCL_P2P_NET_CHUNKSIZE=524288 -export NCCL_LAUNCH_MODE=GROUP +export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" LOG_DIF=${BASEDIR}/logs TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs diff --git a/.github/container/test-fuji-7B.sh b/.github/container/test-fuji-7B.sh deleted file mode 100755 index e2ff8dde6..000000000 --- a/.github/container/test-fuji-7B.sh +++ /dev/null @@ -1,40 +0,0 @@ -#! /bin/bash -BASEDIR="/opt/host/" -CONFIG="fuji-7B-v2-flash" -POSTFIX=${POSTFIX:=""} - - -export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_graph_level=0 - --xla_gpu_enable_highest_priority_async_stream=true - --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 - --xla_gpu_all_gather_combine_threshold_bytes=1073741824 - --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 - --xla_gpu_enable_pipelined_all_gather=true - --xla_gpu_enable_pipelined_reduce_scatter=true - --xla_gpu_enable_pipelined_all_reduce=true - --xla_gpu_enable_while_loop_double_buffering=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_all_gather_combine_by_dim=false - --xla_gpu_enable_reduce_scatter_combine_by_dim=false - --xla_disable_hlo_passes=rematerialization" - -export XLA_PYTHON_CLIENT_PREALLOCATE=false -export TF_GPU_ALLOCATOR=cuda_malloc_async -export NCCL_BUFFSIZE=8388608 -export NCCL_P2P_NET_CHUNKSIZE=524288 -export NCCL_LAUNCH_MODE=GROUP -export NCCL_DEBUG=INFO -LOG_DIF=${BASEDIR}/logs -TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs -mkdir -p ${TRAINER_DIR} - -export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=127.0.0.1:8080 --process_id=${SLURM_PROCID}" - -python3 -m axlearn.common.launch_trainer_main \ - --module=text.gpt.c4_trainer \ - --config=${CONFIG} \ - --trainer_dir=${TRAINER_DIR} \ - --data_dir=gs://axlearn-public/tensorflow_datasets \ - --jax_backend=gpu \ - ${MP_ARGS} diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml index 60ac97e3a..76b767089 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml @@ -5,7 +5,7 @@ metadata: labels: kueue.x-k8s.io/queue-name: p5-queue spec: - # the job will run for 20 mins, as we can' tset max_steps + # the job will run for 20 mins, as we can't set max_steps activeDeadlineSeconds: 1200 completions: 1 parallelism: 1 @@ -13,7 +13,7 @@ spec: spec: restartPolicy: Never containers: - - name: axlearn + - name: axlearn-fuji-1B image: PLACEHOLDER command: - bash diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml deleted file mode 100644 index 39fbce6be..000000000 --- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml +++ /dev/null @@ -1,80 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: PLACEHOLDER - labels: - kueue.x-k8s.io/queue-name: p5-queue -spec: - # the job will run for 20 mins, as we can' tset max_steps - activeDeadlineSeconds: 1200 - completions: 1 - parallelism: 1 - template: - spec: - restartPolicy: Never - containers: - - name: axlearn - image: PLACEHOLDER - command: - - bash - - -xo - - pipefail - - -c - - | - - BASEDIR="/opt/axlearn" - CONFIG="fuji-3B-v3-flash-single-host" - HLO_DUMP=0 - POSTFIX="" - - AR_THRESHOLD=1073741824 - AG_THRESHOLD=8589934592 - RS_THRESHOLD=8589934592 - XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_highest_priority_async_stream=true - --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD} - --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD} - --xla_gpu_enable_pipelined_all_gather=true - --xla_gpu_enable_pipelined_reduce_scatter=true - --xla_gpu_enable_nccl_comm_splitting=false" - - export XLA_PYTHON_CLIENT_PREALLOCATE=false - export TF_GPU_ALLOCATOR=cuda_malloc_async - export XLA_FLAGS="${XLA_BASE_FLAGS}" - - export NCCL_BUFFSIZE=8388608 - export NCCL_P2P_NET_CHUNKSIZE=524288 - export NCCL_LAUNCH_MODE=GROUP - export NCCL_DEBUG=INFO - - LOG_DIR=${BASEDIR}/logs - TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir - mkdir -p ${TRAINER_DIR} - - echo "Executing TF" - cat << EOF > tf_fix_gpu.py - import tensorflow as tf - tf.config.set_visible_devices([], 'GPU') - import runpy - runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__') - EOF - - python3 tf_fix_gpu.py \ - --module=text.gpt.c4_trainer \ - --config=${CONFIG} \ - --trainer_dir=${TRAINER_DIR} \ - --data_dir=gs://axlearn-public/tensorflow_datasets \ - --jax_backend=gpu - - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: output - mountPath: /opt/output - imagePullSecrets: - - name: PLACEHOLDER - volumes: - - name: output - emptyDir: {} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index eef5a78bb..d5d0005af 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -660,16 +660,17 @@ jobs: JOB_NAME: axlearn-${{ github.run_id }} TOKEN_NAME: axlearn-${{ github.run_id }}-token steps: - - name: Set date env var for saving files - run: | - echo "DATE_TEST_RAN=$(date +'%Y-%m-%d-%H-%M-%S')" >> $GITHUB_ENV - name: Check out the repository uses: actions/checkout@v4 - - name: GHCR Login - uses: ./.github/actions/ghcr-login + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR login and delete + uses: ./.github/actions/store-delete-k8s-ghcr with: - docker-username: ${{ github.repository_owner }} - docker-password: ${{ secrets.GITHUB_TOKEN }} token-name: ${{ env.TOKEN_NAME }} - name: Configure axlearn test job run: | @@ -677,34 +678,20 @@ jobs: yq -i ea ' select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.containers[1].env[0].value = strenv(DATE_TEST_RAN) + | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}" | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ .github/eks-workflow-files/axlearn/axlearn-job.yml git diff .github/eks-workflow-files/axlearn/axlearn-job.yml - - - name: Submit & wait for axlearn test job - uses: ./.github/actions/submit-k8s-job + - name: Submit & delete axlearn test + uses: ./.github/actions/submit-delete-k8s-job with: job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" job-name: ${{ env.JOB_NAME }} - - - name: Delete axlearn test job - uses: ./.github/actions/delete-k8s-job - if: ${{ always() }} - with: - job-name: ${{ env.JOB_NAME }} - - - name: Delete GitHub Container Registry token - uses: ./.github/actions/delete-ghcr-token - if: ${{ always() }} - with: - token-name: ${{ env.TOKEN_NAME }} - - name: Download logs from S3 id: log-s3 run: | mkdir -p /tmp/axlearn-output - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ env.DATE_TEST_RAN }}/summary.txt /tmp/axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt /tmp/axlearn-output/ passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true) failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true) @@ -713,7 +700,6 @@ jobs: echo "Passed tests: $passed_tests" echo "Failed tests: $failed_tests" echo "Total tests: $total_tests" - echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT @@ -761,14 +747,8 @@ jobs: "badge-axlearn-test" summary.txt - test-axlearn-fuji-1B-slurm: - needs: build-axlearn - if: inputs.ARCHITECTURE == 'amd64' - uses: ./.github/workflows/_test_fuji_1B.yaml - with: - AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - + # the fuji test will run for 20 minutes only, as per 2025-02-24 + # is not possible to set the `max_steps` value test-axlearn-fuji-1B-eks: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' @@ -780,11 +760,15 @@ jobs: steps: - name: Check out the repository uses: actions/checkout@v4 - - name: GHCR Login - uses: ./.github/actions/ghcr-login + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR login and delete + uses: ./.github/actions/store-delete-k8s-ghcr with: - docker-username: ${{ github.repository_owner }} - docker-password: ${{ secrets.GITHUB_TOKEN }} token-name: ${{ env.TOKEN_NAME }} - name: Configure axlearn test job run: | @@ -795,20 +779,9 @@ jobs: .github/eks-workflow-files/axlearn/axlearn-1B-model.yml git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml - - name: Submit & wait for axlearn test job - uses: ./.github/actions/submit-k8s-job + - name: Submit & delete axlearn test + uses: ./.github/actions/submit-delete-k8s-job with: - job-config-file: ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml" + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml" job-name: ${{ env.JOB_NAME }} - - name: Delete axlearn test job - uses: ./.github/actions/delete-k8s-job - if: ${{ always() }} - with: - job-name: ${{ env.JOB_NAME }} - - - name: Delete GitHub Container Registry token - uses: ./.github/actions/delete-ghcr-token - if: ${{ always() }} - with: - token-name: ${{ env.TOKEN_NAME }} diff --git a/.github/workflows/_test_fuji_1B.yaml b/.github/workflows/_test_fuji_1B.yaml deleted file mode 100644 index cd058b59a..000000000 --- a/.github/workflows/_test_fuji_1B.yaml +++ /dev/null @@ -1,106 +0,0 @@ -name: ~test MaxText functionality - -on: - workflow_call: - inputs: - AXLEARN_DOCKER_IMAGE: - type: string - description: Axlearn image from ghcr.io/nvidia - default: ghcr.io/nvidia/jax:axlearn - required: false - -jobs: - single-process-single-node: - runs-on: jumpbox - steps: - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - - name: Setup SSH - id: setup-ssh - uses: ./.github/actions/setup-ssh - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - - - name: Labels and metadata - id: meta - shell: bash -x -e {0} - run: | - IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')" - TOTAL_TASKS=1 - MAX_GPUS_PER_NODE=8 - NODES=1 - GPUS_PER_NODE=8 - JOB_NAME=axlearn-fuji-1B-${GITHUB_RUN_ID}-${TEST_CASE_NAME} - LOG_FILE=/nfs/cluster/${JOB_NAME}.log - MODEL_PATH=/nfs/cluster/${JOB_NAME} - for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do - echo "$var=${!var}" >> $GITHUB_OUTPUT - done - - - name: Submit SLURM jobs over SSH - id: submit - shell: bash -O expand_aliases -x -e {0} - run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" - sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - JOB=$(sshx sbatch --parsable << EOF - #!/bin/bash - #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - #SBATCH --exclusive - #SBATCH --nodes=${{ steps.meta.outputs.NODES }} - #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} - #SBATCH --time=00:30:00 - #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - - # preload enroot container using one task per node - time srun \ - --ntasks-per-node=1 \ - --container-name=runtime \ - --container-image=${{ steps.meta.outputs.IMAGE }} \ - true - - # run job with tasks on each node sharing one container - time srun \ - --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ - --container-name=runtime \ - --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - --container-entrypoint \ - test-fuji-1B.sh - EOF - ) - - echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT - - . .github/workflows/scripts/wait_for_slurm_job.sh - - wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - - # Gather job info - SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - echo "SLURM Job state is ${SLURM_STATE}" - echo "SLURM Job exit code is ${SLURM_EXITCODE}" - echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - set -x - - - - name: Remove orphaned SLURM job if the CI job is canceled - if: cancelled() - shell: bash -x -e {0} - run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ - scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - - - name: Write SLURM job status to file - shell: bash -x -e {0} - run: | - python << EOF - import json - with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - json.dump(dump, f) - EOF \ No newline at end of file diff --git a/.github/workflows/_test_fuji_7B.yaml b/.github/workflows/_test_fuji_7B.yaml deleted file mode 100644 index 544de815a..000000000 --- a/.github/workflows/_test_fuji_7B.yaml +++ /dev/null @@ -1,106 +0,0 @@ -name: ~test MaxText functionality - -on: - workflow_call: - inputs: - AXLEARN_DOCKER_IMAGE: - type: string - description: Axlearn image from ghcr.io/nvidia - default: ghcr.io/nvidia/jax:axlearn - required: false - -jobs: - single-process-single-node: - runs-on: jumpbox - steps: - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - - name: Setup SSH - id: setup-ssh - uses: ./.github/actions/setup-ssh - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - - - name: Labels and metadata - id: meta - shell: bash -x -e {0} - run: | - IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')" - TOTAL_TASKS=1 - MAX_GPUS_PER_NODE=8 - NODES=1 - GPUS_PER_NODE=8 - JOB_NAME=axlearn-fuji-7B-${GITHUB_RUN_ID}-${TEST_CASE_NAME} - LOG_FILE=/nfs/cluster/${JOB_NAME}.log - MODEL_PATH=/nfs/cluster/${JOB_NAME} - for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do - echo "$var=${!var}" >> $GITHUB_OUTPUT - done - - - name: Submit SLURM jobs over SSH - id: submit - shell: bash -O expand_aliases -x -e {0} - run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" - sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - JOB=$(sshx sbatch --parsable << EOF - #!/bin/bash - #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - #SBATCH --exclusive - #SBATCH --nodes=${{ steps.meta.outputs.NODES }} - #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} - #SBATCH --time=00:30:00 - #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - - # preload enroot container using one task per node - time srun \ - --ntasks-per-node=1 \ - --container-name=runtime \ - --container-image=${{ steps.meta.outputs.IMAGE }} \ - true - - # run job with tasks on each node sharing one container - time srun \ - --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ - --container-name=runtime \ - --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - --container-entrypoint \ - test-fuji-7B.sh - EOF - ) - - echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT - - . .github/workflows/scripts/wait_for_slurm_job.sh - - wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - - # Gather job info - SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - echo "SLURM Job state is ${SLURM_STATE}" - echo "SLURM Job exit code is ${SLURM_EXITCODE}" - echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - set -x - - - - name: Remove orphaned SLURM job if the CI job is canceled - if: cancelled() - shell: bash -x -e {0} - run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ - scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - - - name: Write SLURM job status to file - shell: bash -x -e {0} - run: | - python << EOF - import json - with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - json.dump(dump, f) - EOF \ No newline at end of file diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml new file mode 100644 index 000000000..2102c214e --- /dev/null +++ b/.github/workflows/_test_nccl.yaml @@ -0,0 +1,110 @@ +name: ~run NCCL tests + +on: + workflow_call: + inputs: + # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda + # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought + # to be modified to test one of the JAX-Toolbox containers. + CONTAINER: + type: string + description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04 + required: true + +permissions: + actions: write # to cancel previous workflows + contents: read # to fetch code + packages: write # to upload container + +jobs: + build-mpi-operator-compatible-base: + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: amd64 + ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build + BADGE_FILENAME: badge-mpi-operator-compatible-base-build + BUILD_DATE: 0000-00-00 # not important; this image is never published + BASE_IMAGE: ${{ inputs.CONTAINER }} + CONTAINER_NAME: mpi-operator-compatible-base + DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base + RUNNER_SIZE: small + secrets: inherit + nccl-test: + needs: build-mpi-operator-compatible-base + strategy: + matrix: + test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi] + runs-on: eks + env: + BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} + TEST_NAME: ${{ matrix.test }} + + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Modify variables + id: var + shell: bash + run: | + export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}" + echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT + echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT + echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR login and delete + uses: ./.github/actions/store-delete-k8s-ghcr + with: + token-name: ${{ steps.var.outputs.TOKEN_NAME }} + - name: Configure Kubernetes job + run: | + export WORKER_NAME="${JOB_NAME}-worker" + yq -i '.metadata.name = strenv(JOB_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) + | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) + | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) + | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) + | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + .github/eks-workflow-files/mpi-nccl-test.yml + git diff .github/eks-workflow-files/mpi-nccl-test.yml + - name: Submit & delete Kubernetes test + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/mpi-nccl-test.yml" + job-name: ${{ steps.var.output.JOB_NAME }} + - name: Retrieve Kubernetes job status + shell: bash -exo pipefail {0} + run: | + LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" + while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do + failure=${status[0]:-0} + success=${status[1]:-0} + total=$((failure+success)) + if [[ ${total} < 1 ]]; then + sleep 1 + elif [[ ${total} == 1 ]]; then + break + else + # Shouldn't happen, maybe a sign the job being monitored does not have a + # single launcher pod? + exit 255 + fi + done + exit ${failure} + # Provide more debug output in case of failure; note that some kinds of launch + # failure do not produce any log output. + - name: Debug failed Kubernetes job + if: failure() + run: | + LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" + # Provide better debug in case of launch failures that will not produce log output + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) + if [[ -n "${pods}" ]]; then + kubectl describe ${pods} + fi diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index 6f39ebe0b..d51c12382 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -31,111 +31,8 @@ permissions: packages: write # to upload container jobs: - build-mpi-operator-compatible-base: - uses: ./.github/workflows/_build.yaml + nccl-tests: + uses: ./.github/workflows/_test_nccl.yaml with: - ARCHITECTURE: amd64 - ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build - BADGE_FILENAME: badge-mpi-operator-compatible-base-build - BUILD_DATE: 0000-00-00 # Not important; this image is never published - BASE_IMAGE: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }} - CONTAINER_NAME: mpi-operator-compatible-base - DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base - RUNNER_SIZE: small + CONTAINER: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }} secrets: inherit - - nccl-tests: - needs: build-mpi-operator-compatible-base - runs-on: eks - strategy: - matrix: - test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi] - env: - BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} - TEST_NAME: ${{ matrix.test }} - - - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - - name: Modify variables - id: var - shell: bash - run: | - export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}" - echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT - echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT - echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT - - - name: GHCR login and store K8s secret - uses: ./.github/actions/ghcr-login - with: - docker-username: ${{ github.repository_owner }} - docker-password: ${{ secrets.GITHUB_TOKEN }} - token-name: ${{ steps.var.outputs.TOKEN_NAME }} - - name: Configure Kubernetes job - shell: bash - run: | - export JOB_NAME="${{ steps.var.outputs.JOB_NAME }}" - export LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" - export TOKEN_NAME="${{ steps.var.outputs.TOKEN_NAME }}" - export TEST_NAME="${{ env.TEST_NAME }}" - export WORKER_NAME="${JOB_NAME}-worker" - - # Use yq to set our fields in-place - yq -i '.metadata.name = strenv(JOB_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) - | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) - | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) - | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) - | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ - .github/eks-workflow-files/mpi-nccl-test.yml - - # (Optional) Show diff for debugging - git diff .github/eks-workflow-files/mpi-nccl-test.yml - - - name: Submit & stream K8s job - uses: ./.github/actions/submit-k8s-job - with: - job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml - job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} - - name: Retrieve Kubernetes job status - shell: bash -exo pipefail {0} - run: | - LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" - while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do - failure=${status[0]:-0} - success=${status[1]:-0} - total=$((failure+success)) - if [[ ${total} < 1 ]]; then - sleep 1 - elif [[ ${total} == 1 ]]; then - break - else - # If total > 1, that suggests a mismatch that can occur if there's more than one launcher pod - exit 255 - fi - done - exit ${failure} - - name: Debug failed Kubernetes job - if: ${{ failure() }} - shell: bash - run: | - LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" - pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) - if [[ -n "${pods}" ]]; then - kubectl describe ${pods} - fi - - name: Delete Kubernetes job - if: ${{ always() }} - uses: ./.github/actions/delete-k8s-job - with: - job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} - - name: Delete GitHub Container Registry token - uses: ./.github/actions/delete-ghcr-token - if: ${{ always() }} - with: - token-name: ${{ steps.var.outputs.TOKEN_NAME }} \ No newline at end of file From d680e667a9f55819d6b57dd7432110087fea6f13 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 24 Feb 2025 19:01:38 +0000 Subject: [PATCH 58/89] fix path for git --- .github/container/Dockerfile.axlearn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 039f767ee..ba63fb0c9 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1-labs ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git +ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main ARG SRC_PATH_AXLEARN=/opt/axlearn ############################################################################### From c200dea5289198c9fcee5ac767d6744b93f31107 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 24 Feb 2025 20:43:29 +0000 Subject: [PATCH 59/89] fix error in bash --- .github/actions/store-delete-k8s-ghcr/action.yml | 1 + .github/actions/submit-delete-k8s-job/action.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml index 51eb8b625..803163f9a 100644 --- a/.github/actions/store-delete-k8s-ghcr/action.yml +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -11,6 +11,7 @@ runs: steps: - name: Delete GitHub Container Registry token uses: ./.github/actions/with-post-step + shell: bash with: main: | # Store GitHub Container Registry token as Kubernetes secret diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index e97d4b921..d2c546273 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -15,6 +15,7 @@ runs: steps: - name: Delete Kubernetes job uses: ./.github/actions/with-post-step + shell: bash with: main: | echo "Submit K8s job" From 0b1a61f8728c87c701e6ca057124b5a930177a61 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 25 Feb 2025 11:29:51 +0000 Subject: [PATCH 60/89] fix the 3B model run on k8s --- ...earn-1B-model.yml => axlearn-3B-model.yml} | 42 ++++++++----------- .github/workflows/_ci.yaml | 12 +++--- 2 files changed, 23 insertions(+), 31 deletions(-) rename .github/eks-workflow-files/axlearn/{axlearn-1B-model.yml => axlearn-3B-model.yml} (54%) diff --git a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml similarity index 54% rename from .github/eks-workflow-files/axlearn/axlearn-1B-model.yml rename to .github/eks-workflow-files/axlearn/axlearn-3B-model.yml index 76b767089..9045044e8 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-1B-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml @@ -13,7 +13,7 @@ spec: spec: restartPolicy: Never containers: - - name: axlearn-fuji-1B + - name: axlearn-fuji-3B image: PLACEHOLDER command: - bash @@ -23,31 +23,23 @@ spec: - | BASEDIR="/opt/axlearn" - CONFIG="fuji-1B-v3-flash-single-host" - HLO_DUMP=0 - POSTFIX="" - - AR_THRESHOLD=1073741824 - AG_THRESHOLD=8589934592 - RS_THRESHOLD=8589934592 - XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_highest_priority_async_stream=true - --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD} - --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD} - --xla_gpu_enable_pipelined_all_gather=true - --xla_gpu_enable_pipelined_reduce_scatter=true - --xla_gpu_enable_nccl_comm_splitting=false" - - export XLA_PYTHON_CLIENT_PREALLOCATE=false - export TF_GPU_ALLOCATOR=cuda_malloc_async - export XLA_FLAGS="${XLA_BASE_FLAGS}" - - export NCCL_BUFFSIZE=8388608 - export NCCL_P2P_NET_CHUNKSIZE=524288 - export NCCL_LAUNCH_MODE=GROUP - export NCCL_DEBUG=INFO + CONFIG="fuji-3B-v3-flash-single-host" + BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_pipelined_all_reduce=true + --xla_gpu_enable_while_loop_double_buffering=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_all_gather_combine_by_dim=false + --xla_gpu_enable_reduce_scatter_combine_by_dim=false + --xla_disable_hlo_passes=rematerialization} + export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" + LOG_DIR=${BASEDIR}/logs TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir mkdir -p ${TRAINER_DIR} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index d5d0005af..52b514c9d 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -749,14 +749,14 @@ jobs: # the fuji test will run for 20 minutes only, as per 2025-02-24 # is not possible to set the `max_steps` value - test-axlearn-fuji-1B-eks: + test-axlearn-fuji-3B-eks: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' runs-on: eks env: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: axlearn-fuji-1b-${{ github.run_id }} - TOKEN_NAME: axlearn-fuji-1b-${{ github.run_id }}-token + JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} + TOKEN_NAME: axlearn-fuji-3b-${{ github.run_id }}-token steps: - name: Check out the repository uses: actions/checkout@v4 @@ -776,12 +776,12 @@ jobs: select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ - .github/eks-workflow-files/axlearn/axlearn-1B-model.yml - git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml + .github/eks-workflow-files/axlearn/axlearn-3B-model.yml + git diff .github/eks-workflow-files/axlearn/axlearn-3B-model.yml - name: Submit & delete axlearn test uses: ./.github/actions/submit-delete-k8s-job with: - job-config-file: ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml" + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-3B-model.yml" job-name: ${{ env.JOB_NAME }} From 5693a5c6cd4a21eee8cb79650c084d705c5e638f Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 11:01:21 +0000 Subject: [PATCH 61/89] @olupton comments --- .../actions/store-delete-k8s-ghcr/action.yml | 17 ++- .../actions/submit-delete-k8s-job/action.yml | 14 +-- .github/container/Dockerfile.axlearn | 2 +- .github/container/test-axlearn.sh | 3 +- .../{test-fuji-1B.sh => test-fuji.sh} | 0 .../axlearn/axlearn-3B-model.yml | 71 ------------ .../axlearn/axlearn-fuji-model.yml | 34 ++++++ .../axlearn/axlearn-job.yml | 108 +++++++++--------- .github/workflows/_ci.yaml | 22 ++-- .github/workflows/_test_nccl.yaml | 46 ++++---- 10 files changed, 133 insertions(+), 184 deletions(-) rename .github/container/{test-fuji-1B.sh => test-fuji.sh} (100%) delete mode 100644 .github/eks-workflow-files/axlearn/axlearn-3B-model.yml create mode 100644 .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml index 803163f9a..33a69ebe2 100644 --- a/.github/actions/store-delete-k8s-ghcr/action.yml +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -1,14 +1,19 @@ -name: Delete GHCR Token -description: Deletes the K8s secret used for pulling images from GHCR. +name: Store & Delete GHCR Token +description: Store and Delete the docker credentails for pulling from GHCR -inputs: +outputs: token-name: description: Name of the K8s secret to delete - required: true + value: ${{ steps.token.outputs.token-name }} runs: using: "composite" steps: + - name: Generate a UUID token + shell: bash + id: token + run: | + echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT - name: Delete GitHub Container Registry token uses: ./.github/actions/with-post-step shell: bash @@ -16,8 +21,8 @@ runs: main: | # Store GitHub Container Registry token as Kubernetes secret kubectl create secret generic \ - ${{ inputs.token-name }} \ + ${{ steps.token.outputs.token-name }} \ --from-file=.dockerconfigjson=$HOME/.docker/config.json \ --type=kubernetes.io/dockerconfigjson post: | - kubectl delete secret ${{ inputs.token-name }} \ No newline at end of file + kubectl delete secret ${{ inputs.token-name }} diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index d2c546273..d8b8cb472 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -34,15 +34,5 @@ runs: # stream the logs kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} post: | - pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o jsonpath='{.items[*].metadata.name}') - - for pod in $pods; do - status=$(kubectl get pod "$pod" -o jsonpath='{.status.phase}' || true) - echo "Pod: $pod, status: $status" - if [ "$status" = "Running" ] || [ "$status" = "Pending" ]; then - kubectl delete pod "$pod" --force --grace-period=0 || true - fi - done - - # make sure job is deleted - kubectl delete job ${{ inputs.job-name }} --force --grace-period=0 || true \ No newline at end of file + kubectl delete job ${{ inputs.job-name }} + \ No newline at end of file diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index ba63fb0c9..8c609d08d 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -38,7 +38,7 @@ EOF ## Add test script to the path ############################################################################### -ADD test-axlearn.sh test-fuji-1B.sh /usr/local/bin/ +ADD test-axlearn.sh test-fuji.sh /usr/local/bin/ ############################################################################### ## Install accumulated packages from the base image and the previous stage diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index d5e783f56..b9c3f2dfe 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -89,7 +89,6 @@ else echo " Test Files Pattern: '*_test.py' (default)" fi echo " Output Directory: $OUTPUT_DIRECTORY" -echo " Kubernetes mode: $K8S" cd "$DIR" || exit 1 @@ -168,4 +167,4 @@ for test_file in "${final_test_files[@]}"; do ((failures++)) fi echo "" -done \ No newline at end of file +done diff --git a/.github/container/test-fuji-1B.sh b/.github/container/test-fuji.sh similarity index 100% rename from .github/container/test-fuji-1B.sh rename to .github/container/test-fuji.sh diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml deleted file mode 100644 index 9045044e8..000000000 --- a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml +++ /dev/null @@ -1,71 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: PLACEHOLDER - labels: - kueue.x-k8s.io/queue-name: p5-queue -spec: - # the job will run for 20 mins, as we can't set max_steps - activeDeadlineSeconds: 1200 - completions: 1 - parallelism: 1 - template: - spec: - restartPolicy: Never - containers: - - name: axlearn-fuji-3B - image: PLACEHOLDER - command: - - bash - - -xo - - pipefail - - -c - - | - - BASEDIR="/opt/axlearn" - CONFIG="fuji-3B-v3-flash-single-host" - BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_enable_highest_priority_async_stream=true - --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 - --xla_gpu_all_gather_combine_threshold_bytes=1073741824 - --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 - --xla_gpu_enable_pipelined_all_gather=true - --xla_gpu_enable_pipelined_reduce_scatter=true - --xla_gpu_enable_pipelined_all_reduce=true - --xla_gpu_enable_while_loop_double_buffering=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_all_gather_combine_by_dim=false - --xla_gpu_enable_reduce_scatter_combine_by_dim=false - --xla_disable_hlo_passes=rematerialization} - - export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" - - LOG_DIR=${BASEDIR}/logs - TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir - mkdir -p ${TRAINER_DIR} - - cat << EOF > tf_gpu_fix.py - import tensorflow as tf - tf.config.set_visible_devices([], 'GPU') - import runpy - runpy.run_module('axlearn.common.launch_trainer_main', run_name='__main__') - EOF - - python3 tf_gpu_fix.py \ - --module=text.gpt.c4_trainer \ - --config=${CONFIG} \ - --trainer_dir=${TRAINER_DIR} \ - --data_dir=gs://axlearn-public/tensorflow_datasets \ - --jax_backend=gpu - - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: output - mountPath: /opt/output - imagePullSecrets: - - name: PLACEHOLDER - volumes: - - name: output - emptyDir: {} diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml new file mode 100644 index 000000000..c6d9db3ab --- /dev/null +++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml @@ -0,0 +1,34 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + # the job will run for 20 mins, as we can't set max_steps + activeDeadlineSeconds: 1200 + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn-fuji-3B + image: PLACEHOLDER + command: + - bash + - -xo + - pipefail + - -c + - "\nBASEDIR=\"/opt/axlearn\"\nCONFIG=\"fuji-3B-v3-flash-single-host\"\nBASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true\n --xla_gpu_enable_highest_priority_async_stream=true\n --xla_gpu_all_reduce_combine_threshold_bytes=1073741824\n --xla_gpu_all_gather_combine_threshold_bytes=1073741824\n --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824\n --xla_gpu_enable_pipelined_all_gather=true\n --xla_gpu_enable_pipelined_reduce_scatter=true\n --xla_gpu_enable_pipelined_all_reduce=true\n --xla_gpu_enable_while_loop_double_buffering=true\n --xla_gpu_enable_triton_gemm=false\n --xla_gpu_enable_all_gather_combine_by_dim=false\n --xla_gpu_enable_reduce_scatter_combine_by_dim=false\n --xla_disable_hlo_passes=rematerialization}\n\nexport XLA_FLAGS=\"$BASE_XLA_FLAGS ${XLA_FLAGS:-}\" \n\nLOG_DIR=${BASEDIR}/logs\nTRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir\nmkdir -p ${TRAINER_DIR}\n\npython3 -m axlearn.common.launch_trainer_main \\\n --module=text.gpt.c4_trainer \\\n --config=${CONFIG} \\\n --trainer_dir=${TRAINER_DIR} \\\n --data_dir=gs://axlearn-public/tensorflow_datasets \\\n --jax_backend=gpu \n" + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 7c1022f61..b1ac81909 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -1,59 +1,59 @@ apiVersion: batch/v1 kind: Job metadata: - name: PLACEHOLDER - labels: - kueue.x-k8s.io/queue-name: p5-queue + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue spec: - completions: 1 - parallelism: 1 - template: - spec: - restartPolicy: Never - containers: - - name: axlearn - image: PLACEHOLDER - command: - - bash - - -xo - - pipefail - - -c - - | - test-axlearn.sh \ - --directory "." \ - --output "/opt/output/" \ - --test-files "/opt/axlearn/axlearn/common/*_test.py" \ - --k8s + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn + image: PLACEHOLDER + command: + - bash + - -xo + - pipefail + - -c + - | + test-axlearn.sh \ + --directory "." \ + --output "/opt/output/" \ + --test-files "/opt/axlearn/axlearn/common/*_test.py" \ + --k8s - sync - wait - # after execution flag the results have been produced - touch /opt/output/done - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: output - mountPath: /opt/output - - name: upload - image: amazon/aws-cli - env: - - name: TEST_DATE - value: PLACEHOLDER - command: - - sh - - -c - - | - while [ ! -f /opt/output/done ]; do - sleep 5 - done - # Upload to S3 bucket - aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt - volumeMounts: - - name: output - mountPath: /opt/output - imagePullSecrets: - - name: PLACEHOLDER - volumes: - - name: output - emptyDir: {} + sync + wait + # after execution flag the results have been produced + touch /opt/output/done + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + - name: upload + image: amazon/aws-cli + env: + - name: TEST_DATE + value: PLACEHOLDER + command: + - sh + - -c + - | + while [ ! -f /opt/output/done ]; do + sleep 5 + done + # Upload to S3 bucket + aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt + volumeMounts: + - name: output + mountPath: /opt/output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 52b514c9d..17bb9262b 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -658,7 +658,6 @@ jobs: env: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} JOB_NAME: axlearn-${{ github.run_id }} - TOKEN_NAME: axlearn-${{ github.run_id }}-token steps: - name: Check out the repository uses: actions/checkout@v4 @@ -668,10 +667,9 @@ jobs: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR login and delete + - name: K8s GHCR store and delete token + id: store-token uses: ./.github/actions/store-delete-k8s-ghcr - with: - token-name: ${{ env.TOKEN_NAME }} - name: Configure axlearn test job run: | # Replace placeholders in axlearn-job.yml with environment variables @@ -679,7 +677,7 @@ jobs: select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}" - | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ .github/eks-workflow-files/axlearn/axlearn-job.yml git diff .github/eks-workflow-files/axlearn/axlearn-job.yml - name: Submit & delete axlearn test @@ -756,7 +754,6 @@ jobs: env: AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} - TOKEN_NAME: axlearn-fuji-3b-${{ github.run_id }}-token steps: - name: Check out the repository uses: actions/checkout@v4 @@ -766,22 +763,21 @@ jobs: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR login and delete + - name: K8s GHCR store and delete token + id: store-token uses: ./.github/actions/store-delete-k8s-ghcr - with: - token-name: ${{ env.TOKEN_NAME }} - name: Configure axlearn test job run: | yq -i ea ' select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ - .github/eks-workflow-files/axlearn/axlearn-3B-model.yml - git diff .github/eks-workflow-files/axlearn/axlearn-3B-model.yml + | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - name: Submit & delete axlearn test uses: ./.github/actions/submit-delete-k8s-job with: - job-config-file: ".github/eks-workflow-files/axlearn/axlearn-3B-model.yml" + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" job-name: ${{ env.JOB_NAME }} diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 2102c214e..200ef3b37 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -3,18 +3,15 @@ name: ~run NCCL tests on: workflow_call: inputs: - # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda - # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought - # to be modified to test one of the JAX-Toolbox containers. CONTAINER: type: string - description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04 + description: CUDA image to use as base required: true permissions: - actions: write # to cancel previous workflows - contents: read # to fetch code - packages: write # to upload container + actions: write + contents: read + packages: write jobs: build-mpi-operator-compatible-base: @@ -23,12 +20,13 @@ jobs: ARCHITECTURE: amd64 ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build BADGE_FILENAME: badge-mpi-operator-compatible-base-build - BUILD_DATE: 0000-00-00 # not important; this image is never published + BUILD_DATE: 0000-00-00 BASE_IMAGE: ${{ inputs.CONTAINER }} CONTAINER_NAME: mpi-operator-compatible-base DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base RUNNER_SIZE: small secrets: inherit + nccl-test: needs: build-mpi-operator-compatible-base strategy: @@ -42,6 +40,7 @@ jobs: steps: - name: Check out the repository uses: actions/checkout@v4 + - name: Modify variables id: var shell: bash @@ -49,35 +48,36 @@ jobs: export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}" echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT - echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT + - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR login and delete - uses: ./.github/actions/store-delete-k8s-ghcr - with: - token-name: ${{ steps.var.outputs.TOKEN_NAME }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr - name: Configure Kubernetes job run: | export WORKER_NAME="${JOB_NAME}-worker" yq -i '.metadata.name = strenv(JOB_NAME) | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) - | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ .github/eks-workflow-files/mpi-nccl-test.yml git diff .github/eks-workflow-files/mpi-nccl-test.yml - - name: Submit & delete Kubernetes test - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: ".github/eks-workflow-files/mpi-nccl-test.yml" - job-name: ${{ steps.var.output.JOB_NAME }} + + - name: Submit & delete Kubernetes test + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/mpi-nccl-test.yml" + job-name: ${{ steps.var.outputs.JOB_NAME }} # Fixed outputs instead of output + - name: Retrieve Kubernetes job status shell: bash -exo pipefail {0} run: | @@ -91,19 +91,15 @@ jobs: elif [[ ${total} == 1 ]]; then break else - # Shouldn't happen, maybe a sign the job being monitored does not have a - # single launcher pod? exit 255 fi done exit ${failure} - # Provide more debug output in case of failure; note that some kinds of launch - # failure do not produce any log output. + - name: Debug failed Kubernetes job if: failure() run: | LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" - # Provide better debug in case of launch failures that will not produce log output pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) if [[ -n "${pods}" ]]; then kubectl describe ${pods} From 64c646f245873e59df8951722b455783986d70f7 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 11:43:20 +0000 Subject: [PATCH 62/89] fix errors --- .github/actions/store-delete-k8s-ghcr/action.yml | 1 - .github/workflows/_ci.yaml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml index 33a69ebe2..e089dfd21 100644 --- a/.github/actions/store-delete-k8s-ghcr/action.yml +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -16,7 +16,6 @@ runs: echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT - name: Delete GitHub Container Registry token uses: ./.github/actions/with-post-step - shell: bash with: main: | # Store GitHub Container Registry token as Kubernetes secret diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 17bb9262b..bf530d533 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -747,7 +747,7 @@ jobs: # the fuji test will run for 20 minutes only, as per 2025-02-24 # is not possible to set the `max_steps` value - test-axlearn-fuji-3B-eks: + test-axlearn-fuji-models-eks: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' runs-on: eks From 9009dc4f0159337b348e42fc4b2582442cc0f048 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 11:49:56 +0000 Subject: [PATCH 63/89] test uuidgen --- .github/actions/store-delete-k8s-ghcr/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml index e089dfd21..4a0018c7d 100644 --- a/.github/actions/store-delete-k8s-ghcr/action.yml +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -10,7 +10,6 @@ runs: using: "composite" steps: - name: Generate a UUID token - shell: bash id: token run: | echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT From 56047630046640e3d1d50917c25c03371103be33 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 12:00:12 +0000 Subject: [PATCH 64/89] test with random --- .github/actions/store-delete-k8s-ghcr/action.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml index 4a0018c7d..e8761d570 100644 --- a/.github/actions/store-delete-k8s-ghcr/action.yml +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -10,9 +10,10 @@ runs: using: "composite" steps: - name: Generate a UUID token + shell: bash id: token run: | - echo "token-name=$(uuidgen)" >> $GITHUB_OUTPUT + echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT - name: Delete GitHub Container Registry token uses: ./.github/actions/with-post-step with: From 9d53298c44d36a7ec15f0591b4f20d82be3a8372 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 12:03:00 +0000 Subject: [PATCH 65/89] no shell needed --- .github/actions/submit-delete-k8s-job/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index d8b8cb472..a1ed4029c 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -15,7 +15,6 @@ runs: steps: - name: Delete Kubernetes job uses: ./.github/actions/with-post-step - shell: bash with: main: | echo "Submit K8s job" From 2eba3b7694d1ec269ca5ae821789015d619f0ecd Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 13:06:39 +0000 Subject: [PATCH 66/89] revert test nccl and simplify the submit k8s --- .../actions/submit-delete-k8s-job/action.yml | 25 +++---- .github/workflows/_test_nccl.yaml | 66 +++++++++++-------- 2 files changed, 53 insertions(+), 38 deletions(-) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index a1ed4029c..b58179326 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -9,29 +9,30 @@ inputs: description: Path to the Kubernetes job YAML required: true - runs: using: "composite" steps: - name: Delete Kubernetes job uses: ./.github/actions/with-post-step + shell: bash with: main: | echo "Submit K8s job" kubectl apply -f "${{ inputs.job-config-file }}" - # wait for the job to be created + + # Wait for job to be craeted kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s - - # wait for the 'spec.suspend' field to become false. Necessary for kueue + + # Wait for job to be unsuspended kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s - - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - echo "Waiting for pods to start..." - sleep 20 - done - - # stream the logs + + # Wait for pods to be running + kubectl wait --for=condition=Ready \ + --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \ + --timeout=600s pod + + # Stream logs kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} + post: | kubectl delete job ${{ inputs.job-name }} - \ No newline at end of file diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 200ef3b37..3ccf55809 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -3,15 +3,18 @@ name: ~run NCCL tests on: workflow_call: inputs: + # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda + # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought + # to be modified to test one of the JAX-Toolbox containers. CONTAINER: type: string - description: CUDA image to use as base + description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04 required: true permissions: - actions: write - contents: read - packages: write + actions: write # to cancel previous workflows + contents: read # to fetch code + packages: write # to upload container jobs: build-mpi-operator-compatible-base: @@ -20,13 +23,12 @@ jobs: ARCHITECTURE: amd64 ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build BADGE_FILENAME: badge-mpi-operator-compatible-base-build - BUILD_DATE: 0000-00-00 + BUILD_DATE: 0000-00-00 # not important; this image is never published BASE_IMAGE: ${{ inputs.CONTAINER }} CONTAINER_NAME: mpi-operator-compatible-base DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base RUNNER_SIZE: small secrets: inherit - nccl-test: needs: build-mpi-operator-compatible-base strategy: @@ -36,19 +38,9 @@ jobs: env: BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} TEST_NAME: ${{ matrix.test }} - steps: - name: Check out the repository uses: actions/checkout@v4 - - - name: Modify variables - id: var - shell: bash - run: | - export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}" - echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT - echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT - - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: @@ -71,17 +63,29 @@ jobs: | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ .github/eks-workflow-files/mpi-nccl-test.yml git diff .github/eks-workflow-files/mpi-nccl-test.yml - - - name: Submit & delete Kubernetes test - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: ".github/eks-workflow-files/mpi-nccl-test.yml" - job-name: ${{ steps.var.outputs.JOB_NAME }} # Fixed outputs instead of output - + - name: Submit Kubernetes job + run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml + - name: Wait for Kubernetes job to start + # Note that this is *not* using JOB_NAME + run: | + # Launcher job is created eagerly, but suspended. Kueue un-suspends it when + # resources are available, but that is where there can be a long wait if the + # cluster is busy executing other jobs. + kubectl wait --for=create job/${LAUNCHER_NAME} + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s + - name: Stream Kubernetes job output + # Note that this is *not* JOB_NAME + run: | + # Streaming logs will fail if the container/pod is still pending + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 1 + done + # TODO: --all-containers=true --all-pods=true could make sense here, but it + # prefixes lines with a rather verbose tag + kubectl logs --follow job/${LAUNCHER_NAME} - name: Retrieve Kubernetes job status shell: bash -exo pipefail {0} run: | - LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do failure=${status[0]:-0} success=${status[1]:-0} @@ -91,16 +95,26 @@ jobs: elif [[ ${total} == 1 ]]; then break else + # Shouldn't happen, maybe a sign the job being monitored does not have a + # single launcher pod? exit 255 fi done exit ${failure} - + # Provide more debug output in case of failure; note that some kinds of launch + # failure do not produce any log output. - name: Debug failed Kubernetes job if: failure() run: | - LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" + # Provide better debug in case of launch failures that will not produce log output pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) if [[ -n "${pods}" ]]; then kubectl describe ${pods} fi + # Clean up in case of errors as well as success + - name: Delete Kubernetes job + if: always() + run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml + - name: Delete GitHub Container Registry token + if: always() + run: kubectl delete secret ${TOKEN_NAME} \ No newline at end of file From 900ebb220eb63f1a55e5ad5845e6f212f5c5cc35 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 13:29:49 +0000 Subject: [PATCH 67/89] Fix the nccl test --- .github/workflows/_test_nccl.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 3ccf55809..53dbcdaca 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -47,6 +47,16 @@ jobs: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Create env vars + id: var + shell: bash + run: | + JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}" + LAUNCHER_NAME="${JOB_NAME}-launcher" + TOKEN_NAME="${JOB_NAME}-token" + # Make these available to later steps + echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV" + echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV" - name: K8s GHCR store and delete token id: store-token uses: ./.github/actions/store-delete-k8s-ghcr From a5b8e082ae2ad4f3acb9b18b3e0f9296f06933ef Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 14:09:34 +0000 Subject: [PATCH 68/89] do not add the shell --- .github/actions/store-delete-k8s-ghcr/action.yml | 2 +- .github/actions/submit-delete-k8s-job/action.yml | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml index e8761d570..1d3acec18 100644 --- a/.github/actions/store-delete-k8s-ghcr/action.yml +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -24,4 +24,4 @@ runs: --from-file=.dockerconfigjson=$HOME/.docker/config.json \ --type=kubernetes.io/dockerconfigjson post: | - kubectl delete secret ${{ inputs.token-name }} + kubectl delete secret ${{ steps.token.outputs.token-name }} diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index b58179326..5c91af1f4 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -1,9 +1,9 @@ -name: Delete K8s Job -description: Cleans up the Job resource to avoid leaving pods behind. +name: Submit & Delete K8s Job +description: Submit and delete a K8s job after its execution inputs: job-name: - description: The job name to delete + description: The job name required: true job-config-file: description: Path to the Kubernetes job YAML @@ -12,9 +12,8 @@ inputs: runs: using: "composite" steps: - - name: Delete Kubernetes job + - name: Submit and Delete Kubernetes job uses: ./.github/actions/with-post-step - shell: bash with: main: | echo "Submit K8s job" From 43f75a6ddfc1edf92caa7bcba5892d205d5d302f Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 15:07:18 +0000 Subject: [PATCH 69/89] correct typos --- .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml | 2 +- .github/eks-workflow-files/axlearn/axlearn-job.yml | 3 +-- .github/workflows/_test_nccl.yaml | 5 +---- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml index c6d9db3ab..de6f6c7ad 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml @@ -13,7 +13,7 @@ spec: spec: restartPolicy: Never containers: - - name: axlearn-fuji-3B + - name: axlearn-fuji image: PLACEHOLDER command: - bash diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index b1ac81909..f3998ef9f 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -22,8 +22,7 @@ spec: test-axlearn.sh \ --directory "." \ --output "/opt/output/" \ - --test-files "/opt/axlearn/axlearn/common/*_test.py" \ - --k8s + --test-files "/opt/axlearn/axlearn/common/*_test.py" sync wait diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 53dbcdaca..76d66ab9a 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -124,7 +124,4 @@ jobs: # Clean up in case of errors as well as success - name: Delete Kubernetes job if: always() - run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml - - name: Delete GitHub Container Registry token - if: always() - run: kubectl delete secret ${TOKEN_NAME} \ No newline at end of file + run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml \ No newline at end of file From e3a9e4e78f0e8281dd1ccbea97eb7ea2c02c3e4c Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 26 Feb 2025 18:40:40 +0000 Subject: [PATCH 70/89] fix the fuji eks model --- .../axlearn/axlearn-fuji-model.yml | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml index de6f6c7ad..e2662d040 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml @@ -13,14 +13,50 @@ spec: spec: restartPolicy: Never containers: - - name: axlearn-fuji + - name: axlearn-fuji-model image: PLACEHOLDER command: - bash - -xo - pipefail - -c - - "\nBASEDIR=\"/opt/axlearn\"\nCONFIG=\"fuji-3B-v3-flash-single-host\"\nBASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true\n --xla_gpu_enable_highest_priority_async_stream=true\n --xla_gpu_all_reduce_combine_threshold_bytes=1073741824\n --xla_gpu_all_gather_combine_threshold_bytes=1073741824\n --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824\n --xla_gpu_enable_pipelined_all_gather=true\n --xla_gpu_enable_pipelined_reduce_scatter=true\n --xla_gpu_enable_pipelined_all_reduce=true\n --xla_gpu_enable_while_loop_double_buffering=true\n --xla_gpu_enable_triton_gemm=false\n --xla_gpu_enable_all_gather_combine_by_dim=false\n --xla_gpu_enable_reduce_scatter_combine_by_dim=false\n --xla_disable_hlo_passes=rematerialization}\n\nexport XLA_FLAGS=\"$BASE_XLA_FLAGS ${XLA_FLAGS:-}\" \n\nLOG_DIR=${BASEDIR}/logs\nTRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir\nmkdir -p ${TRAINER_DIR}\n\npython3 -m axlearn.common.launch_trainer_main \\\n --module=text.gpt.c4_trainer \\\n --config=${CONFIG} \\\n --trainer_dir=${TRAINER_DIR} \\\n --data_dir=gs://axlearn-public/tensorflow_datasets \\\n --jax_backend=gpu \n" + - | + BASEDIR="/opt/axlearn" + CONFIG="fuji-3B-v3-flash-single-host" + HLO_DUMP=0 + POSTFIX="" + + AR_THRESHOLD=1073741824 + AG_THRESHOLD=8589934592 + RS_THRESHOLD=8589934592 + BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_pipelined_all_reduce=true + --xla_gpu_enable_while_loop_double_buffering=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_all_gather_combine_by_dim=false + --xla_gpu_enable_reduce_scatter_combine_by_dim=false + --xla_disable_hlo_passes=rematerialization} + + export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" + export TF_GPU_ALLOCATOR=cuda_malloc_async + + LOG_DIR=${BASEDIR}/logs + TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir + mkdir -p ${TRAINER_DIR} + + + python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu resources: limits: nvidia.com/gpu: 8 From 785f8ae6d3e8bce5524763e46128323d738cd001 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 27 Feb 2025 08:58:11 +0000 Subject: [PATCH 71/89] remove k8s --- .github/container/test-axlearn.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index b9c3f2dfe..579582a80 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -22,7 +22,6 @@ usage() { DIR='axlearn/axlearn/common' TEST_FILES=() OUTPUT_DIRECTORY='' -K8S=false # Parse args manually while [[ $# -gt 0 ]]; do From 7c2da3fe3aad525c39709e34bd5148d427eb8728 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 27 Feb 2025 09:41:57 +0000 Subject: [PATCH 72/89] remove test-fuji.sh, test with slurm --- .github/container/Dockerfile.axlearn | 2 +- .github/container/test-fuji.sh | 33 ---------------------------- 2 files changed, 1 insertion(+), 34 deletions(-) delete mode 100755 .github/container/test-fuji.sh diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 8c609d08d..b34923e29 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -38,7 +38,7 @@ EOF ## Add test script to the path ############################################################################### -ADD test-axlearn.sh test-fuji.sh /usr/local/bin/ +ADD test-axlearn.sh /usr/local/bin/ ############################################################################### ## Install accumulated packages from the base image and the previous stage diff --git a/.github/container/test-fuji.sh b/.github/container/test-fuji.sh deleted file mode 100755 index 9018f37de..000000000 --- a/.github/container/test-fuji.sh +++ /dev/null @@ -1,33 +0,0 @@ -#! /bin/bash -BASEDIR="/opt/host/" -CONFIG="fuji-1B-v3-flash" -POSTFIX=${POSTFIX:=""} - -BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_enable_highest_priority_async_stream=true - --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 - --xla_gpu_all_gather_combine_threshold_bytes=1073741824 - --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 - --xla_gpu_enable_pipelined_all_gather=true - --xla_gpu_enable_pipelined_reduce_scatter=true - --xla_gpu_enable_pipelined_all_reduce=true - --xla_gpu_enable_while_loop_double_buffering=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_all_gather_combine_by_dim=false - --xla_gpu_enable_reduce_scatter_combine_by_dim=false - --xla_disable_hlo_passes=rematerialization} - -export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" - -LOG_DIF=${BASEDIR}/logs -TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs -mkdir -p ${TRAINER_DIR} - -#test "${WITH_MP}" == 1 && export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=${SLURM_LAUNCH_NODE_IPADDR}:12345 --process_id=${SLURM_PROCID}" - -python3 -m axlearn.common.launch_trainer_main \ - --module=text.gpt.c4_trainer \ - --config=${CONFIG} \ - --trainer_dir=${TRAINER_DIR} \ - --data_dir=gs://axlearn-public/tensorflow_datasets \ - --jax_backend=gpu From d2823b8d74b51ae045dfd00ceab575581cee52e3 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 27 Feb 2025 16:32:52 +0000 Subject: [PATCH 73/89] try to not install seqio for tensorflow --- .github/container/Dockerfile.axlearn | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index b34923e29..8f0ceabac 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -21,7 +21,6 @@ aqtp==0.8.2 einops==0.8.0 nltk==3.7 portpicker==1.6.0 -seqio==0.0.18 protobuf==3.20.3 pytest>=7.4.3 REQUIREMENTS From 5afc8d9a4011cc4cfa09062d37fda2bea5a93d80 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 27 Feb 2025 17:23:23 +0000 Subject: [PATCH 74/89] recommit seqio --- .github/container/Dockerfile.axlearn | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 8f0ceabac..b34923e29 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -21,6 +21,7 @@ aqtp==0.8.2 einops==0.8.0 nltk==3.7 portpicker==1.6.0 +seqio==0.0.18 protobuf==3.20.3 pytest>=7.4.3 REQUIREMENTS From bbe8c3bff04f91130675922d720705a664188399 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 28 Feb 2025 09:28:23 +0000 Subject: [PATCH 75/89] substitute tensorflow with cpu one --- .github/container/pip-finalize.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 6d8ceac9b..56013ac78 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -46,6 +46,15 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then exit 1 fi +# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt +sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt + +# Replace any torch==Y with torch==Y+cpu in requirements.txt +sed -i 's/^torch==\([0-9.*]\+\)$/torch==\1+cpu/' requirements.txt + +# Add the --find-links option for PyTorch wheels +echo "--find-links https://download.pytorch.org/whl/torch" >> requirements.txt + # --no-deps is required since conflicts can still appear during pip-sync pip-sync --pip-args '--no-deps --src /opt' requirements.txt From f711efc38d6918a2131faa6cc6a1fc86f9cf790c Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 28 Feb 2025 09:52:18 +0000 Subject: [PATCH 76/89] fix the test --- .github/container/test-axlearn.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 579582a80..a46bc0e83 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -93,8 +93,8 @@ cd "$DIR" || exit 1 echo "Running tests..." -pip install torch --extra-index-url https://download.pytorch.org/whl/cpu -pip install transformers scikit-learn timm +pip install transformers --no-deps +pip install scikit-learn timm if [ "${#TEST_FILES[@]}" -eq 0 ]; then From faf0b83ec0afe32d05418f3c37a4df3a39e550fa Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 28 Feb 2025 10:38:35 +0000 Subject: [PATCH 77/89] fix installation process --- .github/container/Dockerfile.axlearn | 6 ------ .github/container/pip-finalize.sh | 15 ++++++--------- .github/container/test-axlearn.sh | 4 ++-- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index b34923e29..ac73d07c6 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -25,12 +25,6 @@ seqio==0.0.18 protobuf==3.20.3 pytest>=7.4.3 REQUIREMENTS - # Only append "tensorflow-cpu" if running on x86_64 - if [ "$(uname -m)" = "x86_64" ]; then - echo "tensorflow-cpu" >> /opt/pip-tools.d/requirements-axlearn.in - else - echo "Skipping TF on $(uname -m)" - fi EOF diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 56013ac78..285da565c 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -46,15 +46,12 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then exit 1 fi -# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt -sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt - -# Replace any torch==Y with torch==Y+cpu in requirements.txt -sed -i 's/^torch==\([0-9.*]\+\)$/torch==\1+cpu/' requirements.txt - -# Add the --find-links option for PyTorch wheels -echo "--find-links https://download.pytorch.org/whl/torch" >> requirements.txt - +# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64 +if [ "$(uname -m)" = "x86_64" ]; then + sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt +else + echo "Skipping TF on $(uname -m)" +fi # --no-deps is required since conflicts can still appear during pip-sync pip-sync --pip-args '--no-deps --src /opt' requirements.txt diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index a46bc0e83..d1993cc03 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -93,8 +93,8 @@ cd "$DIR" || exit 1 echo "Running tests..." -pip install transformers --no-deps -pip install scikit-learn timm +pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu +pip install timm transformers scikit-learn if [ "${#TEST_FILES[@]}" -eq 0 ]; then From b2579cb0f03b94a387e80e05fc6687ca77477694 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 11:29:56 +0000 Subject: [PATCH 78/89] @olupton comments work --- .../actions/submit-delete-k8s-job/action.yml | 2 +- .../axlearn/axlearn-job.yml | 8 ++- .github/workflows/_ci.yaml | 17 ++--- .github/workflows/_test_nccl.yaml | 2 +- README.md | 25 +++++++ rosetta/rosetta/projects/axlearn/README.md | 59 +++++++++++++++ .../projects/axlearn/scripts/eks-fuji.yaml | 66 +++++++++++++++++ .../projects/axlearn/scripts/multinode.py | 71 +++++++++++++++++++ 8 files changed, 238 insertions(+), 12 deletions(-) create mode 100644 rosetta/rosetta/projects/axlearn/README.md create mode 100644 rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml create mode 100644 rosetta/rosetta/projects/axlearn/scripts/multinode.py diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index 5c91af1f4..dbeabe668 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -34,4 +34,4 @@ runs: kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} post: | - kubectl delete job ${{ inputs.job-name }} + kubectl delete -f "${{ inputs.job-config-file }}" diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index f3998ef9f..d27ee53d5 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -37,7 +37,7 @@ spec: - name: upload image: amazon/aws-cli env: - - name: TEST_DATE + - name: RUN_ID value: PLACEHOLDER command: - sh @@ -47,7 +47,11 @@ spec: sleep 5 done # Upload to S3 bucket - aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt + aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt + # Zip the results of all the tests + tar -czf test_logs.tar.gz /opt/output + # Upload logs to S3 bucket + aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz volumeMounts: - name: output mountPath: /opt/output diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index bf530d533..eaaf82b21 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -688,11 +688,12 @@ jobs: - name: Download logs from S3 id: log-s3 run: | - mkdir -p /tmp/axlearn-output - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt /tmp/axlearn-output/ + mkdir -p axlearn-output + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.tar.gz axlearn-output/ - passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true) - failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true) + passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true) + failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true) total_tests=$((failed_tests + passed_tests)) echo "Passed tests: $passed_tests" @@ -733,7 +734,7 @@ jobs: message="Passed $passed_tests out of $total_tests." \ color=$badge_color \ to_json schemaVersion label message color \ - > "badge-axlearn-test" + > badge-axlearn-test.json - name: Upload artifacts if: ${{ !cancelled() }} @@ -742,8 +743,8 @@ jobs: name: "artifact-axlearn-test" path: | sitrep.json - "badge-axlearn-test" - summary.txt + badge-axlearn-test.json + axlearn-output/* # the fuji test will run for 20 minutes only, as per 2025-02-24 # is not possible to set the `max_steps` value @@ -779,5 +780,5 @@ jobs: uses: ./.github/actions/submit-delete-k8s-job with: job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" - job-name: ${{ env.JOB_NAME }} + job-name: ${{ env.JOB_NAME }}https://docs.google.com/spreadsheets/d/12JIThodWLhf-H7Ob9p3CGZHLjKEPp17ogp9Do5Ofa6U/edit?gid=1030128481#gid=1030128481 diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 76d66ab9a..f8b328b76 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -124,4 +124,4 @@ jobs: # Clean up in case of errors as well as success - name: Delete Kubernetes job if: always() - run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml \ No newline at end of file + run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml diff --git a/README.md b/README.md index 648208205..83053215e 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ We support and test the following JAX frameworks and model architectures. More d | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` | | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` | | levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` | +| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | # Build Pipeline Status @@ -248,6 +249,30 @@ We support and test the following JAX frameworks and model architectures. More d + + + + + +
+ + + + + ghcr.io/nvidia/jax:axlearn + + + + +
+ + + +
+ + + +
diff --git a/rosetta/rosetta/projects/axlearn/README.md b/rosetta/rosetta/projects/axlearn/README.md new file mode 100644 index 000000000..f4c8f6679 --- /dev/null +++ b/rosetta/rosetta/projects/axlearn/README.md @@ -0,0 +1,59 @@ +# AXLearn +[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. + + +## Hardware and Software Specifications + +Functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G); please refer to the [Configs](#configs) section below for some initial configs and performance numbers. We will continue to populate it with more models and configs. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU. + + +## Containers +We provide a fully built and ready-to-use multi-arch container, bleeding edge: `ghcr.io/nvidia/jax:axlearn`. We also provide nightly dated images with the naming pattern `ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD`, but we encourage you to use the latest ones for the best performance. + +*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the AXLearn repository. When working interactively with containers, make sure you navigate to `/opt/axlearn` before running any commmands. + +## Launching a container +Use the following command to launch a container: +``` +docker run -ti --gpus=all --net=host --ipc=host -v :/opt/axlearn/workspace -w /opt/axlearn /bin/bash +``` +where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag. + +## Running a Fuji model +### Quick Runs + +#### EKS Single node: `fuji-3B-v3-flash-single-host` +Fuji models are defined with 1B, 3B, 7B or 70B parameters. In this example, we deploy the training for a Fuji-3B model, that uses flash attention, and runs on a single host. [Here](scripts/eks-fuji.yaml) we provide an example deployment file. The core point of the deployment is: +```bash +python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu +``` +Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). + +#### Running a multinode job for `fuji-XB-v2-flash` + +For running a multinode job we provide a [custom example](scripts/multinode.py). The code access AXLearn directly, it allows to specify a custom dataset, the number of GPUs to use, the global batch size, as well as the `max_sequence_length`. + + +## XLA Flags +The [GPU Performance document](../../../docs/GPU_performance.md) provides a detailed description of the XLA flags that can be set to optimize performance. These are the recommended XLA flags to get good performance for AXLearn. + +``` +XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_command_buffer= + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_pipelined_all_reduce=true + --xla_gpu_enable_while_loop_double_buffering=true + --xla_gpu_enable_all_gather_combine_by_dim=false + --xla_gpu_enable_reduce_scatter_combine_by_dim=false + --xla_disable_hlo_passes=rematerialization" +``` \ No newline at end of file diff --git a/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml b/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml new file mode 100644 index 000000000..8d24a1658 --- /dev/null +++ b/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml @@ -0,0 +1,66 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: axlearn-fuji + # Specify any labels for running on a dedicated queue +spec: + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn-fuji-model + image: gchr.io/nvidia/jax:axlearn + command: + - bash + - -xo + - pipefail + - -c + - | + BASEDIR="/opt/axlearn" + CONFIG="fuji-3B-v3-flash-single-host" + HLO_DUMP=0 + POSTFIX="" + + AR_THRESHOLD=1073741824 + AG_THRESHOLD=8589934592 + RS_THRESHOLD=8589934592 + BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_pipelined_all_reduce=true + --xla_gpu_enable_while_loop_double_buffering=true + --xla_gpu_enable_triton_gemm=false + --xla_gpu_enable_all_gather_combine_by_dim=false + --xla_gpu_enable_reduce_scatter_combine_by_dim=false + --xla_disable_hlo_passes=rematerialization} + + export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" + export TF_GPU_ALLOCATOR=cuda_malloc_async + + LOG_DIR=${BASEDIR}/logs + TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir + mkdir -p ${TRAINER_DIR} + + + python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + # specify any image secret if needed + volumes: + - name: output + emptyDir: {} diff --git a/rosetta/rosetta/projects/axlearn/scripts/multinode.py b/rosetta/rosetta/projects/axlearn/scripts/multinode.py new file mode 100644 index 000000000..0107ebddc --- /dev/null +++ b/rosetta/rosetta/projects/axlearn/scripts/multinode.py @@ -0,0 +1,71 @@ +import os + +from absl import app, flags +from axlearn.common.launch_trainer import run_trainer +from axlearn.common.config import config_for_function +from axlearn.experiments.text.gpt import c4_trainer +from axlearn.common.trainer import SpmdTrainer + +FLAGS = flags.FLAGS +FLAGS.set_default("module", "text.gpt.c4_trainer") +FLAGS.set_default("config", "fuji-7B-v2-flash") # Set the model +FLAGS.set_default("trainer_dir", "/opt/host/axlearn-checkpoints") # Set the trainer directory + +def main(_): + axlearn_path = "/opt/axlearn" + os.environ["PYTHONPATH"] = f"{axlearn_path}:{os.environ.get('PYTHONPATH', '')}" + + n_gpus = 16 # This can be also an env variable + # Base XLA flags + base_flags = [ + "--xla_gpu_enable_latency_hiding_scheduler=true", + "--xla_gpu_enable_command_buffer=", + "--xla_gpu_enable_highest_priority_async_stream=true", + "--xla_gpu_all_reduce_combine_threshold_bytes=1073741824", + "--xla_gpu_all_gather_combine_threshold_bytes=1073741824", + "--xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824", + "--xla_gpu_enable_pipelined_all_gather=true", + "--xla_gpu_enable_pipelined_reduce_scatter=true", + "--xla_gpu_enable_pipelined_all_reduce=true", + "--xla_gpu_enable_while_loop_double_buffering=true", + "--xla_gpu_enable_triton_gemm=false", + "--xla_gpu_enable_all_gather_combine_by_dim=false", + "--xla_gpu_enable_reduce_scatter_combine_by_dim=false", + "--xla_disable_hlo_passes=rematerialization", + ] + # Get existing flags from environment with proper fallback. + existing_xla_flags = os.environ.get("XLA_FLAGS", "").split() + # XLA flags + os.environ.update({ + "XLA_FLAGS": " ".join([ + *base_flags, + *existing_xla_flags + ])}) + + os.environ.update({ + "DATA_DIR":"gs://axlearn-public/tensorflow_datasets", # Set up your input dataset + "NUM_PROCESSES":f"{n_gpus}", + "DISTRIBUTED_COORDINATOR":"127.0.0.1:8080", + "PROCESS_ID":"0", + }) + + # Raw config + config_fn = c4_trainer.named_trainer_configs()[FLAGS.config] + trainer_config: SpmdTrainer.Config = config_for_function(config_fn).fn() + + trainer_config.max_step = 100 # Set the max number of steps to run + trainer_config.dir = "/opt/host/axlearn-checkpoints" # Use 'dir' instead of 'model_dir' + trainer_config.input.input_dispatcher.global_logical_batch_size = 8 # Tune the batch size for training + #trainer_config.input.source.max_sequence_length = 2048 # Tune the max sequence length if running in OOM + trainer_config.checkpointer.save_policy.n = 500 # Save every 500 steps + trainer_config.checkpointer.keep_every_n_steps = 500 # Keep checkpoints + trainer_config.summary_writer.write_every_n_steps = 100 # Log every 100 steps + + run_trainer( + trainer_config=trainer_config, + ) + + +if __name__ == "__main__": + from absl import app + app.run(main) From fc64bbd5e6005025771ace9ef8bafebd99d621c1 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 12:47:21 +0000 Subject: [PATCH 79/89] fix typo --- .github/workflows/_ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index eaaf82b21..82e1bdb35 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -780,5 +780,5 @@ jobs: uses: ./.github/actions/submit-delete-k8s-job with: job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" - job-name: ${{ env.JOB_NAME }}https://docs.google.com/spreadsheets/d/12JIThodWLhf-H7Ob9p3CGZHLjKEPp17ogp9Do5Ofa6U/edit?gid=1030128481#gid=1030128481 + job-name: ${{ env.JOB_NAME }} From 7f186cc72c94bec0ef61204ffdfd23ebc5c46b4e Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 15:17:25 +0000 Subject: [PATCH 80/89] fix readme, and copy of zip file, and xla flags --- .../axlearn/axlearn-fuji-model.yml | 4 -- .../axlearn/axlearn-job.yml | 2 +- docs/frameworks/axlearn/README.md | 40 +++++++++++ rosetta/rosetta/projects/axlearn/README.md | 59 --------------- .../projects/axlearn/scripts/eks-fuji.yaml | 66 ----------------- .../projects/axlearn/scripts/multinode.py | 71 ------------------- 6 files changed, 41 insertions(+), 201 deletions(-) create mode 100644 docs/frameworks/axlearn/README.md delete mode 100644 rosetta/rosetta/projects/axlearn/README.md delete mode 100644 rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml delete mode 100644 rosetta/rosetta/projects/axlearn/scripts/multinode.py diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml index e2662d040..a36411d73 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml @@ -30,7 +30,6 @@ spec: AG_THRESHOLD=8589934592 RS_THRESHOLD=8589934592 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=1073741824 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 @@ -38,9 +37,6 @@ spec: --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_while_loop_double_buffering=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_all_gather_combine_by_dim=false - --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_disable_hlo_passes=rematerialization} export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index d27ee53d5..8d0eda9e2 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -51,7 +51,7 @@ spec: # Zip the results of all the tests tar -czf test_logs.tar.gz /opt/output # Upload logs to S3 bucket - aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz + aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz volumeMounts: - name: output mountPath: /opt/output diff --git a/docs/frameworks/axlearn/README.md b/docs/frameworks/axlearn/README.md new file mode 100644 index 000000000..ad7172ca7 --- /dev/null +++ b/docs/frameworks/axlearn/README.md @@ -0,0 +1,40 @@ +# AXLearn +[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. + + +## Hardware and Software Specifications + +The functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G). + + +## Containers +We provide a multi-architecture container that is regularly updated. Use these containers to avoid dependency and environment issues. +- Latest container: ghcr.io/nvidia/jax:axlearn +- Nightly dated container: ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD + +When you start an interactive session: + +- Navigate to `/opt/axlearn` inside the container. +- Place your persistent files in a mounted directory (e.g. `/opt/axlearn/workspace`). + +## Launching a container +Use the following command to launch a container: +```bash +docker run -ti --gpus=all --net=host --ipc=host -v :/opt/axlearn/workspace -w /opt/axlearn /bin/bash +``` +where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag. + +## Example: training `fuji-3B-v3-flash-single-host` on EKS +[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml) we're using for deploying the training of Fuji-3B model, that uses flash attention, and runs on a single host. The core part of the deployment is: +```bash +python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu +``` +Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). + +## Testing +[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-job.yml) used for testing AXLearn funcitonalities. In particular, this test makes uses of [`test_axlearn.sh` script](../../../.github/container/test-axlearn.sh). The test runs `pytest` against all the tests contains in `/opt/axlearn/axlearn/common` folder. diff --git a/rosetta/rosetta/projects/axlearn/README.md b/rosetta/rosetta/projects/axlearn/README.md deleted file mode 100644 index f4c8f6679..000000000 --- a/rosetta/rosetta/projects/axlearn/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# AXLearn -[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. - - -## Hardware and Software Specifications - -Functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G); please refer to the [Configs](#configs) section below for some initial configs and performance numbers. We will continue to populate it with more models and configs. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU. - - -## Containers -We provide a fully built and ready-to-use multi-arch container, bleeding edge: `ghcr.io/nvidia/jax:axlearn`. We also provide nightly dated images with the naming pattern `ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD`, but we encourage you to use the latest ones for the best performance. - -*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the AXLearn repository. When working interactively with containers, make sure you navigate to `/opt/axlearn` before running any commmands. - -## Launching a container -Use the following command to launch a container: -``` -docker run -ti --gpus=all --net=host --ipc=host -v :/opt/axlearn/workspace -w /opt/axlearn /bin/bash -``` -where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag. - -## Running a Fuji model -### Quick Runs - -#### EKS Single node: `fuji-3B-v3-flash-single-host` -Fuji models are defined with 1B, 3B, 7B or 70B parameters. In this example, we deploy the training for a Fuji-3B model, that uses flash attention, and runs on a single host. [Here](scripts/eks-fuji.yaml) we provide an example deployment file. The core point of the deployment is: -```bash -python3 -m axlearn.common.launch_trainer_main \ - --module=text.gpt.c4_trainer \ - --config=${CONFIG} \ - --trainer_dir=${TRAINER_DIR} \ - --data_dir=gs://axlearn-public/tensorflow_datasets \ - --jax_backend=gpu -``` -Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). - -#### Running a multinode job for `fuji-XB-v2-flash` - -For running a multinode job we provide a [custom example](scripts/multinode.py). The code access AXLearn directly, it allows to specify a custom dataset, the number of GPUs to use, the global batch size, as well as the `max_sequence_length`. - - -## XLA Flags -The [GPU Performance document](../../../docs/GPU_performance.md) provides a detailed description of the XLA flags that can be set to optimize performance. These are the recommended XLA flags to get good performance for AXLearn. - -``` -XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_command_buffer= - --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 - --xla_gpu_all_gather_combine_threshold_bytes=1073741824 - --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 - --xla_gpu_enable_pipelined_all_gather=true - --xla_gpu_enable_pipelined_reduce_scatter=true - --xla_gpu_enable_pipelined_all_reduce=true - --xla_gpu_enable_while_loop_double_buffering=true - --xla_gpu_enable_all_gather_combine_by_dim=false - --xla_gpu_enable_reduce_scatter_combine_by_dim=false - --xla_disable_hlo_passes=rematerialization" -``` \ No newline at end of file diff --git a/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml b/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml deleted file mode 100644 index 8d24a1658..000000000 --- a/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml +++ /dev/null @@ -1,66 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: axlearn-fuji - # Specify any labels for running on a dedicated queue -spec: - completions: 1 - parallelism: 1 - template: - spec: - restartPolicy: Never - containers: - - name: axlearn-fuji-model - image: gchr.io/nvidia/jax:axlearn - command: - - bash - - -xo - - pipefail - - -c - - | - BASEDIR="/opt/axlearn" - CONFIG="fuji-3B-v3-flash-single-host" - HLO_DUMP=0 - POSTFIX="" - - AR_THRESHOLD=1073741824 - AG_THRESHOLD=8589934592 - RS_THRESHOLD=8589934592 - BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true - --xla_gpu_enable_highest_priority_async_stream=true - --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 - --xla_gpu_all_gather_combine_threshold_bytes=1073741824 - --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 - --xla_gpu_enable_pipelined_all_gather=true - --xla_gpu_enable_pipelined_reduce_scatter=true - --xla_gpu_enable_pipelined_all_reduce=true - --xla_gpu_enable_while_loop_double_buffering=true - --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_all_gather_combine_by_dim=false - --xla_gpu_enable_reduce_scatter_combine_by_dim=false - --xla_disable_hlo_passes=rematerialization} - - export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" - export TF_GPU_ALLOCATOR=cuda_malloc_async - - LOG_DIR=${BASEDIR}/logs - TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir - mkdir -p ${TRAINER_DIR} - - - python3 -m axlearn.common.launch_trainer_main \ - --module=text.gpt.c4_trainer \ - --config=${CONFIG} \ - --trainer_dir=${TRAINER_DIR} \ - --data_dir=gs://axlearn-public/tensorflow_datasets \ - --jax_backend=gpu - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: output - mountPath: /opt/output - # specify any image secret if needed - volumes: - - name: output - emptyDir: {} diff --git a/rosetta/rosetta/projects/axlearn/scripts/multinode.py b/rosetta/rosetta/projects/axlearn/scripts/multinode.py deleted file mode 100644 index 0107ebddc..000000000 --- a/rosetta/rosetta/projects/axlearn/scripts/multinode.py +++ /dev/null @@ -1,71 +0,0 @@ -import os - -from absl import app, flags -from axlearn.common.launch_trainer import run_trainer -from axlearn.common.config import config_for_function -from axlearn.experiments.text.gpt import c4_trainer -from axlearn.common.trainer import SpmdTrainer - -FLAGS = flags.FLAGS -FLAGS.set_default("module", "text.gpt.c4_trainer") -FLAGS.set_default("config", "fuji-7B-v2-flash") # Set the model -FLAGS.set_default("trainer_dir", "/opt/host/axlearn-checkpoints") # Set the trainer directory - -def main(_): - axlearn_path = "/opt/axlearn" - os.environ["PYTHONPATH"] = f"{axlearn_path}:{os.environ.get('PYTHONPATH', '')}" - - n_gpus = 16 # This can be also an env variable - # Base XLA flags - base_flags = [ - "--xla_gpu_enable_latency_hiding_scheduler=true", - "--xla_gpu_enable_command_buffer=", - "--xla_gpu_enable_highest_priority_async_stream=true", - "--xla_gpu_all_reduce_combine_threshold_bytes=1073741824", - "--xla_gpu_all_gather_combine_threshold_bytes=1073741824", - "--xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824", - "--xla_gpu_enable_pipelined_all_gather=true", - "--xla_gpu_enable_pipelined_reduce_scatter=true", - "--xla_gpu_enable_pipelined_all_reduce=true", - "--xla_gpu_enable_while_loop_double_buffering=true", - "--xla_gpu_enable_triton_gemm=false", - "--xla_gpu_enable_all_gather_combine_by_dim=false", - "--xla_gpu_enable_reduce_scatter_combine_by_dim=false", - "--xla_disable_hlo_passes=rematerialization", - ] - # Get existing flags from environment with proper fallback. - existing_xla_flags = os.environ.get("XLA_FLAGS", "").split() - # XLA flags - os.environ.update({ - "XLA_FLAGS": " ".join([ - *base_flags, - *existing_xla_flags - ])}) - - os.environ.update({ - "DATA_DIR":"gs://axlearn-public/tensorflow_datasets", # Set up your input dataset - "NUM_PROCESSES":f"{n_gpus}", - "DISTRIBUTED_COORDINATOR":"127.0.0.1:8080", - "PROCESS_ID":"0", - }) - - # Raw config - config_fn = c4_trainer.named_trainer_configs()[FLAGS.config] - trainer_config: SpmdTrainer.Config = config_for_function(config_fn).fn() - - trainer_config.max_step = 100 # Set the max number of steps to run - trainer_config.dir = "/opt/host/axlearn-checkpoints" # Use 'dir' instead of 'model_dir' - trainer_config.input.input_dispatcher.global_logical_batch_size = 8 # Tune the batch size for training - #trainer_config.input.source.max_sequence_length = 2048 # Tune the max sequence length if running in OOM - trainer_config.checkpointer.save_policy.n = 500 # Save every 500 steps - trainer_config.checkpointer.keep_every_n_steps = 500 # Keep checkpoints - trainer_config.summary_writer.write_every_n_steps = 100 # Log every 100 steps - - run_trainer( - trainer_config=trainer_config, - ) - - -if __name__ == "__main__": - from absl import app - app.run(main) From 63ace5d8ceb38d31e57f663f1c5224f92a3491d2 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 17:45:15 +0000 Subject: [PATCH 81/89] fix test error --- .github/eks-workflow-files/axlearn/axlearn-job.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 8d0eda9e2..d3dd154df 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -49,7 +49,7 @@ spec: # Upload to S3 bucket aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt # Zip the results of all the tests - tar -czf test_logs.tar.gz /opt/output + tar cvzf test_logs.tar.gz /opt/output # Upload logs to S3 bucket aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz volumeMounts: From ec6b54871e8bf8333dc3cc9b81c6699599470432 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 17:46:26 +0000 Subject: [PATCH 82/89] run small test --- .github/container/test-axlearn.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index d1993cc03..e31b0bf30 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -149,7 +149,7 @@ passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]}"; do +for test_file in "${final_test_files[@]:0:10}"; do echo "Running: ${test_file}" log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log log_file="${LOG_DIRECTORY}/${log_file_name}" From 68c001088101f7e43c18f5e43139fbcfb5401c90 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 19:32:27 +0000 Subject: [PATCH 83/89] change with zip --- .github/eks-workflow-files/axlearn/axlearn-job.yml | 4 ++-- .github/workflows/_ci.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index d3dd154df..14124cc4c 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -49,9 +49,9 @@ spec: # Upload to S3 bucket aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt # Zip the results of all the tests - tar cvzf test_logs.tar.gz /opt/output + zip test_logs.zip /opt/output # Upload logs to S3 bucket - aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz + aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.zip volumeMounts: - name: output mountPath: /opt/output diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 82e1bdb35..35784c5a3 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -690,7 +690,7 @@ jobs: run: | mkdir -p axlearn-output aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/ - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.tar.gz axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.zip axlearn-output/ passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true) failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true) From 46b6c9ee3f9e2dc344cc9d6d85fef5e27e67076c Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 19:32:56 +0000 Subject: [PATCH 84/89] change with zip --- .github/eks-workflow-files/axlearn/axlearn-job.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 14124cc4c..f2da9ff0d 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -49,7 +49,7 @@ spec: # Upload to S3 bucket aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt # Zip the results of all the tests - zip test_logs.zip /opt/output + zip -r test_logs.zip /opt/output # Upload logs to S3 bucket aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.zip volumeMounts: From 7ef84c401c7e4f019905f6369700dc5dacbc5729 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 20:45:46 +0000 Subject: [PATCH 85/89] fix the copy --- .github/container/test-axlearn.sh | 2 +- .github/eks-workflow-files/axlearn/axlearn-job.yml | 4 +--- .github/workflows/_ci.yaml | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index e31b0bf30..9d7faf9dd 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -149,7 +149,7 @@ passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]:0:10}"; do +for test_file in "${final_test_files[@]:0:3}"; do echo "Running: ${test_file}" log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log log_file="${LOG_DIRECTORY}/${log_file_name}" diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index f2da9ff0d..8f70908da 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -48,10 +48,8 @@ spec: done # Upload to S3 bucket aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt - # Zip the results of all the tests - zip -r test_logs.zip /opt/output # Upload logs to S3 bucket - aws s3 cp test_logs.tar.gz s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.zip + aws s3 cp /opt/output/ s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/ --recursive --exclude "*" --include "*.log" volumeMounts: - name: output mountPath: /opt/output diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 35784c5a3..a1d333d91 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -690,7 +690,7 @@ jobs: run: | mkdir -p axlearn-output aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/ - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.zip axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log" passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true) failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true) From 828073c14059954d8eed46ca52304a09a6a401d6 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 3 Mar 2025 22:20:54 +0000 Subject: [PATCH 86/89] fixed tests and comments @olupton --- .github/container/test-axlearn.sh | 2 +- .github/workflows/_ci.yaml | 976 +++++++++++++++--------------- 2 files changed, 489 insertions(+), 489 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 9d7faf9dd..d1993cc03 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -149,7 +149,7 @@ passed=0 SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" -for test_file in "${final_test_files[@]:0:3}"; do +for test_file in "${final_test_files[@]}"; do echo "Running: ${test_file}" log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log log_file="${LOG_DIRECTORY}/${log_file_name}" diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index a1d333d91..9ded946d2 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,115 +66,115 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - # build-triton: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-triton-build - # BADGE_FILENAME: badge-triton-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: triton - # DOCKERFILE: .github/container/Dockerfile.triton - # RUNNER_SIZE: large - # EXTRA_BUILD_ARGS: | - # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - # secrets: inherit + build-triton: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-triton-build + BADGE_FILENAME: badge-triton-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: triton + DOCKERFILE: .github/container/Dockerfile.triton + RUNNER_SIZE: large + EXTRA_BUILD_ARGS: | + URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + secrets: inherit - # build-equinox: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-equinox-build - # BADGE_FILENAME: badge-equinox-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: equinox - # DOCKERFILE: .github/container/Dockerfile.equinox - # EXTRA_BUILD_ARGS: | - # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - # secrets: inherit + build-equinox: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-equinox-build + BADGE_FILENAME: badge-equinox-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: equinox + DOCKERFILE: .github/container/Dockerfile.equinox + EXTRA_BUILD_ARGS: | + URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + secrets: inherit - # build-maxtext: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-maxtext-build - # BADGE_FILENAME: badge-maxtext-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: maxtext - # DOCKERFILE: .github/container/Dockerfile.maxtext - # EXTRA_BUILD_ARGS: | - # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - # secrets: inherit + build-maxtext: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-maxtext-build + BADGE_FILENAME: badge-maxtext-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: maxtext + DOCKERFILE: .github/container/Dockerfile.maxtext + EXTRA_BUILD_ARGS: | + URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + secrets: inherit - # build-levanter: - # needs: [build-jax] - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-levanter-build" - # BADGE_FILENAME: "badge-levanter-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: levanter - # DOCKERFILE: .github/container/Dockerfile.levanter - # EXTRA_BUILD_ARGS: | - # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - # secrets: inherit + build-levanter: + needs: [build-jax] + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-levanter-build" + BADGE_FILENAME: "badge-levanter-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: levanter + DOCKERFILE: .github/container/Dockerfile.levanter + EXTRA_BUILD_ARGS: | + URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + secrets: inherit - # build-upstream-t5x: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: "artifact-t5x-build" - # BADGE_FILENAME: "badge-t5x-build" - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: upstream-t5x - # DOCKERFILE: .github/container/Dockerfile.t5x - # EXTRA_BUILD_ARGS: | - # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - # secrets: inherit + build-upstream-t5x: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: "artifact-t5x-build" + BADGE_FILENAME: "badge-t5x-build" + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-t5x + DOCKERFILE: .github/container/Dockerfile.t5x + EXTRA_BUILD_ARGS: | + URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + secrets: inherit - # build-rosetta-t5x: - # needs: build-upstream-t5x - # uses: ./.github/workflows/_build_rosetta.yaml - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: t5x - # secrets: inherit + build-rosetta-t5x: + needs: build-upstream-t5x + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: t5x + secrets: inherit - # build-gemma: - # needs: build-jax - # uses: ./.github/workflows/_build.yaml - # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-gemma-build - # BADGE_FILENAME: badge-gemma-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: gemma - # DOCKERFILE: rosetta/Dockerfile.gemma - # DOCKER_CONTEXT: . - # EXTRA_BUILD_ARGS: | - # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - # secrets: inherit + build-gemma: + needs: build-jax + uses: ./.github/workflows/_build.yaml + if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-gemma-build + BADGE_FILENAME: badge-gemma-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: gemma + DOCKERFILE: rosetta/Dockerfile.gemma + DOCKER_CONTEXT: . + EXTRA_BUILD_ARGS: | + URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + secrets: inherit build-axlearn: needs: build-jax @@ -196,13 +196,13 @@ jobs: needs: - build-base - build-jax - # - build-triton - # - build-equinox - # - build-maxtext - # - build-levanter - # - build-upstream-t5x - # - build-rosetta-t5x - # - build-gemma + - build-triton + - build-equinox + - build-maxtext + - build-levanter + - build-upstream-t5x + - build-rosetta-t5x + - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -214,22 +214,22 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - # {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - # {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -239,276 +239,276 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - # test-distribution: - # runs-on: ubuntu-22.04 - # strategy: - # matrix: - # TEST_SCRIPT: - # - extra-only-distribution.sh - # - mirror-only-distribution.sh - # - upstream-only-distribution.sh - # - local-patch-distribution.sh - # fail-fast: false - # steps: - # - name: Print environment variables - # run: env - # - name: Set git login for tests - # run: | - # git config --global user.email "jax@nvidia.com" - # git config --global user.name "JAX-Toolbox CI" - # - name: Check out the repository under ${GITHUB_WORKSPACE} - # uses: actions/checkout@v4 - # - name: Run integration test ${{ matrix.TEST_SCRIPT }} - # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - # test-jax: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: jax - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-backend-independent.log - # test-jax.sh -b backend-independent - # EOF - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee tee test-gpu.log - # nvidia-cuda-mps-control -d - # test-jax.sh -b gpu - # EOF - # STATISTICS_SCRIPT: | - # errors=$(cat test-*.log | grep -c 'ERROR:' || true) - # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-backend-independent.log - # test-gpu.log - # secrets: inherit + test-distribution: + runs-on: ubuntu-22.04 + strategy: + matrix: + TEST_SCRIPT: + - extra-only-distribution.sh + - mirror-only-distribution.sh + - upstream-only-distribution.sh + - local-patch-distribution.sh + fail-fast: false + steps: + - name: Print environment variables + run: env + - name: Set git login for tests + run: | + git config --global user.email "jax@nvidia.com" + git config --global user.name "JAX-Toolbox CI" + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v4 + - name: Run integration test ${{ matrix.TEST_SCRIPT }} + run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + test-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: jax + EXECUTE: | + docker run -i --shm-size=1g --gpus all \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-backend-independent.log + test-jax.sh -b backend-independent + EOF + docker run -i --shm-size=1g --gpus all \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee tee test-gpu.log + nvidia-cuda-mps-control -d + test-jax.sh -b gpu + EOF + STATISTICS_SCRIPT: | + errors=$(cat test-*.log | grep -c 'ERROR:' || true) + failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-backend-independent.log + test-gpu.log + secrets: inherit - # test-nsys-jax: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: nsys-jax - # EXECUTE: | - # set -o pipefail - # num_tests=0 - # num_failures=0 - # # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # # avoid an early abort here. - # set +e - # docker run -i --shm-size=1g --gpus all \ - # -v $PWD:/opt/output \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-nsys-jax.log - # # nsys-jax is already installed, this is just adding the test dependencies - # pip install pytest-reportlog nsys-jax[test] - # # abuse knowledge that nsys-jax is installed editable, so the tests exist - # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - # EOF - # set -e - # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - # for mode in 1-process 2-process process-per-gpu; do - # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - # if [[ "${mode}" == "1-process" ]]; then - # PROCESS_COUNT=1 - # ARGS="" - # elif [[ "${mode}" == "2-process" ]]; then - # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # # this will flush out more bugs than process-per-node or process-per-GPU. - # PROCESS_COUNT=2 - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - # else - # PROCESS_COUNT=${GPUS_PER_NODE} - # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - # fi - # for collection in full partial; do - # NSYS_JAX="nsys-jax" - # if [[ "${mode}" == "1-process" ]]; then - # # We will not run nsys-jax-combine, so run analyses eagerly - # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - # fi - # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - # if [[ "${collection}" == "partial" ]]; then - # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # # nvbug/4801401 - # NSYS_JAX+=" --sample=none" - # fi - # set +e - # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - # num_failures=$((num_failures + ($? != 0))) - # set -e - # num_tests=$((num_tests + 1)) - # done - # if [[ "${mode}" != "1-process" ]]; then - # # Run nsys-jax-combine - # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - # for (( i=0; i> $GITHUB_ENV - # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - # exit $num_failures - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-nsys-jax.log) - # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # # pytest-driven part - # test-nsys-jax.log - # pytest-report.jsonl - # # nsys-jax logfiles - # *process-*-execution.log - # # nsys-jax output for the case that doesn't use nsys-jax-combine - # 1-process-*-execution-0.zip - # # nsys-jax-combine output/logfiles - # *process*-*-execution.zip - # *-execution-combine.log - # secrets: inherit + test-nsys-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: nsys-jax + EXECUTE: | + set -o pipefail + num_tests=0 + num_failures=0 + # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # avoid an early abort here. + set +e + docker run -i --shm-size=1g --gpus all \ + -v $PWD:/opt/output \ + ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-nsys-jax.log + # nsys-jax is already installed, this is just adding the test dependencies + pip install pytest-reportlog nsys-jax[test] + # abuse knowledge that nsys-jax is installed editable, so the tests exist + test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + EOF + set -e + GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + for mode in 1-process 2-process process-per-gpu; do + DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + if [[ "${mode}" == "1-process" ]]; then + PROCESS_COUNT=1 + ARGS="" + elif [[ "${mode}" == "2-process" ]]; then + # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # this will flush out more bugs than process-per-node or process-per-GPU. + PROCESS_COUNT=2 + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + else + PROCESS_COUNT=${GPUS_PER_NODE} + ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + fi + for collection in full partial; do + NSYS_JAX="nsys-jax" + if [[ "${mode}" == "1-process" ]]; then + # We will not run nsys-jax-combine, so run analyses eagerly + NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + fi + NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + if [[ "${collection}" == "partial" ]]; then + NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # nvbug/4801401 + NSYS_JAX+=" --sample=none" + fi + set +e + ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + num_failures=$((num_failures + ($? != 0))) + set -e + num_tests=$((num_tests + 1)) + done + if [[ "${mode}" != "1-process" ]]; then + # Run nsys-jax-combine + NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + for (( i=0; i> $GITHUB_ENV + echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + exit $num_failures + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-nsys-jax.log) + num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + ARTIFACTS: | + # pytest-driven part + test-nsys-jax.log + pytest-report.jsonl + # nsys-jax logfiles + *process-*-execution.log + # nsys-jax output for the case that doesn't use nsys-jax-combine + 1-process-*-execution-0.zip + # nsys-jax-combine output/logfiles + *process*-*-execution.zip + *-execution-combine.log + secrets: inherit #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does #not already have nsys-jax installed - # test-nsys-jax-archive: - # needs: test-nsys-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # strategy: - # matrix: - # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - # runs-on: ${{ matrix.os }} - # steps: - # - name: Download nsys-jax output .zip files - # uses: actions/download-artifact@v4 - # with: - # name: nsys-jax-unit-test-A100 - # - name: Extract archives and execute install scripts - # run: | - # pip install virtualenv # for install.sh - # for zip in $(ls *.zip); do - # ZIP="${PWD}/${zip}" - # pushd $(mktemp -d) - # unzip "${ZIP}" - # ls -l - # # TODO: verify this isn't needed, or make sure it isn't needed - # chmod 755 install.sh - # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # # Skip executing Jupyter lab - # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - # popd - # done - - # test-nsys-jax-eks: - # needs: build-jax - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # runs-on: eks - # env: - # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: ${{ github.run_id }}-nsys-jax - # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - # TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token - # steps: - # - name: Check out the repository - # uses: actions/checkout@v4 - # - name: GHCR login - # uses: ./.github/actions/ghcr-login - # with: - # docker-username: ${{ github.repository_owner }} - # docker-password: ${{ secrets.GITHUB_TOKEN}} - # token-name: ${{ env.TOKEN_NAME }} - # - name: Configure Kubernetes job - # run: | - # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - # | select(di == 1).metadata.name = strenv(JOB_NAME) - # | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - # .github/eks-workflow-files/job.yml - # git diff .github/eks-workflow-files/job.yml - # - name: Submit Kubernetes job - # uses: ./.github/actions/submit-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/job.yml - # job-name: ${{ env.JOB_NAME }} - # - name: Delete eks job - # uses: ./.github/actions/delete-k8s-job - # if: ${{ always() }} - # with: - # job-name: ${{ env.JOB_NAME }} - # - name: Configure post-processing job - # run: | - # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - # | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - # .github/eks-workflow-files/post-process-job.yml - # git diff .github/eks-workflow-files/post-process-job.yml - # - name: Submit post process k8s job - # uses: ./.github/actions/submit-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/post-process-job.yml - # job-name: ${{ env.POSTPROCESS_JOB_NAME }} - # - name: Delete post process k8s job - # uses: ./.github/actions/delete-k8s-job - # with: - # job-name: ${{ env.POSTPROCESS_JOB_NAME }} - # - name: Delete GitHub Container Registry token - # uses: ./.github/actions/delete-ghcr-token - # if: ${{ always() }} - # with: - # token-name: ${{ env.TOKEN_NAME }} - # COMMENT THIS - # test-equinox: - # needs: build-equinox - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # TEST_NAME: equinox - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # bash -exc -o pipefail \ - # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - # STATISTICS_SCRIPT: | - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-equinox.log - # secrets: inherit - # COMMENT THIS + test-nsys-jax-archive: + needs: test-nsys-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + runs-on: ${{ matrix.os }} + steps: + - name: Download nsys-jax output .zip files + uses: actions/download-artifact@v4 + with: + name: nsys-jax-unit-test-A100 + - name: Extract archives and execute install scripts + run: | + pip install virtualenv # for install.sh + for zip in $(ls *.zip); do + ZIP="${PWD}/${zip}" + pushd $(mktemp -d) + unzip "${ZIP}" + ls -l + # TODO: verify this isn't needed, or make sure it isn't needed + chmod 755 install.sh + # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # Skip executing Jupyter lab + NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + popd + done + + test-nsys-jax-eks: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + runs-on: eks + env: + JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: ${{ github.run_id }}-nsys-jax + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: GHCR login + uses: ./.github/actions/ghcr-login + with: + docker-username: ${{ github.repository_owner }} + docker-password: ${{ secrets.GITHUB_TOKEN}} + token-name: ${{ env.TOKEN_NAME }} + - name: Configure Kubernetes job + run: | + yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + | select(di == 1).metadata.name = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + .github/eks-workflow-files/job.yml + git diff .github/eks-workflow-files/job.yml + - name: Submit Kubernetes job + uses: ./.github/actions/submit-k8s-job + with: + job-config-file: .github/eks-workflow-files/job.yml + job-name: ${{ env.JOB_NAME }} + - name: Delete eks job + uses: ./.github/actions/delete-k8s-job + if: ${{ always() }} + with: + job-name: ${{ env.JOB_NAME }} + - name: Configure post-processing job + run: | + export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + .github/eks-workflow-files/post-process-job.yml + git diff .github/eks-workflow-files/post-process-job.yml + - name: Submit post process k8s job + uses: ./.github/actions/submit-k8s-job + with: + job-config-file: .github/eks-workflow-files/post-process-job.yml + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete post process k8s job + uses: ./.github/actions/delete-k8s-job + with: + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + - name: Delete GitHub Container Registry token + uses: ./.github/actions/delete-ghcr-token + if: ${{ always() }} + with: + token-name: ${{ env.TOKEN_NAME }} + COMMENT THIS + test-equinox: + needs: build-equinox + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + TEST_NAME: equinox + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + bash -exc -o pipefail \ + 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + STATISTICS_SCRIPT: | + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-equinox.log + secrets: inherit + # test-te-multigpu: # needs: build-upstream-pax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -517,79 +517,78 @@ jobs: # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} # secrets: inherit - # test-upstream-t5x: - # needs: build-upstream-t5x - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_upstream_t5x.yaml - # with: - # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-upstream-t5x: + needs: build-upstream-t5x + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_upstream_t5x.yaml + with: + T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit - # test-rosetta-t5x: - # needs: build-rosetta-t5x - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_t5x_rosetta.yaml - # with: - # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-rosetta-t5x: + needs: build-rosetta-t5x + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_t5x_rosetta.yaml + with: + T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit - # test-triton: - # needs: build-triton - # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: triton - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-triton.log - # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # # actually having a CUDA backend for pytoch - # pip install --no-deps torch - # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - # EOF - # STATISTICS_SCRIPT: | - # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - # passed_tests=$((total_tests - errors - failed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-triton.log - # secrets: inherit + test-triton: + needs: build-triton + if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: triton + EXECUTE: | + docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-triton.log + # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # actually having a CUDA backend for pytoch + pip install --no-deps torch + python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + EOF + STATISTICS_SCRIPT: | + curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + passed_tests=$((total_tests - errors - failed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-triton.log + secrets: inherit - # test-levanter: - # needs: build-levanter - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: levanter - # EXECUTE: | - # docker run -i --gpus all --shm-size=1g \ - # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-levanter.log - # pip install flake8 pytest soundfile librosa - # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - # EOF - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-levanter.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-levanter.log - # secrets: inherit + test-levanter: + needs: build-levanter + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: levanter + EXECUTE: | + docker run -i --gpus all --shm-size=1g \ + ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + bash <<"EOF" |& tee test-levanter.log + pip install flake8 pytest soundfile librosa + PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" + EOF + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-levanter.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-levanter.log + secrets: inherit - # COMMENT THIS # test-te: # needs: build-upstream-pax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -619,37 +618,37 @@ jobs: # pytest-report.jsonl # secrets: inherit - # test-gemma: - # needs: build-gemma - # uses: ./.github/workflows/_test_unit.yaml - # if: inputs.ARCHITECTURE == 'amd64' - # with: - # TEST_NAME: gemma - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - # bash -ec \ - # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-gemma.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-gemma.log - # secrets: inherit + test-gemma: + needs: build-gemma + uses: ./.github/workflows/_test_unit.yaml + if: inputs.ARCHITECTURE == 'amd64' + with: + TEST_NAME: gemma + EXECUTE: | + docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + bash -ec \ + "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + STATISTICS_SCRIPT: | + summary_line=$(tail -n1 test-gemma.log) + errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-gemma.log + secrets: inherit - # test-maxtext: - # needs: build-maxtext - # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - # uses: ./.github/workflows/_test_maxtext.yaml - # with: - # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + test-maxtext: + needs: build-maxtext + if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + uses: ./.github/workflows/_test_maxtext.yaml + with: + MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + secrets: inherit test-axlearn-eks: needs: build-axlearn @@ -748,6 +747,7 @@ jobs: # the fuji test will run for 20 minutes only, as per 2025-02-24 # is not possible to set the `max_steps` value + # this will be done with a customer python code test-axlearn-fuji-models-eks: needs: build-axlearn if: inputs.ARCHITECTURE == 'amd64' From 97f02157f35ae71faf68daf59ab18abf97431f87 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 4 Mar 2025 13:33:43 +0000 Subject: [PATCH 87/89] fix ci typo --- .github/workflows/_ci.yaml | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 9ded946d2..8072e282d 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -484,30 +484,30 @@ jobs: if: ${{ always() }} with: token-name: ${{ env.TOKEN_NAME }} - COMMENT THIS - test-equinox: - needs: build-equinox - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - TEST_NAME: equinox - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - bash -exc -o pipefail \ - 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - STATISTICS_SCRIPT: | - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-equinox.log - secrets: inherit + + # test-equinox: + # needs: build-equinox + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + # TEST_NAME: equinox + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + # bash -exc -o pipefail \ + # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # STATISTICS_SCRIPT: | + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-equinox.log + # secrets: inherit # test-te-multigpu: # needs: build-upstream-pax From f1fbff2a06fc7c3d67f2f2d8b61d2708daf90712 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 4 Mar 2025 14:00:05 +0000 Subject: [PATCH 88/89] Fix test-nsys-jax-eks --- .github/workflows/_ci.yaml | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 8072e282d..9f0bb971a 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -436,12 +436,9 @@ jobs: steps: - name: Check out the repository uses: actions/checkout@v4 - - name: GHCR login - uses: ./.github/actions/ghcr-login - with: - docker-username: ${{ github.repository_owner }} - docker-password: ${{ secrets.GITHUB_TOKEN}} - token-name: ${{ env.TOKEN_NAME }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr - name: Configure Kubernetes job run: | yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) @@ -452,15 +449,10 @@ jobs: .github/eks-workflow-files/job.yml git diff .github/eks-workflow-files/job.yml - name: Submit Kubernetes job - uses: ./.github/actions/submit-k8s-job + uses: ./.github/actions/submit-delete-k8s-job with: job-config-file: .github/eks-workflow-files/job.yml job-name: ${{ env.JOB_NAME }} - - name: Delete eks job - uses: ./.github/actions/delete-k8s-job - if: ${{ always() }} - with: - job-name: ${{ env.JOB_NAME }} - name: Configure post-processing job run: | export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" @@ -470,20 +462,11 @@ jobs: | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ .github/eks-workflow-files/post-process-job.yml git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post process k8s job - uses: ./.github/actions/submit-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete post process k8s job - uses: ./.github/actions/delete-k8s-job - with: - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - name: Delete GitHub Container Registry token - uses: ./.github/actions/delete-ghcr-token - if: ${{ always() }} + - name: Submit post process Kubernetes job + uses: ./.github/actions/submit-delete-k8s-job with: - token-name: ${{ env.TOKEN_NAME }} + job-config-file: .github/eks-workflow-files/post-process-job.yml + job-name: ${{ env.POSTPROCESS_JOB_NAME }} # test-equinox: # needs: build-equinox From 626d1a76da5ca1decfd9822f512849a2b5164cef Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 4 Mar 2025 14:25:29 +0000 Subject: [PATCH 89/89] fix names in CI --- .github/workflows/_ci.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 9f0bb971a..8ed17d9d6 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -432,10 +432,15 @@ jobs: JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} JOB_NAME: ${{ github.run_id }}-nsys-jax POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - TOKEN_NAME: ${{ github.run_id }}-nsys-jax-token steps: - name: Check out the repository uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} - name: K8s GHCR store and delete token id: store-token uses: ./.github/actions/store-delete-k8s-ghcr @@ -443,7 +448,7 @@ jobs: run: | yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ .github/eks-workflow-files/job.yml @@ -458,7 +463,7 @@ jobs: export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ .github/eks-workflow-files/post-process-job.yml git diff .github/eks-workflow-files/post-process-job.yml