Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
139 commits
Select commit Hold shift + click to select a range
acbb950
ci: Fix issues
ko3n1g Oct 14, 2025
503dbdd
test
ko3n1g Oct 14, 2025
ddb7734
fix
ko3n1g Oct 15, 2025
e34600b
fix
ko3n1g Oct 15, 2025
18b1476
format
ko3n1g Oct 15, 2025
860d3ca
update lockfile
ko3n1g Oct 15, 2025
44a4118
fix
ko3n1g Oct 15, 2025
bf9301f
update with nemo-run
ko3n1g Oct 15, 2025
2b8b544
format
ko3n1g Oct 15, 2025
cf4be18
switch to nemorun
ko3n1g Oct 17, 2025
617808d
pythonpath
ko3n1g Oct 17, 2025
63d3e12
parallel
ko3n1g Oct 17, 2025
f801418
parse from matrix
ko3n1g Oct 17, 2025
705c843
fix
ko3n1g Oct 17, 2025
0c4f13b
ghci
ko3n1g Oct 17, 2025
969365b
test
ko3n1g Oct 17, 2025
3028065
test
ko3n1g Oct 17, 2025
caad66f
fix
ko3n1g Oct 17, 2025
a28c36f
test
ko3n1g Oct 17, 2025
0fffc6c
test
ko3n1g Oct 17, 2025
7bcad07
container
ko3n1g Oct 17, 2025
4f52645
env
ko3n1g Oct 17, 2025
1b1a6ec
set context
ko3n1g Oct 17, 2025
1d99930
tags
ko3n1g Oct 17, 2025
0b294de
tag
ko3n1g Oct 17, 2025
54fe251
riles
ko3n1g Oct 17, 2025
6bba0d6
wait for build
ko3n1g Oct 17, 2025
3504f7e
sha
ko3n1g Oct 17, 2025
1cae2fa
fix
ko3n1g Oct 17, 2025
26e2fac
test
ko3n1g Oct 17, 2025
becc0ac
test
ko3n1g Oct 17, 2025
9254853
test
ko3n1g Oct 17, 2025
bcf016c
try run
ko3n1g Oct 17, 2025
343c59c
set NEMORUN_HOME
ko3n1g Oct 17, 2025
2494378
mount data dir
ko3n1g Oct 17, 2025
da0fdc5
fix
ko3n1g Oct 17, 2025
9d38352
fix
ko3n1g Oct 17, 2025
77473b6
test
ko3n1g Oct 17, 2025
b117a76
/mnt/datadrive/TestData/megatron-lm/artifacts
ko3n1g Oct 17, 2025
4847729
test
ko3n1g Oct 17, 2025
8bd510e
remove
ko3n1g Oct 17, 2025
ba2f5a4
remove
ko3n1g Oct 17, 2025
f24ebb6
remove
ko3n1g Oct 17, 2025
59d8107
python
ko3n1g Oct 17, 2025
f3c4362
staging
ko3n1g Oct 18, 2025
e53e2cf
test
ko3n1g Oct 18, 2025
b08eb19
fix
ko3n1g Oct 18, 2025
b044061
use generic mounts
ko3n1g Oct 18, 2025
f97fc22
fix
ko3n1g Oct 18, 2025
c57ad99
pull
ko3n1g Oct 18, 2025
3870c2c
fix
ko3n1g Oct 18, 2025
83b7c9d
fix
ko3n1g Oct 18, 2025
d5927c7
all
ko3n1g Oct 18, 2025
841c2c7
bert
ko3n1g Oct 18, 2025
45f5d9b
ckpts
ko3n1g Oct 18, 2025
6260dc6
error
ko3n1g Oct 18, 2025
6ee9811
fix
ko3n1g Oct 18, 2025
5d79303
exitcode
ko3n1g Oct 18, 2025
a590e77
format
ko3n1g Oct 18, 2025
7d806ef
test
ko3n1g Oct 18, 2025
d053f56
fix
ko3n1g Oct 18, 2025
f96c85d
test
ko3n1g Oct 18, 2025
276e578
fix t5
ko3n1g Oct 18, 2025
b032d2e
no nemo2 tests anymore
ko3n1g Oct 18, 2025
219ba83
N_REPEAT
ko3n1g Oct 18, 2025
9981e2b
refactorings
ko3n1g Oct 19, 2025
799e80e
format
ko3n1g Oct 19, 2025
ca28bac
paths
ko3n1g Oct 19, 2025
75926fd
unit tests
ko3n1g Oct 19, 2025
1cbf99a
latest
ko3n1g Oct 19, 2025
cdef674
lightweight
ko3n1g Oct 19, 2025
ea9da95
test
ko3n1g Oct 19, 2025
00cd098
fix
ko3n1g Oct 19, 2025
4937651
test
ko3n1g Oct 19, 2025
05689d3
coverage only for unit tests
ko3n1g Oct 19, 2025
dc1f28f
lightweight mode
ko3n1g Oct 19, 2025
49b26a8
lightweight
ko3n1g Oct 19, 2025
ab73d32
coverage
ko3n1g Oct 19, 2025
a5e797d
uuid-runtime
ko3n1g Oct 19, 2025
196c517
coverage
ko3n1g Oct 19, 2025
b427674
coverage
ko3n1g Oct 19, 2025
362e906
fix
ko3n1g Oct 19, 2025
31d9ae0
fix
ko3n1g Oct 19, 2025
94c70e5
fixes
ko3n1g Oct 19, 2025
f06315e
fix
ko3n1g Oct 19, 2025
ef64666
fix
ko3n1g Oct 19, 2025
25c4169
use image
ko3n1g Oct 19, 2025
92bad94
fix
ko3n1g Oct 19, 2025
9ee914c
fixes
ko3n1g Oct 19, 2025
8aead6c
fix
ko3n1g Oct 19, 2025
e24f0ab
test
ko3n1g Oct 20, 2025
6d88674
test
ko3n1g Oct 20, 2025
9e58cff
fix
ko3n1g Oct 20, 2025
593996c
set permissions
ko3n1g Oct 20, 2025
7c0542e
rename bert
ko3n1g Oct 20, 2025
358a566
set -exo pipefail
ko3n1g Oct 20, 2025
5f43ee4
fix
ko3n1g Oct 20, 2025
b9d793c
Merge remote-tracking branch 'gitlab/main' into ko3n1g/ci/fix-issues
ko3n1g Oct 20, 2025
07dbbae
lock
ko3n1g Oct 20, 2025
9c09308
sortby
ko3n1g Oct 20, 2025
79a199d
nemo-run
ko3n1g Oct 20, 2025
fe1c6a7
vocab file
ko3n1g Oct 20, 2025
b2316a9
less output
ko3n1g Oct 20, 2025
53c5d8a
rename
ko3n1g Oct 20, 2025
24a1fe0
345m_
ko3n1g Oct 20, 2025
ed55fa3
suppress
ko3n1g Oct 20, 2025
e5e00fc
fix
ko3n1g Oct 20, 2025
152eac3
suppress
ko3n1g Oct 20, 2025
d8cc60d
--timing-log-level
ko3n1g Oct 20, 2025
154c91a
moe
ko3n1g Oct 20, 2025
0df0e17
t5
ko3n1g Oct 20, 2025
873623a
fix
ko3n1g Oct 20, 2025
88140e6
less verbose
ko3n1g Oct 20, 2025
839d2a3
multimodal-llava
ko3n1g Oct 20, 2025
86b0f8a
fixes
ko3n1g Oct 20, 2025
9c76de9
pytest only if exit_code 0
ko3n1g Oct 20, 2025
6c6bcca
fix
ko3n1g Oct 20, 2025
5fbca55
fix
ko3n1g Oct 20, 2025
34ae48a
logs
ko3n1g Oct 20, 2025
53aa21d
fixes
ko3n1g Oct 20, 2025
43fbaee
test
ko3n1g Oct 20, 2025
7e745fd
fix
ko3n1g Oct 20, 2025
ae97413
fix thresholds
ko3n1g Oct 20, 2025
c0031e5
fix
ko3n1g Oct 20, 2025
ce71087
fix
ko3n1g Oct 20, 2025
c05581e
if always()
ko3n1g Oct 21, 2025
4c2f803
auto-retry flaky failure
ko3n1g Oct 21, 2025
990b99b
linting
ko3n1g Oct 21, 2025
8f9823d
debug
ko3n1g Oct 21, 2025
40c030d
debug
ko3n1g Oct 21, 2025
e982b55
debug
ko3n1g Oct 21, 2025
d66e1bb
test
ko3n1g Oct 21, 2025
9293946
debug
ko3n1g Oct 21, 2025
cad4e3b
fix
ko3n1g Oct 21, 2025
8fcd4ae
fiux
ko3n1g Oct 21, 2025
7b07ea1
logging
ko3n1g Oct 21, 2025
5a04a5e
test
ko3n1g Oct 21, 2025
22cd2cc
remove debugb
ko3n1g Oct 21, 2025
c781d04
cleanup
ko3n1g Oct 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
218 changes: 63 additions & 155 deletions .github/actions/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ name: "Test Template"
description: "Template for running NeMo tests in a containerized environment"

inputs:
container-image:
description: "Container image to use for test"
required: true
timeout:
description: "Max runtime of test in minutes"
required: false
Expand Down Expand Up @@ -46,210 +49,118 @@ inputs:
runs:
using: "composite"
steps:
- name: Copy data
shell: bash
if: inputs.is_unit_test == 'false'
env:
SOURCE_DIR: /mnt/datadrive/TestData/megatron-lm/artifacts
TARGET_DIR: /home/runner/_work/TestData/megatron-lm/artifacts
MODEL: ${{ inputs.model }}
run: |
mkdir -p $TARGET_DIR/text/data/

if [[ "$MODEL" == "bert" ]]; then
mkdir -p $TARGET_DIR/text/the_pile/bert_shard00/
cp -a $SOURCE_DIR/text/the_pile/bert_shard00/. $TARGET_DIR/text/data/
elif [[ "$MODEL" == "gpt" ]] || [[ "$MODEL" == "moe" ]]; then
cp -a $SOURCE_DIR/text/the_pile/shard00/. $TARGET_DIR/text/data/
fi

- name: Install curl, sudo
shell: bash
run: |
sudo apt-get update
sudo apt-get install -y curl uuid-runtime

- name: Checkout repository
uses: actions/checkout@v2
with:
path: ${{ github.workspace }}/Megatron-LM

- name: Cache uv
uses: actions/cache@v4
id: cache
with:
path: cache-mount
key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
restore-keys: |
${{ runner.os }}-uv-

- name: Restore Docker cache mounts
uses: reproducible-containers/buildkit-cache-dance@5b81f4d29dc8397a7d341dba3aeecc7ec54d6361
with:
cache-dir: cache-mount
dockerfile: docker/Dockerfile.ci.dev
skip-extraction: ${{ steps.cache.outputs.cache-hit }}
- name: Change ownership of /home/runner/
shell: bash
run: sudo chown -R $(whoami) /home/runner/

- name: Setup python
uses: actions/setup-python@v5
with:
python-version: 3.12

- name: Download test data
shell: bash
env:
GH_TOKEN: ${{ inputs.PAT }}
TIMEOUT: ${{ inputs.timeout }}
IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
- name: Install uuidgen
shell: bash -x -e -u -o pipefail {0}
run: |
echo "::group::Download test data"
pip install --no-cache-dir pygithub click
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
echo "::endgroup::"
apt-get update
apt-get install -y uuid-runtime

- name: Create run-script (unit test)
shell: bash
shell: bash -x -e -u -o pipefail {0}
if: inputs.is_unit_test == 'true'
run: |
echo "::group::Create run-script"
cmd=$(cat <<'RUN_TEST_EOF'
#!/bin/bash

docker exec -t test_container_${{ github.run_id }} bash -c '
set -e
bash /opt/megatron-lm/tests/unit_tests/run_ci_test.sh \
--tag ${{ inputs.tag }} \
--environment dev \
--bucket '\''${{ inputs.test_case }}'\'' \
--log-dir /opt/megatron-lm/outputs/logs
'
export PYTHONPATH=$(pwd)
export NEMORUN_HOME=$(pwd)
pip install --no-cache-dir uv
uv sync --only-group test
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
--scope unit-tests \
--model unit-tests \
--test-case '${{ inputs.test_case }}' \
--environment dev \
--platform dgx_h100 \
--tag ${{ inputs.tag }} \
--container-image ${{ inputs.container-image }}

RUN_TEST_EOF
)
echo "$cmd" | tee "job.sh"
echo "::endgroup::"

- name: Create run-script (e2e test)
shell: bash
shell: bash -x -e -u -o pipefail {0}
if: inputs.is_unit_test == 'false'
env:
MODEL: ${{ inputs.model }}
run: |
echo "::group::Create run-script"
cmd=$(cat <<'RUN_TEST_EOF'
#!/bin/bash



docker exec -t test_container_${{ github.run_id }} bash -c '

set -e
ls -al /workspace/data

if [[ "${{ inputs.model }}" == "bert" ]]; then
TRAINING_SCRIPT_PATH=pretrain_bert.py
elif [[ "${{ inputs.model }}" == "gpt" ]] || [[ "${{ inputs.model }}" == "moe" ]]; then
TRAINING_SCRIPT_PATH=pretrain_gpt.py
fi

ARGUMENTS=(
"DATA_PATH=/workspace/data"
"DATA_CACHE_PATH=/workspace/data/cache"
"OUTPUT_PATH=$(pwd)/outputs/"
"TENSORBOARD_PATH=$(pwd)/tensorboard"
"CHECKPOINT_SAVE_PATH=$(pwd)/checkpoints"
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME"
"TRAINING_SCRIPT_PATH=$TRAINING_SCRIPT_PATH"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/golden_values_dev_dgx_h100.json"
"N_REPEAT=5"
"ENABLE_LIGHTWEIGHT_MODE=false"
"RECORD_CHECKPOINTS=false"
)

bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}
'
set -euxo pipefail

export PYTHONPATH=$(pwd)
export NEMORUN_HOME=$(pwd)
pip install --no-cache-dir uv
uv sync --only-group test
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
--scope mr \
--model ${{ inputs.model }} \
--test-case ${{ inputs.test_case }} \
--environment dev \
--platform dgx_h100 \
--container-image ${{ inputs.container-image }} \
--data-dir /mnt/datadrive/TestData/megatron-lm/artifacts

RUN_TEST_EOF
)
echo "$cmd" | tee "job.sh"
echo "::endgroup::"

- name: Build container
shell: bash
env:
GH_TOKEN: ${{ inputs.PAT }}
run: |
echo "::group::Build test container"
docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="nvcr.io/nvidia/pytorch:25.06-py3" --target=main -t megatron-core .
echo "::endgroup::"

- name: Start container
shell: bash
run: |
echo "::group::Start test container"
set -x

cmd=$(cat <<RUN_TEST_EOF
#!/bin/bash
docker container rm -f test_container_${{ github.run_id }} || true
docker run \
--rm \
-d \
--name test_container_${{ github.run_id }} \
--runtime=nvidia --gpus all \
--shm-size=64g \
--ipc=host \
-e NCCL_IB_DISABLE=1 \
-e NCCL_P2P_LEVEL=NVL \
--workdir /opt/megatron-lm/ \
-v /home/runner/_work/TestData/megatron-lm/artifacts/text/data/:/workspace/data \
--volume ${{ github.workspace }}/Megatron-LM:/opt/megatron-lm/ \
$VOLUME_ARGS \
megatron-core \
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
RUN_TEST_EOF
)

echo "$cmd" | tee "retry_job.sh"
bash retry_job.sh
echo "::endgroup::"

- name: Set timeout
shell: bash
shell: bash -x -e -u -o pipefail {0}
id: timeout_in_seconds
run: |
echo "::group::Set timeout"
echo "main=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
echo "::endgroup::"

- name: Pull container
shell: bash -x -e -u -o pipefail {0}
run: |
echo "::group::Pull container"
docker pull ${{ inputs.container-image }}
echo "::endgroup::"

- name: Run main script
uses: nick-fields/retry@v3
shell: bash -x -e -u -o pipefail {0}
id: run-main-script
with:
timeout_seconds: ${{ steps.timeout_in_seconds.outputs.main }}
max_attempts: 3
shell: bash
retry_on: any
command: /bin/bash job.sh
on_retry_command: /bin/bash retry_job.sh
run: |
echo "::group::Run main script"
EXIT_CODE=0
/bin/bash job.sh || EXIT_CODE=$?
echo "exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
exit $EXIT_CODE
echo "::endgroup::"

- name: Check result
id: check
shell: bash
shell: bash -x -e -u -o pipefail {0}
if: always()
env:
IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
run: |
echo "::group::Check result"

docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/outputs/logs ./
logs_report=logs-${{ inputs.test_case }}-${{ github.run_id }}-$(uuidgen)
echo "logs_report=$logs_report" | sed 's/\//-/g' | sed 's/\*/-/g' | tee -a "$GITHUB_OUTPUT"

if [[ "$IS_UNIT_TEST" == "true" ]]; then
docker exec test_container_${{ github.run_id }} /opt/venv/bin/coverage xml
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/.coverage .coverage
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/coverage.xml coverage.xml
coverage_report=coverage-${{ inputs.is_unit_test == 'true' && 'unit-test' || 'e2e' }}-${{ github.run_id }}-$(uuidgen)
else
coverage_report=none
Expand All @@ -267,16 +178,18 @@ runs:
if [[ "$IS_SUCCESS" == "false" ]]; then
echo Test did not finish successfully.
exit 1
else
docker exec -t test_container_${{ github.run_id }} /opt/venv/bin/coverage report -i
fi

if [[ "$coverage_report" != "none" ]]; then
uv run coverage report -i
fi

exit $EXIT_CODE
echo "::endgroup::"

- name: Upload coverage
uses: actions/upload-artifact@v4
if: ${{ steps.check.outputs.coverage_report != 'none' }}
if: ${{ always() && steps.check.outputs.coverage_report != 'none' }}
with:
name: ${{ steps.check.outputs.coverage_report }}
path: |
Expand All @@ -286,13 +199,8 @@ runs:

- name: Upload logs
uses: actions/upload-artifact@v4
if: always()
with:
name: ${{ steps.check.outputs.logs_report }}
path: logs
path: ${{ inputs.is_unit_test == 'true' && 'logs' || 'assets_dir/logs' }}
include-hidden-files: true

- name: Container shutdown
if: always()
shell: bash
run: |
docker container rm -f test_container_${{ github.run_id }} || true
Loading
Loading