Skip to content

Commit cb0bb7e

Browse files
authored
add pretraining installer for abci 3.0 (#82)
1 parent 967826e commit cb0bb7e

13 files changed

+261
-0
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/bash
2+
#PBS -P gcg51557
3+
#PBS -q R10415
4+
#PBS -v RTYPE=rt_HF
5+
#PBS -l select=1
6+
#PBS -l walltime=01:00:00
7+
#PBS -o /dev/null
8+
#PBS -e /dev/null
9+
10+
cd $PBS_O_WORKDIR
11+
12+
TIMESTAMP=$(date +%Y%m%d%H%M%S)
13+
JOBID=${PBS_JOBID%%.*}
14+
mkdir -p logs
15+
LOGFILE=logs/install-$JOBID.out
16+
ERRFILE=logs/install-$JOBID.err
17+
exec > $LOGFILE 2> $ERRFILE
18+
19+
set -eu -o pipefail
20+
21+
echo "TARGET_DIR=${TARGET_DIR}"
22+
23+
# Find the script directory
24+
if [ -n "${PBS_JOBID:-}" ]; then
25+
SCRIPT_PATH="$PBS_O_WORKDIR/$(basename "$0")"
26+
else
27+
SCRIPT_PATH=$(realpath "$0")
28+
fi
29+
SCRIPT_DIR=$(dirname "${SCRIPT_PATH}")
30+
echo "SCRIPT_DIR=${SCRIPT_DIR}"
31+
32+
mkdir ${TARGET_DIR}
33+
mkdir ${TARGET_DIR}/src
34+
35+
# Copy necessary scripts
36+
cp -r ${SCRIPT_DIR}/scripts ${TARGET_DIR}
37+
38+
# Set variables
39+
source ${TARGET_DIR}/scripts/environment.sh
40+
set > ${TARGET_DIR}/installer_envvar.log
41+
42+
# Install Libraries
43+
source ${SCRIPT_DIR}/src/install_python.sh
44+
source ${SCRIPT_DIR}/src/install_venv.sh
45+
source ${SCRIPT_DIR}/src/install_pytorch.sh
46+
source ${SCRIPT_DIR}/src/install_requirements.sh
47+
source ${SCRIPT_DIR}/src/install_apex.sh
48+
source ${SCRIPT_DIR}/src/install_flash_attention.sh
49+
source ${SCRIPT_DIR}/src/install_transformer_engine.sh
50+
source ${SCRIPT_DIR}/src/install_megatron_lm.sh
51+
source ${SCRIPT_DIR}/src/install_tokenizer.sh
52+
53+
echo "Done"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
set -eu -o pipefail
4+
5+
if [ $# -ne 1 ]; then
6+
>&2 echo "Usage: $0 <target-dir>"
7+
>&2 echo "Example: $0 /path/to/target_dir"
8+
exit 1
9+
fi
10+
11+
target_dir=$1; shift
12+
13+
qsub \
14+
-v TARGET_DIR=${target_dir},RTYPE=rt_HF \
15+
-o /dev/null -e /dev/null \
16+
-m n \
17+
qsub_setup.sh
18+
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
# List of environment variables and module loads for pretrain tasks
3+
4+
export PRETRAIN_CUDA_VERSION_MAJOR=12
5+
export PRETRAIN_CUDA_VERSION_MINOR=4
6+
export PRETRAIN_CUDA_VERSION_PATCH=1
7+
8+
export PRETRAIN_CUDA_VERSION=${PRETRAIN_CUDA_VERSION_MAJOR}.${PRETRAIN_CUDA_VERSION_MINOR}
9+
export PRETRAIN_CUDA_VERSION_SHORT=${PRETRAIN_CUDA_VERSION_MAJOR}${PRETRAIN_CUDA_VERSION_MINOR}
10+
export PRETRAIN_CUDNN_VERSION=9.5
11+
export PRETRAIN_CUDNN_VERSION_WITH_PATCH=9.5.1
12+
export PRETRAIN_HPCX_VERSION=2.20
13+
export PRETRAIN_NCCL_VERSION=2.25
14+
export PRETRAIN_NCCL_VERSION_WITH_PATCH=2.25.1-1
15+
16+
export PRETRAIN_PYTHON_VERSION=3.10.4
17+
export PRETRAIN_TORCH_VERSION=2.6.0
18+
export PRETRAIN_TORCHVISION_VERSION=0.21.0
19+
export PRETRAIN_APEX_COMMIT=312acb44f9fe05cab8c67bba6daa0e64d3737863
20+
export PRETRAIN_FLASH_ATTENTION_VERSION=2.5.8
21+
export PRETRAIN_TRANSFORMER_ENGINE_VERSION=1.13.0
22+
23+
export PRETRAIN_MEGATRON_TAG=v4
24+
# Ensure the appropriate Huggingface tokenizer is included
25+
# https://github.com/llm-jp/scripts/pull/12#discussion_r1708415209
26+
export PRETRAIN_TOKENIZER_TAG=v3.0b2
27+
28+
source /etc/profile.d/modules.sh
29+
module load cuda/${PRETRAIN_CUDA_VERSION}/${PRETRAIN_CUDA_VERSION}.${PRETRAIN_CUDA_VERSION_PATCH}
30+
module load cudnn/${PRETRAIN_CUDNN_VERSION}/${PRETRAIN_CUDNN_VERSION_WITH_PATCH}
31+
module load hpcx/${PRETRAIN_HPCX_VERSION}
32+
module load nccl/${PRETRAIN_NCCL_VERSION}/${PRETRAIN_NCCL_VERSION_WITH_PATCH}
33+
34+
export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Install
2+
3+
echo "Installing apex with commit ${PRETRAIN_APEX_COMMIT}"
4+
source ${TARGET_DIR}/venv/bin/activate
5+
pushd ${TARGET_DIR}/src
6+
7+
git clone --recurse-submodules https://github.com/NVIDIA/apex
8+
pushd apex
9+
10+
# Checkout the specific commit
11+
git checkout ${PRETRAIN_APEX_COMMIT}
12+
git submodule update --init --recursive
13+
14+
15+
python -m pip install \
16+
-v \
17+
--no-cache-dir \
18+
--no-build-isolation \
19+
--config-settings "--build-option=--cpp_ext" \
20+
--config-settings "--build-option=--cuda_ext" \
21+
./
22+
popd
23+
24+
popd # ${TARGET_DIR}/src
25+
deactivate
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Installs flash attention.
2+
3+
echo "Installing Flash Attention ${PRETRAIN_FLASH_ATTENTION_VERSION}"
4+
source ${TARGET_DIR}/venv/bin/activate
5+
6+
python -m pip install \
7+
--no-build-isolation \
8+
--no-cache-dir \
9+
"flash-attn==${PRETRAIN_FLASH_ATTENTION_VERSION}"
10+
11+
deactivate
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Installs Megatron-LM.
2+
3+
echo "Installing Megatron-LM ${PRETRAIN_MEGATRON_TAG}"
4+
source ${TARGET_DIR}/venv/bin/activate
5+
pushd ${TARGET_DIR}/src
6+
7+
# download our Megatron and build helper library
8+
git clone https://github.com/llm-jp/Megatron-LM -b ${PRETRAIN_MEGATRON_TAG}
9+
pushd Megatron-LM
10+
pushd megatron/core/datasets
11+
12+
# NOTE(odashi):
13+
# Original makefile in the above directory uses the system's (or pyenv's) python3-config.
14+
# But we need to invoke python3-config installed on our target directory.
15+
MEGATRON_HELPER_CPPFLAGS=(
16+
-O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
17+
$(python -m pybind11 --includes)
18+
)
19+
MEGATRON_HELPER_EXT=$(${TARGET_DIR}/python/bin/python3-config --extension-suffix)
20+
21+
# NOTE(odashi):
22+
# New version of Megatron-LM changed the extension name 'helpers' to 'helpers_cpp'
23+
#g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT}
24+
g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers${MEGATRON_HELPER_EXT}
25+
26+
popd # megatron/core/datasets
27+
popd # Megatron-LM
28+
29+
popd # ${TARGET_DIR}/src
30+
deactivate
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Script to install Python to TARGET_DIR
2+
#
3+
# This script will make the following directories:
4+
# * ${TARGET_DIR}/src/cpython ... Source of Python
5+
# * ${TARGET_DIR}/python ... installed Python binary
6+
7+
echo "Installing Python ${PRETRAIN_PYTHON_VERSION}"
8+
pushd ${TARGET_DIR}/src
9+
10+
git clone https://github.com/python/cpython -b v${PRETRAIN_PYTHON_VERSION}
11+
pushd cpython
12+
./configure --prefix="${TARGET_DIR}/python" --enable-optimizations
13+
make -j 64
14+
make install
15+
popd # cpython
16+
17+
popd # ${TARGET_DIR}/src
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Install pytorch and torchvision
2+
3+
echo "Installing torch ${PRETRAIN_TORCH_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT} and torchvision ${PRETRAIN_TORCHVISION_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT}"
4+
5+
source ${TARGET_DIR}/venv/bin/activate
6+
7+
python -m pip install \
8+
--no-cache-dir \
9+
torch==${PRETRAIN_TORCH_VERSION} \
10+
torchvision==${PRETRAIN_TORCHVISION_VERSION} \
11+
--index-url https://download.pytorch.org/whl/cu${PRETRAIN_CUDA_VERSION_SHORT}
12+
13+
deactivate
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Installs prerequisite packages
2+
3+
echo "Installing requirements"
4+
5+
source ${TARGET_DIR}/venv/bin/activate
6+
7+
python -m pip install --no-cache-dir -U -r ${SCRIPT_DIR}/src/requirements.txt
8+
9+
deactivate
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Install LLM-jp Tokenizer.
2+
3+
echo "Installing LLM-jp Tokenizer ${PRETRAIN_TOKENIZER_TAG}"
4+
pushd ${TARGET_DIR}/src
5+
6+
# download our tokeniser
7+
# Tokenizer
8+
git clone https://github.com/llm-jp/llm-jp-tokenizer -b ${PRETRAIN_TOKENIZER_TAG}
9+
10+
popd # ${TARGET_DIR}/src
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Installs Transformer Engine.
2+
3+
echo "Installing Transformer Engine ${PRETRAIN_TRANSFORMER_ENGINE_VERSION}"
4+
source ${TARGET_DIR}/venv/bin/activate
5+
6+
# install transformer engine
7+
# NOTE(odashi):
8+
# This implicitly installs flash-attn with their recommended version.
9+
# If the auto-installed flash-attn causes some problems, we need to re-install it.
10+
pip install --no-build-isolation --no-cache-dir transformer_engine[pytorch]==${PRETRAIN_TRANSFORMER_ENGINE_VERSION}
11+
12+
deactivate
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Script to install Python to TARGET_DIR
2+
#
3+
# This script will make the following directories:
4+
# * ${TARGET_DIR}/venv ... venv directory inherited from the above Python binary
5+
6+
echo "Setup venv"
7+
pushd ${TARGET_DIR}
8+
9+
python/bin/python3 -m venv venv
10+
11+
source venv/bin/activate
12+
python -m pip install --no-cache-dir -U pip setuptools wheel
13+
deactivate
14+
15+
popd # ${TARGET_DIR}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
accelerate==1.0.1
2+
cmake==3.30.1
3+
einops==0.8.0
4+
ninja==1.11.1.1
5+
numpy==1.26.3
6+
packaging==24.1
7+
pybind11==2.13.6
8+
regex==2024.9.11
9+
safetensors==0.4.5
10+
sentencepiece==0.2.0
11+
six==1.16.0
12+
transformers==4.46.0
13+
wandb==0.18.5
14+
wheel==0.44.0

0 commit comments

Comments
 (0)