add pretraining installer for abci 3.0 (#82)

Taka008 · web-flow · commit cb0bb7eda44f · 2025-05-16T16:57:42.000+09:00
diff --git a/pretrain/installers/v4-megatron-abci/qsub_setup.sh b/pretrain/installers/v4-megatron-abci/qsub_setup.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#PBS -P gcg51557
+#PBS -q R10415
+#PBS -v RTYPE=rt_HF
+#PBS -l select=1
+#PBS -l walltime=01:00:00
+#PBS -o /dev/null
+#PBS -e /dev/null
+
+cd $PBS_O_WORKDIR
+
+TIMESTAMP=$(date +%Y%m%d%H%M%S)
+JOBID=${PBS_JOBID%%.*}
+mkdir -p logs
+LOGFILE=logs/install-$JOBID.out
+ERRFILE=logs/install-$JOBID.err
+exec > $LOGFILE 2> $ERRFILE
+
+set -eu -o pipefail
+
+echo "TARGET_DIR=${TARGET_DIR}"
+
+# Find the script directory
+if [ -n "${PBS_JOBID:-}" ]; then
+    SCRIPT_PATH="$PBS_O_WORKDIR/$(basename "$0")"
+else
+    SCRIPT_PATH=$(realpath "$0")
+fi
+SCRIPT_DIR=$(dirname "${SCRIPT_PATH}")
+echo "SCRIPT_DIR=${SCRIPT_DIR}"
+
+mkdir ${TARGET_DIR}
+mkdir ${TARGET_DIR}/src
+
+# Copy necessary scripts
+cp -r ${SCRIPT_DIR}/scripts ${TARGET_DIR}
+
+# Set variables
+source ${TARGET_DIR}/scripts/environment.sh
+set > ${TARGET_DIR}/installer_envvar.log
+
+# Install Libraries
+source ${SCRIPT_DIR}/src/install_python.sh
+source ${SCRIPT_DIR}/src/install_venv.sh
+source ${SCRIPT_DIR}/src/install_pytorch.sh
+source ${SCRIPT_DIR}/src/install_requirements.sh
+source ${SCRIPT_DIR}/src/install_apex.sh
+source ${SCRIPT_DIR}/src/install_flash_attention.sh
+source ${SCRIPT_DIR}/src/install_transformer_engine.sh
+source ${SCRIPT_DIR}/src/install_megatron_lm.sh
+source ${SCRIPT_DIR}/src/install_tokenizer.sh
+
+echo "Done"
diff --git a/pretrain/installers/v4-megatron-abci/run_setup.sh b/pretrain/installers/v4-megatron-abci/run_setup.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+if [ $# -ne 1 ]; then
+    >&2 echo "Usage: $0 <target-dir>"
+    >&2 echo "Example: $0 /path/to/target_dir"
+    exit 1
+fi
+
+target_dir=$1; shift
+
+qsub \
+  -v TARGET_DIR=${target_dir},RTYPE=rt_HF \
+  -o /dev/null -e /dev/null \
+  -m n \
+  qsub_setup.sh
+
diff --git a/pretrain/installers/v4-megatron-abci/scripts/environment.sh b/pretrain/installers/v4-megatron-abci/scripts/environment.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# List of environment variables and module loads for pretrain tasks
+
+export PRETRAIN_CUDA_VERSION_MAJOR=12
+export PRETRAIN_CUDA_VERSION_MINOR=4
+export PRETRAIN_CUDA_VERSION_PATCH=1
+
+export PRETRAIN_CUDA_VERSION=${PRETRAIN_CUDA_VERSION_MAJOR}.${PRETRAIN_CUDA_VERSION_MINOR}
+export PRETRAIN_CUDA_VERSION_SHORT=${PRETRAIN_CUDA_VERSION_MAJOR}${PRETRAIN_CUDA_VERSION_MINOR}
+export PRETRAIN_CUDNN_VERSION=9.5
+export PRETRAIN_CUDNN_VERSION_WITH_PATCH=9.5.1
+export PRETRAIN_HPCX_VERSION=2.20
+export PRETRAIN_NCCL_VERSION=2.25
+export PRETRAIN_NCCL_VERSION_WITH_PATCH=2.25.1-1
+
+export PRETRAIN_PYTHON_VERSION=3.10.4
+export PRETRAIN_TORCH_VERSION=2.6.0
+export PRETRAIN_TORCHVISION_VERSION=0.21.0
+export PRETRAIN_APEX_COMMIT=312acb44f9fe05cab8c67bba6daa0e64d3737863
+export PRETRAIN_FLASH_ATTENTION_VERSION=2.5.8
+export PRETRAIN_TRANSFORMER_ENGINE_VERSION=1.13.0
+
+export PRETRAIN_MEGATRON_TAG=v4
+# Ensure the appropriate Huggingface tokenizer is included
+# https://github.com/llm-jp/scripts/pull/12#discussion_r1708415209
+export PRETRAIN_TOKENIZER_TAG=v3.0b2
+
+source /etc/profile.d/modules.sh
+module load cuda/${PRETRAIN_CUDA_VERSION}/${PRETRAIN_CUDA_VERSION}.${PRETRAIN_CUDA_VERSION_PATCH}
+module load cudnn/${PRETRAIN_CUDNN_VERSION}/${PRETRAIN_CUDNN_VERSION_WITH_PATCH}
+module load hpcx/${PRETRAIN_HPCX_VERSION}
+module load nccl/${PRETRAIN_NCCL_VERSION}/${PRETRAIN_NCCL_VERSION_WITH_PATCH}
+
+export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
diff --git a/pretrain/installers/v4-megatron-abci/src/install_apex.sh b/pretrain/installers/v4-megatron-abci/src/install_apex.sh
@@ -0,0 +1,25 @@
+# Install 
+
+echo "Installing apex with commit ${PRETRAIN_APEX_COMMIT}"
+source ${TARGET_DIR}/venv/bin/activate
+pushd ${TARGET_DIR}/src
+
+git clone --recurse-submodules https://github.com/NVIDIA/apex
+pushd apex
+
+# Checkout the specific commit
+git checkout ${PRETRAIN_APEX_COMMIT}
+git submodule update --init --recursive
+
+
+python -m pip install \
+  -v \
+  --no-cache-dir \
+  --no-build-isolation \
+  --config-settings "--build-option=--cpp_ext" \
+  --config-settings "--build-option=--cuda_ext" \
+  ./
+popd
+
+popd  # ${TARGET_DIR}/src
+deactivate
diff --git a/pretrain/installers/v4-megatron-abci/src/install_flash_attention.sh b/pretrain/installers/v4-megatron-abci/src/install_flash_attention.sh
@@ -0,0 +1,11 @@
+# Installs flash attention.
+
+echo "Installing Flash Attention ${PRETRAIN_FLASH_ATTENTION_VERSION}"
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install \
+    --no-build-isolation \
+    --no-cache-dir \
+    "flash-attn==${PRETRAIN_FLASH_ATTENTION_VERSION}"
+
+deactivate
diff --git a/pretrain/installers/v4-megatron-abci/src/install_megatron_lm.sh b/pretrain/installers/v4-megatron-abci/src/install_megatron_lm.sh
@@ -0,0 +1,30 @@
+# Installs Megatron-LM.
+
+echo "Installing Megatron-LM ${PRETRAIN_MEGATRON_TAG}"
+source ${TARGET_DIR}/venv/bin/activate
+pushd ${TARGET_DIR}/src
+
+# download our Megatron and build helper library
+git clone https://github.com/llm-jp/Megatron-LM -b ${PRETRAIN_MEGATRON_TAG}
+pushd Megatron-LM
+pushd megatron/core/datasets
+
+# NOTE(odashi):
+# Original makefile in the above directory uses the system's (or pyenv's) python3-config.
+# But we need to invoke python3-config installed on our target directory.
+MEGATRON_HELPER_CPPFLAGS=(
+  -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+  $(python -m pybind11 --includes)
+)
+MEGATRON_HELPER_EXT=$(${TARGET_DIR}/python/bin/python3-config --extension-suffix)
+
+# NOTE(odashi):
+# New version of Megatron-LM changed the extension name 'helpers' to 'helpers_cpp'
+#g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT}
+g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers${MEGATRON_HELPER_EXT}
+
+popd  # megatron/core/datasets
+popd  # Megatron-LM
+
+popd  # ${TARGET_DIR}/src
+deactivate
diff --git a/pretrain/installers/v4-megatron-abci/src/install_python.sh b/pretrain/installers/v4-megatron-abci/src/install_python.sh
@@ -0,0 +1,17 @@
+# Script to install Python to TARGET_DIR
+#
+# This script will make the following directories:
+#   * ${TARGET_DIR}/src/cpython ... Source of Python
+#   * ${TARGET_DIR}/python ... installed Python binary
+
+echo "Installing Python ${PRETRAIN_PYTHON_VERSION}"
+pushd ${TARGET_DIR}/src
+
+git clone https://github.com/python/cpython -b v${PRETRAIN_PYTHON_VERSION}
+pushd cpython
+./configure --prefix="${TARGET_DIR}/python" --enable-optimizations
+make -j 64
+make install
+popd  # cpython
+
+popd  # ${TARGET_DIR}/src
diff --git a/pretrain/installers/v4-megatron-abci/src/install_pytorch.sh b/pretrain/installers/v4-megatron-abci/src/install_pytorch.sh
@@ -0,0 +1,13 @@
+# Install pytorch and torchvision
+
+echo "Installing torch ${PRETRAIN_TORCH_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT} and torchvision ${PRETRAIN_TORCHVISION_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT}"
+
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install \
+    --no-cache-dir \
+    torch==${PRETRAIN_TORCH_VERSION} \
+    torchvision==${PRETRAIN_TORCHVISION_VERSION} \
+    --index-url https://download.pytorch.org/whl/cu${PRETRAIN_CUDA_VERSION_SHORT}
+
+deactivate
diff --git a/pretrain/installers/v4-megatron-abci/src/install_requirements.sh b/pretrain/installers/v4-megatron-abci/src/install_requirements.sh
@@ -0,0 +1,9 @@
+# Installs prerequisite packages
+
+echo "Installing requirements"
+
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install --no-cache-dir -U -r ${SCRIPT_DIR}/src/requirements.txt
+
+deactivate
diff --git a/pretrain/installers/v4-megatron-abci/src/install_tokenizer.sh b/pretrain/installers/v4-megatron-abci/src/install_tokenizer.sh
@@ -0,0 +1,10 @@
+# Install LLM-jp Tokenizer.
+
+echo "Installing LLM-jp Tokenizer ${PRETRAIN_TOKENIZER_TAG}"
+pushd ${TARGET_DIR}/src
+
+# download our tokeniser
+# Tokenizer
+git clone https://github.com/llm-jp/llm-jp-tokenizer -b ${PRETRAIN_TOKENIZER_TAG}
+
+popd  # ${TARGET_DIR}/src
diff --git a/pretrain/installers/v4-megatron-abci/src/install_transformer_engine.sh b/pretrain/installers/v4-megatron-abci/src/install_transformer_engine.sh
@@ -0,0 +1,12 @@
+# Installs Transformer Engine.
+
+echo "Installing Transformer Engine ${PRETRAIN_TRANSFORMER_ENGINE_VERSION}"
+source ${TARGET_DIR}/venv/bin/activate
+
+# install transformer engine
+# NOTE(odashi):
+# This implicitly installs flash-attn with their recommended version.
+# If the auto-installed flash-attn causes some problems, we need to re-install it.
+pip install --no-build-isolation --no-cache-dir transformer_engine[pytorch]==${PRETRAIN_TRANSFORMER_ENGINE_VERSION}
+
+deactivate
diff --git a/pretrain/installers/v4-megatron-abci/src/install_venv.sh b/pretrain/installers/v4-megatron-abci/src/install_venv.sh
@@ -0,0 +1,15 @@
+# Script to install Python to TARGET_DIR
+#
+# This script will make the following directories:
+#   * ${TARGET_DIR}/venv ... venv directory inherited from the above Python binary
+
+echo "Setup venv"
+pushd ${TARGET_DIR}
+
+python/bin/python3 -m venv venv
+
+source venv/bin/activate
+python -m pip install --no-cache-dir -U pip setuptools wheel
+deactivate
+
+popd  # ${TARGET_DIR}
diff --git a/pretrain/installers/v4-megatron-abci/src/requirements.txt b/pretrain/installers/v4-megatron-abci/src/requirements.txt
@@ -0,0 +1,14 @@
+accelerate==1.0.1
+cmake==3.30.1
+einops==0.8.0
+ninja==1.11.1.1
+numpy==1.26.3
+packaging==24.1
+pybind11==2.13.6
+regex==2024.9.11
+safetensors==0.4.5
+sentencepiece==0.2.0
+six==1.16.0
+transformers==4.46.0
+wandb==0.18.5
+wheel==0.44.0