Skip to content

Add template scripts for training on compute Canada #55

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions training_scripts/job_compute_canada.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
#SBATCH --account=def-jcohen
#SBATCH --job-name=job1 # set a more descriptive job-name
#SBATCH --nodes=1
#SBATCH --gpus-per-node=1
#SBATCH --cpus-per-task=12
#SBATCH --mem=32G
#SBATCH --time=1-06:00:00 # DD-HH:MM:SS
#SBATCH --output=/home/<your-user-name>/code/nnunet-v2/jobs/sciseg-v2/region-based/outs/%x_%A_v2.out
#SBATCH --error=/home/<your-user-name>/code/nnunet-v2/jobs/sciseg-v2/region-based/errs/%x_%A_v2.err
#SBATCH --mail-user=<your-email-id> # whenever the job starts/fails/completes, an email will be sent
#SBATCH --mail-type=begin,end

# Echo time and hostname into log
echo "Date: $(date)"
echo "Hostname: $(hostname)"

# load the required modules
echo "Loading modules ..."
module load python/3.10.13 cuda/12.2 # TODO: might differ depending on the python and cuda version you have

# activate environment
echo "Activating environment ..."
source /home/$(whoami)/envs/venv_nnunet/bin/activate # TODO: update to match the name of your environment

# Run the model
bash <path/to/run_nnunet_compute_canada/script>
89 changes: 89 additions & 0 deletions training_scripts/run_nnunet_compute_canada.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash
# Training nnUNet on a dataset

# define arguments for nnUNet
dataset_num="XXX"
dataset_name="Dataset${dataset_num}_<your-dataset-name>"
# nnunet_trainer="nnUNetTrainerDiceCELoss_noSmooth" # default: nnUNetTrainer or nnUNetTrainer_2000epochs
nnunet_trainer="nnUNetTrainerDA5_DiceCELoss_noSmooth" # custom trainer
# nnunet_trainer="nnUNetTrainer_5epochs" # options: nnUNetTrainer_1epoch, nnUNetTrainer_5epochs

# model_type="M" # options: "M" or "L" or "XL"
# nnunet_planner="nnUNetPlannerResEnc${model_type}" # default: nnUNetPlannerResEncM/L
nnunet_plans_file="nnUNetPlans"

# configurations=("3d_fullres" "2d") # for 2D training, use "2d"
configurations=("3d_fullres") # for 2D training, use "2d"
cuda_visible_devices=0
fold=1

# define final variables where the data will be copied
# NOTE: this assumes that following folders exist at the defined paths -- update to match your folder structure
final_prepro_dir="/home/$(whoami)/projects/def-jcohen/$(whoami)/datasets/nnUNet_preprocessed"
final_results_dir="/home/$(whoami)/code/nnunet-v2/nnUNet_results"

echo $SLURM_TMPDIR

# NOTE: Compute Canada recommends moving data to $SLURM_TMPDIR because this folder has fast read/write ability
# which makes dataloading faster. Hence, we copy the datasets to $SLURM_TMPDIR before starting the training
echo "-------------------------------------------"
echo "Moving the dataset to SLURM_TMPDIR: ${SLURM_TMPDIR}"
echo "-------------------------------------------"

# create folders in SLURM_TMPDIR
if [[ ! -d $SLURM_TMPDIR/nnUNet_raw ]]; then
mkdir $SLURM_TMPDIR/nnUNet_raw

# copy the dataset to SLURM_TMPDIR
cp -r /home/$(whoami)/projects/rrg-bengioy-ad/$(whoami)/datasets/nnUNet_raw/${dataset_name} ${SLURM_TMPDIR}/nnUNet_raw
fi

# create folders in SLURM_TMPDIR
mkdir $SLURM_TMPDIR/nnUNet_preprocessed
mkdir $SLURM_TMPDIR/nnUNet_results

# temporarily export the nnUNet environment variables (to make nnUNet happy)
export nnUNet_raw=$SLURM_TMPDIR/nnUNet_raw
export nnUNet_preprocessed=$SLURM_TMPDIR/nnUNet_preprocessed
export nnUNet_results=$SLURM_TMPDIR/nnUNet_results


echo "-------------------------------------------------------"
echo "Running preprocessing and verifying dataset integrity"
echo "-------------------------------------------------------"
nnUNetv2_plan_and_preprocess -d ${dataset_num} -c ${configurations} --verify_dataset_integrity


for configuration in ${configurations[@]}; do
echo "-------------------------------------------"
echo "Training on Fold $fold, Configuration $configuration"
echo "-------------------------------------------"

# training
CUDA_VISIBLE_DEVICES=${cuda_visible_devices} nnUNetv2_train ${dataset_num} $configuration $fold -tr ${nnunet_trainer} -p ${nnunet_plans_file}

echo ""
echo "-------------------------------------------"
echo "Training completed, Testing on Fold $fold"
echo "-------------------------------------------"

# run inference on test set
CUDA_VISIBLE_DEVICES=${cuda_visible_devices} nnUNetv2_predict -i ${nnUNet_raw}/${dataset_name}/imagesTs -tr ${nnunet_trainer} -p ${nnunet_plans_file} -o ${nnUNet_results}/${dataset_name}/${nnunet_trainer}__${nnunet_plans_file}__${configuration}/fold_${fold}/test -d ${dataset_num} -f $fold -c ${configuration} # -step_size 0.9 --disable_tta

done

echo ""
echo "--------------------------------------------------------------------------------------------"
echo "Testing done, Moving the results/preprocessed data from ${SLURM_TMPDIR} to the home directory"
echo "-----------------------------------------------------------------------------------------"


# copy the preprocessed data back to the home directory
cp -r ${SLURM_TMPDIR}/nnUNet_preprocessed/${dataset_name} ${final_prepro_dir}

# copy the results back to the home directory
cp -r ${SLURM_TMPDIR}/nnUNet_results/${dataset_name} ${final_results_dir}

echo "-------------------"
echo "Job Done!"
echo "-------------------"