From d0ce24ac9783c2675d0370198cf12558dd6b7c65 Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Wed, 23 Jul 2025 04:20:55 -0600 Subject: [PATCH 1/7] Add derecho-gpu machine config --- machines/config_batch.xml | 15 ++ machines/config_compilers.xml | 423 ++++++++++++++++++++++++++++++++++ machines/config_machines.xml | 102 ++++++++ 3 files changed, 540 insertions(+) create mode 100644 machines/config_compilers.xml diff --git a/machines/config_batch.xml b/machines/config_batch.xml index cb56c334..f35d2838 100644 --- a/machines/config_batch.xml +++ b/machines/config_batch.xml @@ -374,6 +374,21 @@ + + qsub + + -l job_priority="$JOB_PRIORITY" + + + -S {{ shell }} + -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:ngpus=4 + + + main + develop + + + sbatch diff --git a/machines/config_compilers.xml b/machines/config_compilers.xml new file mode 100644 index 00000000..6f302100 --- /dev/null +++ b/machines/config_compilers.xml @@ -0,0 +1,423 @@ + + + -D_USE_FLOW_CONTROL + + FALSE + + + + + -h noomp + -g -O0 + -O2 + + + + -DFORTRANUNDERSCORE -DNO_R16 -DCPRCRAY + -DDIR=NOOP + -DDIR=NOOP + + + -s real64 + + + -f free -N 255 -h byteswapio -x dir + -h noomp + -g -O0 -K trap=fp -m1 + -O2,ipa2 -em + + + -O1,fp2,ipa0,scalar0,vector0 + + TRUE + + -Wl,--allow-multiple-definition -h byteswapio + + + + + + -std=gnu99 + -fopenmp + -g -Wall -Og -fbacktrace -ffpe-trap=invalid,zero,overflow -fcheck=bounds + -O + + + + -DFORTRANUNDERSCORE -DNO_R16 -DCPRGNU + + FORTRAN + + -fdefault-real-8 + + + + -fconvert=big-endian -ffree-line-length-none -ffixed-line-length-none + -fopenmp + + -g -Wall -Og -fbacktrace -ffpe-trap=zero,overflow -fcheck=bounds + -O + + + -O0 + + + -ffixed-form + + + -ffree-form + + FALSE + + -fopenmp + + mpicc + mpicxx + mpif90 + gcc + g++ + gfortran + TRUE + + + + + -g -qfullpath -qmaxmem=-1 + -O3 + -qsmp=omp + -qsmp=omp:noopt + + + + -DFORTRAN_SAME -DCPRIBM + + -WF,-D + + -qrealsize=8 + + + -g -qfullpath -qmaxmem=-1 + -O2 -qstrict -qinline=auto + -qsmp=omp + -qinitauto=7FF7FFFF -qflttrap=ov:zero:inv:en + -qsmp=omp:noopt + -C + + + -qsuffix=f=f -qfixed=132 + + + -qsuffix=f=f90:cpp=F90 + + TRUE + + -qsmp=omp + -qsmp=omp:noopt + + + + + + -qno-opt-dynamic-align -fp-model precise -std=gnu99 + -qopenmp + -O2 -debug minimal + -O0 -g + + + + -DFORTRANUNDERSCORE -DCPRINTEL + + + -cxxlib + + FORTRAN + + -r8 + + + -qno-opt-dynamic-align -convert big_endian -assume byterecl -ftz -traceback -assume realloc_lhs -fp-model source + -qopenmp + -O0 -g -check uninit -check bounds -check pointers -fpe0 -check noarg_temp_created + -O2 -debug minimal + + + -O0 + -qopenmp + + + -fixed -132 + + + -free + + + -qopenmp + + mpicc + mpicxx + mpif90 + icc + icpc + ifort + + -mkl=cluster + -mkl=cluster + -mkl=cluster + -mkl=cluster + -mkl=cluster + -mkl=cluster + -mkl=cluster + -mkl + + TRUE + + + + + -std=gnu99 + -g + + + -DFORTRANUNDERSCORE -DNO_CRAY_POINTERS -DNO_SHR_VMATH -DCPRNAG + + + -r8 + + + + + -Wp,-macro=no_com -convert=BIG_ENDIAN -indirect $ENV{CIMEROOT}/config/cesm/machines/nag_mpi_argument.txt + + -ieee=full -O2 + + + -C=all -g -time -f2003 -ieee=stop + -gline + + -mismatch_all + + + -Wp,-macro=no_com -convert=BIG_ENDIAN -indirect $ENV{CIMEROOT}/config/cesm/machines/nag_mpi_argument.txt + -ieee=full + + + + -g -time -f2003 -ieee=stop + -gline + + + -fixed + + + -free + + FALSE + mpicc + mpif90 + gcc + nagfor + + + + + -gopt -time + -mp + + + + + + + + + + + + + + + + + + + + + + + + + + + + -DFORTRANUNDERSCORE -DNO_SHR_VMATH -DNO_R16 -DCPRPGI + + CXX + + -r8 + + + -i4 -gopt -time -Mextend -byteswapio -Mflushz -Kieee + -mp + -O0 -g -Ktrap=fp -Mbounds -Kieee + -Mnovect + -Mnovect + -Mnovect + -Mnovect + -Mnovect + -Mnovect + + + -O0 -g -Ktrap=fp -Mbounds -Kieee + -mp + + + -Mfixed + + + -Mfree + + + + FALSE + + -time -Wl,--allow-multiple-definition + -mp + + mpicc + mpicxx + mpif90 + pgcc + pgc++ + pgf95 + + + + + -qarch=auto -qtune=auto -qcache=auto + + /usr/bin/bash + + -qarch=auto -qtune=auto -qcache=auto -qsclk=micro + -qspill=6000 + + + -qsigtrap=xl__trcedump + -bdatapsize:64K -bstackpsize:64K -btextpsize:32K + + mpcc_r + mpxlf2003_r + cc_r + xlf2003_r + + -lmassv -lessl + -lmass + + + + + + --build=powerpc-bgp-linux --host=powerpc64-suse-linux + + + -DLINUX + + + -g -qfullpath -qmaxmem=-1 -qspillsize=2500 -qextname=flush + -O3 -qstrict -qinline=auto + -qsmp=omp + -qsmp=omp:noopt + + + -Wl,--relax -Wl,--allow-multiple-definition + + + + + + -DCMAKE_SYSTEM_NAME=Catamount + + + -DLINUX + -DHAVE_NANOTIME -DBIT64 -DHAVE_VPRINTF -DHAVE_BACKTRACE -DHAVE_SLASHPROC -DHAVE_COMM_F2C -DHAVE_TIMES -DHAVE_GETTIMEOFDAY + + cc + CC + ftn + $ENV{NETCDF_DIR} + lustre + $ENV{PARALLEL_NETCDF_DIR} + cc + CC + ftn + + + + + + + /glade/u/apps/derecho/23.06/spack/opt/spack/netcdf/4.9.2/cray-mpich/8.1.25/oneapi/2023.0.0/wzol + /glade/u/apps/derecho/23.06/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.25/oneapi/2023.0.0/blyr + + -lnetcdff -lnetcdf + + + + + + -qno-opt-dynamic-align -fp-model precise -std=gnu99 + -std=gnu89 + -march=core-avx2 -no-fma + + + -march=core-avx2 -no-fma + + + + + + diff --git a/machines/config_machines.xml b/machines/config_machines.xml index fa3a4492..d4904157 100644 --- a/machines/config_machines.xml +++ b/machines/config_machines.xml @@ -1255,6 +1255,108 @@ This allows using a different mpirun command to launch unit tests 64M + + + NCAR AMD EPYC + de.*.hpc.ucar.edu + CNL + intel + + mpich + $ENV{SCRATCH} + $ENV{CESMDATAROOT}/inputdata + $ENV{CESMDATAROOT}/inputdata/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + $ENV{CESMDATAROOT}/cesm_baselines + $ENV{CESMDATAROOT}/cprnc/cprnc + 16 + pbs + cseg + 64 + 64 + TRUE + + mpiexec + + --label + --line-buffer + -n {{ total_tasks }} + + + + $LMOD_ROOT/lmod/init/perl + $LMOD_ROOT/lmod/init/env_modules_python.py + $LMOD_ROOT/lmod/init/sh + $LMOD_ROOT/lmod/init/csh + $LMOD_ROOT/lmod/libexec/lmod perl + $LMOD_ROOT/lmod/libexec/lmod python + module + module + + cesmdev/1.0 + ncarenv/23.09 + + craype + conda/latest + + + intel/2023.2.1 + mkl + spherepack/3.2 + + + intel-oneapi/2023.2.1 + mkl + + + intel-classic/2023.2.1 + mkl + + + cce/15.0.1 + cray-libsci/23.02.1.1 + + + gcc/12.2.0 + cray-libsci/23.02.1.1 + + + nvhpc/23.7 + + + ncarcompilers/1.0.0 + cmake + + + cray-mpich/8.1.27 + + + mpi-serial/2.3.0 + + + + netcdf/4.9.2 + + + + netcdf-mpi/4.9.2 + parallel-netcdf/1.12.3 + + + + esmf/8.6.0 + + + + + + 64M + hybrid + memhooks + $ENV{NCAR_ROOT_SPHEREPACK}/lib + + + NCAR AMD EPYC system From 83bb1a353b02c8da35c3923ddbac404e7162413b Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Wed, 23 Jul 2025 05:03:33 -0600 Subject: [PATCH 2/7] Remove comment --- machines/config_compilers.xml | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/machines/config_compilers.xml b/machines/config_compilers.xml index 6f302100..a7612f31 100644 --- a/machines/config_compilers.xml +++ b/machines/config_compilers.xml @@ -401,23 +401,3 @@ - - - From 74ec5dc008cea51eb48dd1d7f67edcc09bc438f8 Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Tue, 29 Jul 2025 09:31:53 -0600 Subject: [PATCH 3/7] Remove config_compilers.xml and add derecho-gpu.cmake --- machines/cmake_macros/derecho-gpu.cmake | 11 + machines/config_compilers.xml | 403 ------------------------ 2 files changed, 11 insertions(+), 403 deletions(-) create mode 100644 machines/cmake_macros/derecho-gpu.cmake delete mode 100644 machines/config_compilers.xml diff --git a/machines/cmake_macros/derecho-gpu.cmake b/machines/cmake_macros/derecho-gpu.cmake new file mode 100644 index 00000000..2772a168 --- /dev/null +++ b/machines/cmake_macros/derecho-gpu.cmake @@ -0,0 +1,11 @@ +if (COMP_NAME STREQUAL gptl) + string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_VPRINTF -DHAVE_BACKTRACE -DHAVE_SLASHPROC -DHAVE_COMM_F2C -DHAVE_TIMES -DHAVE_GETTIMEOFDAY") +endif() +set(NETCDF_PATH "$ENV{NETCDF}") +set(PIO_FILESYSTEM_HINTS "lustre") +set(PNETCDF_PATH "$ENV{PNETCDF}") +# If we want to use cray-libsci instead of mkl uncomment this line as well as the module in config_machines.xml +string(REPLACE "-mkl=cluster" "" SLIBS "${SLIBS}") +#string(REPLACE "-mkl=cluster" "-qmkl=cluster" SLIBS "${SLIBS}") +#string(APPEND CPPDEFS " -DNO_SHR_VMATH ") +string(APPEND CPPDEFS " -DHAVE_GETTID") diff --git a/machines/config_compilers.xml b/machines/config_compilers.xml deleted file mode 100644 index a7612f31..00000000 --- a/machines/config_compilers.xml +++ /dev/null @@ -1,403 +0,0 @@ - - - -D_USE_FLOW_CONTROL - - FALSE - - - - - -h noomp - -g -O0 - -O2 - - - - -DFORTRANUNDERSCORE -DNO_R16 -DCPRCRAY - -DDIR=NOOP - -DDIR=NOOP - - - -s real64 - - - -f free -N 255 -h byteswapio -x dir - -h noomp - -g -O0 -K trap=fp -m1 - -O2,ipa2 -em - - - -O1,fp2,ipa0,scalar0,vector0 - - TRUE - - -Wl,--allow-multiple-definition -h byteswapio - - - - - - -std=gnu99 - -fopenmp - -g -Wall -Og -fbacktrace -ffpe-trap=invalid,zero,overflow -fcheck=bounds - -O - - - - -DFORTRANUNDERSCORE -DNO_R16 -DCPRGNU - - FORTRAN - - -fdefault-real-8 - - - - -fconvert=big-endian -ffree-line-length-none -ffixed-line-length-none - -fopenmp - - -g -Wall -Og -fbacktrace -ffpe-trap=zero,overflow -fcheck=bounds - -O - - - -O0 - - - -ffixed-form - - - -ffree-form - - FALSE - - -fopenmp - - mpicc - mpicxx - mpif90 - gcc - g++ - gfortran - TRUE - - - - - -g -qfullpath -qmaxmem=-1 - -O3 - -qsmp=omp - -qsmp=omp:noopt - - - - -DFORTRAN_SAME -DCPRIBM - - -WF,-D - - -qrealsize=8 - - - -g -qfullpath -qmaxmem=-1 - -O2 -qstrict -qinline=auto - -qsmp=omp - -qinitauto=7FF7FFFF -qflttrap=ov:zero:inv:en - -qsmp=omp:noopt - -C - - - -qsuffix=f=f -qfixed=132 - - - -qsuffix=f=f90:cpp=F90 - - TRUE - - -qsmp=omp - -qsmp=omp:noopt - - - - - - -qno-opt-dynamic-align -fp-model precise -std=gnu99 - -qopenmp - -O2 -debug minimal - -O0 -g - - - - -DFORTRANUNDERSCORE -DCPRINTEL - - - -cxxlib - - FORTRAN - - -r8 - - - -qno-opt-dynamic-align -convert big_endian -assume byterecl -ftz -traceback -assume realloc_lhs -fp-model source - -qopenmp - -O0 -g -check uninit -check bounds -check pointers -fpe0 -check noarg_temp_created - -O2 -debug minimal - - - -O0 - -qopenmp - - - -fixed -132 - - - -free - - - -qopenmp - - mpicc - mpicxx - mpif90 - icc - icpc - ifort - - -mkl=cluster - -mkl=cluster - -mkl=cluster - -mkl=cluster - -mkl=cluster - -mkl=cluster - -mkl=cluster - -mkl - - TRUE - - - - - -std=gnu99 - -g - - - -DFORTRANUNDERSCORE -DNO_CRAY_POINTERS -DNO_SHR_VMATH -DCPRNAG - - - -r8 - - - - - -Wp,-macro=no_com -convert=BIG_ENDIAN -indirect $ENV{CIMEROOT}/config/cesm/machines/nag_mpi_argument.txt - - -ieee=full -O2 - - - -C=all -g -time -f2003 -ieee=stop - -gline - - -mismatch_all - - - -Wp,-macro=no_com -convert=BIG_ENDIAN -indirect $ENV{CIMEROOT}/config/cesm/machines/nag_mpi_argument.txt - -ieee=full - - - - -g -time -f2003 -ieee=stop - -gline - - - -fixed - - - -free - - FALSE - mpicc - mpif90 - gcc - nagfor - - - - - -gopt -time - -mp - - - - - - - - - - - - - - - - - - - - - - - - - - - - -DFORTRANUNDERSCORE -DNO_SHR_VMATH -DNO_R16 -DCPRPGI - - CXX - - -r8 - - - -i4 -gopt -time -Mextend -byteswapio -Mflushz -Kieee - -mp - -O0 -g -Ktrap=fp -Mbounds -Kieee - -Mnovect - -Mnovect - -Mnovect - -Mnovect - -Mnovect - -Mnovect - - - -O0 -g -Ktrap=fp -Mbounds -Kieee - -mp - - - -Mfixed - - - -Mfree - - - - FALSE - - -time -Wl,--allow-multiple-definition - -mp - - mpicc - mpicxx - mpif90 - pgcc - pgc++ - pgf95 - - - - - -qarch=auto -qtune=auto -qcache=auto - - /usr/bin/bash - - -qarch=auto -qtune=auto -qcache=auto -qsclk=micro - -qspill=6000 - - - -qsigtrap=xl__trcedump - -bdatapsize:64K -bstackpsize:64K -btextpsize:32K - - mpcc_r - mpxlf2003_r - cc_r - xlf2003_r - - -lmassv -lessl - -lmass - - - - - - --build=powerpc-bgp-linux --host=powerpc64-suse-linux - - - -DLINUX - - - -g -qfullpath -qmaxmem=-1 -qspillsize=2500 -qextname=flush - -O3 -qstrict -qinline=auto - -qsmp=omp - -qsmp=omp:noopt - - - -Wl,--relax -Wl,--allow-multiple-definition - - - - - - -DCMAKE_SYSTEM_NAME=Catamount - - - -DLINUX - -DHAVE_NANOTIME -DBIT64 -DHAVE_VPRINTF -DHAVE_BACKTRACE -DHAVE_SLASHPROC -DHAVE_COMM_F2C -DHAVE_TIMES -DHAVE_GETTIMEOFDAY - - cc - CC - ftn - $ENV{NETCDF_DIR} - lustre - $ENV{PARALLEL_NETCDF_DIR} - cc - CC - ftn - - - - - - - /glade/u/apps/derecho/23.06/spack/opt/spack/netcdf/4.9.2/cray-mpich/8.1.25/oneapi/2023.0.0/wzol - /glade/u/apps/derecho/23.06/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.25/oneapi/2023.0.0/blyr - - -lnetcdff -lnetcdf - - - - - - -qno-opt-dynamic-align -fp-model precise -std=gnu99 - -std=gnu89 - -march=core-avx2 -no-fma - - - -march=core-avx2 -no-fma - - - From b6498274d2c08d72eaaa7674c352e01db631781a Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Tue, 29 Jul 2025 10:38:46 -0600 Subject: [PATCH 4/7] Set some sensible memory defaults for derecho-gpu --- machines/config_batch.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machines/config_batch.xml b/machines/config_batch.xml index f35d2838..70b5ec8f 100644 --- a/machines/config_batch.xml +++ b/machines/config_batch.xml @@ -381,7 +381,7 @@ -S {{ shell }} - -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:ngpus=4 + -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=480GB:ngpus={{ ngpus_per_node }} main From 5d54f6f1356100eaa0e7b73c28b31f682c52161c Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Tue, 19 Aug 2025 03:55:44 -0600 Subject: [PATCH 5/7] Fix formatting and add sensible defaults --- machines/config_machines.xml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/machines/config_machines.xml b/machines/config_machines.xml index d4904157..cfcd4ab2 100644 --- a/machines/config_machines.xml +++ b/machines/config_machines.xml @@ -1272,15 +1272,15 @@ This allows using a different mpirun command to launch unit tests 16 pbs cseg - 64 - 64 + 128 + 128 TRUE mpiexec - --label - --line-buffer - -n {{ total_tasks }} + --label + --line-buffer + -n {{ total_tasks }} From 41b32c325c61cb077e55a9d7e5e944a64a729d12 Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Tue, 19 Aug 2025 04:01:22 -0600 Subject: [PATCH 6/7] Change back to 64 tasks --- machines/config_machines.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machines/config_machines.xml b/machines/config_machines.xml index cfcd4ab2..c7bd1337 100644 --- a/machines/config_machines.xml +++ b/machines/config_machines.xml @@ -1272,8 +1272,8 @@ This allows using a different mpirun command to launch unit tests 16 pbs cseg - 128 - 128 + 64 + 64 TRUE mpiexec From d6d079af1f05cf9943f79f7e788944b082daf86a Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Tue, 19 Aug 2025 04:02:10 -0600 Subject: [PATCH 7/7] Hardcode ngpus to 4 --- machines/config_batch.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machines/config_batch.xml b/machines/config_batch.xml index 70b5ec8f..d6789f10 100644 --- a/machines/config_batch.xml +++ b/machines/config_batch.xml @@ -381,7 +381,7 @@ -S {{ shell }} - -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=480GB:ngpus={{ ngpus_per_node }} + -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=480GB:ngpus=4 main