#!/bin/bash ## Batch Job Paremeters #SBATCH -p PPPPPPP ##SBATCH -t 03:59:59 # run time 03hr59min59sec #SBATCH --nodes=400 # 400*56=22400 cores ##SBATCH --mem=16G # memory per node ##SBATCH --gpus-per-node=2 # GPUs pre node ##SBATCH --gres=gpu:volta:2 # see https://slurm.schedmd.com/gres.html ##########SBATCH --gres=gpu:2 ##SBATCH --gres=gpu:0 # GPUs per node @T3 ##SBATCH --gres=gpu:1 # GPUs per node @T3 ##SBATCH --gres=gpu::2 ##SBATCH --ntasks=32 # number of parallel processes (tasks) #SBATCH --ntasks-per-node=56 # tasks to run per node, Round Robin ##SBATCH --cpus-per-task=8 ##SBATCH --mem-per-cpu=40 #SBATCH --account=XXXXXXXXX #SBATCH -o %j.out #SBATCH -e %j.err module purge NWINPUT=carot_ground_small_22400 # source /pkg/nwchem/7.0.0/etc/default.nwchemrc # source /pkg/nwchem/setnwchem.Casper export ARMCI_NETWORK=OPENIB #export MSG_COMMS=MPI #source /work/$USER/t3/nwchem/7.0.2/setnwchem.${ARMCI_NETWORK}.i20cuda10.2 # confirmed to work source /work/$USER/t3/nwchem/7.0.2/setnwchem.${ARMCI_NETWORK}.i21 # confirmed to work #source /work/$USER/t3/nwchem/7.0.2/setnwchem.${ARMCI_NETWORK}.i20-nocuda # compiled without CUDA supp #source /work/$USER/t3/nwchem/7.0.2/setnwchem.${ARMCI_NETWORK}.i20-cpuonly # compiled without CUDA supp, libibumad # MRCC.hu #source /opt/ohpc/twcc/intel/2020/update1/compilers_and_libraries_2020.1.217/linux/mpi/intel64/bin/mpivars.sh release_mt -ofi_internal=0 #module load nvidia/cuda/11.0 # not necesseary ####source /work/$USER/t3/nwchem/7.0.2/setnwchem.${ARMCI_NETWORK}.i18cuda11.0 # Malfunction confirmed! #source /work/$USER/t3/nwchem/7.0.2/setnwchem.${ARMCI_NETWORK} #export ARMCI_DEFAULT_SHMMAX=131072 #export ARMCI_DEFAULT_SHMMAX=32768 #export ARMCI_DEFAULT_SHMMAX=65536 #export ARMCI_DEFAULT_SHMMAX=4096 export ARMCI_DEFAULT_SHMMAX=8192 export ARMCI_DEFAULT_SHMMAX_UBOUND=131072 #export I_MPI_DEBUG=5 # HPC setting: for using many-many cores, esp. > 4000 cores export I_MPI_HYDRA_PMI_CONNECT=cache export I_MPI_HYDRA_BRANCH_COUNT=-1 export USE_NOFSCHECK=1 export USE_NOIO=1 #source /work/$USER/t3/nwchem/7.0.2/setnwchem.${ARMCI_NETWORK}.i20cuda11.0_phase4 # #module load intel/2020 #export LD_LIBRARY_PATH="$HOME/lib:$LD_LIBRARY_PATH" #module load nvidia/cuda/11.0 #source /opt/ohpc/twcc/intel/2020/update1/compilers_and_libraries_2020.1.217/linux/mpi/intel64/bin/mpivars.sh release_mt intel64 #source /opt/ohpc/twcc/intel/2020/update1/compilers_and_libraries_2020.1.217/linux/mpi/intel64/bin/mpivars.sh debug_mt intel64 # For Casper, reference #NODES=2 #PPN=8 ## Must set this for PSM2 > 4000 cores using PBS Pro #export I_MPI_HYDRA_PMI_CONNECT=cache #export I_MPI_HYDRA_BRANCH_COUNT=-1 #export I_MPI_OFI_LIBRARY_INTERNAL=yes #export I_MPI_PMI_LIBRARY=/usr/lib64/libpmi2.so #export PSM2_MEMORY=large # fix UCX bug: mm_posix.c:195 UCX ERROR open(file_name=/proc/64773/fd/21 flags=0x0) failed: Permission denied # see https://github.com/openucx/ucx/issues/5571 #export UCX_POSIX_USE_PROC_LINK=n #env |grep I_MPI_ #env |grep UCX #echo "===== ALL env VARIABLES =====" #env #echo "===== End of ALL env VARIABLES =====" # Re-define SCRATCH_DIR #export SCRATCH_DIR=/scratch/$USER/nwchem/nwchem.$SLURM_JOBID export SCRATCH_DIR=/work/$USER/scratch/nwchem/nwchem.$SLURM_JOBID if [ ! -d $SCRATCH_DIR ] then mkdir -p $SCRATCH_DIR fi echo "SLURM_JOBID="$SLURM_JOBID echo "SLURM_JOB_NODELIST="$SLURM_JOB_NODELIST echo "SLURM_NNODES="$SLURM_NNODES echo "SLURM_TMPDIR="$SLURM_TMPDIR #export LD_LIBRARY_PATH="$HOME/lib:$LD_LIBRARY_PATH" echo "LD_LIBRARY_PATH="$LD_LIBRARY_PATH echo "SLURM_NTASKS="$SLURM_NTASKS echo "working directory = "$SLURM_SUBMIT_DIR cd $SLURM_SUBMIT_DIR echo "NWCHEM scratch directory="$SCRATCH_DIR echo "The NWCHEM job begins at `date` " # Choose -OFI/-IB for InfiniBand export I_MPI_FABRICS=shm:ofi #export I_MPI_FABRICS=ofi:shm #export I_MPI_FABRIC=shm:ofi export FI_PROVIDER=mlx #export FI_LOG_LEVEL=debug #export UCX_TLS=ud,sm,self export UCX_TLS=rc,ud,sm,self #export UCX_TLS=all #echo "mpirun -OFI -n $(($NODES*$(($PPN)))) \ #echo "$I_MPI_ROOT/intel64/bin/mpiexec.hydra -bootstrap slurm -n $SLURM_NTASKS -OFI \ # $NWCHEM_EXECUTABLE $NWINPUT.nw > $NWINPUT.out" echo "The working dir. is $(pwd)" # ARMCI_NETWORK=OPENIB # Taiwania 2 uses ARMCI_NETWORK=OPENIB and mpirun -ofi #mpirun -ofi -n $(($NODES*$(($PPN)))) \ # Taiwania 3 uses ARMCI_NETWORK=OPENIB and mpirun -ib #$I_MPI_ROOT/intel64/bin/mpiexec.hydra -bootstrap slurm -OFI \ #$I_MPI_ROOT/intel64/bin/mpiexec.hydra -bootstrap slurm -n $SLURM_NTASKS -OFI \ #which mpiexec.hydra #### Use mpirun MPIRUN=$(which mpirun) echo "MPIRUN="$MPIRUN $MPIRUN -n $SLURM_NTASKS \ $NWCHEM_EXECUTABLE $NWINPUT.nw > $NWINPUT.out # Use mpiexec.hydra #MPIRUN=$(which mpiexec.hydra) #echo "MPIRUN="$MPIRUN #$MPIRUN -OFI -bootstrap slurm -n $SLURM_NTASKS \ # $NWCHEM_EXECUTABLE $NWINPUT.nw > $NWINPUT.out #rm -rf $SCRATCH_DIR echo "Your NWCHEM job completed at `date` "