Pulled useful changes from the benchmark branch. GPUDirect RDMA (unpinned) is now the default for MPI communication.

2020-07-29 16:39:24 +03:00
parent 6cab3586cf
commit 003c202e8c
4 changed files with 238 additions and 22 deletions
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -21,6 +21,7 @@ main(void)
        fprintf(fp, "#SBATCH --time=00:14:59\n");
        fprintf(fp, "#SBATCH --mem=32000\n");
        fprintf(fp, "#SBATCH --partition=gpu\n");
+        fprintf(fp, "#SBATCH --cpus-per-task=10\n");

        // nprocs, nodes, gpus
        const int max_gpus_per_node = 4;
@@ -29,22 +30,30 @@ main(void)
        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
        fprintf(fp, "#SBATCH -n %d\n", nprocs);
        fprintf(fp, "#SBATCH -N %d\n", nodes);
-        fprintf(fp, "#SBATCH --exclusive\n");
+        //fprintf(fp, "#SBATCH --exclusive\n");
+        if (nprocs > 4)
+            fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");

        // Modules
-        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        // OpenMPI
+        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n");
+        // HPCX
+        //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
        fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");

        // Profile and run
-        fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+        //fprintf(fp, "mkdir -p profile_%d\n", nprocs);

        const int nx = 256; // max size 1792;
        const int ny = nx;
        const int nz = nx;
+        /*
        fprintf(fp,
                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
                //"%d\n",
                "srun ./benchmark %d %d %d\n", nx, ny, nz);
+        */
+        fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);

        fclose(fp);
    }