From 7f7b0b89ea0b9b259dc6aa4d4a7a5de241eca334 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 19 Aug 2020 12:03:15 +0300
Subject: [PATCH] Fetched improvements to benchmarks from the
 mpi-paper-benchmarks branch

---
 samples/benchmark/main.cc          | 65 ------------------------------
 samples/genbenchmarkscripts/main.c | 54 ++++++++++++++++++++-----
 2 files changed, 44 insertions(+), 75 deletions(-)
diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index 16a99df..733b064 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -149,30 +149,6 @@ main(int argc, char** argv)
         }
     }*/
 
-    /*
-    // Basic
-    const size_t num_iters = 100;
-
-    // Warmup
-    for (size_t i = 0; i < num_iters / 10; ++i)
-        acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
-
-    // Benchmark
-    Timer t;
-    const AcReal dt = FLT_EPSILON;
-
-    acGridSynchronizeStream(STREAM_ALL);
-    timer_reset(&t);
-    acGridSynchronizeStream(STREAM_ALL);
-
-    for (size_t i = 0; i < num_iters; ++i)
-        acGridIntegrate(STREAM_DEFAULT, dt);
-
-    acGridSynchronizeStream(STREAM_ALL);
-    if (!pid)
-        timer_diff_print(t);
-    acGridSynchronizeStream(STREAM_ALL);
-    */
 
     // Percentiles
     const size_t num_iters      = 1000;
@@ -217,47 +193,6 @@ main(int argc, char** argv)
         fclose(fp);
     }
 
-    /*
-const size_t num_iters      = 1000;
-const double nth_percentile = 0.90;
-
-std::vector<double> results; // ms
-results.reserve(num_iters);
-
-for (size_t i = 0; i < num_iters; ++i) {
-    acGridSynchronizeStream(STREAM_ALL);
-    timer_reset(&t);
-    acGridSynchronizeStream(STREAM_ALL);
-    acGridIntegrate(STREAM_DEFAULT, dt);
-    acGridSynchronizeStream(STREAM_ALL);
-    results.push_back(timer_diff_nsec(t) / 1e6);
-}
-
-// Write benchmark to file
-if (!pid) {
-    std::sort(results.begin(), results.end(),
-              [](const double& a, const double& b) { return a < b; });
-    fprintf(stdout,
-            "Integration step time %g ms (%gth "
-            "percentile)--------------------------------------\n",
-            results[nth_percentile * num_iters], 100 * nth_percentile);
-
-    char path[4096] = "";
-    if (test == TEST_STRONG_SCALING)
-        strncpy(path, "strong_scaling.csv", sizeof(path));
-    else if (test == TEST_WEAK_SCALING)
-        strncpy(path, "weak_scaling.csv", sizeof(path));
-    else
-        ERROR("Invalid test type");
-
-    FILE* fp = fopen(path, "a");
-    ERRCHK_ALWAYS(fp);
-    // Format
-    // nprocs, measured (ms)
-    fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
-
-    fclose(fp);
-}*/
 
     acGridQuit();
     MPI_Finalize();
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index ce782ed..7be0872 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -2,11 +2,12 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 int
 main(void)
 {
-    const int max_nprocs = 128;
+    const int max_nprocs = 64;
     for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
         char filename[4096];
         sprintf(filename, "benchmark_%d.sh", nprocs);
@@ -18,10 +19,11 @@ main(void)
         fprintf(fp, "#!/bin/bash\n");
         fprintf(fp, "#BATCH --job-name=astaroth\n");
         fprintf(fp, "#SBATCH --account=project_2000403\n");
-        fprintf(fp, "#SBATCH --time=00:14:59\n");
+        fprintf(fp, "#SBATCH --time=03:00:00\n");
         fprintf(fp, "#SBATCH --mem=32000\n");
         fprintf(fp, "#SBATCH --partition=gpu\n");
-        fprintf(fp, "#SBATCH --cpus-per-task=10\n");
+        fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
+        // fprintf(fp, "#SBATCH --cpus-per-task=10\n");
 
         // nprocs, nodes, gpus
         const int max_gpus_per_node = 4;
@@ -30,30 +32,62 @@ main(void)
         fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
         fprintf(fp, "#SBATCH -n %d\n", nprocs);
         fprintf(fp, "#SBATCH -N %d\n", nodes);
-        //fprintf(fp, "#SBATCH --exclusive\n");
-        if (nprocs > 4)
+        // fprintf(fp, "#SBATCH --exclusive\n");
+        if (nprocs >= 4)
             fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
 
         // Modules
         // OpenMPI
-        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n");
+        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
+        //fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
+        //fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
+        //if (nprocs >= 32)
+        //    fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
+
         // HPCX
         //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
-        fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
+        //fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
 
         // Profile and run
-        //fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+        // fprintf(fp, "mkdir -p profile_%d\n", nprocs);
 
+        /*
         const int nx = 256; // max size 1792;
         const int ny = nx;
         const int nz = nx;
-        /*
+
         fprintf(fp,
                 //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
                 //"%d\n",
                 "srun ./benchmark %d %d %d\n", nx, ny, nz);
         */
-        fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
+        // fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
+
+        const char* files[] = {
+            "benchmark_decomp_1D",       "benchmark_decomp_2D",      "benchmark_decomp_3D",
+            "benchmark_decomp_1D_comm",  "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
+            "benchmark_meshsize_256",    "benchmark_meshsize_512",   "benchmark_meshsize_1024",
+            "benchmark_meshsize_1792",   "benchmark_stencilord_2",   "benchmark_stencilord_4",
+            "benchmark_stencilord_6",    "benchmark_stencilord_8",   "benchmark_timings_control",
+            "benchmark_timings_comp",    "benchmark_timings_comm",   "benchmark_timings_default",
+            "benchmark_timings_corners", "benchmark_weak_128",       "benchmark_weak_256",
+            "benchmark_weak_448",
+        };
+        for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
+            int nn = 256;
+            if (strcmp(files[i], "benchmark_meshsize_512") == 0)
+                nn = 512;
+            else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
+                nn = 1024;
+            else if (strcmp(files[i], "benchmark_meshsize_1792") == 0)
+                nn = 1792;
+            else if (strcmp(files[i], "benchmark_weak_128") == 0)
+                nn = 128;
+            else if (strcmp(files[i], "benchmark_weak_448") == 0)
+                nn = 448;
+
+            fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn);
+        }
 
         fclose(fp);
     }