From 9840b817d081339cc04272225a7505bad3b4b0c0 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 7 Jun 2020 21:59:33 +0300
Subject: [PATCH] Added the (hopefully final) basic test case used for the
 benchmarks

---
 samples/benchmark/main.cc          |  7 ++--
 samples/genbenchmarkscripts/main.c |  9 ++---
 src/core/device.cc                 | 57 +++++++++++++++++++++---------
 src/core/kernels/integration.cuh   |  2 +-
 4 files changed, 50 insertions(+), 25 deletions(-)
diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index dd14129..962a316 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -56,11 +56,12 @@ morton3D(const uint64_t pid)
 {
     uint64_t i, j, k;
     i = j = k = 0;
+
     for (int bit = 0; bit <= 21; ++bit) {
         const uint64_t mask = 0x1l << 3 * bit;
-        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
         j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
-        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
     }
 
     return (uint3_64){i, j, k};
@@ -174,7 +175,7 @@ main(int argc, char** argv)
     */
 
     // Percentiles
-    const size_t num_iters      = 100;
+    const size_t num_iters      = 1000;
     const double nth_percentile = 0.90;
     std::vector<double> results; // ms
     results.reserve(num_iters);
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index 6f160b3..a45bf1a 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -29,6 +29,7 @@ main(void)
         fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
         fprintf(fp, "#SBATCH -n %d\n", nprocs);
         fprintf(fp, "#SBATCH -N %d\n", nodes);
+        fprintf(fp, "#SBATCH --exclusive\n");
 
         // Modules
         fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
@@ -37,13 +38,13 @@ main(void)
         // Profile and run
         fprintf(fp, "mkdir -p profile_%d\n", nprocs);
 
-        const int nx = 1792;
+        const int nx = 256; // max size 1792;
         const int ny = nx;
         const int nz = nx;
         fprintf(fp,
-                "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
-                "%d\n",
-                nprocs, nx, ny, nz);
+                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                //"%d\n",
+                "srun ./benchmark %d %d %d\n", nx, ny, nz);
 
         fclose(fp);
     }
diff --git a/src/core/device.cc b/src/core/device.cc
index 1e070cb..35af82d 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -527,6 +527,15 @@ morton3D(const uint64_t pid)
     i = j = k = 0;
 
     if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 3 * bit;
+            k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+            i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        }
+    }
+    /*
+    else if (DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 3 * bit;
             i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
@@ -534,18 +543,19 @@ morton3D(const uint64_t pid)
             k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
         }
     }
+    */
     // Just a quick copy/paste for other decomp dims
     else if (DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 2 * bit;
-            i |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
-            j |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
+            j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
+            k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
         }
     }
     else if (DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 1 * bit;
-            i |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
+            k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
         }
     }
     else {
@@ -562,24 +572,33 @@ morton1D(const uint3_64 pid)
     uint64_t i = 0;
 
     if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.z & mask) << 0) << 2 * bit;
+            i |= ((pid.y & mask) << 1) << 2 * bit;
+            i |= ((pid.x & mask) << 2) << 2 * bit;
+        }
+    }
+    /*
+    else if (DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.x & mask) << 0) << 2 * bit;
             i |= ((pid.y & mask) << 1) << 2 * bit;
             i |= ((pid.z & mask) << 2) << 2 * bit;
         }
-    }
+    }*/
     else if (DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 1 * bit;
-            i |= ((pid.y & mask) << 1) << 1 * bit;
+            i |= ((pid.y & mask) << 0) << 1 * bit;
+            i |= ((pid.z & mask) << 1) << 1 * bit;
         }
     }
     else if (DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 0 * bit;
+            i |= ((pid.z & mask) << 0) << 0 * bit;
         }
     }
     else {
@@ -1204,6 +1223,8 @@ typedef struct {
     CommData sidexy_data;
     CommData sidexz_data;
     CommData sideyz_data;
+
+    // int comm_cart;
 } Grid;
 
 static Grid grid = {};
@@ -1444,16 +1465,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sideyz_b0s, &sideyz_data);
 #endif
 
-#if MPI_COMPUTE_ENABLED
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-////////////////////////////////////////////
-#endif // MPI_COMPUTE_ENABLED
-
 #if MPI_COMM_ENABLED
         MPI_Barrier(MPI_COMM_WORLD);
 
@@ -1474,7 +1485,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommData(device, sidexy_b0s, &sidexy_data);
         acTransferCommData(device, sidexz_b0s, &sidexz_data);
         acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
 
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
         // acTransferCommDataWait(corner_data); // Do not rm: required for corners
         acTransferCommDataWait(edgex_data);
         acTransferCommDataWait(edgey_data);
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 97326ad..4c01148 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -134,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
     // RK3
     dim3 best_dims(0, 0, 0);
     float best_time          = INFINITY;
-    const int num_iterations = 5;
+    const int num_iterations = 10;
 
     for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
         for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {