From 0d80834619fa973052686fbf887dcca1fa664f23 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 2 Jun 2020 14:08:34 +0300
Subject: [PATCH 01/10] Disabled forcing and upwinding for performance tests.
 Set default grid size to 512^3. Set default cmake params s.t. benchmarks can
 be reproduced out-of-the-box.

---
 CMakeLists.txt                   | 10 +++++-----
 acc/mhd_solver/stencil_kernel.ac |  4 ++--
 config/astaroth.conf             |  6 +++---
 src/utils/modelsolver.c          |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5a514b..45c3a2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,11 +30,11 @@ endif()
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 
 ## Options
-option(DOUBLE_PRECISION "Generates double precision code."                    OFF)
-option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            OFF)
-option(BUILD_STANDALONE "Builds standalone Astaroth."                         ON)
-option(MPI_ENABLED      "Enables additional functions for MPI communciation." OFF)
-option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
+option(DOUBLE_PRECISION "Generates double precision code."                    ON)
+option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            ON)
+option(BUILD_STANDALONE "Builds standalone Astaroth."                         OFF)
+option(MPI_ENABLED      "Enables additional functions for MPI communciation." ON)
+option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
 
 ## Options (DEPRECATED)
 # option(BUILD_DEBUG              "Builds the program with extensive error checking"          OFF)
diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac
index 905eb65..0f37ea5 100644
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -5,8 +5,8 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (1)
-#define LUPWD (1)
+#define LFORCING (0)
+#define LUPWD (0)
 #define LSINK (0)
 
 #define AC_THERMAL_CONDUCTIVITY (0.001) // TODO: make an actual config parameter
diff --git a/config/astaroth.conf b/config/astaroth.conf
index abc1613..ccefc45 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 128
-AC_ny = 128
-AC_nz = 128
+AC_nx = 512
+AC_ny = 512
+AC_nz = 512
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index 92eb71c..e482bd9 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -39,7 +39,7 @@
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
 #define LFORCING (0)
-#define LUPWD (1)
+#define LUPWD (0)
 #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
 
 typedef AcReal Scalar;

From 17a4f314519dad18c25caf920c709d6da5ecfc2e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 4 Jun 2020 20:47:03 +0300
Subject: [PATCH 02/10] Added the latest setup used for benchmarks

---
 CMakeLists.txt                     |  6 +-
 config/astaroth.conf               | 14 ++---
 samples/benchmark/main.cc          | 65 ++++++++++++++++++++-
 samples/genbenchmarkscripts/main.c | 10 +++-
 src/core/device.cc                 | 93 ++++++++++++++++++++++++++++--
 src/core/kernels/integration.cuh   | 21 ++++---
 6 files changed, 182 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45c3a2b..04100bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 ## CMake settings
 # V3.9 required for first-class CUDA support
 # V3.17 required for the FindCUDAToolkit package
-cmake_minimum_required(VERSION 3.17) 
+cmake_minimum_required(VERSION 3.17)
 find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
 find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
 
@@ -10,7 +10,7 @@ project(astaroth C CXX CUDA)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 
 ## Project-wide compilation flags
-set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
+set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
@@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11)
 find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
 cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
 string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
-set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
+set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
 
 
diff --git a/config/astaroth.conf b/config/astaroth.conf
index ccefc45..83e93d9 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 512
-AC_ny = 512
-AC_nz = 512
+AC_nx = 256
+AC_ny = 256
+AC_nz = 256
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
@@ -24,11 +24,11 @@ AC_bin_steps = 1000
 AC_bin_save_t = 1e666
 
 // Set to 0 if you want to run the simulation from the beginning, or just a new
-// simulation. If continuing from a saved step, specify the step number here.  
-AC_start_step = 0 
+// simulation. If continuing from a saved step, specify the step number here.
+AC_start_step = 0
 
 // Maximum time in code units. If negative, there is no time limit
-AC_max_time = -1.0 
+AC_max_time = -1.0
 
 // Hydro
 AC_cdt = 0.4
@@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5
 AC_kmin              = 0.8
 AC_kmax              = 1.2
 // Switches forcing off and accretion on
-AC_switch_accretion  = 0 
+AC_switch_accretion  = 0
 
 // Entropy
 AC_cp_sound = 1.0
diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index 5ab4349..dd14129 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -39,8 +39,46 @@ typedef enum {
     NUM_TESTS,
 } TestType;
 
+#include <stdint.h>
+
+typedef struct {
+    uint64_t x, y, z;
+} uint3_64;
+
+static uint3_64
+operator+(const uint3_64& a, const uint3_64& b)
+{
+    return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static uint3_64
+morton3D(const uint64_t pid)
+{
+    uint64_t i, j, k;
+    i = j = k = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << 3 * bit;
+        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+    }
+
+    return (uint3_64){i, j, k};
+}
+
+static uint3_64
+decompose(const uint64_t target)
+{
+    // This is just so beautifully elegant. Complex and efficient decomposition
+    // in just one line of code.
+    uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
+
+    ERRCHK_ALWAYS(p.x * p.y * p.z == target);
+    return p;
+}
+
 int
-main(void)
+main(int argc, char** argv)
 {
     MPI_Init(NULL, NULL);
     int nprocs, pid;
@@ -51,9 +89,30 @@ main(void)
     AcMeshInfo info;
     acLoadConfig(AC_DEFAULT_CONFIG, &info);
 
+    if (argc > 1) {
+        if (argc == 4) {
+            const int nx           = atoi(argv[1]);
+            const int ny           = atoi(argv[2]);
+            const int nz           = atoi(argv[3]);
+            info.int_params[AC_nx] = nx;
+            info.int_params[AC_ny] = ny;
+            info.int_params[AC_nz] = nz;
+            acUpdateBuiltinParams(&info);
+            printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
+        }
+        else {
+            fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
     const TestType test = TEST_STRONG_SCALING;
-    if (test == TEST_WEAK_SCALING)
-        info.int_params[AC_nz] *= nprocs;
+    if (test == TEST_WEAK_SCALING) {
+        uint3_64 decomp = decompose(nprocs);
+        info.int_params[AC_nx] *= decomp.x;
+        info.int_params[AC_ny] *= decomp.y;
+        info.int_params[AC_nz] *= decomp.z;
+    }
 
     /*
     AcMesh model, candidate;
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index 8d35ae9..6f160b3 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -36,8 +36,14 @@ main(void)
 
         // Profile and run
         fprintf(fp, "mkdir -p profile_%d\n", nprocs);
-        fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
-                nprocs);
+
+        const int nx = 1792;
+        const int ny = nx;
+        const int nz = nx;
+        fprintf(fp,
+                "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                "%d\n",
+                nprocs, nx, ny, nz);
 
         fclose(fp);
     }
diff --git a/src/core/device.cc b/src/core/device.cc
index f473fc2..1e070cb 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -518,6 +518,78 @@ mod(const int a, const int b)
     return r < 0 ? r + b : r;
 }
 
+#define DECOMPOSITION_AXES (3)
+
+static uint3_64
+morton3D(const uint64_t pid)
+{
+    uint64_t i, j, k;
+    i = j = k = 0;
+
+    if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 3 * bit;
+            i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+            k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        }
+    }
+    // Just a quick copy/paste for other decomp dims
+    else if (DECOMPOSITION_AXES == 2) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 2 * bit;
+            i |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
+            j |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
+        }
+    }
+    else if (DECOMPOSITION_AXES == 1) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 1 * bit;
+            i |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
+        }
+    }
+    else {
+        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        ERRCHK_ALWAYS(0);
+    }
+
+    return (uint3_64){i, j, k};
+}
+
+static uint64_t
+morton1D(const uint3_64 pid)
+{
+    uint64_t i = 0;
+
+    if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.x & mask) << 0) << 2 * bit;
+            i |= ((pid.y & mask) << 1) << 2 * bit;
+            i |= ((pid.z & mask) << 2) << 2 * bit;
+        }
+    }
+    else if (DECOMPOSITION_AXES == 2) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.x & mask) << 0) << 1 * bit;
+            i |= ((pid.y & mask) << 1) << 1 * bit;
+        }
+    }
+    else if (DECOMPOSITION_AXES == 1) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.x & mask) << 0) << 0 * bit;
+        }
+    }
+    else {
+        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        ERRCHK_ALWAYS(0);
+    }
+
+    return i;
+}
+/*
 static uint3_64
 morton3D(const uint64_t pid)
 {
@@ -545,6 +617,7 @@ morton1D(const uint3_64 pid)
     }
     return i;
 }
+*/
 
 static uint3_64
 decompose(const uint64_t target)
@@ -1277,15 +1350,18 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
+#define MPI_COMPUTE_ENABLED (1)
+#define MPI_COMM_ENABLED (1)
+
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
     // acGridSynchronizeStream(stream);
 
-    const Device device  = grid.device;
-    const int3 nn        = grid.nn;
-    //CommData corner_data = grid.corner_data; // Do not rm: required for corners
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+    // CommData corner_data = grid.corner_data; // Do not rm: required for corners
     CommData edgex_data  = grid.edgex_data;
     CommData edgey_data  = grid.edgey_data;
     CommData edgez_data  = grid.edgez_data;
@@ -1357,6 +1433,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
+
+#if MPI_COMM_ENABLED
         // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
         acPackCommData(device, edgex_b0s, &edgex_data);
         acPackCommData(device, edgey_b0s, &edgey_data);
@@ -1364,15 +1442,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexy_b0s, &sidexy_data);
         acPackCommData(device, sidexz_b0s, &sidexz_data);
         acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif
 
+#if MPI_COMPUTE_ENABLED
         //////////// INNER INTEGRATION //////////////
         {
             const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
             const int3 m2 = nn;
             acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
         }
-        ////////////////////////////////////////////
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
 
+#if MPI_COMM_ENABLED
         MPI_Barrier(MPI_COMM_WORLD);
 
 #if MPI_GPUDIRECT_DISABLED
@@ -1436,6 +1518,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acSyncCommData(sidexy_data);
         acSyncCommData(sidexz_data);
         acSyncCommData(sideyz_data);
+#endif // MPI_COMM_ENABLED
+#if MPI_COMPUTE_ENABLED
         { // Front
             const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
             const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
@@ -1466,6 +1550,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
             const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
             acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
         }
+#endif // MPI_COMPUTE_ENABLED
         acDeviceSwapBuffers(device);
         acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
         ////////////////////////////////////////////
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 8d66fd2..97326ad 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -41,10 +41,12 @@ static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
               const AcReal3 rate_of_change, const AcReal dt)
 {
-    return (AcReal3){
-        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+    return (AcReal3){rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x,
+                                                dt),
+                     rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y,
+                                                dt),
+                     rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z,
+                                                dt)};
 }
 
 #define rk3(state_previous, state_current, rate_of_change, dt)                                     \
@@ -132,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
     // RK3
     dim3 best_dims(0, 0, 0);
     float best_time          = INFINITY;
-    const int num_iterations = 10;
+    const int num_iterations = 5;
 
     for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
         for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
@@ -192,9 +194,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
         }
     }
 #if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
+    printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f "
+           "ms\n",
+           best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
     /*
     FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
@@ -204,6 +206,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
     */
 
     rk3_tpb = best_dims;
+
+    // Failed to find valid thread block dimensions
+    ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0);
     return AC_SUCCESS;
 }
 

From 9840b817d081339cc04272225a7505bad3b4b0c0 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 7 Jun 2020 21:59:33 +0300
Subject: [PATCH 03/10] Added the (hopefully final) basic test case used for
 the benchmarks

---
 samples/benchmark/main.cc          |  7 ++--
 samples/genbenchmarkscripts/main.c |  9 ++---
 src/core/device.cc                 | 57 +++++++++++++++++++++---------
 src/core/kernels/integration.cuh   |  2 +-
 4 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index dd14129..962a316 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -56,11 +56,12 @@ morton3D(const uint64_t pid)
 {
     uint64_t i, j, k;
     i = j = k = 0;
+
     for (int bit = 0; bit <= 21; ++bit) {
         const uint64_t mask = 0x1l << 3 * bit;
-        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
         j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
-        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
     }
 
     return (uint3_64){i, j, k};
@@ -174,7 +175,7 @@ main(int argc, char** argv)
     */
 
     // Percentiles
-    const size_t num_iters      = 100;
+    const size_t num_iters      = 1000;
     const double nth_percentile = 0.90;
     std::vector<double> results; // ms
     results.reserve(num_iters);
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index 6f160b3..a45bf1a 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -29,6 +29,7 @@ main(void)
         fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
         fprintf(fp, "#SBATCH -n %d\n", nprocs);
         fprintf(fp, "#SBATCH -N %d\n", nodes);
+        fprintf(fp, "#SBATCH --exclusive\n");
 
         // Modules
         fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
@@ -37,13 +38,13 @@ main(void)
         // Profile and run
         fprintf(fp, "mkdir -p profile_%d\n", nprocs);
 
-        const int nx = 1792;
+        const int nx = 256; // max size 1792;
         const int ny = nx;
         const int nz = nx;
         fprintf(fp,
-                "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
-                "%d\n",
-                nprocs, nx, ny, nz);
+                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                //"%d\n",
+                "srun ./benchmark %d %d %d\n", nx, ny, nz);
 
         fclose(fp);
     }
diff --git a/src/core/device.cc b/src/core/device.cc
index 1e070cb..35af82d 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -527,6 +527,15 @@ morton3D(const uint64_t pid)
     i = j = k = 0;
 
     if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 3 * bit;
+            k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+            i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        }
+    }
+    /*
+    else if (DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 3 * bit;
             i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
@@ -534,18 +543,19 @@ morton3D(const uint64_t pid)
             k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
         }
     }
+    */
     // Just a quick copy/paste for other decomp dims
     else if (DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 2 * bit;
-            i |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
-            j |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
+            j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
+            k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
         }
     }
     else if (DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 1 * bit;
-            i |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
+            k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
         }
     }
     else {
@@ -562,24 +572,33 @@ morton1D(const uint3_64 pid)
     uint64_t i = 0;
 
     if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.z & mask) << 0) << 2 * bit;
+            i |= ((pid.y & mask) << 1) << 2 * bit;
+            i |= ((pid.x & mask) << 2) << 2 * bit;
+        }
+    }
+    /*
+    else if (DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.x & mask) << 0) << 2 * bit;
             i |= ((pid.y & mask) << 1) << 2 * bit;
             i |= ((pid.z & mask) << 2) << 2 * bit;
         }
-    }
+    }*/
     else if (DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 1 * bit;
-            i |= ((pid.y & mask) << 1) << 1 * bit;
+            i |= ((pid.y & mask) << 0) << 1 * bit;
+            i |= ((pid.z & mask) << 1) << 1 * bit;
         }
     }
     else if (DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 0 * bit;
+            i |= ((pid.z & mask) << 0) << 0 * bit;
         }
     }
     else {
@@ -1204,6 +1223,8 @@ typedef struct {
     CommData sidexy_data;
     CommData sidexz_data;
     CommData sideyz_data;
+
+    // int comm_cart;
 } Grid;
 
 static Grid grid = {};
@@ -1444,16 +1465,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sideyz_b0s, &sideyz_data);
 #endif
 
-#if MPI_COMPUTE_ENABLED
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-////////////////////////////////////////////
-#endif // MPI_COMPUTE_ENABLED
-
 #if MPI_COMM_ENABLED
         MPI_Barrier(MPI_COMM_WORLD);
 
@@ -1474,7 +1485,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommData(device, sidexy_b0s, &sidexy_data);
         acTransferCommData(device, sidexz_b0s, &sidexz_data);
         acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
 
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
         // acTransferCommDataWait(corner_data); // Do not rm: required for corners
         acTransferCommDataWait(edgex_data);
         acTransferCommDataWait(edgey_data);
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 97326ad..4c01148 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -134,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
     // RK3
     dim3 best_dims(0, 0, 0);
     float best_time          = INFINITY;
-    const int num_iterations = 5;
+    const int num_iterations = 10;
 
     for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
         for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {

From fa422cf4575cb0a8eace8f0409693e881fc0f709 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 10 Jun 2020 02:16:23 +0300
Subject: [PATCH 04/10] Added a better-pipelined version of the acGridIntegrate
 and a switch for toggling the transfer of corners

---
 src/core/device.cc | 211 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index 35af82d..481b465 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1373,9 +1373,220 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
 
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
+#define MPI_INCL_CORNERS (0)
 
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+#if MPI_INCL_CORNERS
+    CommData corner_data = grid.corner_data; // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
+
+// Corners
+#if MPI_INCL_CORNERS
+    // Do not rm: required for corners
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
+#endif // MPI_INCL_CORNERS
+
+    // Edges X
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
+
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges Y
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
+
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
+
+    // Edges Z
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acDeviceSynchronizeStream(device, STREAM_ALL);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+#if MPI_COMPUTE_ENABLED
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommDataWait(sidexy_data);
+        acUnpinCommData(device, &sidexy_data);
+        acTransferCommDataWait(sidexz_data);
+        acUnpinCommData(device, &sidexz_data);
+        acTransferCommDataWait(sideyz_data);
+        acUnpinCommData(device, &sideyz_data);
+
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+#if MPI_INCL_CORNERS
+        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                    // MPI_INCL_CORNERS
+        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommDataWait(edgex_data);
+        acUnpinCommData(device, &edgex_data);
+        acTransferCommDataWait(edgey_data);
+        acUnpinCommData(device, &edgey_data);
+        acTransferCommDataWait(edgez_data);
+        acUnpinCommData(device, &edgez_data);
+
+#if MPI_INCL_CORNERS
+        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                        // MPI_INCL_CORNERS
+#endif                                                        // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        acUnpackCommData(device, edgex_b0s, &edgex_data);
+        acUnpackCommData(device, edgey_b0s, &edgey_data);
+        acUnpackCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataWait(corner_data);   // Do not rm: required for corners
+        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
+#endif                                         // MPI_INCL_CORNERS
+#endif                                         // MPI_COMM_ENABLED
+#if MPI_COMPUTE_ENABLED
+#if MPI_INCL_CORNERS
+        acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                      // MPI_INCL_CORNERS
+#endif                                                      // MPI_COMPUTE_ENABLED
+
+        // Wait for unpacking
+        acSyncCommData(sidexy_data);
+        acSyncCommData(sidexz_data);
+        acSyncCommData(sideyz_data);
+        acSyncCommData(edgex_data);
+        acSyncCommData(edgey_data);
+        acSyncCommData(edgez_data);
+#if MPI_INCL_CORNERS
+        acSyncCommData(corner_data); // Do not rm: required for corners
+#endif                               // MPI_INCL_CORNERS
+
+#if MPI_COMPUTE_ENABLED
+        { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
+        }
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
+        }
+        { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
+        }
+        { // Top
+            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
+        }
+        { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
+        }
+        { // Right
+            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
+        }
+#endif // MPI_COMPUTE_ENABLED
+        acDeviceSwapBuffers(device);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
     // acGridSynchronizeStream(stream);

From 1cdb9e2ce7c3eb675bf466b6bd5e3a6432c956d3 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 10 Jun 2020 12:32:56 +0300
Subject: [PATCH 05/10] Added missing synchronization to the end of the new
 integration function

---
 src/core/device.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 481b465..f47f7a0 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -12,6 +12,11 @@
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 #define MPI_GPUDIRECT_DISABLED (0)
 
+#define DECOMPOSITION_AXES (3)
+#define MPI_COMPUTE_ENABLED (1)
+#define MPI_COMM_ENABLED (1)
+#define MPI_INCL_CORNERS (0)
+
 AcResult
 acDevicePrintInfo(const Device device)
 {
@@ -518,8 +523,6 @@ mod(const int a, const int b)
     return r < 0 ? r + b : r;
 }
 
-#define DECOMPOSITION_AXES (3)
-
 static uint3_64
 morton3D(const uint64_t pid)
 {
@@ -1371,10 +1374,6 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
-#define MPI_COMPUTE_ENABLED (1)
-#define MPI_COMM_ENABLED (1)
-#define MPI_INCL_CORNERS (0)
-
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
@@ -1582,6 +1581,9 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acDeviceSwapBuffers(device);
     }
 
+    // Does not have to be STREAM_ALL, only the streams used with
+    // acDeviceIntegrateSubstep (less likely to break this way though)
+    acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
     return AC_SUCCESS;
 }
 

From 0e4b39d6d78210b57bd8b38dd09ec770bde2e1b4 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 11 Jun 2020 11:28:52 +0300
Subject: [PATCH 06/10] Added a toggle for using pinned memory

---
 src/core/device.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index f47f7a0..01d5e87 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -16,6 +16,7 @@
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
+#define MPI_USE_PINNED (1)
 
 AcResult
 acDevicePrintInfo(const Device device)
@@ -1165,7 +1166,7 @@ acTransferCommData(const Device device, //
         const int npid = getPid(pid3d + neighbor, decomp);
 
         PackedData* dst = &data->dsts[b0_idx];
-        if (onTheSameNode(pid, npid)) {
+        if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) {
             MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
                       MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
             dst->pinned = false;
@@ -1187,7 +1188,7 @@ acTransferCommData(const Device device, //
         const int npid = getPid(pid3d - neighbor, decomp);
 
         PackedData* src = &data->srcs[b0_idx];
-        if (onTheSameNode(pid, npid)) {
+        if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) {
             cudaStreamSynchronize(data->streams[b0_idx]);
             MPI_Isend(src->data, count, datatype, npid, b0_idx, //
                       MPI_COMM_WORLD, &data->send_reqs[b0_idx]);

From f04e347c451c1555ce24b0e24c37cfcf699c6289 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:13:15 +0300
Subject: [PATCH 07/10] Cleanup before merging to the master merge candidate
 branch

---
 samples/bwtest/CMakeLists.txt |   2 +-
 samples/bwtest/main.c         |  28 ++++
 src/core/CMakeLists.txt       |   2 +-
 src/core/device.cc            | 289 +++-------------------------------
 4 files changed, 52 insertions(+), 269 deletions(-)

diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt
index cd4329f..229e7e2 100644
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -5,5 +5,5 @@ find_package(OpenMP)
 find_package(CUDAToolkit)
 
 add_executable(bwtest main.c)
-target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
 target_compile_options(bwtest PRIVATE -O3)
diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index 73f4387..9dd25d9 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -7,6 +7,7 @@
 #include <mpi.h>
 
 #include <cuda_runtime_api.h>
+#include <cuda.h> // CUDA driver API
 
 #include "timer_hires.h" // From src/common
 
@@ -56,6 +57,17 @@ allocDevice(const size_t bytes)
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
+    #define USE_CUDA_DRIVER_PINNING (1)
+    #if USE_CUDA_DRIVER_PINNING
+    uint8_t* arr = allocDevice(bytes);
+
+    unsigned int flag = 1;
+    CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr);
+
+    errchk(retval == CUDA_SUCCESS);
+    return arr;
+
+    #else
     uint8_t* arr;
     // Standard (20 GiB/s internode, 85 GiB/s intranode)
     // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -65,8 +77,24 @@ allocDevicePinned(const size_t bytes)
     const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
     errchk(retval == cudaSuccess);
     return arr;
+    #endif
 }
 
+/*
+static uint8_t*
+allocDevicePinned(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    errchk(retval == cudaSuccess);
+    return arr;
+}*/
+
 static void
 freeDevice(uint8_t* arr)
 {
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 757cbfe..81bcf14 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
 
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
 
 ## Options
 if (MPI_ENABLED)
diff --git a/src/core/device.cc b/src/core/device.cc
index 01d5e87..688ed89 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -10,13 +10,16 @@
 #include "kernels/kernels.h"
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
-#define MPI_GPUDIRECT_DISABLED (0)
 
-#define DECOMPOSITION_AXES (3)
+#define MPI_GPUDIRECT_DISABLED (0) // Buffer through host memory, deprecated
+#define MPI_DECOMPOSITION_AXES (3)
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1)
+#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory
+#define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
+
+#include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
 
 AcResult
 acDevicePrintInfo(const Device device)
@@ -530,7 +533,7 @@ morton3D(const uint64_t pid)
     uint64_t i, j, k;
     i = j = k = 0;
 
-    if (DECOMPOSITION_AXES == 3) {
+    if (MPI_DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 3 * bit;
             k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
@@ -538,32 +541,22 @@ morton3D(const uint64_t pid)
             i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
         }
     }
-    /*
-    else if (DECOMPOSITION_AXES == 3) {
-        for (int bit = 0; bit <= 21; ++bit) {
-            const uint64_t mask = 0x1l << 3 * bit;
-            i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
-            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
-            k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
-        }
-    }
-    */
     // Just a quick copy/paste for other decomp dims
-    else if (DECOMPOSITION_AXES == 2) {
+    else if (MPI_DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 2 * bit;
             j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
             k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
         }
     }
-    else if (DECOMPOSITION_AXES == 1) {
+    else if (MPI_DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 1 * bit;
             k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
         }
     }
     else {
-        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
         ERRCHK_ALWAYS(0);
     }
 
@@ -575,7 +568,7 @@ morton1D(const uint3_64 pid)
 {
     uint64_t i = 0;
 
-    if (DECOMPOSITION_AXES == 3) {
+    if (MPI_DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.z & mask) << 0) << 2 * bit;
@@ -583,64 +576,26 @@ morton1D(const uint3_64 pid)
             i |= ((pid.x & mask) << 2) << 2 * bit;
         }
     }
-    /*
-    else if (DECOMPOSITION_AXES == 3) {
-        for (int bit = 0; bit <= 21; ++bit) {
-            const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 2 * bit;
-            i |= ((pid.y & mask) << 1) << 2 * bit;
-            i |= ((pid.z & mask) << 2) << 2 * bit;
-        }
-    }*/
-    else if (DECOMPOSITION_AXES == 2) {
+    else if (MPI_DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.y & mask) << 0) << 1 * bit;
             i |= ((pid.z & mask) << 1) << 1 * bit;
         }
     }
-    else if (DECOMPOSITION_AXES == 1) {
+    else if (MPI_DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.z & mask) << 0) << 0 * bit;
         }
     }
     else {
-        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
         ERRCHK_ALWAYS(0);
     }
 
     return i;
 }
-/*
-static uint3_64
-morton3D(const uint64_t pid)
-{
-    uint64_t i, j, k;
-    i = j = k = 0;
-    for (int bit = 0; bit <= 21; ++bit) {
-        const uint64_t mask = 0x1l << 3 * bit;
-        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
-        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
-        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
-    }
-
-    return (uint3_64){i, j, k};
-}
-
-static uint64_t
-morton1D(const uint3_64 pid)
-{
-    uint64_t i = 0;
-    for (int bit = 0; bit <= 21; ++bit) {
-        const uint64_t mask = 0x1l << bit;
-        i |= ((pid.x & mask) << 0) << 2 * bit;
-        i |= ((pid.y & mask) << 1) << 2 * bit;
-        i |= ((pid.z & mask) << 2) << 2 * bit;
-    }
-    return i;
-}
-*/
 
 static uint3_64
 decompose(const uint64_t target)
@@ -701,9 +656,17 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
+    #if MPI_USE_CUDA_DRIVER_PINNING
+      ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
+
+      unsigned int flag = 1;
+      CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned);
+      ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
+    #else
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
     // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
     // slower than pinned (38 ms vs. 125 ms)
+    #fi // USE_CUDA_DRIVER_PINNING
 
     return data;
 }
@@ -1588,214 +1551,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     return AC_SUCCESS;
 }
 
-AcResult
-acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
-{
-    ERRCHK(grid.initialized);
-    // acGridSynchronizeStream(stream);
-
-    const Device device = grid.device;
-    const int3 nn       = grid.nn;
-    // CommData corner_data = grid.corner_data; // Do not rm: required for corners
-    CommData edgex_data  = grid.edgex_data;
-    CommData edgey_data  = grid.edgey_data;
-    CommData edgez_data  = grid.edgez_data;
-    CommData sidexy_data = grid.sidexy_data;
-    CommData sidexz_data = grid.sidexz_data;
-    CommData sideyz_data = grid.sideyz_data;
-
-    acDeviceSynchronizeStream(device, stream);
-
-    // Corners
-    /*
-    // Do not rm: required for corners
-    const int3 corner_b0s[] = {
-        (int3){0, 0, 0},
-        (int3){NGHOST + nn.x, 0, 0},
-        (int3){0, NGHOST + nn.y, 0},
-        (int3){0, 0, NGHOST + nn.z},
-
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
-        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
-    };
-    */
-
-    // Edges X
-    const int3 edgex_b0s[] = {
-        (int3){NGHOST, 0, 0},
-        (int3){NGHOST, NGHOST + nn.y, 0},
-
-        (int3){NGHOST, 0, NGHOST + nn.z},
-        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
-    };
-
-    // Edges Y
-    const int3 edgey_b0s[] = {
-        (int3){0, NGHOST, 0},
-        (int3){NGHOST + nn.x, NGHOST, 0},
-
-        (int3){0, NGHOST, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
-    };
-
-    // Edges Z
-    const int3 edgez_b0s[] = {
-        (int3){0, 0, NGHOST},
-        (int3){NGHOST + nn.x, 0, NGHOST},
-
-        (int3){0, NGHOST + nn.y, NGHOST},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
-    };
-
-    // Sides XY
-    const int3 sidexy_b0s[] = {
-        (int3){NGHOST, NGHOST, 0},             //
-        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
-    };
-
-    // Sides XZ
-    const int3 sidexz_b0s[] = {
-        (int3){NGHOST, 0, NGHOST},             //
-        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
-    };
-
-    // Sides YZ
-    const int3 sideyz_b0s[] = {
-        (int3){0, NGHOST, NGHOST},             //
-        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
-    };
-
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-
-#if MPI_COMM_ENABLED
-        // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
-        acPackCommData(device, edgex_b0s, &edgex_data);
-        acPackCommData(device, edgey_b0s, &edgey_data);
-        acPackCommData(device, edgez_b0s, &edgez_data);
-        acPackCommData(device, sidexy_b0s, &sidexy_data);
-        acPackCommData(device, sidexz_b0s, &sidexz_data);
-        acPackCommData(device, sideyz_b0s, &sideyz_data);
-#endif
-
-#if MPI_COMM_ENABLED
-        MPI_Barrier(MPI_COMM_WORLD);
-
-#if MPI_GPUDIRECT_DISABLED
-        // acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
-        acTransferCommDataToHost(device, &edgex_data);
-        acTransferCommDataToHost(device, &edgey_data);
-        acTransferCommDataToHost(device, &edgez_data);
-        acTransferCommDataToHost(device, &sidexy_data);
-        acTransferCommDataToHost(device, &sidexz_data);
-        acTransferCommDataToHost(device, &sideyz_data);
-#endif
-
-        // acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
-        acTransferCommData(device, edgex_b0s, &edgex_data);
-        acTransferCommData(device, edgey_b0s, &edgey_data);
-        acTransferCommData(device, edgez_b0s, &edgez_data);
-        acTransferCommData(device, sidexy_b0s, &sidexy_data);
-        acTransferCommData(device, sidexz_b0s, &sidexz_data);
-        acTransferCommData(device, sideyz_b0s, &sideyz_data);
-#endif // MPI_COMM_ENABLED
-
-#if MPI_COMPUTE_ENABLED
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-////////////////////////////////////////////
-#endif // MPI_COMPUTE_ENABLED
-
-#if MPI_COMM_ENABLED
-        // acTransferCommDataWait(corner_data); // Do not rm: required for corners
-        acTransferCommDataWait(edgex_data);
-        acTransferCommDataWait(edgey_data);
-        acTransferCommDataWait(edgez_data);
-        acTransferCommDataWait(sidexy_data);
-        acTransferCommDataWait(sidexz_data);
-        acTransferCommDataWait(sideyz_data);
-
-#if MPI_GPUDIRECT_DISABLED
-        // acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners
-        acTransferCommDataToDevice(device, &edgex_data);
-        acTransferCommDataToDevice(device, &edgey_data);
-        acTransferCommDataToDevice(device, &edgez_data);
-        acTransferCommDataToDevice(device, &sidexy_data);
-        acTransferCommDataToDevice(device, &sidexz_data);
-        acTransferCommDataToDevice(device, &sideyz_data);
-#endif
-
-        // acUnpinCommData(device, &corner_data); // Do not rm: required for corners
-        acUnpinCommData(device, &edgex_data);
-        acUnpinCommData(device, &edgey_data);
-        acUnpinCommData(device, &edgez_data);
-        acUnpinCommData(device, &sidexy_data);
-        acUnpinCommData(device, &sidexz_data);
-        acUnpinCommData(device, &sideyz_data);
-
-        // acUnpackCommData(device, corner_b0s, &corner_data);
-        acUnpackCommData(device, edgex_b0s, &edgex_data);
-        acUnpackCommData(device, edgey_b0s, &edgey_data);
-        acUnpackCommData(device, edgez_b0s, &edgez_data);
-        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
-        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
-        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
-        //////////// OUTER INTEGRATION //////////////
-
-        // Wait for unpacking
-        // acSyncCommData(corner_data); // Do not rm: required for corners
-        acSyncCommData(edgex_data);
-        acSyncCommData(edgey_data);
-        acSyncCommData(edgez_data);
-        acSyncCommData(sidexy_data);
-        acSyncCommData(sidexz_data);
-        acSyncCommData(sideyz_data);
-#endif // MPI_COMM_ENABLED
-#if MPI_COMPUTE_ENABLED
-        { // Front
-            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
-            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
-        }
-        { // Back
-            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
-            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
-        }
-        { // Bottom
-            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
-        }
-        { // Top
-            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
-        }
-        { // Left
-            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
-        }
-        { // Right
-            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
-        }
-#endif // MPI_COMPUTE_ENABLED
-        acDeviceSwapBuffers(device);
-        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
-        ////////////////////////////////////////////
-    }
-
-    return AC_SUCCESS;
-}
-
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
 {

From 88f99c12e4d3a035bcc53db630adc3c8eeb4db52 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:20:43 +0300
Subject: [PATCH 08/10] Fixed #fi -> #endif

---
 src/core/device.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 688ed89..181e802 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -666,7 +666,7 @@ acCreatePackedData(const int3 dims)
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
     // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
     // slower than pinned (38 ms vs. 125 ms)
-    #fi // USE_CUDA_DRIVER_PINNING
+    #endif // USE_CUDA_DRIVER_PINNING
 
     return data;
 }

From 3c3b2a188593f8b2900012788b46f2b3f50af451 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:35:19 +0300
Subject: [PATCH 09/10] Reverted the default settings to what they were before
 merge. Note: LFORCING (1) is potentially not tested properly, TODO recheck.

---
 CMakeLists.txt                   | 4 ++--
 acc/mhd_solver/stencil_kernel.ac | 4 ++--
 config/astaroth.conf             | 6 +++---
 src/utils/modelsolver.c          | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04100bc..682be55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ project(astaroth C CXX CUDA)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 
 ## Project-wide compilation flags
-set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow")
+set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
@@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11)
 find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
 cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
 string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
-set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
+set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
 
 
diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac
index 0f37ea5..905eb65 100644
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -5,8 +5,8 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (0)
-#define LUPWD (0)
+#define LFORCING (1)
+#define LUPWD (1)
 #define LSINK (0)
 
 #define AC_THERMAL_CONDUCTIVITY (0.001) // TODO: make an actual config parameter
diff --git a/config/astaroth.conf b/config/astaroth.conf
index 83e93d9..190948b 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 256
-AC_ny = 256
-AC_nz = 256
+AC_nx = 128
+AC_ny = 128
+AC_nz = 128
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index e482bd9..d9446eb 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -38,8 +38,8 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (0)
-#define LUPWD (0)
+#define LFORCING (1)
+#define LUPWD (1)
 #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
 
 typedef AcReal Scalar;

From 0d1c5b3911f502c3fe8a7d91b344a81921a2d822 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:56:30 +0300
Subject: [PATCH 10/10] Autoformatted

---
 samples/bwtest/main.c           | 47 ++++++++++++--------
 src/core/device.cc              | 21 ++++-----
 src/core/kernels/kernels.cu     | 25 ++++++-----
 src/core/kernels/reductions.cuh |  5 ++-
 src/utils/modelsolver.c         | 78 ++++++++++++++++++++++-----------
 5 files changed, 108 insertions(+), 68 deletions(-)

diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index 9dd25d9..35d98d5 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -6,15 +6,21 @@
 
 #include <mpi.h>
 
-#include <cuda_runtime_api.h>
 #include <cuda.h> // CUDA driver API
+#include <cuda_runtime_api.h>
 
 #include "timer_hires.h" // From src/common
 
 //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
 #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
 
-#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
+#define errchk(x)                                                                                  \
+    {                                                                                              \
+        if (!(x)) {                                                                                \
+            fprintf(stderr, "errchk(%s) failed", #x);                                              \
+            assert(x);                                                                             \
+        }                                                                                          \
+    }
 
 /*
   Findings:
@@ -57,17 +63,18 @@ allocDevice(const size_t bytes)
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
-    #define USE_CUDA_DRIVER_PINNING (1)
-    #if USE_CUDA_DRIVER_PINNING
+#define USE_CUDA_DRIVER_PINNING (1)
+#if USE_CUDA_DRIVER_PINNING
     uint8_t* arr = allocDevice(bytes);
 
     unsigned int flag = 1;
-    CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr);
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)arr);
 
     errchk(retval == CUDA_SUCCESS);
     return arr;
 
-    #else
+#else
     uint8_t* arr;
     // Standard (20 GiB/s internode, 85 GiB/s intranode)
     // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -77,7 +84,7 @@ allocDevicePinned(const size_t bytes)
     const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
     errchk(retval == cudaSuccess);
     return arr;
-    #endif
+#endif
 }
 
 /*
@@ -267,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
     cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
 }
 
-
 static void
 sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
 {
@@ -327,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
-
 static void
-measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
-                                                                                                            uint8_t* hsrc, uint8_t* ddst)
+measurebw2(const char* msg, const size_t bytes,
+           void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
+           uint8_t* hsrc, uint8_t* ddst)
 {
     const size_t num_samples = 100;
 
@@ -414,8 +420,8 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Host)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Host)", //
-                  2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
         //          2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
 
         freeHost(src);
@@ -434,11 +440,12 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
         //          2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
+                  dst);
 
         freeDevice(src);
         freeDevice(dst);
@@ -456,7 +463,7 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
 
         freeDevice(src);
         freeDevice(dst);
@@ -472,7 +479,8 @@ main(void)
         measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
         measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
 
-        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
+                   ddst);
 
         freeDevice(dsrc);
         freeDevice(ddst);
@@ -490,7 +498,8 @@ main(void)
         measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
         measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
 
-        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
+                   hsrc, ddst);
 
         freeDevice(dsrc);
         freeDevice(ddst);
diff --git a/src/core/device.cc b/src/core/device.cc
index 181e802..8cda677 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -16,7 +16,7 @@
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory
+#define MPI_USE_PINNED (1)              // Do inter-node comm with pinned memory
 #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
 
 #include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
@@ -656,17 +656,18 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
-    #if MPI_USE_CUDA_DRIVER_PINNING
-      ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
+#if MPI_USE_CUDA_DRIVER_PINNING
+    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
 
-      unsigned int flag = 1;
-      CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned);
-      ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
-    #else
+    unsigned int flag = 1;
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)data.data_pinned);
+    ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
+#else
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
-    // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
-    // slower than pinned (38 ms vs. 125 ms)
-    #endif // USE_CUDA_DRIVER_PINNING
+// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
+// slower than pinned (38 ms vs. 125 ms)
+#endif // USE_CUDA_DRIVER_PINNING
 
     return data;
 }
diff --git a/src/core/kernels/kernels.cu b/src/core/kernels/kernels.cu
index 3c59486..2878f38 100644
--- a/src/core/kernels/kernels.cu
+++ b/src/core/kernels/kernels.cu
@@ -75,17 +75,20 @@ exp(const acComplex& val)
 {
     return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
 }
-static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const AcReal& a, const acComplex& b)
 {
     return (acComplex){a * b.x, a * b.y};
 }
 
-static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
+static __device__ inline acComplex
+operator*(const acComplex& b, const AcReal& a)
 {
     return (acComplex){a * b.x, a * b.y};
 }
 
-static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const acComplex& a, const acComplex& b)
 {
     return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
 }
@@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
 
     const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
 
     const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
 
     const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
     }
 
     if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
-        fprintf(
-            stderr,
-            "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n",
-            value.x, value.y, value.z, int3param_names[param]);
+        fprintf(stderr,
+                "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
+                "Skipping.\n",
+                value.x, value.y, value.z, int3param_names[param]);
         return AC_FAILURE;
     }
 
@@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device)
 {
     cudaSetDevice(device->id);
 
-    // clang-format off
+// clang-format off
     // Scalar
     #define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
     AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)
diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh
index 8877e7e..1f40df3 100644
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
     assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
     assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
 
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
+                                                                   src1[IDX(src_idx)],
+                                                                   src2[IDX(src_idx)]);
 }
 
 template <ReduceFunc reduce>
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index d9446eb..4c2edcf 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -103,11 +103,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
 #elif STENCIL_ORDER == 4
     const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0),
-                                   (Scalar)(1.0 / 60.0)};
+    const Scalar coefficients[] = {
+        0,
+        (Scalar)(3.0 / 4.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 60.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0),
-                                   (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)};
+    const Scalar coefficients[] = {
+        0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -126,15 +131,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
 #if STENCIL_ORDER == 2
     const Scalar coefficients[] = {-2, 1};
 #elif STENCIL_ORDER == 4
-    const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0),
-                                   (Scalar)(-1.0 / 12.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-5.0 / 2.0),
+        (Scalar)(4.0 / 3.0),
+        (Scalar)(-1.0 / 12.0),
+    };
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0),
-                                   (Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-49.0 / 18.0),
+        (Scalar)(3.0 / 2.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 90.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),
-                                   (Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0),
-                                   (Scalar)(-1.0 / 560.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),    (Scalar)(-1.0 / 5.0),
+        (Scalar)(8.0 / 315.0),   (Scalar)(-1.0 / 560.0),
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -156,16 +169,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
     const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
 #elif STENCIL_ORDER == 4
     const Scalar coefficients[] = {
-        0, (Scalar)(1.0 / 32.0),
-        (Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders
+        0,
+        (Scalar)(1.0 / 32.0),
+        (Scalar)(1.0 / 64.0),
+    }; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
     const Scalar fac            = ((Scalar)(1. / 720.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac,
-                                   (Scalar)(2.0) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(270.0) * fac,
+        (Scalar)(-27.0) * fac,
+        (Scalar)(2.0) * fac,
+    };
 #elif STENCIL_ORDER == 8
     const Scalar fac            = ((Scalar)(1. / 20160.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac,
-                                   (Scalar)(128.) * fac, (Scalar)(-9.) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(8064.) * fac,
+        (Scalar)(-1008.) * fac,
+        (Scalar)(128.) * fac,
+        (Scalar)(-9.) * fac,
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -207,14 +231,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
     Scalar pencil_a[STENCIL_ORDER + 1];
     //#pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2,
-                                   k)];
+        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + offset - STENCIL_ORDER / 2, k)];
 
     Scalar pencil_b[STENCIL_ORDER + 1];
     //#pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset,
-                                   k)];
+        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + STENCIL_ORDER / 2 - offset, k)];
 
     return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
 }
@@ -539,7 +563,8 @@ gradient_of_divergence(const VectorData vec)
     return (Vector){
         hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
         hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
-        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]};
+        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
+    };
 }
 
 // Takes uu gradients and returns S
@@ -805,10 +830,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
                                        getInt(AC_ny) * getReal(AC_dsy),
                                        getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
     (void)a;                                                             // WARNING: not used
-    Vector xx        = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
-                         (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
-                         (globalVertexIdx.z - getInt(AC_nz_min)) *
-                             getReal(AC_dsz)}; // sink (current index)
+    Vector xx = (Vector){
+        (globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
+        (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
+        (globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
+    }; // sink (current index)
     const Scalar cs2 = getReal(AC_cs2_sound);
     const Scalar cs  = sqrt(cs2);