From 9065381b2a0dbebd224e8df1291c3f537343d391 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 30 Mar 2020 18:01:35 +0300
Subject: [PATCH 01/89] Added the configuration used for benchmarking (not to
 be merged to master)

---
 CMakeLists.txt       | 10 +++++-----
 config/astaroth.conf | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9fe066c..328c23b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,11 +28,11 @@ endif()
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 
 ## Options
-option(DOUBLE_PRECISION "Generates double precision code."                    OFF)
-option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            OFF)
-option(BUILD_STANDALONE "Builds standalone Astaroth."                         ON)
-option(MPI_ENABLED      "Enables additional functions for MPI communciation." OFF)
-option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
+option(DOUBLE_PRECISION "Generates double precision code."                    ON)
+option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            ON)
+option(BUILD_STANDALONE "Builds standalone Astaroth."                         OFF)
+option(MPI_ENABLED      "Enables additional functions for MPI communciation." ON)
+option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
 
 ## Options (DEPRECATED)
 # option(BUILD_DEBUG              "Builds the program with extensive error checking"          OFF)
diff --git a/config/astaroth.conf b/config/astaroth.conf
index abc1613..6c34cb8 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 128
-AC_ny = 128
-AC_nz = 128
+AC_nx = 512
+AC_ny = 512
+AC_nz = 512
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
@@ -24,11 +24,11 @@ AC_bin_steps = 1000
 AC_bin_save_t = 1e666
 
 // Set to 0 if you want to run the simulation from the beginning, or just a new
-// simulation. If continuing from a saved step, specify the step number here.  
-AC_start_step = 0 
+// simulation. If continuing from a saved step, specify the step number here.
+AC_start_step = 0
 
 // Maximum time in code units. If negative, there is no time limit
-AC_max_time = -1.0 
+AC_max_time = -1.0
 
 // Hydro
 AC_cdt = 0.4
@@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5
 AC_kmin              = 0.8
 AC_kmax              = 1.2
 // Switches forcing off and accretion on
-AC_switch_accretion  = 0 
+AC_switch_accretion  = 0
 
 // Entropy
 AC_cp_sound = 1.0

From 24e65ab02db13c587f32b83513cd92ff1c428bbb Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 30 Mar 2020 18:13:50 +0300
Subject: [PATCH 02/89] Set decompositions for some nprocs by hand

---
 src/core/device.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index 8c78b09..8e97772 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -495,6 +495,10 @@ decompose(const int target)
         return (int3){4, 2, 2};
     if (target == 32)
         return (int3){4, 4, 2};
+    if (target == 128)
+        return (int3){8, 4, 4};
+    if (target == 256)
+        return (int3){8, 8, 4};
 
     int decomposition[] = {1, 1, 1};
 

From 742dcc26977fd9b1c26de53624623f50254bda90 Mon Sep 17 00:00:00 2001
From: Johannes Pekkila <jpekkila@daint101.login.cscs.ch>
Date: Tue, 31 Mar 2020 12:36:25 +0200
Subject: [PATCH 03/89] Optimized MPI synchronization a bit

---
 src/core/device.cc | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 8e97772..59ad2e8 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1220,7 +1220,7 @@ AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
-    acGridSynchronizeStream(stream);
+    //acGridSynchronizeStream(stream);
 
     const Device device  = grid.device;
     const int3 nn        = grid.nn;
@@ -1231,6 +1231,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     CommData sidexy_data = grid.sidexy_data;
     CommData sidexz_data = grid.sidexz_data;
     CommData sideyz_data = grid.sideyz_data;
+	
+	acDeviceSynchronizeStream(device, stream);
 
     // Corners
     const int3 corner_a0s[] = {
@@ -1343,13 +1345,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexz_a0s, &sidexz_data);
         acPackCommData(device, sideyz_a0s, &sideyz_data);
 
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-        ////////////////////////////////////////////
+	MPI_Barrier(MPI_COMM_WORLD);
 
 #if MPI_GPUDIRECT_DISABLED
         acTransferCommDataToHost(device, &corner_data);
@@ -1368,6 +1364,14 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
         acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
         acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+	
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+        ////////////////////////////////////////////
 
         acTransferCommDataWait(corner_data);
         acTransferCommDataWait(edgex_data);

From 9b6d927cf164e8fe1c857e5a6cbc826d1c0db1be Mon Sep 17 00:00:00 2001
From: Johannes Pekkila <jpekkila@daint101.login.cscs.ch>
Date: Tue, 31 Mar 2020 12:37:54 +0200
Subject: [PATCH 04/89] It might be better to benchmark MPI codes without
 synchronization because of overhead of timing individual steps

---
 samples/benchmark/main.cc | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index 847e872..4b50bfb 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -96,6 +96,20 @@ main(void)
     // Benchmark
     Timer t;
     const AcReal dt             = FLT_EPSILON;
+
+	acGridSynchronizeStream(STREAM_ALL);
+	timer_reset(&t);
+	acGridSynchronizeStream(STREAM_ALL);
+
+	const size_t num_iters = 50;
+	for (size_t i = 0; i < num_iters; ++i)
+		acGridIntegrate(STREAM_DEFAULT, dt);
+
+	acGridSynchronizeStream(STREAM_ALL);
+	if (!pid)
+		timer_diff_print(t);
+	acGridSynchronizeStream(STREAM_ALL);
+	/*
     const size_t num_iters      = 100;
     const double nth_percentile = 0.90;
 
@@ -135,7 +149,7 @@ main(void)
         fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
 
         fclose(fp);
-    }
+    }*/
 
     acGridQuit();
     MPI_Finalize();

From fe14ae4665b33ec890f4161a064f2362edc05949 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 2 Apr 2020 17:59:53 +0300
Subject: [PATCH 05/89] Added an alternative MPI implementation which uses
 one-sided communication

---
 src/core/device.cc         | 223 +++++++++++++++++++++++++++++++++----
 src/core/kernels/kernels.h |  10 ++
 2 files changed, 209 insertions(+), 24 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 59ad2e8..a784c69 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -530,12 +530,22 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
+#if AC_MPI_UNIDIRECTIONAL_COMM
+    ERRCHK_ALWAYS(MPI_Win_create(data.data, bytes, sizeof(AcReal), MPI_INFO_NULL, MPI_COMM_WORLD,
+                                 &data.win) == MPI_SUCCESS);
+    MPI_Win_fence(0, data.win);
+#endif // AC_MPI_UNIDIRECTIONAL_COMM
+
     return data;
 }
 
 static AcResult
 acDestroyPackedData(PackedData* data)
 {
+#if AC_MPI_UNIDIRECTIONAL_COMM
+    MPI_Win_free(&data->win);
+#endif // AC_MPI_UNIDIRECTIONAL_COMM
+
     data->dims = (int3){-1, -1, -1};
     cudaFree(data->data);
     data->data = NULL;
@@ -555,9 +565,29 @@ acCreatePackedDataHost(const int3 dims)
     data.data          = (AcReal*)malloc(bytes);
     ERRCHK_ALWAYS(data.data);
 
+#if AC_MPI_UNIDIRECTIONAL_COMM
+    ERRCHK_ALWAYS(MPI_Win_create(data.data, bytes, sizeof(AcReal), MPI_INFO_NULL, MPI_COMM_WORLD,
+                                 &data.win) == MPI_SUCCESS);
+    MPI_Win_fence(0, data.win);
+#endif // AC_MPI_UNIDIRECTIONAL_COMM
+
     return data;
 }
 
+static AcResult
+acDestroyPackedDataHost(PackedData* data)
+{
+#if AC_MPI_UNIDIRECTIONAL_COMM
+    MPI_Win_free(&data->win);
+#endif // AC_MPI_UNIDIRECTIONAL_COMM
+
+    data->dims = (int3){-1, -1, -1};
+    free(data->data);
+    data->data = NULL;
+
+    return AC_SUCCESS;
+}
+
 static void
 acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata,
                            PackedData* hdata)
@@ -579,16 +609,6 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
                          NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream));
 }
-
-static AcResult
-acDestroyPackedDataHost(PackedData* data)
-{
-    data->dims = (int3){-1, -1, -1};
-    free(data->data);
-    data->data = NULL;
-
-    return AC_SUCCESS;
-}
 #endif // MPI_GPUDIRECT_DISABLED
 
 // TODO: do with packed data
@@ -884,6 +904,160 @@ acTransferCommDataToDevice(const Device device, CommData* data)
 }
 #endif
 
+#if AC_MPI_UNIDIRECTIONAL_COMM
+static AcResult
+acTransferCommData(const Device device, //
+                   const int3* a0s,     // Src idx inside comp. domain
+                   const int3* b0s,     // Dst idx inside bound zone
+                   CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const int3 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    const int3 dims         = data->dims;
+    const size_t blockcount = data->count;
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+#if MPI_GPUDIRECT_DISABLED
+                            MPI_Win_fence(0, data->srcs_host[a_idx].win);
+                            MPI_Win_fence(0, data->dsts_host[b_idx].win);
+#else
+                            MPI_Win_fence(0, data->srcs[a_idx].win);
+                            MPI_Win_fence(0, data->dsts[b_idx].win);
+#endif
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+
+                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+                            const int3 pid3d = getPid3D(pid, decomp);
+
+#if MPI_GPUDIRECT_DISABLED
+
+                            MPI_Put(data->srcs_host[a_idx].data, count, datatype,
+                                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
+                                    data->dsts_host[b_idx].win);
+
+                            /*
+            MPI_Get(data->dsts_host[b_idx].data, count, datatype,
+                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
+                    data->srcs_host[a_idx].win);
+                    */
+
+#else
+                            /*
+                                            MPI_Put(data->srcs[a_idx].data, count, datatype,
+                                                    getPid(pid3d - neighbor, decomp), 0, count,
+                               datatype, data->dsts[b_idx].win);
+                                            */
+
+                            MPI_Get(data->dsts[b_idx].data, count, datatype,
+                                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
+                                    data->srcs[a_idx].win);
+                            ERROR("CUDA-aware MPI_Put/MPI_Get not yet supported with UCX "
+                                  "(2020-04-02)");
+#endif
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+#if MPI_GPUDIRECT_DISABLED
+                            MPI_Win_fence(0, data->srcs_host[a_idx].win);
+                            MPI_Win_fence(0, data->dsts_host[b_idx].win);
+#else
+                            MPI_Win_fence(0, data->srcs[a_idx].win);
+                            MPI_Win_fence(0, data->dsts[b_idx].win);
+#endif
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return AC_SUCCESS;
+}
+
+static void
+acTransferCommDataWait(const CommData data)
+{
+    (void)data;
+    // NOP
+}
+
+#else
 static AcResult
 acTransferCommData(const Device device, //
                    const int3* a0s,     // Src idx inside comp. domain
@@ -931,7 +1105,7 @@ acTransferCommData(const Device device, //
                             const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
 #if MPI_GPUDIRECT_DISABLED
-                            PackedData dst = data->dsts_host[b_idx];
+                            PackedData dst     = data->dsts_host[b_idx];
 #else
                             PackedData dst = data->dsts[b_idx];
 #endif
@@ -967,7 +1141,7 @@ acTransferCommData(const Device device, //
                             const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
 #if MPI_GPUDIRECT_DISABLED
-                            PackedData src = data->srcs_host[a_idx];
+                            PackedData src     = data->srcs_host[a_idx];
 #else
                             PackedData src = data->srcs[a_idx];
 #endif
@@ -995,6 +1169,7 @@ acTransferCommDataWait(const CommData data)
         MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
     }
 }
+#endif // AC_MPI_UNIDIRECTIONAL_COMM
 
 typedef struct {
     Device device;
@@ -1220,7 +1395,7 @@ AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
-    //acGridSynchronizeStream(stream);
+    // acGridSynchronizeStream(stream);
 
     const Device device  = grid.device;
     const int3 nn        = grid.nn;
@@ -1231,8 +1406,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     CommData sidexy_data = grid.sidexy_data;
     CommData sidexz_data = grid.sidexz_data;
     CommData sideyz_data = grid.sideyz_data;
-	
-	acDeviceSynchronizeStream(device, stream);
+
+    acDeviceSynchronizeStream(device, stream);
 
     // Corners
     const int3 corner_a0s[] = {
@@ -1345,7 +1520,15 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexz_a0s, &sidexz_data);
         acPackCommData(device, sideyz_a0s, &sideyz_data);
 
-	MPI_Barrier(MPI_COMM_WORLD);
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+        ////////////////////////////////////////////
+
+        MPI_Barrier(MPI_COMM_WORLD);
 
 #if MPI_GPUDIRECT_DISABLED
         acTransferCommDataToHost(device, &corner_data);
@@ -1364,14 +1547,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
         acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
         acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
-	
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-        ////////////////////////////////////////////
 
         acTransferCommDataWait(corner_data);
         acTransferCommDataWait(edgex_data);
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 476abae..9e4e0f7 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -1,9 +1,19 @@
 #pragma once
 #include "astaroth.h"
 
+#if AC_MPI_ENABLED
+#include <mpi.h>
+
+#define AC_MPI_UNIDIRECTIONAL_COMM (0)
+#endif // AC_MPI_ENABLED
+
 typedef struct {
     int3 dims;
     AcReal* data;
+
+#if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
+    MPI_Win win; // MPI window for RMA
+#endif           // (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
 } PackedData;
 
 typedef struct {

From 88e53dfa215e0178d4760d955d588308d72da9a5 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 5 Apr 2020 17:09:57 +0300
Subject: [PATCH 06/89] Added a little program for testing the bandwidths of
 different MPI comm styles on n nodes and processes

---
 samples/bwtest/CMakeLists.txt |   9 ++
 samples/bwtest/main.c         | 207 ++++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 samples/bwtest/CMakeLists.txt
 create mode 100644 samples/bwtest/main.c

diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt
new file mode 100644
index 0000000..db7066e
--- /dev/null
+++ b/samples/bwtest/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
+
+find_package(MPI)
+find_package(OpenMP)
+find_package(CUDAToolkit)
+
+add_executable(bwtest main.c)
+add_compile_options(-O3)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart)
diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
new file mode 100644
index 0000000..d3e7303
--- /dev/null
+++ b/samples/bwtest/main.c
@@ -0,0 +1,207 @@
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <mpi.h>
+
+#include <cuda_runtime_api.h>
+
+#include "timer_hires.h" // From src/common
+
+//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
+#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
+
+/*
+  Findings:
+    - MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
+    - Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
+    - For internode one should use pinned memory (RDMA is staged through pinned, gives full
+    network speed iff pinned)
+*/
+
+static uint8_t*
+allocHost(const size_t bytes)
+{
+    uint8_t* arr = malloc(bytes);
+    assert(arr);
+    return arr;
+}
+
+static void
+freeHost(uint8_t* arr)
+{
+    free(arr);
+}
+
+static uint8_t*
+allocDevice(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    assert(retval == cudaSuccess);
+    return arr;
+}
+
+static void
+freeDevice(uint8_t* arr)
+{
+    cudaFree(arr);
+}
+
+static void
+sendrecv_blocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    if (!pid) {
+        MPI_Status status;
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+    }
+    else {
+        MPI_Status status;
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+    }
+}
+
+static void
+sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Request recv_request, send_request;
+    MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
+    MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
+
+    MPI_Status status;
+    MPI_Wait(&recv_request, &status);
+    MPI_Wait(&send_request, &status);
+}
+
+static void
+sendrecv_twoway(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Status status;
+    MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
+                 MPI_COMM_WORLD, &status);
+}
+
+#define PRINT                                                                                      \
+    if (!pid)                                                                                      \
+    printf
+
+static void
+measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
+          uint8_t* dst)
+{
+    const size_t num_samples = 10;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    PRINT("%s\n", msg);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("\tWarming up... ");
+    for (size_t i = 0; i < num_samples / 10; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    PRINT("Done\n");
+
+    PRINT("\tBandwidth... ");
+    fflush(stdout);
+
+    Timer t;
+    MPI_Barrier(MPI_COMM_WORLD);
+    timer_reset(&t);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (size_t i = 0; i < num_samples; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
+    PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000);
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int
+main(void)
+{
+    // Disable stdout buffering
+    setbuf(stdout, NULL);
+
+    MPI_Init(NULL, NULL);
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    assert(nprocs >= 2); // Require at least one neighbor
+
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+    const int device_id = pid % devices_per_node;
+    cudaSetDevice(device_id);
+
+    printf("Process %d of %d running.\n", pid, nprocs);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
+
+    {
+        uint8_t* src = allocHost(BLOCK_SIZE);
+        uint8_t* dst = allocHost(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+
+        freeHost(src);
+        freeHost(dst);
+    }
+
+    {
+        uint8_t* src = allocDevice(BLOCK_SIZE);
+        uint8_t* dst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}

From cc9d3f1b9cdeec06959cbd43ff9793430e2a3b36 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 5 Apr 2020 20:15:32 +0300
Subject: [PATCH 07/89] Found a workaround that gives good inter and intra-node
 performance. HPC-X MPI implementation does not know how to do p2p comm with
 pinned arrays (should be 80 GiB/s, measured 10 GiB/s) and internode comm is
 super slow without pinned arrays (should be 40 GiB/s, measured < 1 GiB/s).
 Made a proof of concept communicator that pins arrays that are send or
 received from another node.

---
 samples/bwtest/CMakeLists.txt |   2 +-
 samples/bwtest/main.c         | 181 +++++++++++++++++++++++++++++++++-
 samples/mpitest/main_old.cc   |  80 +++++++++++++++
 3 files changed, 258 insertions(+), 5 deletions(-)
 create mode 100644 samples/mpitest/main_old.cc

diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt
index db7066e..13bf13f 100644
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -6,4 +6,4 @@ find_package(CUDAToolkit)
 
 add_executable(bwtest main.c)
 add_compile_options(-O3)
-target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index d3e7303..00c0caf 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -1,4 +1,5 @@
 #include <assert.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -18,6 +19,8 @@
     - Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
     - For internode one should use pinned memory (RDMA is staged through pinned, gives full
     network speed iff pinned)
+    - Both the sending and receiving arrays must be pinned to see performance improvement
+    in internode comm
 */
 
 static uint8_t*
@@ -36,6 +39,20 @@ freeHost(uint8_t* arr)
 
 static uint8_t*
 allocDevice(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    assert(retval == cudaSuccess);
+    return arr;
+}
+
+static uint8_t*
+allocDevicePinned(const size_t bytes)
 {
     uint8_t* arr;
     // Standard (20 GiB/s internode, 85 GiB/s intranode)
@@ -107,6 +124,107 @@ sendrecv_twoway(uint8_t* src, uint8_t* dst)
                  MPI_COMM_WORLD, &status);
 }
 
+static void
+sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        int nback  = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
+        MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&recv_requests[i], &status);
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
+static void
+sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    static bool error_shown = false;
+    if (!pid && !error_shown) {
+        fprintf(stderr, "\tWARNING: make sure you init MPI_Init_thread for OpenMP support (no "
+                        "supported on puhti atm "
+                        "2020-04-05\n");
+        error_shown = true;
+    }
+#pragma omp parallel for
+    for (int i = 1; i < nprocs; ++i) {
+        int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        MPI_Status status;
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &status);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
+static void
+sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    static uint8_t* src_pinned = NULL;
+    static uint8_t* dst_pinned = NULL;
+    if (!src_pinned)
+        src_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
+    if (!dst_pinned)
+        dst_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
+
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+
+    const int node_id = pid / devices_per_node;
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        int nback  = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        if (nback / devices_per_node != pid / devices_per_node) // Not on the same node
+            MPI_Irecv(dst_pinned, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD,
+                      &recv_requests[i]);
+        else
+            MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
+
+        if (nfront / devices_per_node != pid / devices_per_node) // Not on the same node
+            MPI_Isend(src_pinned, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD,
+                      &send_requests[i]);
+        else
+            MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&recv_requests[i], &status);
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
 #define PRINT                                                                                      \
     if (!pid)                                                                                      \
     printf
@@ -115,7 +233,7 @@ static void
 measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
           uint8_t* dst)
 {
-    const size_t num_samples = 10;
+    const size_t num_samples = 100;
 
     int pid, nprocs;
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
@@ -145,23 +263,36 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
     MPI_Barrier(MPI_COMM_WORLD);
     const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
     PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
-    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000);
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
 int
 main(void)
 {
+    MPI_Init(NULL, NULL);
+    // int provided;
+    // MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
+    // assert(provided >= MPI_THREAD_MULTIPLE);
+
     // Disable stdout buffering
     setbuf(stdout, NULL);
 
-    MPI_Init(NULL, NULL);
-
     int pid, nprocs;
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
     assert(nprocs >= 2); // Require at least one neighbor
 
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (!pid) {
+        printf("Do we have threads? The following should not be ordered (unless very lucky)\n");
+#pragma omp parallel for
+        for (int i = 0; i < 10; ++i)
+            printf("%d, ", i);
+        printf("\n");
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
     int devices_per_node = -1;
     cudaGetDeviceCount(&devices_per_node);
     const int device_id = pid % devices_per_node;
@@ -182,10 +313,15 @@ main(void)
                   2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
         measurebw("Bidirectional bandwidth, twoway (Host)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
 
         freeHost(src);
         freeHost(dst);
     }
+    PRINT("\n------------------------\n");
 
     {
         uint8_t* src = allocDevice(BLOCK_SIZE);
@@ -197,10 +333,47 @@ main(void)
                   2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
         measurebw("Bidirectional bandwidth, twoway (Device)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
 
         freeDevice(src);
         freeDevice(dst);
     }
+    PRINT("\n------------------------\n");
+
+    {
+        uint8_t* src = allocDevicePinned(BLOCK_SIZE);
+        uint8_t* dst = allocDevicePinned(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+    PRINT("\n------------------------\n");
+    /*
+    { // Final run for easy identification with the profiler
+        uint8_t* src = allocDevice(BLOCK_SIZE);
+        uint8_t* dst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+    */
 
     MPI_Finalize();
     return EXIT_SUCCESS;
diff --git a/samples/mpitest/main_old.cc b/samples/mpitest/main_old.cc
new file mode 100644
index 0000000..16c27e4
--- /dev/null
+++ b/samples/mpitest/main_old.cc
@@ -0,0 +1,80 @@
+/*
+    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+    Running: mpirun -np <num processes> <executable>
+*/
+#include "astaroth.h"
+#include "astaroth_utils.h"
+
+#include <mpi.h>
+
+int
+main(void)
+{
+    MPI_Init(NULL, NULL);
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+    // CPU alloc
+    AcMeshInfo info;
+    acLoadConfig(AC_DEFAULT_CONFIG, &info);
+    info.real_params[AC_inv_dsx]   = AcReal(1.0) / info.real_params[AC_dsx];
+    info.real_params[AC_inv_dsy]   = AcReal(1.0) / info.real_params[AC_dsy];
+    info.real_params[AC_inv_dsz]   = AcReal(1.0) / info.real_params[AC_dsz];
+    info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
+
+    AcMesh model, candidate;
+    if (pid == 0) {
+        acMeshCreate(info, &model);
+        acMeshCreate(info, &candidate);
+        acMeshRandomize(&model);
+        acMeshRandomize(&candidate);
+    }
+
+    // GPU alloc & compute
+    Grid grid;
+    acGridCreateMPI(info, &grid);
+
+    acGridLoadMeshMPI(grid, STREAM_DEFAULT, model);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+
+    acGridIntegrateMPI(grid, FLT_EPSILON);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+    acGridSynchronizeMeshMPI(grid, STREAM_DEFAULT);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+
+    acGridStoreMeshMPI(grid, STREAM_DEFAULT, &candidate);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+
+    acGridDestroyMPI(grid);
+
+    // Verify
+    if (pid == 0) {
+        acModelIntegrateStep(model, FLT_EPSILON);
+        acMeshApplyPeriodicBounds(&model);
+
+        acVerifyMesh(model, candidate);
+        acMeshDestroy(&model);
+        acMeshDestroy(&candidate);
+    }
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}

From 37f1c841a34dd5f6ab64f4667d73860059b75b31 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 6 Apr 2020 14:09:12 +0300
Subject: [PATCH 08/89] Added functions for pinning memory that is sent over
 the network. TODO pack to and from pinned memory selectively (currently P2P
 results are overwritten with data in pinned memory)

---
 src/core/device.cc         | 224 +++++++++++++++++++++++++++++++++++++
 src/core/kernels/kernels.h |   5 +
 2 files changed, 229 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index a784c69..61cab2f 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -488,6 +488,20 @@ getPid3D(const int pid, const int3 decomposition)
     return pid3d;
 }
 
+/** Note: assumes that contiguous pids are on the same node and there is one process per GPU. I.e.
+ * pids are linearly mapped i + j * dx + k * dx * dy. */
+static bool
+onTheSameNode(const int pid_a, const int pid_b)
+{
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+
+    const int node_a = pid_a / devices_per_node;
+    const int node_b = pid_b / devices_per_node;
+
+    return node_a == node_b;
+}
+
 static int3
 decompose(const int target)
 {
@@ -530,6 +544,10 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
+#if AC_MPI_RT_PINNING
+    ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
+#endif // AC_MPI_RT_PINNING
+
 #if AC_MPI_UNIDIRECTIONAL_COMM
     ERRCHK_ALWAYS(MPI_Win_create(data.data, bytes, sizeof(AcReal), MPI_INFO_NULL, MPI_COMM_WORLD,
                                  &data.win) == MPI_SUCCESS);
@@ -542,6 +560,10 @@ acCreatePackedData(const int3 dims)
 static AcResult
 acDestroyPackedData(PackedData* data)
 {
+#if AC_MPI_RT_PINNING
+    cudaFree(data->data_pinned);
+#endif // AC_MPI_RT_PINNING
+
 #if AC_MPI_UNIDIRECTIONAL_COMM
     MPI_Win_free(&data->win);
 #endif // AC_MPI_UNIDIRECTIONAL_COMM
@@ -611,6 +633,30 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
 }
 #endif // MPI_GPUDIRECT_DISABLED
 
+#if AC_MPI_RT_PINNING
+static void
+acPinPackedData(const Device device, const cudaStream_t stream, PackedData* packed)
+{
+    cudaSetDevice(device->id);
+
+    const size_t bytes = packed->dims.x * packed->dims.y * packed->dims.z * sizeof(AcReal) *
+                         NUM_VTXBUF_HANDLES;
+    ERRCHK_CUDA(cudaMemcpyAsync(packed->data_pinned, packed->data, bytes, cudaMemcpyDeviceToDevice,
+                                stream));
+}
+
+static void
+acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* packed)
+{
+    cudaSetDevice(device->id);
+
+    const size_t bytes = packed->dims.x * packed->dims.y * packed->dims.z * sizeof(AcReal) *
+                         NUM_VTXBUF_HANDLES;
+    ERRCHK_CUDA(cudaMemcpyAsync(packed->data, packed->data_pinned, bytes, cudaMemcpyDeviceToDevice,
+                                stream));
+}
+#endif // AC_MPI_RT_PINNING
+
 // TODO: do with packed data
 static AcResult
 acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
@@ -904,6 +950,24 @@ acTransferCommDataToDevice(const Device device, CommData* data)
 }
 #endif
 
+#if AC_MPI_RT_PINNING
+static void
+acPinCommData(const Device device, CommData* data)
+{
+    cudaSetDevice(device->id);
+    for (size_t i = 0; i < data->count; ++i)
+        acPinPackedData(device, data->streams[i], &data->srcs[i]);
+}
+
+static void
+acUnpinCommData(const Device device, CommData* data)
+{
+    cudaSetDevice(device->id);
+    for (size_t i = 0; i < data->count; ++i)
+        acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
+}
+#endif
+
 #if AC_MPI_UNIDIRECTIONAL_COMM
 static AcResult
 acTransferCommData(const Device device, //
@@ -1057,6 +1121,146 @@ acTransferCommDataWait(const CommData data)
     // NOP
 }
 
+#elif AC_MPI_RT_PINNING
+static AcResult
+acTransferCommData(const Device device, //
+                   const int3* a0s,     // Src idx inside comp. domain
+                   const int3* b0s,     // Dst idx inside bound zone
+                   CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const int3 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    const int3 dims         = data->dims;
+    const size_t blockcount = data->count;
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+
+                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+#if MPI_GPUDIRECT_DISABLED
+                            PackedData src     = data->srcs_host[a_idx];
+#else
+                            PackedData src = data->srcs[a_idx];
+#endif
+#if MPI_GPUDIRECT_DISABLED
+                            PackedData dst     = data->dsts_host[b_idx];
+#else
+                            PackedData dst = data->dsts[b_idx];
+#endif
+
+                            const int3 pid3d    = getPid3D(pid, decomp);
+                            const int npid_recv = getPid(pid3d - neighbor, decomp);
+                            const int npid_send = getPid(pid3d + neighbor, decomp);
+
+                            if (onTheSameNode(pid, npid_recv)) {
+                                MPI_Irecv(dst.data, count, datatype, npid_recv, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                            }
+                            else {
+                                MPI_Irecv(dst.data_pinned, count, datatype, npid_recv, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+
+                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+#if MPI_GPUDIRECT_DISABLED
+                            PackedData src     = data->srcs_host[a_idx];
+#else
+                            PackedData src = data->srcs[a_idx];
+#endif
+#if MPI_GPUDIRECT_DISABLED
+                            PackedData dst     = data->dsts_host[b_idx];
+#else
+                            PackedData dst = data->dsts[b_idx];
+#endif
+
+                            const int3 pid3d    = getPid3D(pid, decomp);
+                            const int npid_recv = getPid(pid3d - neighbor, decomp);
+                            const int npid_send = getPid(pid3d + neighbor, decomp);
+
+                            cudaStreamSynchronize(data->streams[a_idx]);
+                            if (onTheSameNode(pid, npid_send)) {
+                                MPI_Isend(src.data, count, datatype, npid_send, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            else {
+                                MPI_Isend(src.data_pinned, count, datatype, npid_send, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return AC_SUCCESS;
+}
+
+static void
+acTransferCommDataWait(const CommData data)
+{
+    for (size_t i = 0; i < data.count; ++i) {
+        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
+        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
+    }
+}
+
 #else
 static AcResult
 acTransferCommData(const Device device, //
@@ -1520,6 +1724,16 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexz_a0s, &sidexz_data);
         acPackCommData(device, sideyz_a0s, &sideyz_data);
 
+#if AC_MPI_RT_PINNING
+        acPinCommData(device, &corner_data);
+        acPinCommData(device, &edgex_data);
+        acPinCommData(device, &edgey_data);
+        acPinCommData(device, &edgez_data);
+        acPinCommData(device, &sidexy_data);
+        acPinCommData(device, &sidexz_data);
+        acPinCommData(device, &sideyz_data);
+#endif // AC_MPI_RT_PINNING
+
         //////////// INNER INTEGRATION //////////////
         {
             const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
@@ -1566,6 +1780,16 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToDevice(device, &sideyz_data);
 #endif
 
+#if AC_MPI_RT_PINNING
+        acUnpinCommData(device, &corner_data);
+        acUnpinCommData(device, &edgex_data);
+        acUnpinCommData(device, &edgey_data);
+        acUnpinCommData(device, &edgez_data);
+        acUnpinCommData(device, &sidexy_data);
+        acUnpinCommData(device, &sidexz_data);
+        acUnpinCommData(device, &sideyz_data);
+#endif // AC_MPI_RT_PINNING
+
         acUnpackCommData(device, corner_b0s, &corner_data);
         acUnpackCommData(device, edgex_b0s, &edgex_data);
         acUnpackCommData(device, edgey_b0s, &edgey_data);
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 9e4e0f7..d2a3740 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -5,12 +5,17 @@
 #include <mpi.h>
 
 #define AC_MPI_UNIDIRECTIONAL_COMM (0)
+#define AC_MPI_RT_PINNING (1)
 #endif // AC_MPI_ENABLED
 
 typedef struct {
     int3 dims;
     AcReal* data;
 
+#if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
+    AcReal* data_pinned;
+#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
+
 #if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
     MPI_Win win; // MPI window for RMA
 #endif           // (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)

From 427a3ac5d80ac4e327a65faf3722b64f7d17f6c6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 6 Apr 2020 17:28:02 +0300
Subject: [PATCH 09/89] Rewrote the previous implementation, now fully works
 (verified) and gives the speedup we want. Communication latency is now
 completely hidden on at least two nodes (8 GPUs). Scaling looks very
 promising.

---
 src/core/device.cc         | 146 +++++++++++++++++++++++--------------
 src/core/kernels/kernels.h |   4 +-
 2 files changed, 93 insertions(+), 57 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 61cab2f..1b3a284 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -635,25 +635,32 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
 
 #if AC_MPI_RT_PINNING
 static void
-acPinPackedData(const Device device, const cudaStream_t stream, PackedData* packed)
+acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
 {
     cudaSetDevice(device->id);
+    // TODO sync stream
+    ddata->pinned = true;
 
-    const size_t bytes = packed->dims.x * packed->dims.y * packed->dims.z * sizeof(AcReal) *
+    const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
                          NUM_VTXBUF_HANDLES;
-    ERRCHK_CUDA(cudaMemcpyAsync(packed->data_pinned, packed->data, bytes, cudaMemcpyDeviceToDevice,
-                                stream));
+    ERRCHK_CUDA(
+        cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDeviceToHost, stream));
 }
 
 static void
-acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* packed)
+acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
 {
-    cudaSetDevice(device->id);
+    if (!ddata->pinned)
+        return;
 
-    const size_t bytes = packed->dims.x * packed->dims.y * packed->dims.z * sizeof(AcReal) *
+    cudaSetDevice(device->id);
+    // TODO sync stream
+    ddata->pinned = false;
+
+    const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
                          NUM_VTXBUF_HANDLES;
-    ERRCHK_CUDA(cudaMemcpyAsync(packed->data, packed->data_pinned, bytes, cudaMemcpyDeviceToDevice,
-                                stream));
+    ERRCHK_CUDA(
+        cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyHostToDevice, stream));
 }
 #endif // AC_MPI_RT_PINNING
 
@@ -963,6 +970,12 @@ static void
 acUnpinCommData(const Device device, CommData* data)
 {
     cudaSetDevice(device->id);
+
+    // Clear pin flags from src
+    for (size_t i = 0; i < data->count; ++i)
+        data->srcs[i].pinned = false;
+
+    // Transfer from pinned to gmem
     for (size_t i = 0; i < data->count; ++i)
         acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
 }
@@ -1169,27 +1182,23 @@ acTransferCommData(const Device device, //
                             const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
 #if MPI_GPUDIRECT_DISABLED
-                            PackedData src     = data->srcs_host[a_idx];
+                            PackedData* dst    = &data->dsts_host[b_idx];
 #else
-                            PackedData src = data->srcs[a_idx];
-#endif
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData dst     = data->dsts_host[b_idx];
-#else
-                            PackedData dst = data->dsts[b_idx];
+                            PackedData* dst = &data->dsts[b_idx];
 #endif
 
-                            const int3 pid3d    = getPid3D(pid, decomp);
-                            const int npid_recv = getPid(pid3d - neighbor, decomp);
-                            const int npid_send = getPid(pid3d + neighbor, decomp);
+                            const int3 pid3d = getPid3D(pid, decomp);
+                            const int npid   = getPid(pid3d - neighbor, decomp);
 
-                            if (onTheSameNode(pid, npid_recv)) {
-                                MPI_Irecv(dst.data, count, datatype, npid_recv, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                            if (onTheSameNode(pid, npid)) {
+                                MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
+                                          &data->recv_reqs[b_idx]);
+                                dst->pinned = false;
                             }
                             else {
-                                MPI_Irecv(dst.data_pinned, count, datatype, npid_recv, b_idx,
+                                MPI_Irecv(dst->data_pinned, count, datatype, npid, b_idx,
                                           MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = true;
                             }
                         }
                     }
@@ -1219,27 +1228,25 @@ acTransferCommData(const Device device, //
                             const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
 #if MPI_GPUDIRECT_DISABLED
-                            PackedData src     = data->srcs_host[a_idx];
+                            PackedData* src    = &data->srcs_host[a_idx];
 #else
-                            PackedData src = data->srcs[a_idx];
-#endif
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData dst     = data->dsts_host[b_idx];
-#else
-                            PackedData dst = data->dsts[b_idx];
+                            PackedData* src = &data->srcs[a_idx];
 #endif
 
-                            const int3 pid3d    = getPid3D(pid, decomp);
-                            const int npid_recv = getPid(pid3d - neighbor, decomp);
-                            const int npid_send = getPid(pid3d + neighbor, decomp);
+                            const int3 pid3d = getPid3D(pid, decomp);
+                            const int npid   = getPid(pid3d + neighbor, decomp);
 
                             cudaStreamSynchronize(data->streams[a_idx]);
-                            if (onTheSameNode(pid, npid_send)) {
-                                MPI_Isend(src.data, count, datatype, npid_send, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            if (onTheSameNode(pid, npid)) {
+                                MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
+                                          &data->send_reqs[b_idx]);
                             }
                             else {
-                                MPI_Isend(src.data_pinned, count, datatype, npid_send, b_idx,
+                                if (!src->pinned) {
+                                    acPinPackedData(device, data->streams[a_idx], src);
+                                    cudaStreamSynchronize(data->streams[a_idx]);
+                                }
+                                MPI_Isend(src->data_pinned, count, datatype, npid, b_idx,
                                           MPI_COMM_WORLD, &data->send_reqs[b_idx]);
                             }
                         }
@@ -1260,7 +1267,6 @@ acTransferCommDataWait(const CommData data)
         MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
     }
 }
-
 #else
 static AcResult
 acTransferCommData(const Device device, //
@@ -1309,14 +1315,16 @@ acTransferCommData(const Device device, //
                             const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
 #if MPI_GPUDIRECT_DISABLED
-                            PackedData dst     = data->dsts_host[b_idx];
+                            PackedData* dst    = &data->dsts_host[b_idx];
 #else
-                            PackedData dst = data->dsts[b_idx];
+                            PackedData* dst = &data->dsts[b_idx];
 #endif
 
                             const int3 pid3d = getPid3D(pid, decomp);
-                            MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
-                                      b_idx, MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                            const int npid   = getPid(pid3d - neighbor, decomp);
+
+                            MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
+                                      &data->recv_reqs[b_idx]);
                         }
                     }
                 }
@@ -1345,16 +1353,16 @@ acTransferCommData(const Device device, //
                             const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
 #if MPI_GPUDIRECT_DISABLED
-                            PackedData src     = data->srcs_host[a_idx];
+                            PackedData* src    = &data->srcs_host[a_idx];
 #else
-                            PackedData src = data->srcs[a_idx];
+                            PackedData* src = &data->srcs[a_idx];
 #endif
 
                             const int3 pid3d = getPid3D(pid, decomp);
+                            const int npid   = getPid(pid3d + neighbor, decomp);
 
                             cudaStreamSynchronize(data->streams[a_idx]);
-                            MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp),
-                                      b_idx, MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                                MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
                         }
                     }
                 }
@@ -1724,15 +1732,17 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexz_a0s, &sidexz_data);
         acPackCommData(device, sideyz_a0s, &sideyz_data);
 
-#if AC_MPI_RT_PINNING
-        acPinCommData(device, &corner_data);
-        acPinCommData(device, &edgex_data);
-        acPinCommData(device, &edgey_data);
-        acPinCommData(device, &edgez_data);
-        acPinCommData(device, &sidexy_data);
-        acPinCommData(device, &sidexz_data);
-        acPinCommData(device, &sideyz_data);
-#endif // AC_MPI_RT_PINNING
+        /*
+        #if AC_MPI_RT_PINNING
+                acPinCommData(device, &corner_data);
+                acPinCommData(device, &edgex_data);
+                acPinCommData(device, &edgey_data);
+                acPinCommData(device, &edgez_data);
+                acPinCommData(device, &sidexy_data);
+                acPinCommData(device, &sidexz_data);
+                acPinCommData(device, &sideyz_data);
+        #endif
+        */
 
         //////////// INNER INTEGRATION //////////////
         {
@@ -1788,7 +1798,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acUnpinCommData(device, &sidexy_data);
         acUnpinCommData(device, &sidexz_data);
         acUnpinCommData(device, &sideyz_data);
-#endif // AC_MPI_RT_PINNING
+#endif
 
         acUnpackCommData(device, corner_b0s, &corner_data);
         acUnpackCommData(device, edgex_b0s, &edgex_data);
@@ -1971,6 +1981,20 @@ acGridPeriodicBoundconds(const Stream stream)
     acPackCommData(device, sidexz_a0s, &sidexz_data);
     acPackCommData(device, sideyz_a0s, &sideyz_data);
 
+    /*
+    #if AC_MPI_RT_PINNING
+        acPinCommData(device, &corner_data);
+        acPinCommData(device, &edgex_data);
+        acPinCommData(device, &edgey_data);
+        acPinCommData(device, &edgez_data);
+        acPinCommData(device, &sidexy_data);
+        acPinCommData(device, &sidexz_data);
+        acPinCommData(device, &sideyz_data);
+    #endif
+    */
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
 #if MPI_GPUDIRECT_DISABLED
     acTransferCommDataToHost(device, &corner_data);
     acTransferCommDataToHost(device, &edgex_data);
@@ -2007,6 +2031,16 @@ acGridPeriodicBoundconds(const Stream stream)
     acTransferCommDataToDevice(device, &sideyz_data);
 #endif
 
+#if AC_MPI_RT_PINNING
+    acUnpinCommData(device, &corner_data);
+    acUnpinCommData(device, &edgex_data);
+    acUnpinCommData(device, &edgey_data);
+    acUnpinCommData(device, &edgez_data);
+    acUnpinCommData(device, &sidexy_data);
+    acUnpinCommData(device, &sidexz_data);
+    acUnpinCommData(device, &sideyz_data);
+#endif
+
     acUnpackCommData(device, corner_b0s, &corner_data);
     acUnpackCommData(device, edgex_b0s, &edgex_data);
     acUnpackCommData(device, edgey_b0s, &edgey_data);
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index d2a3740..fbf24bc 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -3,6 +3,7 @@
 
 #if AC_MPI_ENABLED
 #include <mpi.h>
+#include <stdbool.h>
 
 #define AC_MPI_UNIDIRECTIONAL_COMM (0)
 #define AC_MPI_RT_PINNING (1)
@@ -14,7 +15,8 @@ typedef struct {
 
 #if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
     AcReal* data_pinned;
-#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
+    bool pinned; // Set if data was received to pinned memory
+#endif           // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
 
 #if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
     MPI_Win win; // MPI window for RMA

From fb41741d74e33488dd0e94d9249c7c0654c219eb Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 7 Apr 2020 17:58:47 +0300
Subject: [PATCH 10/89] Improvements to samples

---
 samples/benchmark/main.cc     | 86 ++++++++++++++++++++++++++++-------
 samples/bwtest/CMakeLists.txt |  2 +-
 samples/bwtest/main.c         |  5 +-
 src/core/device.cc            |  5 +-
 src/core/kernels/kernels.h    |  4 +-
 5 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index 4b50bfb..5ab4349 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -89,33 +89,45 @@ main(void)
         }
     }*/
 
+    /*
+    // Basic
+    const size_t num_iters = 100;
+
     // Warmup
-    for (size_t i = 0; i < 10; ++i)
+    for (size_t i = 0; i < num_iters / 10; ++i)
         acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
 
     // Benchmark
     Timer t;
-    const AcReal dt             = FLT_EPSILON;
+    const AcReal dt = FLT_EPSILON;
 
-	acGridSynchronizeStream(STREAM_ALL);
-	timer_reset(&t);
-	acGridSynchronizeStream(STREAM_ALL);
+    acGridSynchronizeStream(STREAM_ALL);
+    timer_reset(&t);
+    acGridSynchronizeStream(STREAM_ALL);
 
-	const size_t num_iters = 50;
-	for (size_t i = 0; i < num_iters; ++i)
-		acGridIntegrate(STREAM_DEFAULT, dt);
+    for (size_t i = 0; i < num_iters; ++i)
+        acGridIntegrate(STREAM_DEFAULT, dt);
 
-	acGridSynchronizeStream(STREAM_ALL);
-	if (!pid)
-		timer_diff_print(t);
-	acGridSynchronizeStream(STREAM_ALL);
-	/*
+    acGridSynchronizeStream(STREAM_ALL);
+    if (!pid)
+        timer_diff_print(t);
+    acGridSynchronizeStream(STREAM_ALL);
+    */
+
+    // Percentiles
     const size_t num_iters      = 100;
     const double nth_percentile = 0.90;
-
     std::vector<double> results; // ms
     results.reserve(num_iters);
 
+    // Warmup
+    for (size_t i = 0; i < num_iters / 10; ++i)
+        acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
+
+    // Benchmark
+    Timer t;
+    const AcReal dt = FLT_EPSILON;
+
     for (size_t i = 0; i < num_iters; ++i) {
         acGridSynchronizeStream(STREAM_ALL);
         timer_reset(&t);
@@ -123,9 +135,9 @@ main(void)
         acGridIntegrate(STREAM_DEFAULT, dt);
         acGridSynchronizeStream(STREAM_ALL);
         results.push_back(timer_diff_nsec(t) / 1e6);
+        acGridSynchronizeStream(STREAM_ALL);
     }
 
-    // Write benchmark to file
     if (!pid) {
         std::sort(results.begin(), results.end(),
                   [](const double& a, const double& b) { return a < b; });
@@ -149,7 +161,49 @@ main(void)
         fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
 
         fclose(fp);
-    }*/
+    }
+
+    /*
+const size_t num_iters      = 100;
+const double nth_percentile = 0.90;
+
+std::vector<double> results; // ms
+results.reserve(num_iters);
+
+for (size_t i = 0; i < num_iters; ++i) {
+    acGridSynchronizeStream(STREAM_ALL);
+    timer_reset(&t);
+    acGridSynchronizeStream(STREAM_ALL);
+    acGridIntegrate(STREAM_DEFAULT, dt);
+    acGridSynchronizeStream(STREAM_ALL);
+    results.push_back(timer_diff_nsec(t) / 1e6);
+}
+
+// Write benchmark to file
+if (!pid) {
+    std::sort(results.begin(), results.end(),
+              [](const double& a, const double& b) { return a < b; });
+    fprintf(stdout,
+            "Integration step time %g ms (%gth "
+            "percentile)--------------------------------------\n",
+            results[nth_percentile * num_iters], 100 * nth_percentile);
+
+    char path[4096] = "";
+    if (test == TEST_STRONG_SCALING)
+        strncpy(path, "strong_scaling.csv", sizeof(path));
+    else if (test == TEST_WEAK_SCALING)
+        strncpy(path, "weak_scaling.csv", sizeof(path));
+    else
+        ERROR("Invalid test type");
+
+    FILE* fp = fopen(path, "a");
+    ERRCHK_ALWAYS(fp);
+    // Format
+    // nprocs, measured (ms)
+    fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
+
+    fclose(fp);
+}*/
 
     acGridQuit();
     MPI_Finalize();
diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt
index 13bf13f..cd4329f 100644
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -5,5 +5,5 @@ find_package(OpenMP)
 find_package(CUDAToolkit)
 
 add_executable(bwtest main.c)
-add_compile_options(-O3)
 target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
+target_compile_options(bwtest PRIVATE -O3)
diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index 00c0caf..7f1f9f6 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -303,6 +303,7 @@ main(void)
 
     PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
 
+#if 0
     {
         uint8_t* src = allocHost(BLOCK_SIZE);
         uint8_t* dst = allocHost(BLOCK_SIZE);
@@ -362,7 +363,7 @@ main(void)
         freeDevice(dst);
     }
     PRINT("\n------------------------\n");
-    /*
+#else
     { // Final run for easy identification with the profiler
         uint8_t* src = allocDevice(BLOCK_SIZE);
         uint8_t* dst = allocDevice(BLOCK_SIZE);
@@ -373,7 +374,7 @@ main(void)
         freeDevice(src);
         freeDevice(dst);
     }
-    */
+#endif
 
     MPI_Finalize();
     return EXIT_SUCCESS;
diff --git a/src/core/device.cc b/src/core/device.cc
index 1b3a284..8ce057f 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -650,7 +650,7 @@ acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddat
 static void
 acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
 {
-    if (!ddata->pinned)
+    if (!ddata->pinned) // Unpin iff the data was pinned previously
         return;
 
     cudaSetDevice(device->id);
@@ -1362,7 +1362,8 @@ acTransferCommData(const Device device, //
                             const int npid   = getPid(pid3d + neighbor, decomp);
 
                             cudaStreamSynchronize(data->streams[a_idx]);
-                                MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
+                            MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
+                                      &data->send_reqs[b_idx]);
                         }
                     }
                 }
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index fbf24bc..805cbed 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -15,8 +15,8 @@ typedef struct {
 
 #if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
     AcReal* data_pinned;
-    bool pinned; // Set if data was received to pinned memory
-#endif           // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
+    bool pinned = false; // Set if data was received to pinned memory
+#endif                   // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
 
 #if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
     MPI_Win win; // MPI window for RMA

From ed8a0bf7e6332e4a46ab56dfff20e843a718c79e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 7 Apr 2020 18:35:12 +0300
Subject: [PATCH 11/89] Added bwtest and benchmarkscript to CMakeLists

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 328c23b..be5d4d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,8 @@ if (BUILD_SAMPLES)
     add_subdirectory(samples/cpptest)
     add_subdirectory(samples/mpitest)
     add_subdirectory(samples/benchmark)
+    add_subdirectory(samples/bwtest)
+    add_subdirectory(samples/genbenchmarkscripts)
 endif()
 
 if (BUILD_STANDALONE)

From d6e74ee2700d9a20f77488db1955cde788b882a8 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 9 Apr 2020 19:24:55 +0300
Subject: [PATCH 12/89] Added missing files

---
 samples/genbenchmarkscripts/CMakeLists.txt |  8 ++++
 samples/genbenchmarkscripts/main.c         | 46 ++++++++++++++++++++++
 src/core/CMakeLists.txt                    |  7 +++-
 3 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 samples/genbenchmarkscripts/CMakeLists.txt
 create mode 100644 samples/genbenchmarkscripts/main.c

diff --git a/samples/genbenchmarkscripts/CMakeLists.txt b/samples/genbenchmarkscripts/CMakeLists.txt
new file mode 100644
index 0000000..6115fde
--- /dev/null
+++ b/samples/genbenchmarkscripts/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(genbenchmarkscripts main.c)
+
+add_custom_command(
+  TARGET genbenchmarkscripts POST_BUILD
+  COMMAND genbenchmarkscripts
+  WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+  COMMENT "Generating benchmark scripts"
+)
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
new file mode 100644
index 0000000..f9d5506
--- /dev/null
+++ b/samples/genbenchmarkscripts/main.c
@@ -0,0 +1,46 @@
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main(void)
+{
+    const int max_nprocs = 128;
+    for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
+        char filename[4096];
+        sprintf(filename, "benchmark_%d.sh", nprocs);
+
+        FILE* fp = fopen(filename, "w");
+        assert(fp);
+
+        // Boilerplate
+        fprintf(fp, "#!/bin/bash\n");
+        fprintf(fp, "#BATCH --job-name=astaroth\n");
+        fprintf(fp, "#SBATCH --account=project_2000403\n");
+        fprintf(fp, "#SBATCH --time=00:14:59\n");
+        fprintf(fp, "#SBATCH --mem=24000\n");
+        fprintf(fp, "#SBATCH --partition=gputest\n");
+
+        // nprocs, nodes, gpus
+        const int max_gpus_per_node = 4;
+        const int gpus_per_node     = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
+        const int nodes             = (int)ceil((double)nprocs / max_gpus_per_node);
+        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
+        fprintf(fp, "#SBATCH -n %d\n", nprocs);
+        fprintf(fp, "#SBATCH -N %d\n", nodes);
+
+        // Modules
+        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
+
+        // Profile and run
+        fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+        fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
+                nprocs);
+
+        fclose(fp);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 152f811..e1d25a1 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,11 +1,14 @@
+find_package(CUDAToolkit)
+
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart_static)
 
 ## Options
 if (MPI_ENABLED)
     find_package(MPI)
-    target_link_libraries(astaroth_core MPI::MPI_CXX)
+    find_package(OpenMP)
+    target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
 endif()
 
 if (MULTIGPU_ENABLED)

From d4a84fb88791b4d31d8ad4d6dc35534f9a4cefd6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 9 Apr 2020 20:04:54 +0300
Subject: [PATCH 13/89] Added a PCIe bandwidth test

---
 samples/bwtest/main.c | 107 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 1 deletion(-)

diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index 7f1f9f6..fb9de90 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -225,6 +225,36 @@ sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
     }
 }
 
+static void
+send_d2h(const uint8_t* src, uint8_t* dst)
+{
+    cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
+}
+
+static void
+send_h2d(const uint8_t* src, uint8_t* dst)
+{
+    cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
+}
+
+
+static void
+sendrecv_d2h2d(const uint8_t* dsrc, uint8_t* hdst, const uint8_t* hsrc, uint8_t* ddst)
+{
+    cudaStream_t d2h, h2d;
+    cudaStreamCreate(&d2h);
+    cudaStreamCreate(&h2d);
+
+    cudaMemcpyAsync(hdst, dsrc, BLOCK_SIZE, cudaMemcpyDeviceToHost, d2h);
+    cudaMemcpyAsync(ddst, hsrc, BLOCK_SIZE, cudaMemcpyHostToDevice, h2d);
+
+    cudaStreamSynchronize(d2h);
+    cudaStreamSynchronize(h2d);
+
+    cudaStreamDestroy(d2h);
+    cudaStreamDestroy(h2d);
+}
+
 #define PRINT                                                                                      \
     if (!pid)                                                                                      \
     printf
@@ -267,6 +297,45 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
+
+static void
+measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(const uint8_t*, uint8_t*, const uint8_t*, uint8_t*), const uint8_t* dsrc, uint8_t* hdst,
+                                                                                                            const uint8_t* hsrc, uint8_t* ddst)
+{
+    const size_t num_samples = 100;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    PRINT("%s\n", msg);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("\tWarming up... ");
+    for (size_t i = 0; i < num_samples / 10; ++i)
+        sendrecv(dsrc, hdst, hsrc, ddst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    PRINT("Done\n");
+
+    PRINT("\tBandwidth... ");
+    fflush(stdout);
+
+    Timer t;
+    MPI_Barrier(MPI_COMM_WORLD);
+    timer_reset(&t);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (size_t i = 0; i < num_samples; ++i)
+        sendrecv(dsrc, hdst, hsrc, ddst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
+    PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
 int
 main(void)
 {
@@ -303,7 +372,7 @@ main(void)
 
     PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
 
-#if 0
+#if 1
     {
         uint8_t* src = allocHost(BLOCK_SIZE);
         uint8_t* dst = allocHost(BLOCK_SIZE);
@@ -363,6 +432,42 @@ main(void)
         freeDevice(dst);
     }
     PRINT("\n------------------------\n");
+    
+    {
+        uint8_t* hsrc = allocHost(BLOCK_SIZE);
+        uint8_t* hdst = allocHost(BLOCK_SIZE);
+        uint8_t* dsrc = allocDevice(BLOCK_SIZE);
+        uint8_t* ddst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
+        measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
+
+        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+
+        freeDevice(dsrc);
+        freeDevice(ddst);
+        freeHost(hsrc);
+        freeHost(hdst);
+    }
+    PRINT("\n------------------------\n");
+    
+    {
+        uint8_t* hsrc = allocHost(BLOCK_SIZE);
+        uint8_t* hdst = allocHost(BLOCK_SIZE);
+        uint8_t* dsrc = allocDevicePinned(BLOCK_SIZE);
+        uint8_t* ddst = allocDevicePinned(BLOCK_SIZE);
+
+        measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
+        measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
+
+        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+
+        freeDevice(dsrc);
+        freeDevice(ddst);
+        freeHost(hsrc);
+        freeHost(hdst);
+    }
+    PRINT("\n------------------------\n");
 #else
     { // Final run for easy identification with the profiler
         uint8_t* src = allocDevice(BLOCK_SIZE);

From 9cd5909f5a88a01e646a76973875464ad94686ee Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 9 Apr 2020 20:28:04 +0300
Subject: [PATCH 14/89] BWtest calculates now aggregate bandwidths per process
 instead of assuming that all neighbor communication can be done in parallel
 (Within a node one can have parallel P2P connections to all neighbors and we
 have an insane total bandwidth, but this is not the case with network, we
 seem to have only one bidirectional socket)

---
 samples/bwtest/main.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index fb9de90..ada1721 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -384,9 +384,9 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Host)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Host)", //
-                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
-                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
+                  2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+        //          2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
 
         freeHost(src);
         freeHost(dst);
@@ -404,11 +404,11 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device)", //
-                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
-                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
+                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+        //          2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
-                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
 
         freeDevice(src);
         freeDevice(dst);
@@ -426,7 +426,7 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
-                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
 
         freeDevice(src);
         freeDevice(dst);

From 8c210b3292bf5901fd38ed3673da782f8750f592 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 19 Apr 2020 22:31:57 +0300
Subject: [PATCH 15/89] 3D decomposition is now done using Morton order instead
 of linear indexing

---
 src/core/device.cc | 57 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 8ce057f..6ac849e 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -468,29 +468,62 @@ mod(const int a, const int b)
     const int r = a % b;
     return r < 0 ? r + b : r;
 }
+#include <stdint.h>
 
-static int
-getPid(const int3 pid, const int3 decomposition)
+int
+getPid(int3 pid, const int3 decomposition)
 {
-    return mod(pid.x, decomposition.x) +                   //
-           mod(pid.y, decomposition.y) * decomposition.x + //
-           mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
+    /*
+      return mod(pid.x, decomposition.x) +                   //
+             mod(pid.y, decomposition.y) * decomposition.x + //
+             mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
+
+    */
+    pid.x = mod(pid.x, decomposition.x);
+    pid.y = mod(pid.y, decomposition.y);
+    pid.z = mod(pid.z, decomposition.z);
+
+    uint64_t i = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << bit;
+        i |= (((uint64_t)pid.x & mask) << 0) << 2 * bit;
+        i |= (((uint64_t)pid.y & mask) << 1) << 2 * bit;
+        i |= (((uint64_t)pid.z & mask) << 2) << 2 * bit;
+    }
+    return (int)i;
+    
 }
 
-static int3
+int3
 getPid3D(const int pid, const int3 decomposition)
 {
-    const int3 pid3d = (int3){
-        mod(pid, decomposition.x),
-        mod(pid / decomposition.x, decomposition.y),
-        (pid / (decomposition.x * decomposition.y)),
-    };
+ /*   
+  const int3 pid3d = (int3){
+      mod(pid, decomposition.x),
+      mod(pid / decomposition.x, decomposition.y),
+      (pid / (decomposition.x * decomposition.y)),
+  };
+     
+  ERRCHK_ALWAYS(getPid(pid3d, decomposition) == pid);
+  return pid3d;
+  */
+    uint64_t i, j, k;
+    i = j = k = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << 3 * bit;
+        i |= (((uint64_t)pid & (mask << 0)) >> 2 * bit) >> 0;
+        j |= (((uint64_t)pid & (mask << 1)) >> 2 * bit) >> 1;
+        k |= (((uint64_t)pid & (mask << 2)) >> 2 * bit) >> 2;
+    }
+    const int3 pid3d = (int3){i, j, k};
+    ERRCHK(getPid(pid3d, decomposition) == pid);
     return pid3d;
+    
 }
 
 /** Note: assumes that contiguous pids are on the same node and there is one process per GPU. I.e.
  * pids are linearly mapped i + j * dx + k * dx * dy. */
-static bool
+static inline bool
 onTheSameNode(const int pid_a, const int pid_b)
 {
     int devices_per_node = -1;

From ffb274e16f59a63f1fe01fa0697ac4373ad04370 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 19 Apr 2020 22:33:01 +0300
Subject: [PATCH 16/89] Linking dynamic CUDA library instead of static (less
 prone to breaking since Astaroth does not have to be rebuilt when CUDA is
 updated)

---
 src/core/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index e1d25a1..7d93fd2 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
 
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart_static)
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)
 
 ## Options
 if (MPI_ENABLED)

From 4dd825f574e876da6d45c1d778887295524bce1c Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 19 Apr 2020 22:50:26 +0300
Subject: [PATCH 17/89] Proper decomposition when using Morton order to
 partition the computational domain

---
 src/core/device.cc | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index 6ac849e..273834d 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -470,6 +470,38 @@ mod(const int a, const int b)
 }
 #include <stdint.h>
 
+typedef struct {
+    uint64_t x, y, z;
+} uint3_64;
+
+static uint3_64
+morton3D(const uint64_t pid)
+{
+    uint64_t i, j, k;
+    i = j = k = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << 3 * bit;
+        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+    }
+
+    return (uint3_64){i, j, k};
+}
+
+static uint64_t
+morton1D(const uint3_64 pid)
+{
+    uint64_t i = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << bit;
+        i |= ((pid.x & mask) << 0) << 2 * bit;
+        i |= ((pid.y & mask) << 1) << 2 * bit;
+        i |= ((pid.z & mask) << 2) << 2 * bit;
+    }
+    return i;
+}
+
 int
 getPid(int3 pid, const int3 decomposition)
 {
@@ -538,6 +570,20 @@ onTheSameNode(const int pid_a, const int pid_b)
 static int3
 decompose(const int target)
 {
+  // This is just so beautifully elegant. Complex and efficient decomposition
+  // in just one line of code.
+  uint3_64 p = morton3D(target - 1);
+  p = (uint3_64){p.x + 1, p.y + 1, p.z + 1};
+  
+  if (p.x * p.y * p.z != target) {
+        fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
+        fprintf(stderr, "Target nprocs: %d. Found: %d\n", target, p.x * p.y * p.z);
+        ERROR("Invalid nprocs");
+        return (int3){-1, -1, -1};
+  }
+
+  return (int3){p.x, p.y, p.z};
+  /*
     if (target == 16)
         return (int3){4, 2, 2};
     if (target == 32)
@@ -565,6 +611,7 @@ decompose(const int target)
     else {
         return (int3){decomposition[0], decomposition[1], decomposition[2]};
     }
+  */
 }
 
 static PackedData

From 22e01b7f1d1f98fa383cd4923e6ca3c1079c00f5 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 19 Apr 2020 23:23:23 +0300
Subject: [PATCH 18/89] Rewrote partitioning code

---
 src/core/device.cc | 178 ++++++++++++++++-----------------------------
 1 file changed, 64 insertions(+), 114 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 273834d..4a81246 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -462,18 +462,31 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
 #if AC_MPI_ENABLED
 #include <mpi.h>
 
-static int
-mod(const int a, const int b)
-{
-    const int r = a % b;
-    return r < 0 ? r + b : r;
-}
 #include <stdint.h>
 
 typedef struct {
     uint64_t x, y, z;
 } uint3_64;
 
+static uint3_64
+operator+(const uint3_64& a, const uint3_64& b)
+{
+    return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static int3
+make_int3(const uint3_64 a)
+{
+    return (int3){(int)a.x, (int)a.y, (int)a.z};
+}
+
+static uint64_t
+mod(const int a, const int b)
+{
+    const int r = a % b;
+    return r < 0 ? r + b : r;
+}
+
 static uint3_64
 morton3D(const uint64_t pid)
 {
@@ -502,118 +515,55 @@ morton1D(const uint3_64 pid)
     return i;
 }
 
-int
-getPid(int3 pid, const int3 decomposition)
+static uint3_64
+decompose(const uint64_t target)
 {
-    /*
-      return mod(pid.x, decomposition.x) +                   //
-             mod(pid.y, decomposition.y) * decomposition.x + //
-             mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
+    // This is just so beautifully elegant. Complex and efficient decomposition
+    // in just one line of code.
+    uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
 
-    */
-    pid.x = mod(pid.x, decomposition.x);
-    pid.y = mod(pid.y, decomposition.y);
-    pid.z = mod(pid.z, decomposition.z);
-
-    uint64_t i = 0;
-    for (int bit = 0; bit <= 21; ++bit) {
-        const uint64_t mask = 0x1l << bit;
-        i |= (((uint64_t)pid.x & mask) << 0) << 2 * bit;
-        i |= (((uint64_t)pid.y & mask) << 1) << 2 * bit;
-        i |= (((uint64_t)pid.z & mask) << 2) << 2 * bit;
-    }
-    return (int)i;
-    
+    ERRCHK_ALWAYS(p.x * p.y * p.z == target);
+    return p;
 }
 
-int3
-getPid3D(const int pid, const int3 decomposition)
+static uint3_64
+wrap(const int3 i, const uint3_64 n)
 {
- /*   
-  const int3 pid3d = (int3){
-      mod(pid, decomposition.x),
-      mod(pid / decomposition.x, decomposition.y),
-      (pid / (decomposition.x * decomposition.y)),
-  };
-     
-  ERRCHK_ALWAYS(getPid(pid3d, decomposition) == pid);
-  return pid3d;
-  */
-    uint64_t i, j, k;
-    i = j = k = 0;
-    for (int bit = 0; bit <= 21; ++bit) {
-        const uint64_t mask = 0x1l << 3 * bit;
-        i |= (((uint64_t)pid & (mask << 0)) >> 2 * bit) >> 0;
-        j |= (((uint64_t)pid & (mask << 1)) >> 2 * bit) >> 1;
-        k |= (((uint64_t)pid & (mask << 2)) >> 2 * bit) >> 2;
-    }
-    const int3 pid3d = (int3){i, j, k};
-    ERRCHK(getPid(pid3d, decomposition) == pid);
-    return pid3d;
-    
+    return (uint3_64){
+        mod(i.x, n.x),
+        mod(i.y, n.y),
+        mod(i.z, n.z),
+    };
 }
 
-/** Note: assumes that contiguous pids are on the same node and there is one process per GPU. I.e.
- * pids are linearly mapped i + j * dx + k * dx * dy. */
+static int
+getPid(const int3 pid_raw, const uint3_64 decomp)
+{
+    const uint3_64 pid = wrap(pid_raw, decomp);
+    return (int)morton1D(pid);
+}
+
+static int3
+getPid3D(const uint64_t pid, const uint3_64 decomp)
+{
+    const uint3_64 pid3D = morton3D(pid);
+    ERRCHK_ALWAYS(getPid(make_int3(pid3D), decomp) == (int)pid);
+    return (int3){(int)pid3D.x, (int)pid3D.y, (int)pid3D.z};
+}
+
+/** Assumes that contiguous pids are on the same node and there is one process per GPU. */
 static inline bool
-onTheSameNode(const int pid_a, const int pid_b)
+onTheSameNode(const uint64_t pid_a, const uint64_t pid_b)
 {
     int devices_per_node = -1;
     cudaGetDeviceCount(&devices_per_node);
 
-    const int node_a = pid_a / devices_per_node;
-    const int node_b = pid_b / devices_per_node;
+    const uint64_t node_a = pid_a / devices_per_node;
+    const uint64_t node_b = pid_b / devices_per_node;
 
     return node_a == node_b;
 }
 
-static int3
-decompose(const int target)
-{
-  // This is just so beautifully elegant. Complex and efficient decomposition
-  // in just one line of code.
-  uint3_64 p = morton3D(target - 1);
-  p = (uint3_64){p.x + 1, p.y + 1, p.z + 1};
-  
-  if (p.x * p.y * p.z != target) {
-        fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
-        fprintf(stderr, "Target nprocs: %d. Found: %d\n", target, p.x * p.y * p.z);
-        ERROR("Invalid nprocs");
-        return (int3){-1, -1, -1};
-  }
-
-  return (int3){p.x, p.y, p.z};
-  /*
-    if (target == 16)
-        return (int3){4, 2, 2};
-    if (target == 32)
-        return (int3){4, 4, 2};
-    if (target == 128)
-        return (int3){8, 4, 4};
-    if (target == 256)
-        return (int3){8, 8, 4};
-
-    int decomposition[] = {1, 1, 1};
-
-    int axis = 0;
-    while (decomposition[0] * decomposition[1] * decomposition[2] < target) {
-        ++decomposition[axis];
-        axis = (axis + 1) % 3;
-    }
-
-    const int found = decomposition[0] * decomposition[1] * decomposition[2];
-    if (found != target) {
-        fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
-        fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found);
-        ERROR("Invalid nprocs");
-        return (int3){-1, -1, -1};
-    }
-    else {
-        return (int3){decomposition[0], decomposition[1], decomposition[2]};
-    }
-  */
-}
-
 static PackedData
 acCreatePackedData(const int3 dims)
 {
@@ -746,7 +696,7 @@ acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* dd
 
 // TODO: do with packed data
 static AcResult
-acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+acDeviceDistributeMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
 {
     MPI_Barrier(MPI_COMM_WORLD);
     printf("Distributing mesh...\n");
@@ -822,7 +772,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
 
 // TODO: do with packed data
 static AcResult
-acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+acDeviceGatherMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
 {
     MPI_Barrier(MPI_COMM_WORLD);
     printf("Gathering mesh...\n");
@@ -1038,7 +988,7 @@ acTransferCommDataToDevice(const Device device, CommData* data)
 #endif
 
 #if AC_MPI_RT_PINNING
-static void
+static inline void
 acPinCommData(const Device device, CommData* data)
 {
     cudaSetDevice(device->id);
@@ -1077,7 +1027,7 @@ acTransferCommData(const Device device, //
     int nprocs, pid;
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const int3 decomp = decompose(nprocs);
+    const uint3_64 decomp = decompose(nprocs);
 
     const int3 nn = (int3){
         device->local_config.int_params[AC_nx],
@@ -1230,7 +1180,7 @@ acTransferCommData(const Device device, //
     int nprocs, pid;
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const int3 decomp = decompose(nprocs);
+    const uint3_64 decomp = decompose(nprocs);
 
     const int3 nn = (int3){
         device->local_config.int_params[AC_nx],
@@ -1363,7 +1313,7 @@ acTransferCommData(const Device device, //
     int nprocs, pid;
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const int3 decomp = decompose(nprocs);
+    const uint3_64 decomp = decompose(nprocs);
 
     const int3 nn = (int3){
         device->local_config.int_params[AC_nx],
@@ -1467,7 +1417,7 @@ acTransferCommDataWait(const CommData data)
 typedef struct {
     Device device;
     AcMesh submesh;
-    int3 decomposition;
+    uint3_64 decomposition;
     bool initialized;
 
     int3 nn;
@@ -1508,11 +1458,11 @@ acGridInit(const AcMeshInfo info)
     printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
 
     // Decompose
-    AcMeshInfo submesh_info  = info;
-    const int3 decomposition = decompose(nprocs);
-    const int3 pid3d         = getPid3D(pid, decomposition);
+    AcMeshInfo submesh_info      = info;
+    const uint3_64 decomposition = decompose(nprocs);
+    const int3 pid3d             = getPid3D(pid, decomposition);
 
-    printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
+    printf("Decomposition: %lu, %lu, %lu\n", decomposition.x, decomposition.y, decomposition.z);
     printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
     ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
     ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
@@ -1650,7 +1600,7 @@ acGridQuit(void)
     acDestroyCommData(grid.device, &grid.sideyz_data);
 
     grid.initialized   = false;
-    grid.decomposition = (int3){-1, -1, -1};
+    grid.decomposition = (uint3_64){0, 0, 0};
     acMeshDestroy(&grid.submesh);
     acDeviceDestroy(grid.device);
 

From c93b3265e6eb66599e5f9606cdc5a4cbb98a0acf Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 22 Apr 2020 17:03:53 +0300
Subject: [PATCH 19/89] Made comm streams high prio

---
 src/core/device.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 4a81246..7a75496 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -576,6 +576,7 @@ acCreatePackedData(const int3 dims)
 
 #if AC_MPI_RT_PINNING
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
+    //ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly slower than pinned (38 ms vs. 125 ms)
 #endif // AC_MPI_RT_PINNING
 
 #if AC_MPI_UNIDIRECTIONAL_COMM
@@ -674,7 +675,7 @@ acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddat
     const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
                          NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA(
-        cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDeviceToHost, stream));
+        cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream));
 }
 
 static void
@@ -690,7 +691,7 @@ acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* dd
     const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
                          NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA(
-        cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyHostToDevice, stream));
+        cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
 }
 #endif // AC_MPI_RT_PINNING
 
@@ -906,7 +907,9 @@ acCreateCommData(const Device device, const int3 dims, const size_t count)
         data.dsts_host[i] = acCreatePackedDataHost(dims);
 #endif
 
-        cudaStreamCreate(&data.streams[i]);
+        int low_prio, high_prio;
+        cudaDeviceGetStreamPriorityRange(&low_prio, &high_prio);
+        cudaStreamCreateWithPriority(&data.streams[i], cudaStreamNonBlocking, high_prio);
     }
 
     return data;

From ec59cdb97344bcad01987e8b09317d00334b4870 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 26 May 2020 18:57:46 +0300
Subject: [PATCH 20/89] Some formatting and unimportant changes to samples

---
 samples/genbenchmarkscripts/main.c | 4 ++--
 samples/mpitest/main.cc            | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index f9d5506..8d35ae9 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -19,8 +19,8 @@ main(void)
         fprintf(fp, "#BATCH --job-name=astaroth\n");
         fprintf(fp, "#SBATCH --account=project_2000403\n");
         fprintf(fp, "#SBATCH --time=00:14:59\n");
-        fprintf(fp, "#SBATCH --mem=24000\n");
-        fprintf(fp, "#SBATCH --partition=gputest\n");
+        fprintf(fp, "#SBATCH --mem=32000\n");
+        fprintf(fp, "#SBATCH --partition=gpu\n");
 
         // nprocs, nodes, gpus
         const int max_gpus_per_node = 4;
diff --git a/samples/mpitest/main.cc b/samples/mpitest/main.cc
index 7315df2..8d0a4fc 100644
--- a/samples/mpitest/main.cc
+++ b/samples/mpitest/main.cc
@@ -74,7 +74,8 @@ main(void)
 int
 main(void)
 {
-    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with cmake -DMPI_ENABLED=ON .. to enable.\n");
+    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
+           "cmake -DMPI_ENABLED=ON .. to enable.\n");
     return EXIT_FAILURE;
 }
 #endif // AC_MPI_ENABLES

From 7e59ea0effdecb975759c0dcf9b041a4ac6f086e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 26 May 2020 19:00:14 +0300
Subject: [PATCH 21/89] MPI: corners are no longer communicated. Slight
 performance impact (14 ms vs 15 ms). Tests still pass with 8 GPUs.

---
 src/core/device.cc | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 7a75496..c5105e7 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -576,7 +576,8 @@ acCreatePackedData(const int3 dims)
 
 #if AC_MPI_RT_PINNING
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
-    //ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly slower than pinned (38 ms vs. 125 ms)
+    // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
+    // slower than pinned (38 ms vs. 125 ms)
 #endif // AC_MPI_RT_PINNING
 
 #if AC_MPI_UNIDIRECTIONAL_COMM
@@ -674,8 +675,7 @@ acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddat
 
     const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
                          NUM_VTXBUF_HANDLES;
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream));
+    ERRCHK_CUDA(cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream));
 }
 
 static void
@@ -690,8 +690,7 @@ acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* dd
 
     const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
                          NUM_VTXBUF_HANDLES;
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
+    ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
 }
 #endif // AC_MPI_RT_PINNING
 
@@ -1758,7 +1757,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acPackCommData(device, corner_a0s, &corner_data);
+        // acPackCommData(device, corner_a0s, &corner_data);
         acPackCommData(device, edgex_a0s, &edgex_data);
         acPackCommData(device, edgey_a0s, &edgey_data);
         acPackCommData(device, edgez_a0s, &edgez_data);
@@ -1789,7 +1788,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         MPI_Barrier(MPI_COMM_WORLD);
 
 #if MPI_GPUDIRECT_DISABLED
-        acTransferCommDataToHost(device, &corner_data);
+        // acTransferCommDataToHost(device, &corner_data);
         acTransferCommDataToHost(device, &edgex_data);
         acTransferCommDataToHost(device, &edgey_data);
         acTransferCommDataToHost(device, &edgez_data);
@@ -1798,7 +1797,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToHost(device, &sideyz_data);
 #endif
 
-        acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
+        // acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
         acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
         acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
         acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
@@ -1806,7 +1805,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
         acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
 
-        acTransferCommDataWait(corner_data);
+        // acTransferCommDataWait(corner_data);
         acTransferCommDataWait(edgex_data);
         acTransferCommDataWait(edgey_data);
         acTransferCommDataWait(edgez_data);
@@ -1815,7 +1814,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataWait(sideyz_data);
 
 #if MPI_GPUDIRECT_DISABLED
-        acTransferCommDataToDevice(device, &corner_data);
+        // acTransferCommDataToDevice(device, &corner_data);
         acTransferCommDataToDevice(device, &edgex_data);
         acTransferCommDataToDevice(device, &edgey_data);
         acTransferCommDataToDevice(device, &edgez_data);
@@ -1825,7 +1824,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
 #endif
 
 #if AC_MPI_RT_PINNING
-        acUnpinCommData(device, &corner_data);
+        // acUnpinCommData(device, &corner_data);
         acUnpinCommData(device, &edgex_data);
         acUnpinCommData(device, &edgey_data);
         acUnpinCommData(device, &edgez_data);
@@ -1834,7 +1833,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acUnpinCommData(device, &sideyz_data);
 #endif
 
-        acUnpackCommData(device, corner_b0s, &corner_data);
+        // acUnpackCommData(device, corner_b0s, &corner_data);
         acUnpackCommData(device, edgex_b0s, &edgex_data);
         acUnpackCommData(device, edgey_b0s, &edgey_data);
         acUnpackCommData(device, edgez_b0s, &edgez_data);
@@ -1844,7 +1843,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         //////////// OUTER INTEGRATION //////////////
 
         // Wait for unpacking
-        acSyncCommData(corner_data);
+        // acSyncCommData(corner_data);
         acSyncCommData(edgex_data);
         acSyncCommData(edgey_data);
         acSyncCommData(edgez_data);

From afe5b973ca79632233a46ddff80bd2f616782b9f Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 27 May 2020 19:08:39 +0300
Subject: [PATCH 22/89] Added multiplication operator for int3

---
 src/common/math_utils.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/common/math_utils.h b/src/common/math_utils.h
index b121679..68b270f 100644
--- a/src/common/math_utils.h
+++ b/src/common/math_utils.h
@@ -105,7 +105,8 @@ operator+(const int3& a, const int3& b)
     return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
 }
 
-static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
+static HOST_DEVICE_INLINE int3
+operator*(const int3& a, const int3& b)
 {
     return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
 }
@@ -144,12 +145,20 @@ operator-=(AcReal3& lhs, const AcReal3& rhs)
     lhs.z -= rhs.z;
 }
 
-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
+static HOST_DEVICE_INLINE int3
+operator*(const int& a, const int3& b)
+{
+    return (int3){a * b.x, a * b.y, a * b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+operator*(const AcReal& a, const AcReal3& b)
 {
     return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
 
-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
+static HOST_DEVICE_INLINE AcReal3
+operator*(const AcReal3& b, const AcReal& a)
 {
     return (AcReal3){a * b.x, a * b.y, a * b.z};
 }

From f97005a75d5274b82ce4bd5d930c4e7683401892 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 27 May 2020 19:09:32 +0300
Subject: [PATCH 23/89] Added WIP version of the new bidirectional comm scheme

---
 src/core/device.cc | 137 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index c5105e7..e017611 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1166,6 +1166,130 @@ acTransferCommDataWait(const CommData data)
     // NOP
 }
 
+#elif AC_MPI_BIDIRECTIONAL_SCHEME
+
+static int3
+mod(const int3 a, const int3 n)
+{
+    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
+}
+
+static AcResult
+acTransferCommData(const Device device, //
+                   const int3* a0s,     // Src idx inside comp. domain
+                   const int3* b0s,     // Dst idx inside bound zone
+                   CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    const int3 mm = (int3){
+        device->local_config.int_params[AC_mx],
+        device->local_config.int_params[AC_my],
+        device->local_config.int_params[AC_mz],
+    };
+
+    const int3 dims         = data->dims;
+    const size_t num_blocks = data->count;
+
+    cudaDeviceSynchronize(); // TODO debug REMOVE
+    for (size_t b0_idx = 0; b0_idx < num_blocks; ++b0_idx) {
+        const int3 b0     = b0s[b0_idx];
+        const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
+        const int3 a0     = mod(((b0 - nghost) + nn), nn) + nghost;
+
+        size_t a0_idx = -1;
+        for (size_t i = 0; i < num_blocks; ++i) {
+            if (a0s[i].x == a0.x && a0s[i].y == a0.y && a0s[i].z == a0.z) {
+                a0_idx = i;
+                break;
+            }
+        }
+        ERRCHK_ALWAYS(a0_idx < num_blocks); // TODO debug REMOVE
+
+        const int3 neighbor = (int3){
+            a0.x < b0.x ? -1 : a0.x > b0.x ? 1 : 0,
+            a0.y < b0.y ? -1 : a0.y > b0.y ? 1 : 0,
+            a0.z < b0.z ? -1 : a0.z > b0.z ? 1 : 0,
+        };
+
+        const int3 b1 = (int3){
+            neighbor.x < 0 ? a0.x - nghost.x : neighbor.x > 0 ? a0.x + nghost.x : a0.x,
+            neighbor.y < 0 ? a0.y - nghost.y : neighbor.y > 0 ? a0.y + nghost.y : a0.y,
+            neighbor.z < 0 ? a0.z - nghost.z : neighbor.z > 0 ? a0.z + nghost.z : a0.z,
+        };
+
+        size_t b1_idx = -1;
+        for (size_t i = 0; i < num_blocks; ++i) {
+            if (b0s[i].x == b1.x && b0s[i].y == b1.y && b0s[i].z == b1.z) {
+                b1_idx = i;
+                break;
+            }
+        }
+        ERRCHK_ALWAYS(b1_idx < num_blocks); // TODO debug REMOVE
+
+        const int3 pid3d   = getPid3D(pid, decomp);
+        const int npid     = getPid(pid3d + neighbor, decomp);
+        const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+        PackedData* src = &data->srcs[a0_idx];
+        PackedData* dst = &data->dsts[b1_idx];
+
+        MPI_Irecv(dst->data, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
+                  &data->recv_reqs[b1_idx]);
+        MPI_Isend(src->data, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
+                  &data->send_reqs[b0_idx]);
+
+        /*
+        const int3 neighbor = (int3){
+            a0.x < b0.x ? a0.x - nghost.x : a0.x > b0.x ? a0.x + nghost.x : a0.x,
+            a0.y < b0.y ? a0.y - nghost.y : a0.y > b0.y ? a0.y + nghost.y : a0.y,
+            a0.z < b0.z ? a0.z - nghost.z : a0.z > b0.z ? a0.z + nghost.z : a0.z,
+        };*/
+
+        printf("a0 -> b0: (%d, %d, %d) -> (%d, %d, %d)\n", a0.x, a0.y, a0.z, b0.x, b0.y, b0.z);
+        printf("b1: (%d, %d, %d)\n", b1.x, b1.y, b1.z);
+        printf("neighbor: (%d, %d, %d)\n", neighbor.x, neighbor.y, neighbor.z);
+
+        /*
+        const int3 b1 = (int3){
+            a0.x < b0.x ? a0.x - nghost.x : a0.x > b0.x ? a0.x + nghost.x : a0.x,
+            a0.y < b0.y ? a0.y - nghost.y : a0.y > b0.y ? a0.y + nghost.y : a0.y,
+            a0.z < b0.z ? a0.z - nghost.z : a0.z > b0.z ? a0.z + nghost.z : a0.z,
+        };
+        const int3 a1 = mod(((b1 - nghost) + nn), nn) + nghost;
+
+        printf("b0, a0: (%d, %d, %d) -> (%d, %d, %d)\n", b0.x, b0.y, b0.z, a0.x, a0.y, a0.z);
+        printf("b1, a1: (%d, %d, %d) -> (%d, %d, %d)\n\n", b1.x, b1.y, b1.z, a1.x, a1.y, a1.z);
+        */
+    }
+
+    return AC_SUCCESS;
+}
+
+static void
+acTransferCommDataWait(const CommData data)
+{
+    for (size_t i = 0; i < data.count; ++i) {
+        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
+        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
+    }
+}
+
 #elif AC_MPI_RT_PINNING
 static AcResult
 acTransferCommData(const Device device, //
@@ -1916,6 +2040,7 @@ acGridPeriodicBoundconds(const Stream stream)
         (int3){NGHOST, nn.y, nn.z},   //
         (int3){nn.x, nn.y, nn.z},
     };
+    /*
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -1927,6 +2052,18 @@ acGridPeriodicBoundconds(const Stream stream)
         (int3){0, NGHOST + nn.y, NGHOST + nn.z},
         (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
     };
+    */
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
 
     // Edges X
     const int3 edgex_a0s[] = {

From 0d62f56e2771834e8bef9dd0e4d01d1dc39ef75e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 28 May 2020 15:31:43 +0300
Subject: [PATCH 24/89] Tried an alternative approach to comm (was worse than
 the current solution) and rewrote the current best solution for (now easier
 to read)

---
 src/core/device.cc         | 333 ++++++++++++++++++++++++++++++++++++-
 src/core/kernels/kernels.h |   3 +
 2 files changed, 328 insertions(+), 8 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index e017611..322fc6d 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1166,7 +1166,7 @@ acTransferCommDataWait(const CommData data)
     // NOP
 }
 
-#elif AC_MPI_BIDIRECTIONAL_SCHEME
+#elif AC_MPI_BIDIRECTIONAL_SCHEME_A
 
 static int3
 mod(const int3 a, const int3 n)
@@ -1206,8 +1206,7 @@ acTransferCommData(const Device device, //
     const int3 dims         = data->dims;
     const size_t num_blocks = data->count;
 
-    cudaDeviceSynchronize(); // TODO debug REMOVE
-    for (size_t b0_idx = 0; b0_idx < num_blocks; ++b0_idx) {
+    for (size_t b0_idx = 0; b0_idx < num_blocks / 2; ++b0_idx) {
         const int3 b0     = b0s[b0_idx];
         const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
         const int3 a0     = mod(((b0 - nghost) + nn), nn) + nghost;
@@ -1232,6 +1231,15 @@ acTransferCommData(const Device device, //
             neighbor.y < 0 ? a0.y - nghost.y : neighbor.y > 0 ? a0.y + nghost.y : a0.y,
             neighbor.z < 0 ? a0.z - nghost.z : neighbor.z > 0 ? a0.z + nghost.z : a0.z,
         };
+        const int3 a1 = mod(((b1 + nn - nghost) + nn), nn) + nghost;
+        size_t a1_idx = -1;
+        for (size_t i = 0; i < num_blocks; ++i) {
+            if (a0s[i].x == a1.x && a0s[i].y == a1.y && a0s[i].z == a1.z) {
+                a1_idx = i;
+                break;
+            }
+        }
+        ERRCHK_ALWAYS(a1_idx < num_blocks); // TODO debug REMOVE
 
         size_t b1_idx = -1;
         for (size_t i = 0; i < num_blocks; ++i) {
@@ -1242,17 +1250,66 @@ acTransferCommData(const Device device, //
         }
         ERRCHK_ALWAYS(b1_idx < num_blocks); // TODO debug REMOVE
 
+        const int3 pid3d        = getPid3D(pid, decomp);
+        const int3 npid3d_front = pid3d + neighbor;
+        const int3 npid3d_back  = pid3d - neighbor;
+        const int npid_front    = getPid(npid3d_front, decomp);
+        const int npid_back     = getPid(npid3d_back, decomp);
+        const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+        PackedData* a0_data = &data->srcs[a0_idx];
+        PackedData* a1_data = &data->srcs[a1_idx];
+        PackedData* b0_data = &data->dsts[b0_idx];
+        PackedData* b1_data = &data->dsts[b1_idx];
+
+        // Deadlock!
+        /*
+        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
+                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
+                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+                     */
+        cudaDeviceSynchronize();
+        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
+                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
+                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+
+        /*
         const int3 pid3d   = getPid3D(pid, decomp);
-        const int npid     = getPid(pid3d + neighbor, decomp);
+        const int3 npid3d  = pid3d + neighbor;
+        const int npid     = getPid(npid3d, decomp);
         const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
         PackedData* src = &data->srcs[a0_idx];
         PackedData* dst = &data->dsts[b1_idx];
 
-        MPI_Irecv(dst->data, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
-                  &data->recv_reqs[b1_idx]);
-        MPI_Isend(src->data, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                  &data->send_reqs[b0_idx]);
+
+        if (onTheSameNode(pid, npid)) {
+            MPI_Irecv(dst->data, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
+                      &data->recv_reqs[b1_idx]);
+            dst->pinned = false;
+            cudaStreamSynchronize(data->streams[a0_idx]);
+            MPI_Isend(src->data, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
+                      &data->send_reqs[b0_idx]);
+        }
+        else {
+            MPI_Irecv(dst->data_pinned, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
+                      &data->recv_reqs[b1_idx]);
+            dst->pinned = true;
+
+            if (!src->pinned) {
+                acPinPackedData(device, data->streams[a0_idx], src);
+                cudaStreamSynchronize(data->streams[a0_idx]);
+            }
+            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
+                      &data->send_reqs[b0_idx]);
+        }
+        */
 
         /*
         const int3 neighbor = (int3){
@@ -1290,6 +1347,252 @@ acTransferCommDataWait(const CommData data)
     }
 }
 
+#elif AC_MPI_BIDIRECTIONAL_SCHEME_B
+static AcResult
+acTransferCommData(const Device device, //
+                   const int3* a0s,     // Src idx inside comp. domain
+                   const int3* b0s,     // Dst idx inside bound zone
+                   CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    const int3 dims         = data->dims;
+    const size_t blockcount = data->count;
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+
+                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+#if MPI_GPUDIRECT_DISABLED
+                            PackedData* src    = &data->srcs_host[a_idx];
+                            PackedData* dst    = &data->dsts_host[b_idx];
+#else
+                            PackedData* src = &data->srcs[a_idx];
+                            PackedData* dst = &data->dsts[b_idx];
+#endif
+
+                            const int3 pid3d     = getPid3D(pid, decomp);
+                            const int npid_front = getPid(pid3d + neighbor, decomp);
+                            const int npid_back  = getPid(pid3d - neighbor, decomp);
+
+                            dst->pinned = false;
+
+                            if (onTheSameNode(pid, npid_back)) {
+                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = false;
+                                cudaStreamSynchronize(data->streams[a_idx]);
+                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            else {
+                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = true;
+                                cudaStreamSynchronize(data->streams[a_idx]);
+                                if (!src->pinned) {
+                                    acPinPackedData(device, data->streams[a_idx], src);
+                                    cudaStreamSynchronize(data->streams[a_idx]);
+                                }
+                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            /*
+                            cudaStreamSynchronize(data->streams[a_idx]);
+                            MPI_Status status;
+                            MPI_Sendrecv(src->data, count, datatype, npid_front, b_idx, //
+                                         dst->data, count, datatype, npid_back, b_idx,  //
+                                         MPI_COMM_WORLD, &status);
+                                         */
+
+                            /*
+                            const int npid_back = getPid(pid3d - neighbor, decomp);
+
+                            if (onTheSameNode(pid, npid_back)) {
+                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = false;
+                            }
+                            else {
+                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = true;
+                            }
+
+                            const int npid_front = getPid(pid3d + neighbor, decomp);
+
+                            cudaStreamSynchronize(data->streams[a_idx]);
+                            if (onTheSameNode(pid, npid_front)) {
+                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            else {
+                                if (!src->pinned) {
+                                    acPinPackedData(device, data->streams[a_idx], src);
+                                    cudaStreamSynchronize(data->streams[a_idx]);
+                                }
+                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            */
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return AC_SUCCESS;
+}
+
+static void
+acTransferCommDataWait(const CommData data)
+{
+
+    for (size_t i = 0; i < data.count; ++i) {
+        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
+        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
+    }
+}
+
+#elif AC_MPI_RT_PINNING_IMPROVED
+static int3
+mod(const int3 a, const int3 n)
+{
+    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
+}
+
+static AcResult
+acTransferCommData(const Device device, //
+                   const int3* a0s,     // Src idx inside comp. domain
+                   const int3* b0s,     // Dst idx inside bound zone
+                   CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    const int3 pid3d        = getPid3D(pid, decomp);
+    const int3 dims         = data->dims;
+    const size_t blockcount = data->count;
+    const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+    const int3 nghost       = (int3){NGHOST, NGHOST, NGHOST};
+
+    for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
+
+        const int3 b0       = b0s[b0_idx];
+        const int3 neighbor = (int3){
+            b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
+            b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
+            b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
+        };
+        const int npid = getPid(pid3d + neighbor, decomp);
+
+        PackedData* dst = &data->dsts[b0_idx];
+        if (onTheSameNode(pid, npid)) {
+            MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
+            dst->pinned = false;
+        }
+        else {
+            MPI_Irecv(dst->data_pinned, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
+            dst->pinned = true;
+        }
+    }
+
+    for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
+        const int3 b0       = b0s[b0_idx];
+        const int3 neighbor = (int3){
+            b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
+            b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
+            b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
+        };
+        const int npid = getPid(pid3d - neighbor, decomp);
+
+        const int3 a0 = mod(b0 - nghost, nn) + nghost;
+
+        // Not needed if there's a 1-to-1 mapping from b -> a
+        size_t a0_idx = -1;
+        for (size_t i = 0; i < blockcount; ++i) {
+            if (a0s[i].x == a0.x && a0s[i].y == a0.y && a0s[i].z == a0.z) {
+                a0_idx = i;
+                break;
+            }
+        }
+        ERRCHK(a0_idx < blockcount);
+
+        PackedData* src = &data->srcs[a0_idx];
+        if (onTheSameNode(pid, npid)) {
+            cudaStreamSynchronize(data->streams[a0_idx]);
+            MPI_Isend(src->data, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
+        }
+        else {
+            acPinPackedData(device, data->streams[a0_idx], src);
+            cudaStreamSynchronize(data->streams[a0_idx]);
+            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
+                      &data->send_reqs[b0_idx]);
+            src->pinned = true;
+        }
+    }
+
+    return AC_SUCCESS;
+}
+
+static void
+acTransferCommDataWait(const CommData data)
+{
+    for (size_t i = 0; i < data.count; ++i) {
+        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
+        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
+    }
+}
+
 #elif AC_MPI_RT_PINNING
 static AcResult
 acTransferCommData(const Device device, //
@@ -1401,6 +1704,7 @@ acTransferCommData(const Device device, //
                                 if (!src->pinned) {
                                     acPinPackedData(device, data->streams[a_idx], src);
                                     cudaStreamSynchronize(data->streams[a_idx]);
+                                    src->pinned = true;
                                 }
                                 MPI_Isend(src->data_pinned, count, datatype, npid, b_idx,
                                           MPI_COMM_WORLD, &data->send_reqs[b_idx]);
@@ -1790,6 +2094,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         (int3){NGHOST, nn.y, nn.z},   //
         (int3){nn.x, nn.y, nn.z},
     };
+    /*
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -1801,6 +2106,18 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         (int3){0, NGHOST + nn.y, NGHOST + nn.z},
         (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
     };
+    */
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
 
     // Edges X
     const int3 edgex_a0s[] = {
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 805cbed..5f91008 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -6,6 +6,9 @@
 #include <stdbool.h>
 
 #define AC_MPI_UNIDIRECTIONAL_COMM (0)
+#define AC_MPI_BIDIRECTIONAL_SCHEME_A (0)
+#define AC_MPI_BIDIRECTIONAL_SCHEME_B (0)
+#define AC_MPI_RT_PINNING_IMPROVED (1)
 #define AC_MPI_RT_PINNING (1)
 #endif // AC_MPI_ENABLED
 

From f1138b04ac503816a996454ad42aed391b31d738 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 28 May 2020 16:42:50 +0300
Subject: [PATCH 25/89] Cleaned up the MPI implementation, removed all older
 implementations (removed also MPI window implementation which might be handy
 in the future when CUDA-aware support is introduced). If the removed stuff is
 needed later, here are some keywords to help find this commit: MPI_window,
 sendrecv, bidirectional, unidirectional transfer, real-time pinning, a0s,
 b0s.

---
 src/core/device.cc         | 1090 +++---------------------------------
 src/core/kernels/kernels.h |   12 +-
 2 files changed, 67 insertions(+), 1035 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 322fc6d..b7295d7 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -574,17 +574,9 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
-#if AC_MPI_RT_PINNING
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
     // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
     // slower than pinned (38 ms vs. 125 ms)
-#endif // AC_MPI_RT_PINNING
-
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    ERRCHK_ALWAYS(MPI_Win_create(data.data, bytes, sizeof(AcReal), MPI_INFO_NULL, MPI_COMM_WORLD,
-                                 &data.win) == MPI_SUCCESS);
-    MPI_Win_fence(0, data.win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
 
     return data;
 }
@@ -592,13 +584,7 @@ acCreatePackedData(const int3 dims)
 static AcResult
 acDestroyPackedData(PackedData* data)
 {
-#if AC_MPI_RT_PINNING
     cudaFree(data->data_pinned);
-#endif // AC_MPI_RT_PINNING
-
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    MPI_Win_free(&data->win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
 
     data->dims = (int3){-1, -1, -1};
     cudaFree(data->data);
@@ -619,22 +605,12 @@ acCreatePackedDataHost(const int3 dims)
     data.data          = (AcReal*)malloc(bytes);
     ERRCHK_ALWAYS(data.data);
 
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    ERRCHK_ALWAYS(MPI_Win_create(data.data, bytes, sizeof(AcReal), MPI_INFO_NULL, MPI_COMM_WORLD,
-                                 &data.win) == MPI_SUCCESS);
-    MPI_Win_fence(0, data.win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
-
     return data;
 }
 
 static AcResult
 acDestroyPackedDataHost(PackedData* data)
 {
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    MPI_Win_free(&data->win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
-
     data->dims = (int3){-1, -1, -1};
     free(data->data);
     data->data = NULL;
@@ -665,7 +641,6 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
 }
 #endif // MPI_GPUDIRECT_DISABLED
 
-#if AC_MPI_RT_PINNING
 static void
 acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
 {
@@ -692,7 +667,6 @@ acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* dd
                          NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
 }
-#endif // AC_MPI_RT_PINNING
 
 // TODO: do with packed data
 static AcResult
@@ -954,12 +928,28 @@ acSyncCommData(const CommData data)
         cudaStreamSynchronize(data.streams[i]);
 }
 
+static int3
+mod(const int3 a, const int3 n)
+{
+    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
+}
+
 static void
-acPackCommData(const Device device, const int3* a0s, CommData* data)
+acPackCommData(const Device device, const int3* b0s, CommData* data)
 {
     cudaSetDevice(device->id);
-    for (size_t i = 0; i < data->count; ++i)
-        acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+    const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
+
+    for (size_t i = 0; i < data->count; ++i) {
+        const int3 a0 = mod(b0s[i] - nghost, nn) + nghost;
+        acKernelPackData(data->streams[i], device->vba, a0, data->srcs[i]);
+    }
 }
 
 static void
@@ -989,7 +979,6 @@ acTransferCommDataToDevice(const Device device, CommData* data)
 }
 #endif
 
-#if AC_MPI_RT_PINNING
 static inline void
 acPinCommData(const Device device, CommData* data)
 {
@@ -1011,491 +1000,9 @@ acUnpinCommData(const Device device, CommData* data)
     for (size_t i = 0; i < data->count; ++i)
         acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
 }
-#endif
-
-#if AC_MPI_UNIDIRECTIONAL_COMM
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-#if MPI_GPUDIRECT_DISABLED
-                            MPI_Win_fence(0, data->srcs_host[a_idx].win);
-                            MPI_Win_fence(0, data->dsts_host[b_idx].win);
-#else
-                            MPI_Win_fence(0, data->srcs[a_idx].win);
-                            MPI_Win_fence(0, data->dsts[b_idx].win);
-#endif
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-
-#if MPI_GPUDIRECT_DISABLED
-
-                            MPI_Put(data->srcs_host[a_idx].data, count, datatype,
-                                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
-                                    data->dsts_host[b_idx].win);
-
-                            /*
-            MPI_Get(data->dsts_host[b_idx].data, count, datatype,
-                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
-                    data->srcs_host[a_idx].win);
-                    */
-
-#else
-                            /*
-                                            MPI_Put(data->srcs[a_idx].data, count, datatype,
-                                                    getPid(pid3d - neighbor, decomp), 0, count,
-                               datatype, data->dsts[b_idx].win);
-                                            */
-
-                            MPI_Get(data->dsts[b_idx].data, count, datatype,
-                                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
-                                    data->srcs[a_idx].win);
-                            ERROR("CUDA-aware MPI_Put/MPI_Get not yet supported with UCX "
-                                  "(2020-04-02)");
-#endif
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-#if MPI_GPUDIRECT_DISABLED
-                            MPI_Win_fence(0, data->srcs_host[a_idx].win);
-                            MPI_Win_fence(0, data->dsts_host[b_idx].win);
-#else
-                            MPI_Win_fence(0, data->srcs[a_idx].win);
-                            MPI_Win_fence(0, data->dsts[b_idx].win);
-#endif
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    (void)data;
-    // NOP
-}
-
-#elif AC_MPI_BIDIRECTIONAL_SCHEME_A
-
-static int3
-mod(const int3 a, const int3 n)
-{
-    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
-}
 
 static AcResult
 acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 mm = (int3){
-        device->local_config.int_params[AC_mx],
-        device->local_config.int_params[AC_my],
-        device->local_config.int_params[AC_mz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t num_blocks = data->count;
-
-    for (size_t b0_idx = 0; b0_idx < num_blocks / 2; ++b0_idx) {
-        const int3 b0     = b0s[b0_idx];
-        const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
-        const int3 a0     = mod(((b0 - nghost) + nn), nn) + nghost;
-
-        size_t a0_idx = -1;
-        for (size_t i = 0; i < num_blocks; ++i) {
-            if (a0s[i].x == a0.x && a0s[i].y == a0.y && a0s[i].z == a0.z) {
-                a0_idx = i;
-                break;
-            }
-        }
-        ERRCHK_ALWAYS(a0_idx < num_blocks); // TODO debug REMOVE
-
-        const int3 neighbor = (int3){
-            a0.x < b0.x ? -1 : a0.x > b0.x ? 1 : 0,
-            a0.y < b0.y ? -1 : a0.y > b0.y ? 1 : 0,
-            a0.z < b0.z ? -1 : a0.z > b0.z ? 1 : 0,
-        };
-
-        const int3 b1 = (int3){
-            neighbor.x < 0 ? a0.x - nghost.x : neighbor.x > 0 ? a0.x + nghost.x : a0.x,
-            neighbor.y < 0 ? a0.y - nghost.y : neighbor.y > 0 ? a0.y + nghost.y : a0.y,
-            neighbor.z < 0 ? a0.z - nghost.z : neighbor.z > 0 ? a0.z + nghost.z : a0.z,
-        };
-        const int3 a1 = mod(((b1 + nn - nghost) + nn), nn) + nghost;
-        size_t a1_idx = -1;
-        for (size_t i = 0; i < num_blocks; ++i) {
-            if (a0s[i].x == a1.x && a0s[i].y == a1.y && a0s[i].z == a1.z) {
-                a1_idx = i;
-                break;
-            }
-        }
-        ERRCHK_ALWAYS(a1_idx < num_blocks); // TODO debug REMOVE
-
-        size_t b1_idx = -1;
-        for (size_t i = 0; i < num_blocks; ++i) {
-            if (b0s[i].x == b1.x && b0s[i].y == b1.y && b0s[i].z == b1.z) {
-                b1_idx = i;
-                break;
-            }
-        }
-        ERRCHK_ALWAYS(b1_idx < num_blocks); // TODO debug REMOVE
-
-        const int3 pid3d        = getPid3D(pid, decomp);
-        const int3 npid3d_front = pid3d + neighbor;
-        const int3 npid3d_back  = pid3d - neighbor;
-        const int npid_front    = getPid(npid3d_front, decomp);
-        const int npid_back     = getPid(npid3d_back, decomp);
-        const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-        PackedData* a0_data = &data->srcs[a0_idx];
-        PackedData* a1_data = &data->srcs[a1_idx];
-        PackedData* b0_data = &data->dsts[b0_idx];
-        PackedData* b1_data = &data->dsts[b1_idx];
-
-        // Deadlock!
-        /*
-        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
-                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
-                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-                     */
-        cudaDeviceSynchronize();
-        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
-                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
-                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-
-        /*
-        const int3 pid3d   = getPid3D(pid, decomp);
-        const int3 npid3d  = pid3d + neighbor;
-        const int npid     = getPid(npid3d, decomp);
-        const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-        PackedData* src = &data->srcs[a0_idx];
-        PackedData* dst = &data->dsts[b1_idx];
-
-
-        if (onTheSameNode(pid, npid)) {
-            MPI_Irecv(dst->data, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
-                      &data->recv_reqs[b1_idx]);
-            dst->pinned = false;
-            cudaStreamSynchronize(data->streams[a0_idx]);
-            MPI_Isend(src->data, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                      &data->send_reqs[b0_idx]);
-        }
-        else {
-            MPI_Irecv(dst->data_pinned, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
-                      &data->recv_reqs[b1_idx]);
-            dst->pinned = true;
-
-            if (!src->pinned) {
-                acPinPackedData(device, data->streams[a0_idx], src);
-                cudaStreamSynchronize(data->streams[a0_idx]);
-            }
-            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                      &data->send_reqs[b0_idx]);
-        }
-        */
-
-        /*
-        const int3 neighbor = (int3){
-            a0.x < b0.x ? a0.x - nghost.x : a0.x > b0.x ? a0.x + nghost.x : a0.x,
-            a0.y < b0.y ? a0.y - nghost.y : a0.y > b0.y ? a0.y + nghost.y : a0.y,
-            a0.z < b0.z ? a0.z - nghost.z : a0.z > b0.z ? a0.z + nghost.z : a0.z,
-        };*/
-
-        printf("a0 -> b0: (%d, %d, %d) -> (%d, %d, %d)\n", a0.x, a0.y, a0.z, b0.x, b0.y, b0.z);
-        printf("b1: (%d, %d, %d)\n", b1.x, b1.y, b1.z);
-        printf("neighbor: (%d, %d, %d)\n", neighbor.x, neighbor.y, neighbor.z);
-
-        /*
-        const int3 b1 = (int3){
-            a0.x < b0.x ? a0.x - nghost.x : a0.x > b0.x ? a0.x + nghost.x : a0.x,
-            a0.y < b0.y ? a0.y - nghost.y : a0.y > b0.y ? a0.y + nghost.y : a0.y,
-            a0.z < b0.z ? a0.z - nghost.z : a0.z > b0.z ? a0.z + nghost.z : a0.z,
-        };
-        const int3 a1 = mod(((b1 - nghost) + nn), nn) + nghost;
-
-        printf("b0, a0: (%d, %d, %d) -> (%d, %d, %d)\n", b0.x, b0.y, b0.z, a0.x, a0.y, a0.z);
-        printf("b1, a1: (%d, %d, %d) -> (%d, %d, %d)\n\n", b1.x, b1.y, b1.z, a1.x, a1.y, a1.z);
-        */
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-
-#elif AC_MPI_BIDIRECTIONAL_SCHEME_B
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* src    = &data->srcs_host[a_idx];
-                            PackedData* dst    = &data->dsts_host[b_idx];
-#else
-                            PackedData* src = &data->srcs[a_idx];
-                            PackedData* dst = &data->dsts[b_idx];
-#endif
-
-                            const int3 pid3d     = getPid3D(pid, decomp);
-                            const int npid_front = getPid(pid3d + neighbor, decomp);
-                            const int npid_back  = getPid(pid3d - neighbor, decomp);
-
-                            dst->pinned = false;
-
-                            if (onTheSameNode(pid, npid_back)) {
-                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = false;
-                                cudaStreamSynchronize(data->streams[a_idx]);
-                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            else {
-                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = true;
-                                cudaStreamSynchronize(data->streams[a_idx]);
-                                if (!src->pinned) {
-                                    acPinPackedData(device, data->streams[a_idx], src);
-                                    cudaStreamSynchronize(data->streams[a_idx]);
-                                }
-                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            /*
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            MPI_Status status;
-                            MPI_Sendrecv(src->data, count, datatype, npid_front, b_idx, //
-                                         dst->data, count, datatype, npid_back, b_idx,  //
-                                         MPI_COMM_WORLD, &status);
-                                         */
-
-                            /*
-                            const int npid_back = getPid(pid3d - neighbor, decomp);
-
-                            if (onTheSameNode(pid, npid_back)) {
-                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = false;
-                            }
-                            else {
-                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = true;
-                            }
-
-                            const int npid_front = getPid(pid3d + neighbor, decomp);
-
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            if (onTheSameNode(pid, npid_front)) {
-                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            else {
-                                if (!src->pinned) {
-                                    acPinPackedData(device, data->streams[a_idx], src);
-                                    cudaStreamSynchronize(data->streams[a_idx]);
-                                }
-                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            */
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-
-#elif AC_MPI_RT_PINNING_IMPROVED
-static int3
-mod(const int3 a, const int3 n)
-{
-    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
-}
-
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
                    const int3* b0s,     // Dst idx inside bound zone
                    CommData* data)
 {
@@ -1554,29 +1061,17 @@ acTransferCommData(const Device device, //
         };
         const int npid = getPid(pid3d - neighbor, decomp);
 
-        const int3 a0 = mod(b0 - nghost, nn) + nghost;
-
-        // Not needed if there's a 1-to-1 mapping from b -> a
-        size_t a0_idx = -1;
-        for (size_t i = 0; i < blockcount; ++i) {
-            if (a0s[i].x == a0.x && a0s[i].y == a0.y && a0s[i].z == a0.z) {
-                a0_idx = i;
-                break;
-            }
-        }
-        ERRCHK(a0_idx < blockcount);
-
-        PackedData* src = &data->srcs[a0_idx];
+        PackedData* src = &data->srcs[b0_idx];
         if (onTheSameNode(pid, npid)) {
-            cudaStreamSynchronize(data->streams[a0_idx]);
+            cudaStreamSynchronize(data->streams[b0_idx]);
             MPI_Isend(src->data, count, datatype, npid, b0_idx, //
                       MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
         }
         else {
-            acPinPackedData(device, data->streams[a0_idx], src);
-            cudaStreamSynchronize(data->streams[a0_idx]);
-            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                      &data->send_reqs[b0_idx]);
+            acPinPackedData(device, data->streams[b0_idx], src);
+            cudaStreamSynchronize(data->streams[b0_idx]);
+            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
             src->pinned = true;
         }
     }
@@ -1593,257 +1088,6 @@ acTransferCommDataWait(const CommData data)
     }
 }
 
-#elif AC_MPI_RT_PINNING
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* dst    = &data->dsts_host[b_idx];
-#else
-                            PackedData* dst = &data->dsts[b_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d - neighbor, decomp);
-
-                            if (onTheSameNode(pid, npid)) {
-                                MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                          &data->recv_reqs[b_idx]);
-                                dst->pinned = false;
-                            }
-                            else {
-                                MPI_Irecv(dst->data_pinned, count, datatype, npid, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = true;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* src    = &data->srcs_host[a_idx];
-#else
-                            PackedData* src = &data->srcs[a_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d + neighbor, decomp);
-
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            if (onTheSameNode(pid, npid)) {
-                                MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                          &data->send_reqs[b_idx]);
-                            }
-                            else {
-                                if (!src->pinned) {
-                                    acPinPackedData(device, data->streams[a_idx], src);
-                                    cudaStreamSynchronize(data->streams[a_idx]);
-                                    src->pinned = true;
-                                }
-                                MPI_Isend(src->data_pinned, count, datatype, npid, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-#else
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* dst    = &data->dsts_host[b_idx];
-#else
-                            PackedData* dst = &data->dsts[b_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d - neighbor, decomp);
-
-                            MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                      &data->recv_reqs[b_idx]);
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* src    = &data->srcs_host[a_idx];
-#else
-                            PackedData* src = &data->srcs[a_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d + neighbor, decomp);
-
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                      &data->send_reqs[b_idx]);
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
-
 typedef struct {
     Device device;
     AcMesh submesh;
@@ -1937,79 +1181,19 @@ acGridInit(const AcMeshInfo info)
         device->local_config.int_params[AC_nz],
     };
 
-    // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
-
-    // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
-
-    // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
-
-    // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
-
-    // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
-
-    // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
-
-    // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
-
-    const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 edgex_dims  = (int3){nn.x, NGHOST, NGHOST};
-    const int3 edgey_dims  = (int3){NGHOST, nn.y, NGHOST};
-    const int3 edgez_dims  = (int3){NGHOST, NGHOST, nn.z};
-    const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};
-    const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};
-    const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};
-    grid.nn                = nn;
-    grid.corner_data       = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
-    grid.edgex_data        = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
-    grid.edgey_data        = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
-    grid.edgez_data        = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
-    grid.sidexy_data       = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
-    grid.sidexz_data       = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
-    grid.sideyz_data       = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));
+    // Create CommData
+    // We have 8 corners, 12 edges, and 6 sides
+    //
+    // For simplicity's sake all data blocks inside a single CommData struct
+    // have the same dimensions.
+    grid.nn          = nn;
+    grid.corner_data = acCreateCommData(device, (int3){NGHOST, NGHOST, NGHOST}, 8);
+    grid.edgex_data  = acCreateCommData(device, (int3){nn.x, NGHOST, NGHOST}, 4);
+    grid.edgey_data  = acCreateCommData(device, (int3){NGHOST, nn.y, NGHOST}, 4);
+    grid.edgez_data  = acCreateCommData(device, (int3){NGHOST, NGHOST, nn.z}, 4);
+    grid.sidexy_data = acCreateCommData(device, (int3){nn.x, nn.y, NGHOST}, 2);
+    grid.sidexz_data = acCreateCommData(device, (int3){nn.x, NGHOST, nn.z}, 2);
+    grid.sideyz_data = acCreateCommData(device, (int3){NGHOST, nn.y, nn.z}, 2);
 
     acGridSynchronizeStream(STREAM_ALL);
     return AC_SUCCESS;
@@ -2083,30 +1267,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     acDeviceSynchronizeStream(device, stream);
 
     // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
-    /*
-    const int3 corner_b0s[] = {
-        (int3){0, 0, 0},
-        (int3){NGHOST + nn.x, 0, 0},
-        (int3){0, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-
-        (int3){0, 0, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
-        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
-    };
-    */
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -2120,13 +1280,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
     const int3 edgex_b0s[] = {
         (int3){NGHOST, 0, 0},
         (int3){NGHOST, NGHOST + nn.y, 0},
@@ -2136,13 +1289,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
     const int3 edgey_b0s[] = {
         (int3){0, NGHOST, 0},
         (int3){NGHOST + nn.x, NGHOST, 0},
@@ -2152,13 +1298,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
     const int3 edgez_b0s[] = {
         (int3){0, 0, NGHOST},
         (int3){NGHOST + nn.x, 0, NGHOST},
@@ -2168,55 +1307,31 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
     const int3 sidexy_b0s[] = {
         (int3){NGHOST, NGHOST, 0},             //
         (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
     };
 
     // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
     const int3 sidexz_b0s[] = {
         (int3){NGHOST, 0, NGHOST},             //
         (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
     };
 
     // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
     const int3 sideyz_b0s[] = {
         (int3){0, NGHOST, NGHOST},             //
         (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
     };
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        // acPackCommData(device, corner_a0s, &corner_data);
-        acPackCommData(device, edgex_a0s, &edgex_data);
-        acPackCommData(device, edgey_a0s, &edgey_data);
-        acPackCommData(device, edgez_a0s, &edgez_data);
-        acPackCommData(device, sidexy_a0s, &sidexy_data);
-        acPackCommData(device, sidexz_a0s, &sidexz_data);
-        acPackCommData(device, sideyz_a0s, &sideyz_data);
-
-        /*
-        #if AC_MPI_RT_PINNING
-                acPinCommData(device, &corner_data);
-                acPinCommData(device, &edgex_data);
-                acPinCommData(device, &edgey_data);
-                acPinCommData(device, &edgez_data);
-                acPinCommData(device, &sidexy_data);
-                acPinCommData(device, &sidexz_data);
-                acPinCommData(device, &sideyz_data);
-        #endif
-        */
+        // acPackCommData(device, corner_b0s, &corner_data);
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
 
         //////////// INNER INTEGRATION //////////////
         {
@@ -2238,13 +1353,13 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToHost(device, &sideyz_data);
 #endif
 
-        // acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
-        acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
-        acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
-        acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
-        acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
-        acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
-        acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+        // acTransferCommData(device, corner_b0s, &corner_data);
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
 
         // acTransferCommDataWait(corner_data);
         acTransferCommDataWait(edgex_data);
@@ -2264,7 +1379,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToDevice(device, &sideyz_data);
 #endif
 
-#if AC_MPI_RT_PINNING
         // acUnpinCommData(device, &corner_data);
         acUnpinCommData(device, &edgex_data);
         acUnpinCommData(device, &edgey_data);
@@ -2272,7 +1386,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acUnpinCommData(device, &sidexy_data);
         acUnpinCommData(device, &sidexz_data);
         acUnpinCommData(device, &sideyz_data);
-#endif
 
         // acUnpackCommData(device, corner_b0s, &corner_data);
         acUnpackCommData(device, edgex_b0s, &edgex_data);
@@ -2346,30 +1459,6 @@ acGridPeriodicBoundconds(const Stream stream)
     CommData sideyz_data = grid.sideyz_data;
 
     // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
-    /*
-    const int3 corner_b0s[] = {
-        (int3){0, 0, 0},
-        (int3){NGHOST + nn.x, 0, 0},
-        (int3){0, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-
-        (int3){0, 0, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
-        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
-    };
-    */
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -2383,13 +1472,6 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
     const int3 edgex_b0s[] = {
         (int3){NGHOST, 0, 0},
         (int3){NGHOST, NGHOST + nn.y, 0},
@@ -2399,13 +1481,6 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
     const int3 edgey_b0s[] = {
         (int3){0, NGHOST, 0},
         (int3){NGHOST + nn.x, NGHOST, 0},
@@ -2415,13 +1490,6 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
     const int3 edgez_b0s[] = {
         (int3){0, 0, NGHOST},
         (int3){NGHOST + nn.x, 0, NGHOST},
@@ -2431,54 +1499,30 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
     const int3 sidexy_b0s[] = {
         (int3){NGHOST, NGHOST, 0},             //
         (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
     };
 
     // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
     const int3 sidexz_b0s[] = {
         (int3){NGHOST, 0, NGHOST},             //
         (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
     };
 
     // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
     const int3 sideyz_b0s[] = {
         (int3){0, NGHOST, NGHOST},             //
         (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
     };
 
-    acPackCommData(device, corner_a0s, &corner_data);
-    acPackCommData(device, edgex_a0s, &edgex_data);
-    acPackCommData(device, edgey_a0s, &edgey_data);
-    acPackCommData(device, edgez_a0s, &edgez_data);
-    acPackCommData(device, sidexy_a0s, &sidexy_data);
-    acPackCommData(device, sidexz_a0s, &sidexz_data);
-    acPackCommData(device, sideyz_a0s, &sideyz_data);
-
-    /*
-    #if AC_MPI_RT_PINNING
-        acPinCommData(device, &corner_data);
-        acPinCommData(device, &edgex_data);
-        acPinCommData(device, &edgey_data);
-        acPinCommData(device, &edgez_data);
-        acPinCommData(device, &sidexy_data);
-        acPinCommData(device, &sidexz_data);
-        acPinCommData(device, &sideyz_data);
-    #endif
-    */
+    acPackCommData(device, corner_b0s, &corner_data);
+    acPackCommData(device, edgex_b0s, &edgex_data);
+    acPackCommData(device, edgey_b0s, &edgey_data);
+    acPackCommData(device, edgez_b0s, &edgez_data);
+    acPackCommData(device, sidexy_b0s, &sidexy_data);
+    acPackCommData(device, sidexz_b0s, &sidexz_data);
+    acPackCommData(device, sideyz_b0s, &sideyz_data);
 
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -2492,13 +1536,13 @@ acGridPeriodicBoundconds(const Stream stream)
     acTransferCommDataToHost(device, &sideyz_data);
 #endif
 
-    acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
-    acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
-    acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
-    acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
-    acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
-    acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
-    acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+    acTransferCommData(device, corner_b0s, &corner_data);
+    acTransferCommData(device, edgex_b0s, &edgex_data);
+    acTransferCommData(device, edgey_b0s, &edgey_data);
+    acTransferCommData(device, edgez_b0s, &edgez_data);
+    acTransferCommData(device, sidexy_b0s, &sidexy_data);
+    acTransferCommData(device, sidexz_b0s, &sidexz_data);
+    acTransferCommData(device, sideyz_b0s, &sideyz_data);
 
     acTransferCommDataWait(corner_data);
     acTransferCommDataWait(edgex_data);
@@ -2518,7 +1562,6 @@ acGridPeriodicBoundconds(const Stream stream)
     acTransferCommDataToDevice(device, &sideyz_data);
 #endif
 
-#if AC_MPI_RT_PINNING
     acUnpinCommData(device, &corner_data);
     acUnpinCommData(device, &edgex_data);
     acUnpinCommData(device, &edgey_data);
@@ -2526,7 +1569,6 @@ acGridPeriodicBoundconds(const Stream stream)
     acUnpinCommData(device, &sidexy_data);
     acUnpinCommData(device, &sidexz_data);
     acUnpinCommData(device, &sideyz_data);
-#endif
 
     acUnpackCommData(device, corner_b0s, &corner_data);
     acUnpackCommData(device, edgex_b0s, &edgex_data);
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 5f91008..577567b 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -5,25 +5,15 @@
 #include <mpi.h>
 #include <stdbool.h>
 
-#define AC_MPI_UNIDIRECTIONAL_COMM (0)
-#define AC_MPI_BIDIRECTIONAL_SCHEME_A (0)
-#define AC_MPI_BIDIRECTIONAL_SCHEME_B (0)
-#define AC_MPI_RT_PINNING_IMPROVED (1)
-#define AC_MPI_RT_PINNING (1)
+#define MPI_GPUDIRECT_DISABLED (0)
 #endif // AC_MPI_ENABLED
 
 typedef struct {
     int3 dims;
     AcReal* data;
 
-#if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
     AcReal* data_pinned;
     bool pinned = false; // Set if data was received to pinned memory
-#endif                   // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
-
-#if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
-    MPI_Win win; // MPI window for RMA
-#endif           // (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
 } PackedData;
 
 typedef struct {

From 01ad141d90df3f4bfca70dba551cdbc2d7e83beb Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 28 May 2020 17:05:12 +0300
Subject: [PATCH 26/89] Added comments and a short overview of the MPI
 implementation

---
 src/core/device.cc | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index b7295d7..f63a0e8 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -460,6 +460,37 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
 }
 
 #if AC_MPI_ENABLED
+/**
+Quick overview of the MPI implementation:
+
+The halo is partitioned into segments. The first coordinate of a segment is b0.
+The array containing multiple b0s is called... "b0s".
+
+Each b0 maps to an index in computational domain of some neighboring process a0.
+We have a0 = mod(b0 - nghost, nn) + nghost.
+Intuitively, we
+  1) Transform b0 into coordinate system where (0, 0, 0) is the first index in
+     the comp domain.
+  2) Wrap the transformed b0 around nn (comp domain)
+  3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
+     in the ghost zone
+
+struct PackedData is used for packing and unpacking and holds the actual data in
+                  the halo partition
+struct CommData holds multiple PackedDatas for sending and receiving halo
+                partition
+struct Grid contains information about the GPU device, decomposition, the total
+            mesh dimensions and CommDatas
+
+
+Basic steps:
+  1) Distribute the mesh among ranks
+  2) Integrate & communicate
+    - start inner integration and at the same time, pack halo data and send it to neighbors
+    - once all halo data has been received, unpack and do outer integration
+    - sync and start again
+  3) Gather the mesh to rank 0 for postprocessing
+*/
 #include <mpi.h>
 
 #include <stdint.h>
@@ -1003,7 +1034,7 @@ acUnpinCommData(const Device device, CommData* data)
 
 static AcResult
 acTransferCommData(const Device device, //
-                   const int3* b0s,     // Dst idx inside bound zone
+                   const int3* b0s,     // Halo partition coordinates
                    CommData* data)
 {
     cudaSetDevice(device->id);
@@ -1072,7 +1103,6 @@ acTransferCommData(const Device device, //
             cudaStreamSynchronize(data->streams[b0_idx]);
             MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
                       MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
-            src->pinned = true;
         }
     }
 

From 4748e48c7dfa2a2a6715685376a10b24275831ee Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 28 May 2020 17:10:17 +0300
Subject: [PATCH 27/89] Spelling fixes

---
 src/core/device.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index f63a0e8..949fd69 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -466,21 +466,21 @@ Quick overview of the MPI implementation:
 The halo is partitioned into segments. The first coordinate of a segment is b0.
 The array containing multiple b0s is called... "b0s".
 
-Each b0 maps to an index in computational domain of some neighboring process a0.
+Each b0 maps to an index in the computational domain of some neighboring process a0.
 We have a0 = mod(b0 - nghost, nn) + nghost.
 Intuitively, we
-  1) Transform b0 into coordinate system where (0, 0, 0) is the first index in
+  1) Transform b0 into a coordinate system where (0, 0, 0) is the first index in
      the comp domain.
   2) Wrap the transformed b0 around nn (comp domain)
   3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
      in the ghost zone
 
-struct PackedData is used for packing and unpacking and holds the actual data in
+struct PackedData is used for packing and unpacking. Holds the actual data in
                   the halo partition
 struct CommData holds multiple PackedDatas for sending and receiving halo
-                partition
-struct Grid contains information about the GPU device, decomposition, the total
-            mesh dimensions and CommDatas
+                partitions
+struct Grid contains information about the local GPU device, decomposition, the
+            total mesh dimensions and CommDatas
 
 
 Basic steps:

From 555bf8b25291bdc4cafa0833a4f8a6bd561ec99f Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 19:06:21 +0300
Subject: [PATCH 28/89] Reverted the default settings to same as on master for
 easier merge

---
 CMakeLists.txt       | 10 +++++-----
 config/astaroth.conf | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be5d4d6..ac33bc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,11 +28,11 @@ endif()
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 
 ## Options
-option(DOUBLE_PRECISION "Generates double precision code."                    ON)
-option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            ON)
-option(BUILD_STANDALONE "Builds standalone Astaroth."                         OFF)
-option(MPI_ENABLED      "Enables additional functions for MPI communciation." ON)
-option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
+option(DOUBLE_PRECISION "Generates double precision code."                    OFF)
+option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            OFF)
+option(BUILD_STANDALONE "Builds standalone Astaroth."                         ON)
+option(MPI_ENABLED      "Enables additional functions for MPI communciation." OFF)
+option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
 
 ## Options (DEPRECATED)
 # option(BUILD_DEBUG              "Builds the program with extensive error checking"          OFF)
diff --git a/config/astaroth.conf b/config/astaroth.conf
index 6c34cb8..abc1613 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 512
-AC_ny = 512
-AC_nz = 512
+AC_nx = 128
+AC_ny = 128
+AC_nz = 128
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
@@ -24,11 +24,11 @@ AC_bin_steps = 1000
 AC_bin_save_t = 1e666
 
 // Set to 0 if you want to run the simulation from the beginning, or just a new
-// simulation. If continuing from a saved step, specify the step number here.
-AC_start_step = 0
+// simulation. If continuing from a saved step, specify the step number here.  
+AC_start_step = 0 
 
 // Maximum time in code units. If negative, there is no time limit
-AC_max_time = -1.0
+AC_max_time = -1.0 
 
 // Hydro
 AC_cdt = 0.4
@@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5
 AC_kmin              = 0.8
 AC_kmax              = 1.2
 // Switches forcing off and accretion on
-AC_switch_accretion  = 0
+AC_switch_accretion  = 0 
 
 // Entropy
 AC_cp_sound = 1.0

From b719306266d04e814a91e152334ceb69561ac473 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 19:36:32 +0300
Subject: [PATCH 29/89] Upped the required CMake version. This may be an issue
 on older machines. Instead of making the user to compile CMake themselves in
 this case, we could maybe add CMake as a submodule. In any case supporting
 older CMake versions is not really an option because CUDA support with those
 is so bad and requires adding dirty hacks to the clean cmakefiles we have
 now.

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac33bc5..59a371a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,7 @@
 ## CMake settings
-cmake_minimum_required(VERSION 3.9) # Required for first-class CUDA support
+# V3.9 required for first-class CUDA support
+# V3.17 required for the FindCUDAToolkit package
+cmake_minimum_required(VERSION 3.17) 
 find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
 find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
 

From c24996fdb3a10c1b02396f4e7a99c720e46d103c Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 16:45:08 +0000
Subject: [PATCH 30/89] Added a the official Kitware PPA for pulling the latest
 CMake when doing automated builds.

---
 bitbucket-pipelines.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
index 58e5b81..e551fd3 100644
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@@ -19,6 +19,7 @@ pipelines:
     - step:
         script: # Modify the commands below to build your repository.
           - mkdir -p build && cd build
+          - apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
           - apt-get update
           - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev
           - cmake -DDSL_MODULE_DIR="acc/mhd_solver" -DBUILD_STANDALONE=ON -DBUILD_UTILS=ON -DBUILD_RT_VISUALIZATION=OFF -DBUILD_SAMPLES=ON -DDOUBLE_PRECISION=OFF -DMULTIGPU_ENABLED=ON -DMPI_ENABLED=OFF .. # Single precision

From 95275df3f2d98737228a6db4b6025c6fe84eebce Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 16:48:39 +0000
Subject: [PATCH 31/89] bitbucket-pipelines.yml edited online with Bitbucket

---
 bitbucket-pipelines.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
index e551fd3..9fbf36c 100644
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@@ -19,6 +19,8 @@ pipelines:
     - step:
         script: # Modify the commands below to build your repository.
           - mkdir -p build && cd build
+          - apt-get update
+          - apt-get install -y apt-transport-https ca-certificates gnupg software-properties-common wget
           - apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
           - apt-get update
           - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev

From f929b21ac020184d4a7ba714bd7e0fd3de21502b Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 16:52:26 +0000
Subject: [PATCH 32/89] bitbucket-pipelines.yml edited online with Bitbucket

---
 bitbucket-pipelines.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
index 9fbf36c..2f6266a 100644
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@@ -21,6 +21,7 @@ pipelines:
           - mkdir -p build && cd build
           - apt-get update
           - apt-get install -y apt-transport-https ca-certificates gnupg software-properties-common wget
+          - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
           - apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
           - apt-get update
           - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev

From 2ddeef22ac844842e938f0e44a2bd0462442020f Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 16:58:45 +0000
Subject: [PATCH 33/89] bitbucket-pipelines.yml edited online with Bitbucket

---
 bitbucket-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
index 2f6266a..e49d4f1 100644
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@@ -21,7 +21,7 @@ pipelines:
           - mkdir -p build && cd build
           - apt-get update
           - apt-get install -y apt-transport-https ca-certificates gnupg software-properties-common wget
-          - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+          - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
           - apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
           - apt-get update
           - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev

From 176ceae31330c1e2c2bc36293f3231dc3c4f48db Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 20:23:53 +0300
Subject: [PATCH 34/89] Fixed various compilation warnings

---
 samples/bwtest/main.c | 32 +++++++++++++++++---------------
 src/core/device.cc    | 30 ++++++++++++------------------
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index ada1721..73f4387 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -13,6 +13,8 @@
 //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
 #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
 
+#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
+
 /*
   Findings:
     - MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
@@ -27,7 +29,7 @@ static uint8_t*
 allocHost(const size_t bytes)
 {
     uint8_t* arr = malloc(bytes);
-    assert(arr);
+    errchk(arr);
     return arr;
 }
 
@@ -47,7 +49,7 @@ allocDevice(const size_t bytes)
     // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
     // Pinned (40 GiB/s internode, 10 GiB/s intranode)
     // const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
-    assert(retval == cudaSuccess);
+    errchk(retval == cudaSuccess);
     return arr;
 }
 
@@ -61,7 +63,7 @@ allocDevicePinned(const size_t bytes)
     // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
     // Pinned (40 GiB/s internode, 10 GiB/s intranode)
     const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
-    assert(retval == cudaSuccess);
+    errchk(retval == cudaSuccess);
     return arr;
 }
 
@@ -147,6 +149,7 @@ sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
     }
 }
 
+/*
 static void
 sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
 {
@@ -154,7 +157,7 @@ sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 
-    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    MPI_Request send_requests[nprocs];
     for (int i = 1; i < nprocs; ++i) {
         int nfront = (pid + i) % nprocs;
         MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
@@ -180,6 +183,7 @@ sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
         MPI_Wait(&send_requests[i], &status);
     }
 }
+*/
 
 static void
 sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
@@ -198,8 +202,6 @@ sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
     int devices_per_node = -1;
     cudaGetDeviceCount(&devices_per_node);
 
-    const int node_id = pid / devices_per_node;
-
     MPI_Request recv_requests[nprocs], send_requests[nprocs];
     for (int i = 1; i < nprocs; ++i) {
         int nfront = (pid + i) % nprocs;
@@ -226,20 +228,20 @@ sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
 }
 
 static void
-send_d2h(const uint8_t* src, uint8_t* dst)
+send_d2h(uint8_t* src, uint8_t* dst)
 {
     cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
 }
 
 static void
-send_h2d(const uint8_t* src, uint8_t* dst)
+send_h2d(uint8_t* src, uint8_t* dst)
 {
     cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
 }
 
 
 static void
-sendrecv_d2h2d(const uint8_t* dsrc, uint8_t* hdst, const uint8_t* hsrc, uint8_t* ddst)
+sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
 {
     cudaStream_t d2h, h2d;
     cudaStreamCreate(&d2h);
@@ -299,8 +301,8 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
 
 
 static void
-measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(const uint8_t*, uint8_t*, const uint8_t*, uint8_t*), const uint8_t* dsrc, uint8_t* hdst,
-                                                                                                            const uint8_t* hsrc, uint8_t* ddst)
+measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
+                                                                                                            uint8_t* hsrc, uint8_t* ddst)
 {
     const size_t num_samples = 100;
 
@@ -342,7 +344,7 @@ main(void)
     MPI_Init(NULL, NULL);
     // int provided;
     // MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
-    // assert(provided >= MPI_THREAD_MULTIPLE);
+    // errchk(provided >= MPI_THREAD_MULTIPLE);
 
     // Disable stdout buffering
     setbuf(stdout, NULL);
@@ -350,7 +352,7 @@ main(void)
     int pid, nprocs;
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    assert(nprocs >= 2); // Require at least one neighbor
+    errchk(nprocs >= 2); // Require at least one neighbor
 
     MPI_Barrier(MPI_COMM_WORLD);
     if (!pid) {
@@ -432,7 +434,7 @@ main(void)
         freeDevice(dst);
     }
     PRINT("\n------------------------\n");
-    
+
     {
         uint8_t* hsrc = allocHost(BLOCK_SIZE);
         uint8_t* hdst = allocHost(BLOCK_SIZE);
@@ -450,7 +452,7 @@ main(void)
         freeHost(hdst);
     }
     PRINT("\n------------------------\n");
-    
+
     {
         uint8_t* hsrc = allocHost(BLOCK_SIZE);
         uint8_t* hdst = allocHost(BLOCK_SIZE);
diff --git a/src/core/device.cc b/src/core/device.cc
index 949fd69..2b1e482 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -962,7 +962,7 @@ acSyncCommData(const CommData data)
 static int3
 mod(const int3 a, const int3 n)
 {
-    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
+    return (int3){(int)mod(a.x, n.x), (int)mod(a.y, n.y), (int)mod(a.z, n.z)};
 }
 
 static void
@@ -1058,7 +1058,6 @@ acTransferCommData(const Device device, //
     const int3 dims         = data->dims;
     const size_t blockcount = data->count;
     const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-    const int3 nghost       = (int3){NGHOST, NGHOST, NGHOST};
 
     for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
 
@@ -1286,7 +1285,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
 
     const Device device  = grid.device;
     const int3 nn        = grid.nn;
-    CommData corner_data = grid.corner_data;
+    //CommData corner_data = grid.corner_data; // Do not rm: required for corners
     CommData edgex_data  = grid.edgex_data;
     CommData edgey_data  = grid.edgey_data;
     CommData edgez_data  = grid.edgez_data;
@@ -1297,6 +1296,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     acDeviceSynchronizeStream(device, stream);
 
     // Corners
+    /*
+    // Do not rm: required for corners
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -1308,6 +1309,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         (int3){0, NGHOST + nn.y, NGHOST + nn.z},
         (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
     };
+    */
 
     // Edges X
     const int3 edgex_b0s[] = {
@@ -1355,7 +1357,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        // acPackCommData(device, corner_b0s, &corner_data);
+        // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
         acPackCommData(device, edgex_b0s, &edgex_data);
         acPackCommData(device, edgey_b0s, &edgey_data);
         acPackCommData(device, edgez_b0s, &edgez_data);
@@ -1363,18 +1365,10 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexz_b0s, &sidexz_data);
         acPackCommData(device, sideyz_b0s, &sideyz_data);
 
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-        ////////////////////////////////////////////
-
         MPI_Barrier(MPI_COMM_WORLD);
 
 #if MPI_GPUDIRECT_DISABLED
-        // acTransferCommDataToHost(device, &corner_data);
+        // acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
         acTransferCommDataToHost(device, &edgex_data);
         acTransferCommDataToHost(device, &edgey_data);
         acTransferCommDataToHost(device, &edgez_data);
@@ -1383,7 +1377,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToHost(device, &sideyz_data);
 #endif
 
-        // acTransferCommData(device, corner_b0s, &corner_data);
+        // acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
         acTransferCommData(device, edgex_b0s, &edgex_data);
         acTransferCommData(device, edgey_b0s, &edgey_data);
         acTransferCommData(device, edgez_b0s, &edgez_data);
@@ -1391,7 +1385,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommData(device, sidexz_b0s, &sidexz_data);
         acTransferCommData(device, sideyz_b0s, &sideyz_data);
 
-        // acTransferCommDataWait(corner_data);
+        // acTransferCommDataWait(corner_data); // Do not rm: required for corners
         acTransferCommDataWait(edgex_data);
         acTransferCommDataWait(edgey_data);
         acTransferCommDataWait(edgez_data);
@@ -1400,7 +1394,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataWait(sideyz_data);
 
 #if MPI_GPUDIRECT_DISABLED
-        // acTransferCommDataToDevice(device, &corner_data);
+        // acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners
         acTransferCommDataToDevice(device, &edgex_data);
         acTransferCommDataToDevice(device, &edgey_data);
         acTransferCommDataToDevice(device, &edgez_data);
@@ -1409,7 +1403,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToDevice(device, &sideyz_data);
 #endif
 
-        // acUnpinCommData(device, &corner_data);
+        // acUnpinCommData(device, &corner_data); // Do not rm: required for corners
         acUnpinCommData(device, &edgex_data);
         acUnpinCommData(device, &edgey_data);
         acUnpinCommData(device, &edgez_data);
@@ -1427,7 +1421,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         //////////// OUTER INTEGRATION //////////////
 
         // Wait for unpacking
-        // acSyncCommData(corner_data);
+        // acSyncCommData(corner_data); // Do not rm: required for corners
         acSyncCommData(edgex_data);
         acSyncCommData(edgey_data);
         acSyncCommData(edgez_data);

From f97ed9e513f1c7d6102e54c0a42130c048818323 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 20:59:39 +0300
Subject: [PATCH 35/89] For reason X git decided to remove integration from the
 most critical part of the program when merging. Luckily we have autotests.

---
 src/core/device.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index 2b1e482..f473fc2 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1365,6 +1365,14 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexz_b0s, &sidexz_data);
         acPackCommData(device, sideyz_b0s, &sideyz_data);
 
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+        ////////////////////////////////////////////
+
         MPI_Barrier(MPI_COMM_WORLD);
 
 #if MPI_GPUDIRECT_DISABLED

From a753ca92f2a58bf07b76e884171750214641ae4a Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sat, 30 May 2020 22:02:39 +0300
Subject: [PATCH 36/89] Made cmake handle MPI linking. Potentially a bad idea
 (usually better to use mpicc and mpicxx wrappers)

---
 CMakeLists.txt          | 5 +++++
 src/core/CMakeLists.txt | 7 ++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59a371a..a5a514b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,11 @@ include_directories(src/common)                                # Common headers
 include_directories(${CMAKE_BINARY_DIR})                       # DSL headers
 include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # CUDA headers
 
+if (MPI_ENABLED)
+    find_package(MPI REQUIRED)
+    include_directories(${MPI_CXX_INCLUDE_DIRS})
+endif()
+
 ## Subdirectories
 add_subdirectory(src/utils)
 add_subdirectory(src/core/kernels)
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 7d93fd2..757cbfe 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -6,9 +6,10 @@ target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart
 
 ## Options
 if (MPI_ENABLED)
-    find_package(MPI)
-    find_package(OpenMP)
-    target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
+    #find_package(MPI REQUIRED)
+    #find_package(OpenMP)
+    #target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
+    target_link_libraries(astaroth_core MPI::MPI_CXX)
 endif()
 
 if (MULTIGPU_ENABLED)

From 0d80834619fa973052686fbf887dcca1fa664f23 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 2 Jun 2020 14:08:34 +0300
Subject: [PATCH 37/89] Disabled forcing and upwinding for performance tests.
 Set default grid size to 512^3. Set default cmake params s.t. benchmarks can
 be reproduced out-of-the-box.

---
 CMakeLists.txt                   | 10 +++++-----
 acc/mhd_solver/stencil_kernel.ac |  4 ++--
 config/astaroth.conf             |  6 +++---
 src/utils/modelsolver.c          |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5a514b..45c3a2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,11 +30,11 @@ endif()
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 
 ## Options
-option(DOUBLE_PRECISION "Generates double precision code."                    OFF)
-option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            OFF)
-option(BUILD_STANDALONE "Builds standalone Astaroth."                         ON)
-option(MPI_ENABLED      "Enables additional functions for MPI communciation." OFF)
-option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
+option(DOUBLE_PRECISION "Generates double precision code."                    ON)
+option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            ON)
+option(BUILD_STANDALONE "Builds standalone Astaroth."                         OFF)
+option(MPI_ENABLED      "Enables additional functions for MPI communciation." ON)
+option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
 
 ## Options (DEPRECATED)
 # option(BUILD_DEBUG              "Builds the program with extensive error checking"          OFF)
diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac
index 905eb65..0f37ea5 100644
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -5,8 +5,8 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (1)
-#define LUPWD (1)
+#define LFORCING (0)
+#define LUPWD (0)
 #define LSINK (0)
 
 #define AC_THERMAL_CONDUCTIVITY (0.001) // TODO: make an actual config parameter
diff --git a/config/astaroth.conf b/config/astaroth.conf
index abc1613..ccefc45 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 128
-AC_ny = 128
-AC_nz = 128
+AC_nx = 512
+AC_ny = 512
+AC_nz = 512
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index 92eb71c..e482bd9 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -39,7 +39,7 @@
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
 #define LFORCING (0)
-#define LUPWD (1)
+#define LUPWD (0)
 #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
 
 typedef AcReal Scalar;

From 899d679518cd943ea9dc4e95424d8ed280574f25 Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Tue, 2 Jun 2020 21:30:53 +0300
Subject: [PATCH 38/89] Draft of MPI-based reductions acGridReduceScal,
 acGridReduceVec

 - Calls acDeviceReduceScal/Vec first
 - Both functions then perform the same MPI-reduction (MPI_Allreduce)
 - Not tested
---
 src/core/device.cc | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index f473fc2..9846d2f 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1620,4 +1620,74 @@ acGridPeriodicBoundconds(const Stream stream)
     acSyncCommData(sideyz_data);
     return AC_SUCCESS;
 }
+
+AcResult
+acMPIReduceScal(AcReal* local_result, const ReductionType rtype, AcReal* result)
+{
+
+    MPI_Op op;
+    if (rtype == RTYPE_MAX) {
+        op = MPI_MAX;
+    } else if (rtype == RTYPE_MIN) {
+        op = MPI_MIN;
+    } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
+        op = MPI_SUM;
+    } else {
+        ERROR("Unrecognised rtype");
+    }
+
+    #if AC_DOUBLE_PRECISION == 1
+    MPI_Datatype datatype = MPI_DOUBLE;
+    #else
+    MPI_Datatype datatype = MPI_FLOAT;
+    #endif    
+
+    int world_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        //Overflow risk?
+        *local_result = *local_result*(*local_result);
+    }
+
+    AcReal mpi_res;
+    MPI_Allreduce(&local_result, &mpi_res, 1, datatype, op, MPI_COMM_WORLD);
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        const AcReal inv_n = AcReal(1.) / world_size;
+        mpi_res = sqrt(inv_n * mpi_res);
+    }
+    *result = mpi_res;
+    return AC_SUCCESS;
+
+}
+
+AcResult
+acGridReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    acGridSynchronizeStream(STREAM_ALL);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    AcReal local_result;
+    acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
+    
+    return acMPIReduceScal(&local_result,rtype,result);    
+}
+
+
+AcResult
+acGridReduceVec(const Device device, const Stream stream, const ReductionType rtype,
+                const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                const VertexBufferHandle vtxbuf2, AcReal* result)
+{
+    acGridSynchronizeStream(STREAM_ALL);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    AcReal local_result;
+    acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);
+
+    return acMPIReduceScal(&local_result,rtype,result);    
+}
+
 #endif // AC_MPI_ENABLED

From 34793d4e8bc3fa9b121b15575c29b712094f6f5f Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Wed, 3 Jun 2020 12:44:43 +0300
Subject: [PATCH 39/89] Changes after code review with Johannes

---
 src/core/device.cc | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 9846d2f..4846a44 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1621,8 +1621,9 @@ acGridPeriodicBoundconds(const Stream stream)
     return AC_SUCCESS;
 }
 
-AcResult
-acMPIReduceScal(AcReal* local_result, const ReductionType rtype, AcReal* result)
+
+static AcResult
+acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* result)
 {
 
     MPI_Op op;
@@ -1641,20 +1642,20 @@ acMPIReduceScal(AcReal* local_result, const ReductionType rtype, AcReal* result)
     #else
     MPI_Datatype datatype = MPI_FLOAT;
     #endif    
+    
+    /*
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    */
 
     int world_size;
     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        //Overflow risk?
-        *local_result = *local_result*(*local_result);
-    }
-
     AcReal mpi_res;
     MPI_Allreduce(&local_result, &mpi_res, 1, datatype, op, MPI_COMM_WORLD);
 
     if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / world_size;
+        const AcReal inv_n = AcReal(1.) / (grid.nn.x*grid.decomposition.x * grid.nn.y*grid.decomposition.y * grid.nn.z*grid.decomposition.z);
         mpi_res = sqrt(inv_n * mpi_res);
     }
     *result = mpi_res;
@@ -1671,8 +1672,8 @@ acGridReduceScal(const Device device, const Stream stream, const ReductionType r
 
     AcReal local_result;
     acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
-    
-    return acMPIReduceScal(&local_result,rtype,result);    
+
+    return acMPIReduceScal(local_result,rtype,result);    
 }
 
 
@@ -1687,7 +1688,7 @@ acGridReduceVec(const Device device, const Stream stream, const ReductionType rt
     AcReal local_result;
     acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);
 
-    return acMPIReduceScal(&local_result,rtype,result);    
+    return acMPIReduceScal(local_result,rtype,result);    
 }
 
 #endif // AC_MPI_ENABLED

From 226de326513e2a1eba1137faa4f606f64f749028 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 3 Jun 2020 13:37:00 +0300
Subject: [PATCH 40/89] Added model solution for reductions and functions for
 automated testing

---
 include/astaroth.h       |  11 +++
 include/astaroth_utils.h |  28 ++++++
 samples/mpitest/main.cc  |  14 +++
 src/core/device.cc       |  24 +++--
 src/utils/CMakeLists.txt |   2 +-
 src/utils/modelreduce.c  | 209 +++++++++++++++++++++++++++++++++++++++
 src/utils/verification.c |  23 ++---
 7 files changed, 287 insertions(+), 24 deletions(-)
 create mode 100644 src/utils/modelreduce.c

diff --git a/include/astaroth.h b/include/astaroth.h
index b170796..47beb88 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -320,6 +320,17 @@ AcResult acGridIntegrate(const Stream stream, const AcReal dt);
 
 /** */
 AcResult acGridPeriodicBoundconds(const Stream stream);
+
+/** TODO */
+AcResult
+acGridReduceScal(const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result);
+
+/** TODO */
+AcResult
+acGridReduceVec(const Stream stream, const ReductionType rtype,
+               const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+               const VertexBufferHandle vtxbuf2, AcReal* result);
 #endif // AC_MPI_ENABLED
 
 /*
diff --git a/include/astaroth_utils.h b/include/astaroth_utils.h
index 740e7f1..0422b7b 100644
--- a/include/astaroth_utils.h
+++ b/include/astaroth_utils.h
@@ -29,6 +29,25 @@
 extern "C" {
 #endif
 
+ #include <stdbool.h>
+
+typedef struct {
+    VertexBufferHandle handle;
+    AcReal model;
+    AcReal candidate;
+    long double abs_error;
+    long double ulp_error;
+    long double rel_error;
+    AcReal maximum_magnitude;
+    AcReal minimum_magnitude;
+} Error;
+
+/** TODO comment */
+Error acGetError(AcReal model, AcReal candidate);
+
+/** TODO comment */
+bool printErrorToScreen(const Error error);
+
 /** Loads data from the config file */
 AcResult acLoadConfig(const char* config_path, AcMeshInfo* config);
 
@@ -56,6 +75,15 @@ AcResult acMeshClear(AcMesh* mesh);
 /** */
 AcResult acModelIntegrateStep(AcMesh mesh, const AcReal dt);
 
+/** TODO */
+AcReal
+acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a);
+
+/** TODO */
+AcReal
+acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
+                 const VertexBufferHandle b, const VertexBufferHandle c);
+
 /** */
 AcResult acVerifyMesh(const AcMesh model, const AcMesh candidate);
 
diff --git a/samples/mpitest/main.cc b/samples/mpitest/main.cc
index 8d0a4fc..7b12fe2 100644
--- a/samples/mpitest/main.cc
+++ b/samples/mpitest/main.cc
@@ -53,6 +53,13 @@ main(void)
     acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
     acGridPeriodicBoundconds(STREAM_DEFAULT);
 
+    // Do reductions
+    AcReal cand_reduce_res = 0;
+    VertexBufferHandle vtxbuf = VTXBUF_UUX;
+    ReductionType rtype = RTYPE_MAX;
+    acGridReduceScal(STREAM_DEFAULT, rtype, vtxbuf, &cand_reduce_res); // TODO
+
+
     acGridStoreMesh(STREAM_DEFAULT, &candidate);
     acGridQuit();
 
@@ -62,6 +69,13 @@ main(void)
         acMeshApplyPeriodicBounds(&model);
 
         acVerifyMesh(model, candidate);
+
+        // Check reductions
+        AcReal model_reduce_res = acModelReduceScal(model, RTYPE_MAX, vtxbuf);
+        Error error = acGetError(model_reduce_res, cand_reduce_res);
+        error.handle = vtxbuf;
+        printErrorToScreen(error);
+
         acMeshDestroy(&model);
         acMeshDestroy(&candidate);
     }
diff --git a/src/core/device.cc b/src/core/device.cc
index 4846a44..c82e80f 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1641,8 +1641,8 @@ acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* re
     MPI_Datatype datatype = MPI_DOUBLE;
     #else
     MPI_Datatype datatype = MPI_FLOAT;
-    #endif    
-    
+    #endif
+
     /*
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -1664,31 +1664,43 @@ acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* re
 }
 
 AcResult
-acGridReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+acGridReduceScal(const Stream stream, const ReductionType rtype,
                  const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
+    ERRCHK(grid.initialized);
+    // acGridSynchronizeStream(stream);
+
+    const Device device  = grid.device;
+    //const int3 nn        = grid.nn;
+
     acGridSynchronizeStream(STREAM_ALL);
     MPI_Barrier(MPI_COMM_WORLD);
 
     AcReal local_result;
     acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
 
-    return acMPIReduceScal(local_result,rtype,result);    
+    return acMPIReduceScal(local_result,rtype,result);
 }
 
 
 AcResult
-acGridReduceVec(const Device device, const Stream stream, const ReductionType rtype,
+acGridReduceVec(const Stream stream, const ReductionType rtype,
                 const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
                 const VertexBufferHandle vtxbuf2, AcReal* result)
 {
+    ERRCHK(grid.initialized);
+    // acGridSynchronizeStream(stream);
+
+    const Device device  = grid.device;
+    //const int3 nn        = grid.nn;
+
     acGridSynchronizeStream(STREAM_ALL);
     MPI_Barrier(MPI_COMM_WORLD);
 
     AcReal local_result;
     acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);
 
-    return acMPIReduceScal(local_result,rtype,result);    
+    return acMPIReduceScal(local_result,rtype,result);
 }
 
 #endif // AC_MPI_ENABLED
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index 63e918c..47f1116 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -1,3 +1,3 @@
 ## Astaroth Utils
-add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c)
+add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c modelreduce.c)
 add_dependencies(astaroth_utils dsl_headers)
diff --git a/src/utils/modelreduce.c b/src/utils/modelreduce.c
new file mode 100644
index 0000000..d95fc8c
--- /dev/null
+++ b/src/utils/modelreduce.c
@@ -0,0 +1,209 @@
+/*
+    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+ #include "astaroth.h"
+
+#include <math.h>
+
+#include "errchk.h"
+
+#if AC_DOUBLE_PRECISION == 0 // HACK TODO fix, make cleaner (purkkaratkaisu)
+#define fabs fabsf
+#define exp expf
+#define sqrt sqrtf
+#endif
+
+// Function pointer definitions
+typedef AcReal (*ReduceFunc)(const AcReal, const AcReal);
+typedef AcReal (*ReduceInitialScalFunc)(const AcReal);
+typedef AcReal (*ReduceInitialVecFunc)(const AcReal, const AcReal,
+                                            const AcReal);
+
+// clang-format off
+/* Comparison funcs */
+static inline AcReal
+max(const AcReal a, const AcReal b) { return a > b ? a : b; }
+
+static inline AcReal
+min(const AcReal a, const AcReal b) { return a < b ? a : b; }
+
+static inline AcReal
+sum(const AcReal a, const AcReal b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static inline AcReal
+length_scal(const AcReal a) { return (AcReal)(a); }
+
+static inline AcReal
+length_vec(const AcReal a, const AcReal b, const AcReal c) { return sqrt(a*a + b*b + c*c); }
+
+static inline AcReal
+squared_scal(const AcReal a) { return (AcReal)(a*a); }
+
+static inline AcReal
+squared_vec(const AcReal a, const AcReal b, const AcReal c) { return squared_scal(a) + squared_scal(b) + squared_scal(c); }
+
+static inline AcReal
+exp_squared_scal(const AcReal a) { return exp(a)*exp(a); }
+
+static inline AcReal
+exp_squared_vec(const AcReal a, const AcReal b, const AcReal c) { return exp_squared_scal(a) + exp_squared_scal(b) + exp_squared_scal(c); }
+// clang-format on
+
+AcReal
+acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a)
+{
+    ReduceInitialScalFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length_scal;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length_scal;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared_scal;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared_scal;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_SUM:
+        reduce_initial = length_scal;
+        reduce         = sum;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
+                                              mesh.info.int_params[AC_ny_min],
+                                              mesh.info.int_params[AC_nz_min], mesh.info);
+
+    AcReal res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
+    else
+        res = 0;
+
+    for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; ++k) {
+        for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; ++j) {
+            for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
+                 ++i) {
+                const int idx              = acVertexBufferIdx(i, j, k, mesh.info);
+                const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx]);
+                res                        = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
+        return sqrt(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
+
+AcReal
+acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
+                 const VertexBufferHandle b, const VertexBufferHandle c)
+{
+    // AcReal (*reduce_initial)(AcReal, AcReal, AcReal);
+    ReduceInitialVecFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length_vec;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length_vec;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared_vec;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared_vec;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_SUM:
+        reduce_initial = length_vec;
+        reduce         = sum;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
+                                              mesh.info.int_params[AC_ny_min],
+                                              mesh.info.int_params[AC_nz_min], mesh.info);
+
+    AcReal res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx], mesh.vertex_buffer[b][initial_idx],
+                             mesh.vertex_buffer[c][initial_idx]);
+    else
+        res = 0;
+
+    for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; k++) {
+        for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; j++) {
+            for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
+                 i++) {
+                const int idx              = acVertexBufferIdx(i, j, k, mesh.info);
+                const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx],
+                                                            mesh.vertex_buffer[b][idx],
+                                                            mesh.vertex_buffer[c][idx]);
+                res                        = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
+        return sqrt(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
diff --git a/src/utils/verification.c b/src/utils/verification.c
index 3277660..a3ddb18 100644
--- a/src/utils/verification.c
+++ b/src/utils/verification.c
@@ -18,25 +18,14 @@
 #define WHT "\x1B[37m"
 #define RESET "\x1B[0m"
 
-typedef struct {
-    VertexBufferHandle handle;
-    AcReal model;
-    AcReal candidate;
-    long double abs_error;
-    long double ulp_error;
-    long double rel_error;
-    AcReal maximum_magnitude;
-    AcReal minimum_magnitude;
-} Error;
-
 static inline bool
 is_valid(const AcReal a)
 {
     return !isnan(a) && !isinf(a);
 }
 
-static Error
-get_error(AcReal model, AcReal candidate)
+Error
+acGetError(AcReal model, AcReal candidate)
 {
     Error error;
     error.abs_error = 0;
@@ -109,7 +98,7 @@ get_max_abs_error(const VertexBufferHandle vtxbuf_handle, const AcMesh model_mes
 
     for (size_t i = 0; i < acVertexBufferSize(model_mesh.info); ++i) {
 
-        Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
+        Error curr_error = acGetError(model_vtxbuf[i], candidate_vtxbuf[i]);
 
         if (curr_error.abs_error > error.abs_error)
             error = curr_error;
@@ -147,8 +136,8 @@ is_acceptable(const Error error)
         return false;
 }
 
-static bool
-print_error_to_screen(const Error error)
+bool
+printErrorToScreen(const Error error)
 {
     bool errors_found = false;
 
@@ -177,7 +166,7 @@ acVerifyMesh(const AcMesh model, const AcMesh candidate)
     bool errors_found = false;
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         Error field_error = get_max_abs_error(i, model, candidate);
-        errors_found |= print_error_to_screen(field_error);
+        errors_found |= printErrorToScreen(field_error);
     }
 
     printf("%s\n", errors_found ? "Failure. Found errors in one or more vertex buffers"

From f7d8de75d24ccbef0c18021171f944ca88955c75 Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Thu, 4 Jun 2020 13:42:34 +0300
Subject: [PATCH 41/89] Reduction test pipeline added to mpitest, Error struct
 changed: new label field

 - CHANGED: Error struct has a new label field for labeling an error
   - The label is what is printed to screen
   - vtxbuf name lookup moved out of printErrorToScreen/print_error_to_screen
 - NEW: acScalReductionTestCase and acVecReductionTestCase
   - Define new test cases by adding them to a list in samples/mpitest/main.cc:main
 - Minor style change in verification.c to make all Verification functions similar
   and fit one screen
---
 include/astaroth_utils.h | 34 ++++++++++++++++++++++++-----
 samples/mpitest/main.cc  | 37 +++++++++++++++++++++++--------
 src/utils/verification.c | 47 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 98 insertions(+), 20 deletions(-)

diff --git a/include/astaroth_utils.h b/include/astaroth_utils.h
index 0422b7b..88c5a05 100644
--- a/include/astaroth_utils.h
+++ b/include/astaroth_utils.h
@@ -29,9 +29,12 @@
 extern "C" {
 #endif
 
- #include <stdbool.h>
+#include <stdbool.h>
+
+#define ERROR_LABEL_LENGTH 30
 
 typedef struct {
+    char label[ERROR_LABEL_LENGTH];
     VertexBufferHandle handle;
     AcReal model;
     AcReal candidate;
@@ -42,6 +45,22 @@ typedef struct {
     AcReal minimum_magnitude;
 } Error;
 
+typedef struct {
+    char label[ERROR_LABEL_LENGTH];
+    VertexBufferHandle vtxbuf;
+    ReductionType rtype;
+    AcReal candidate;
+} AcScalReductionTestCase;
+
+typedef struct {
+    char label[ERROR_LABEL_LENGTH];
+    VertexBufferHandle a;
+    VertexBufferHandle b;
+    VertexBufferHandle c;
+    ReductionType rtype;
+    AcReal candidate;
+} AcVecReductionTestCase;
+
 /** TODO comment */
 Error acGetError(AcReal model, AcReal candidate);
 
@@ -76,17 +95,20 @@ AcResult acMeshClear(AcMesh* mesh);
 AcResult acModelIntegrateStep(AcMesh mesh, const AcReal dt);
 
 /** TODO */
-AcReal
-acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a);
+AcReal acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a);
 
 /** TODO */
-AcReal
-acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
-                 const VertexBufferHandle b, const VertexBufferHandle c);
+AcReal acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c);
 
 /** */
 AcResult acVerifyMesh(const AcMesh model, const AcMesh candidate);
 
+/** */
+AcResult acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases, const size_t numCases);
+
+/** */
+AcResult acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases, const size_t numCases);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/samples/mpitest/main.cc b/samples/mpitest/main.cc
index 7b12fe2..b25d990 100644
--- a/samples/mpitest/main.cc
+++ b/samples/mpitest/main.cc
@@ -25,6 +25,7 @@
 #if AC_MPI_ENABLED
 
 #include <mpi.h>
+#include <vector>
 
 int
 main(void)
@@ -53,12 +54,32 @@ main(void)
     acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
     acGridPeriodicBoundconds(STREAM_DEFAULT);
 
-    // Do reductions
-    AcReal cand_reduce_res = 0;
-    VertexBufferHandle vtxbuf = VTXBUF_UUX;
-    ReductionType rtype = RTYPE_MAX;
-    acGridReduceScal(STREAM_DEFAULT, rtype, vtxbuf, &cand_reduce_res); // TODO
+    // clang-format off
+    // Define scalar reduction tests here
+    std::vector<AcScalReductionTestCase> scalarReductionTests{
+        AcScalReductionTestCase{"Scalar MAX",     VTXBUF_UUX, RTYPE_MAX,     0},
+        AcScalReductionTestCase{"Scalar MIN",     VTXBUF_UUX, RTYPE_MIN,     0},
+        AcScalReductionTestCase{"Scalar RMS",     VTXBUF_UUX, RTYPE_RMS,     0},
+        AcScalReductionTestCase{"Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP, 0},
+        AcScalReductionTestCase{"Scalar SUM",     VTXBUF_UUX, RTYPE_SUM,     0}
+    };
+    // Define vector reduction tests here
+    std::vector<AcVecReductionTestCase> vectorReductionTests{
+        AcVecReductionTestCase{"Vector MAX",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX,     0},
+        AcVecReductionTestCase{"Vector MIN",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN,     0},
+        AcVecReductionTestCase{"Vector RMS",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS,     0},
+        AcVecReductionTestCase{"Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP, 0},
+        AcVecReductionTestCase{"Vector SUM",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM,     0}
+    };
+    // clang-format on
 
+    for (auto& testCase : scalarReductionTests) {
+        acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
+    }
+    for (auto& testCase : vectorReductionTests) {
+        acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c,
+                        &testCase.candidate);
+    }
 
     acGridStoreMesh(STREAM_DEFAULT, &candidate);
     acGridQuit();
@@ -71,10 +92,8 @@ main(void)
         acVerifyMesh(model, candidate);
 
         // Check reductions
-        AcReal model_reduce_res = acModelReduceScal(model, RTYPE_MAX, vtxbuf);
-        Error error = acGetError(model_reduce_res, cand_reduce_res);
-        error.handle = vtxbuf;
-        printErrorToScreen(error);
+        acVerifyScalReductions(model, scalarReductionTests.data(), scalarReductionTests.size());
+        acVerifyVecReductions(model, vectorReductionTests.data(), vectorReductionTests.size());
 
         acMeshDestroy(&model);
         acMeshDestroy(&candidate);
diff --git a/src/utils/verification.c b/src/utils/verification.c
index a3ddb18..fcae0bc 100644
--- a/src/utils/verification.c
+++ b/src/utils/verification.c
@@ -2,6 +2,7 @@
 
 #include <math.h>
 #include <stdbool.h>
+#include <string.h>
 
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
@@ -105,6 +106,7 @@ get_max_abs_error(const VertexBufferHandle vtxbuf_handle, const AcMesh model_mes
     }
 
     error.handle            = vtxbuf_handle;
+    strcpy(error.label, vtxbuf_names[vtxbuf_handle]);
     error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
     error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
 
@@ -141,7 +143,7 @@ printErrorToScreen(const Error error)
 {
     bool errors_found = false;
 
-    printf("\t%-15s... ", vtxbuf_names[error.handle]);
+    printf("\t%-15s... ", error.label);
     if (is_acceptable(error)) {
         printf(GRN "OK! " RESET);
     }
@@ -172,8 +174,43 @@ acVerifyMesh(const AcMesh model, const AcMesh candidate)
     printf("%s\n", errors_found ? "Failure. Found errors in one or more vertex buffers"
                                 : "Success. No errors found.");
 
-    if (errors_found)
-        return AC_FAILURE;
-    else
-        return AC_SUCCESS;
+    return errors_found ? AC_FAILURE : AC_SUCCESS;
+}
+
+/** Verification function for scalar reductions*/
+AcResult 
+acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases, const size_t numCases)
+{
+    printf("\nTesting scalar reductions:\n");
+
+    bool errors_found = false;
+    for (size_t i = 0; i < numCases; i++){
+        AcReal model_reduction = acModelReduceScal(model, testCases[i].rtype, testCases[i].vtxbuf);
+        Error error = acGetError(model_reduction, testCases[i].candidate);
+        strcpy(error.label, testCases[i].label);
+        errors_found |= printErrorToScreen(error);
+    }
+    printf("%s\n", errors_found ? "Failure. Found errors in one or more scalar reductions"
+                                : "Success. No errors found.");
+
+    return errors_found ? AC_FAILURE : AC_SUCCESS;
+}
+
+/** Verification function for vector reductions*/
+AcResult 
+acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases, const size_t numCases)
+{
+    printf("\nTesting vector reductions:\n");
+
+    bool errors_found = false;
+    for (size_t i = 0; i < numCases; i++){
+        AcReal model_reduction = acModelReduceVec(model, testCases[i].rtype, testCases[i].a, testCases[i].b, testCases[i].c);
+        Error error = acGetError(model_reduction, testCases[i].candidate);
+        strcpy(error.label, testCases[i].label);
+        errors_found |= printErrorToScreen(error);
+    }
+    printf("%s\n", errors_found ? "Failure. Found errors in one or more vector reductions"
+                                : "Success. No errors found.");
+
+    return errors_found ? AC_FAILURE : AC_SUCCESS;
 }

From 9e5fd40838892424d747319bd09a3c5c5df15784 Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Thu, 4 Jun 2020 18:47:31 +0300
Subject: [PATCH 42/89] Changes after code review by Johannes, and clang-format

---
 include/astaroth_utils.h |  6 ++++
 samples/mpitest/main.cc  | 20 ++++++-------
 src/core/device.cc       | 46 +++++++++++++---------------
 src/utils/verification.c | 65 ++++++++++++++++++++++++++++++++--------
 4 files changed, 89 insertions(+), 48 deletions(-)

diff --git a/include/astaroth_utils.h b/include/astaroth_utils.h
index 88c5a05..d96074c 100644
--- a/include/astaroth_utils.h
+++ b/include/astaroth_utils.h
@@ -79,6 +79,12 @@ AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
 /** */
 AcResult acMeshDestroy(AcMesh* mesh);
 
+/** */
+AcScalReductionTestCase acCreateScalReductionTestCase(const char* label, const VertexBufferHandle vtxbuf, const ReductionType rtype);
+
+/** */
+AcVecReductionTestCase acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c, const ReductionType rtype);
+
 /** */
 AcResult acMeshSet(const AcReal value, AcMesh* mesh);
 
diff --git a/samples/mpitest/main.cc b/samples/mpitest/main.cc
index b25d990..f8de568 100644
--- a/samples/mpitest/main.cc
+++ b/samples/mpitest/main.cc
@@ -57,19 +57,19 @@ main(void)
     // clang-format off
     // Define scalar reduction tests here
     std::vector<AcScalReductionTestCase> scalarReductionTests{
-        AcScalReductionTestCase{"Scalar MAX",     VTXBUF_UUX, RTYPE_MAX,     0},
-        AcScalReductionTestCase{"Scalar MIN",     VTXBUF_UUX, RTYPE_MIN,     0},
-        AcScalReductionTestCase{"Scalar RMS",     VTXBUF_UUX, RTYPE_RMS,     0},
-        AcScalReductionTestCase{"Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP, 0},
-        AcScalReductionTestCase{"Scalar SUM",     VTXBUF_UUX, RTYPE_SUM,     0}
+        acCreateScalReductionTestCase("Scalar MAX",     VTXBUF_UUX, RTYPE_MAX),
+        acCreateScalReductionTestCase("Scalar MIN",     VTXBUF_UUX, RTYPE_MIN),
+        acCreateScalReductionTestCase("Scalar RMS",     VTXBUF_UUX, RTYPE_RMS),
+        acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
+        acCreateScalReductionTestCase("Scalar SUM",     VTXBUF_UUX, RTYPE_SUM)
     };
     // Define vector reduction tests here
     std::vector<AcVecReductionTestCase> vectorReductionTests{
-        AcVecReductionTestCase{"Vector MAX",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX,     0},
-        AcVecReductionTestCase{"Vector MIN",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN,     0},
-        AcVecReductionTestCase{"Vector RMS",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS,     0},
-        AcVecReductionTestCase{"Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP, 0},
-        AcVecReductionTestCase{"Vector SUM",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM,     0}
+        acCreateVecReductionTestCase("Vector MAX",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
+        acCreateVecReductionTestCase("Vector MIN",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
+        acCreateVecReductionTestCase("Vector RMS",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
+        acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP),
+        acCreateVecReductionTestCase("Vector SUM",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM)
     };
     // clang-format on
 
diff --git a/src/core/device.cc b/src/core/device.cc
index c82e80f..a3ead26 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1283,9 +1283,9 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     ERRCHK(grid.initialized);
     // acGridSynchronizeStream(stream);
 
-    const Device device  = grid.device;
-    const int3 nn        = grid.nn;
-    //CommData corner_data = grid.corner_data; // Do not rm: required for corners
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+    // CommData corner_data = grid.corner_data; // Do not rm: required for corners
     CommData edgex_data  = grid.edgex_data;
     CommData edgey_data  = grid.edgey_data;
     CommData edgez_data  = grid.edgez_data;
@@ -1621,7 +1621,6 @@ acGridPeriodicBoundconds(const Stream stream)
     return AC_SUCCESS;
 }
 
-
 static AcResult
 acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* result)
 {
@@ -1629,19 +1628,22 @@ acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* re
     MPI_Op op;
     if (rtype == RTYPE_MAX) {
         op = MPI_MAX;
-    } else if (rtype == RTYPE_MIN) {
+    }
+    else if (rtype == RTYPE_MIN) {
         op = MPI_MIN;
-    } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
+    }
+    else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
         op = MPI_SUM;
-    } else {
+    }
+    else {
         ERROR("Unrecognised rtype");
     }
 
-    #if AC_DOUBLE_PRECISION == 1
+#if AC_DOUBLE_PRECISION == 1
     MPI_Datatype datatype = MPI_DOUBLE;
-    #else
+#else
     MPI_Datatype datatype = MPI_FLOAT;
-    #endif
+#endif
 
     /*
     int rank;
@@ -1655,12 +1657,12 @@ acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* re
     MPI_Allreduce(&local_result, &mpi_res, 1, datatype, op, MPI_COMM_WORLD);
 
     if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.nn.x*grid.decomposition.x * grid.nn.y*grid.decomposition.y * grid.nn.z*grid.decomposition.z);
-        mpi_res = sqrt(inv_n * mpi_res);
+        const AcReal inv_n = AcReal(1.) / (grid.nn.x * grid.decomposition.x * grid.nn.y *
+                                           grid.decomposition.y * grid.nn.z * grid.decomposition.z);
+        mpi_res            = sqrt(inv_n * mpi_res);
     }
     *result = mpi_res;
     return AC_SUCCESS;
-
 }
 
 AcResult
@@ -1668,10 +1670,8 @@ acGridReduceScal(const Stream stream, const ReductionType rtype,
                  const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
     ERRCHK(grid.initialized);
-    // acGridSynchronizeStream(stream);
 
-    const Device device  = grid.device;
-    //const int3 nn        = grid.nn;
+    const Device device = grid.device;
 
     acGridSynchronizeStream(STREAM_ALL);
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1679,20 +1679,16 @@ acGridReduceScal(const Stream stream, const ReductionType rtype,
     AcReal local_result;
     acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
 
-    return acMPIReduceScal(local_result,rtype,result);
+    return acMPIReduceScal(local_result, rtype, result);
 }
 
-
 AcResult
-acGridReduceVec(const Stream stream, const ReductionType rtype,
-                const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-                const VertexBufferHandle vtxbuf2, AcReal* result)
+acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf0,
+                const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result)
 {
     ERRCHK(grid.initialized);
-    // acGridSynchronizeStream(stream);
 
-    const Device device  = grid.device;
-    //const int3 nn        = grid.nn;
+    const Device device = grid.device;
 
     acGridSynchronizeStream(STREAM_ALL);
     MPI_Barrier(MPI_COMM_WORLD);
@@ -1700,7 +1696,7 @@ acGridReduceVec(const Stream stream, const ReductionType rtype,
     AcReal local_result;
     acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);
 
-    return acMPIReduceScal(local_result,rtype,result);
+    return acMPIReduceScal(local_result, rtype, result);
 }
 
 #endif // AC_MPI_ENABLED
diff --git a/src/utils/verification.c b/src/utils/verification.c
index fcae0bc..393fbc3 100644
--- a/src/utils/verification.c
+++ b/src/utils/verification.c
@@ -105,8 +105,9 @@ get_max_abs_error(const VertexBufferHandle vtxbuf_handle, const AcMesh model_mes
             error = curr_error;
     }
 
-    error.handle            = vtxbuf_handle;
-    strcpy(error.label, vtxbuf_names[vtxbuf_handle]);
+    error.handle = vtxbuf_handle;
+    strncpy(error.label, vtxbuf_names[vtxbuf_handle], ERROR_LABEL_LENGTH - 1);
+    error.label[ERROR_LABEL_LENGTH - 1] = '\0';
     error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
     error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
 
@@ -178,16 +179,18 @@ acVerifyMesh(const AcMesh model, const AcMesh candidate)
 }
 
 /** Verification function for scalar reductions*/
-AcResult 
-acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases, const size_t numCases)
+AcResult
+acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
+                       const size_t numCases)
 {
     printf("\nTesting scalar reductions:\n");
 
     bool errors_found = false;
-    for (size_t i = 0; i < numCases; i++){
+    for (size_t i = 0; i < numCases; i++) {
         AcReal model_reduction = acModelReduceScal(model, testCases[i].rtype, testCases[i].vtxbuf);
-        Error error = acGetError(model_reduction, testCases[i].candidate);
-        strcpy(error.label, testCases[i].label);
+        Error error            = acGetError(model_reduction, testCases[i].candidate);
+        strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
+        error.label[ERROR_LABEL_LENGTH - 1] = '\0';
         errors_found |= printErrorToScreen(error);
     }
     printf("%s\n", errors_found ? "Failure. Found errors in one or more scalar reductions"
@@ -197,16 +200,19 @@ acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCa
 }
 
 /** Verification function for vector reductions*/
-AcResult 
-acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases, const size_t numCases)
+AcResult
+acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
+                      const size_t numCases)
 {
     printf("\nTesting vector reductions:\n");
 
     bool errors_found = false;
-    for (size_t i = 0; i < numCases; i++){
-        AcReal model_reduction = acModelReduceVec(model, testCases[i].rtype, testCases[i].a, testCases[i].b, testCases[i].c);
-        Error error = acGetError(model_reduction, testCases[i].candidate);
-        strcpy(error.label, testCases[i].label);
+    for (size_t i = 0; i < numCases; i++) {
+        AcReal model_reduction = acModelReduceVec(model, testCases[i].rtype, testCases[i].a,
+                                                  testCases[i].b, testCases[i].c);
+        Error error            = acGetError(model_reduction, testCases[i].candidate);
+        strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
+        error.label[ERROR_LABEL_LENGTH - 1] = '\0';
         errors_found |= printErrorToScreen(error);
     }
     printf("%s\n", errors_found ? "Failure. Found errors in one or more vector reductions"
@@ -214,3 +220,36 @@ acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCase
 
     return errors_found ? AC_FAILURE : AC_SUCCESS;
 }
+
+/** Constructor for scalar reduction test case */
+AcScalReductionTestCase
+acCreateScalReductionTestCase(const char* label, const VertexBufferHandle vtxbuf, const ReductionType rtype)
+{
+    AcScalReductionTestCase testCase;
+
+    strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
+    testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
+    testCase.vtxbuf = vtxbuf;
+    testCase.rtype = rtype;
+    testCase.candidate = 0;
+
+    return testCase;
+}
+
+/** Constructor for vector reduction test case */
+AcVecReductionTestCase
+acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
+                const VertexBufferHandle b, const VertexBufferHandle c, const ReductionType rtype)
+{
+    AcVecReductionTestCase testCase;
+
+    strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
+    testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
+    testCase.a = a;
+    testCase.b = b;
+    testCase.c = c;
+    testCase.rtype = rtype;
+    testCase.candidate = 0;
+
+    return testCase;
+}

From 17a4f314519dad18c25caf920c709d6da5ecfc2e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 4 Jun 2020 20:47:03 +0300
Subject: [PATCH 43/89] Added the latest setup used for benchmarks

---
 CMakeLists.txt                     |  6 +-
 config/astaroth.conf               | 14 ++---
 samples/benchmark/main.cc          | 65 ++++++++++++++++++++-
 samples/genbenchmarkscripts/main.c | 10 +++-
 src/core/device.cc                 | 93 ++++++++++++++++++++++++++++--
 src/core/kernels/integration.cuh   | 21 ++++---
 6 files changed, 182 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45c3a2b..04100bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 ## CMake settings
 # V3.9 required for first-class CUDA support
 # V3.17 required for the FindCUDAToolkit package
-cmake_minimum_required(VERSION 3.17) 
+cmake_minimum_required(VERSION 3.17)
 find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
 find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
 
@@ -10,7 +10,7 @@ project(astaroth C CXX CUDA)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 
 ## Project-wide compilation flags
-set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
+set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
@@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11)
 find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
 cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
 string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
-set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
+set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
 
 
diff --git a/config/astaroth.conf b/config/astaroth.conf
index ccefc45..83e93d9 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 512
-AC_ny = 512
-AC_nz = 512
+AC_nx = 256
+AC_ny = 256
+AC_nz = 256
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
@@ -24,11 +24,11 @@ AC_bin_steps = 1000
 AC_bin_save_t = 1e666
 
 // Set to 0 if you want to run the simulation from the beginning, or just a new
-// simulation. If continuing from a saved step, specify the step number here.  
-AC_start_step = 0 
+// simulation. If continuing from a saved step, specify the step number here.
+AC_start_step = 0
 
 // Maximum time in code units. If negative, there is no time limit
-AC_max_time = -1.0 
+AC_max_time = -1.0
 
 // Hydro
 AC_cdt = 0.4
@@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5
 AC_kmin              = 0.8
 AC_kmax              = 1.2
 // Switches forcing off and accretion on
-AC_switch_accretion  = 0 
+AC_switch_accretion  = 0
 
 // Entropy
 AC_cp_sound = 1.0
diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index 5ab4349..dd14129 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -39,8 +39,46 @@ typedef enum {
     NUM_TESTS,
 } TestType;
 
+#include <stdint.h>
+
+typedef struct {
+    uint64_t x, y, z;
+} uint3_64;
+
+static uint3_64
+operator+(const uint3_64& a, const uint3_64& b)
+{
+    return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static uint3_64
+morton3D(const uint64_t pid)
+{
+    uint64_t i, j, k;
+    i = j = k = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << 3 * bit;
+        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+    }
+
+    return (uint3_64){i, j, k};
+}
+
+static uint3_64
+decompose(const uint64_t target)
+{
+    // This is just so beautifully elegant. Complex and efficient decomposition
+    // in just one line of code.
+    uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
+
+    ERRCHK_ALWAYS(p.x * p.y * p.z == target);
+    return p;
+}
+
 int
-main(void)
+main(int argc, char** argv)
 {
     MPI_Init(NULL, NULL);
     int nprocs, pid;
@@ -51,9 +89,30 @@ main(void)
     AcMeshInfo info;
     acLoadConfig(AC_DEFAULT_CONFIG, &info);
 
+    if (argc > 1) {
+        if (argc == 4) {
+            const int nx           = atoi(argv[1]);
+            const int ny           = atoi(argv[2]);
+            const int nz           = atoi(argv[3]);
+            info.int_params[AC_nx] = nx;
+            info.int_params[AC_ny] = ny;
+            info.int_params[AC_nz] = nz;
+            acUpdateBuiltinParams(&info);
+            printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
+        }
+        else {
+            fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
     const TestType test = TEST_STRONG_SCALING;
-    if (test == TEST_WEAK_SCALING)
-        info.int_params[AC_nz] *= nprocs;
+    if (test == TEST_WEAK_SCALING) {
+        uint3_64 decomp = decompose(nprocs);
+        info.int_params[AC_nx] *= decomp.x;
+        info.int_params[AC_ny] *= decomp.y;
+        info.int_params[AC_nz] *= decomp.z;
+    }
 
     /*
     AcMesh model, candidate;
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index 8d35ae9..6f160b3 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -36,8 +36,14 @@ main(void)
 
         // Profile and run
         fprintf(fp, "mkdir -p profile_%d\n", nprocs);
-        fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
-                nprocs);
+
+        const int nx = 1792;
+        const int ny = nx;
+        const int nz = nx;
+        fprintf(fp,
+                "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                "%d\n",
+                nprocs, nx, ny, nz);
 
         fclose(fp);
     }
diff --git a/src/core/device.cc b/src/core/device.cc
index f473fc2..1e070cb 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -518,6 +518,78 @@ mod(const int a, const int b)
     return r < 0 ? r + b : r;
 }
 
+#define DECOMPOSITION_AXES (3)
+
+static uint3_64
+morton3D(const uint64_t pid)
+{
+    uint64_t i, j, k;
+    i = j = k = 0;
+
+    if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 3 * bit;
+            i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+            k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        }
+    }
+    // Just a quick copy/paste for other decomp dims
+    else if (DECOMPOSITION_AXES == 2) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 2 * bit;
+            i |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
+            j |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
+        }
+    }
+    else if (DECOMPOSITION_AXES == 1) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 1 * bit;
+            i |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
+        }
+    }
+    else {
+        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        ERRCHK_ALWAYS(0);
+    }
+
+    return (uint3_64){i, j, k};
+}
+
+static uint64_t
+morton1D(const uint3_64 pid)
+{
+    uint64_t i = 0;
+
+    if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.x & mask) << 0) << 2 * bit;
+            i |= ((pid.y & mask) << 1) << 2 * bit;
+            i |= ((pid.z & mask) << 2) << 2 * bit;
+        }
+    }
+    else if (DECOMPOSITION_AXES == 2) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.x & mask) << 0) << 1 * bit;
+            i |= ((pid.y & mask) << 1) << 1 * bit;
+        }
+    }
+    else if (DECOMPOSITION_AXES == 1) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.x & mask) << 0) << 0 * bit;
+        }
+    }
+    else {
+        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        ERRCHK_ALWAYS(0);
+    }
+
+    return i;
+}
+/*
 static uint3_64
 morton3D(const uint64_t pid)
 {
@@ -545,6 +617,7 @@ morton1D(const uint3_64 pid)
     }
     return i;
 }
+*/
 
 static uint3_64
 decompose(const uint64_t target)
@@ -1277,15 +1350,18 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
+#define MPI_COMPUTE_ENABLED (1)
+#define MPI_COMM_ENABLED (1)
+
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
     // acGridSynchronizeStream(stream);
 
-    const Device device  = grid.device;
-    const int3 nn        = grid.nn;
-    //CommData corner_data = grid.corner_data; // Do not rm: required for corners
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+    // CommData corner_data = grid.corner_data; // Do not rm: required for corners
     CommData edgex_data  = grid.edgex_data;
     CommData edgey_data  = grid.edgey_data;
     CommData edgez_data  = grid.edgez_data;
@@ -1357,6 +1433,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
+
+#if MPI_COMM_ENABLED
         // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
         acPackCommData(device, edgex_b0s, &edgex_data);
         acPackCommData(device, edgey_b0s, &edgey_data);
@@ -1364,15 +1442,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sidexy_b0s, &sidexy_data);
         acPackCommData(device, sidexz_b0s, &sidexz_data);
         acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif
 
+#if MPI_COMPUTE_ENABLED
         //////////// INNER INTEGRATION //////////////
         {
             const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
             const int3 m2 = nn;
             acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
         }
-        ////////////////////////////////////////////
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
 
+#if MPI_COMM_ENABLED
         MPI_Barrier(MPI_COMM_WORLD);
 
 #if MPI_GPUDIRECT_DISABLED
@@ -1436,6 +1518,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acSyncCommData(sidexy_data);
         acSyncCommData(sidexz_data);
         acSyncCommData(sideyz_data);
+#endif // MPI_COMM_ENABLED
+#if MPI_COMPUTE_ENABLED
         { // Front
             const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
             const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
@@ -1466,6 +1550,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
             const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
             acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
         }
+#endif // MPI_COMPUTE_ENABLED
         acDeviceSwapBuffers(device);
         acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
         ////////////////////////////////////////////
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 8d66fd2..97326ad 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -41,10 +41,12 @@ static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
               const AcReal3 rate_of_change, const AcReal dt)
 {
-    return (AcReal3){
-        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+    return (AcReal3){rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x,
+                                                dt),
+                     rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y,
+                                                dt),
+                     rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z,
+                                                dt)};
 }
 
 #define rk3(state_previous, state_current, rate_of_change, dt)                                     \
@@ -132,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
     // RK3
     dim3 best_dims(0, 0, 0);
     float best_time          = INFINITY;
-    const int num_iterations = 10;
+    const int num_iterations = 5;
 
     for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
         for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
@@ -192,9 +194,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
         }
     }
 #if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
+    printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f "
+           "ms\n",
+           best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
     /*
     FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
@@ -204,6 +206,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
     */
 
     rk3_tpb = best_dims;
+
+    // Failed to find valid thread block dimensions
+    ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0);
     return AC_SUCCESS;
 }
 

From 666f01a23da7f10ae8131dd48da14400b6f5f131 Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Fri, 5 Jun 2020 19:48:40 +0300
Subject: [PATCH 44/89] Benchmarking program for scalar mpi reductions, and
 nonbatch script for running benchmarks     - New program mpi_reduce_bench    
   - runs testcases defined in source       - writes all benchmark results to
 a csv file, tags the testcase and benchmark run       - takes optional
 argument for benchmark tag, default benchmark tag is a timestamp     - New
 script mpibench.sh       - runs the mpi_reduce_bench with defined parameters:
         - number of tasks         - number of nodes         - the benchmark
 tag for mpi_reduce_bench, default tag is the current git HEAD short hash

---
 CMakeLists.txt                          |   5 +
 samples/mpi_reduce_bench/CMakeLists.txt |   3 +
 samples/mpi_reduce_bench/main.cc        | 135 ++++++++++++++++++++++++
 samples/mpi_reduce_bench/mpibench.sh    |  63 +++++++++++
 4 files changed, 206 insertions(+)
 create mode 100644 samples/mpi_reduce_bench/CMakeLists.txt
 create mode 100644 samples/mpi_reduce_bench/main.cc
 create mode 100755 samples/mpi_reduce_bench/mpibench.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5a514b..e1e762f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,11 @@ if (BUILD_SAMPLES)
     add_subdirectory(samples/cpptest)
     add_subdirectory(samples/mpitest)
     add_subdirectory(samples/benchmark)
+    add_subdirectory(samples/mpi_reduce_bench)
+    add_custom_target(copy-mpibench-script ALL
+        COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_SOURCE_DIR}/samples/mpi_reduce_bench/mpibench.sh
+        ${CMAKE_CURRENT_BINARY_DIR}/mpibench.sh)
     add_subdirectory(samples/bwtest)
     add_subdirectory(samples/genbenchmarkscripts)
 endif()
diff --git a/samples/mpi_reduce_bench/CMakeLists.txt b/samples/mpi_reduce_bench/CMakeLists.txt
new file mode 100644
index 0000000..b04f80a
--- /dev/null
+++ b/samples/mpi_reduce_bench/CMakeLists.txt
@@ -0,0 +1,3 @@
+## benchmark
+add_executable(mpi_reduce_bench main.cc)
+target_link_libraries(mpi_reduce_bench astaroth_core astaroth_utils)
diff --git a/samples/mpi_reduce_bench/main.cc b/samples/mpi_reduce_bench/main.cc
new file mode 100644
index 0000000..7c279d3
--- /dev/null
+++ b/samples/mpi_reduce_bench/main.cc
@@ -0,0 +1,135 @@
+/*
+    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+    Running: benchmark -np <num processes> <executable>
+*/
+#include "astaroth.h"
+#include "astaroth_utils.h"
+
+#include "errchk.h"
+#include "timer_hires.h"
+
+#if AC_MPI_ENABLED
+
+#include <mpi.h>
+
+#include <algorithm>
+#include <string.h>
+
+#include <string>
+#include <vector>
+#include <chrono>
+#include <ctime>
+
+int
+main(int argc, char** argv)
+{
+    MPI_Init(NULL, NULL);
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+    // CPU alloc
+    AcMeshInfo info;
+    acLoadConfig(AC_DEFAULT_CONFIG, &info);
+
+    char* benchmark_label;
+
+    if (argc > 1){
+        benchmark_label = argv[1];
+    } else {
+        auto timestamp = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+        benchmark_label = std::ctime(&timestamp);
+        benchmark_label[strcspn(benchmark_label, "\n")] = '\0';
+    }
+    
+    //clang-format off
+    std::vector<AcScalReductionTestCase> scalarReductionTests {
+        acCreateScalReductionTestCase("Scalar MAX"    , VTXBUF_UUX, RTYPE_MAX),
+        acCreateScalReductionTestCase("Scalar MIN"    , VTXBUF_UUX, RTYPE_MIN),
+        acCreateScalReductionTestCase("Scalar RMS"    , VTXBUF_UUX, RTYPE_RMS),
+        acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
+        acCreateScalReductionTestCase("Scalar SUM"    , VTXBUF_UUX, RTYPE_SUM)
+    };
+
+    std::vector<AcVecReductionTestCase> vectorReductionTests {
+        acCreateVecReductionTestCase("Vector MAX"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
+        acCreateVecReductionTestCase("Vector MIN"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
+        acCreateVecReductionTestCase("Vector RMS"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
+        acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP),
+        acCreateVecReductionTestCase("Vector SUM"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM)
+    };
+    //clang-format on
+
+    // GPU alloc & compute
+    acGridInit(info);
+
+    for (auto& testCase : scalarReductionTests) {
+        // Percentiles
+        const size_t num_iters      = 100;
+        const double nth_percentile = 0.90;
+        std::vector<double> results; // ms
+        results.reserve(num_iters);
+
+        // Benchmark
+        Timer t;
+
+        for (size_t i = 0; i < num_iters; ++i) {
+            acGridSynchronizeStream(STREAM_ALL);
+            timer_reset(&t);
+            acGridSynchronizeStream(STREAM_ALL);
+            acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
+            acGridSynchronizeStream(STREAM_ALL);
+            results.push_back(timer_diff_nsec(t) / 1e6);
+            acGridSynchronizeStream(STREAM_ALL);
+        }
+
+        if (!pid) {
+            std::sort(results.begin(), results.end(),
+                      [](const double& a, const double& b) { return a < b; });
+            fprintf(stdout,
+                    "Reduction time %g ms (%gth "
+                    "percentile)--------------------------------------\n",
+                    results[nth_percentile * num_iters], 100 * nth_percentile);
+
+            char path[4096] = "mpi_reduction_benchmark.csv";
+
+            FILE* fp = fopen(path, "a");
+            ERRCHK_ALWAYS(fp);
+            
+            // Format
+            // benchmark label, test label, nprocs, measured (ms)
+            fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
+            fclose(fp);
+        }
+    }
+    acGridQuit();
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+
+#else
+int
+main(void)
+{
+    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
+           "cmake -DMPI_ENABLED=ON .. to enable.\n");
+    return EXIT_FAILURE;
+}
+#endif // AC_MPI_ENABLES
diff --git a/samples/mpi_reduce_bench/mpibench.sh b/samples/mpi_reduce_bench/mpibench.sh
new file mode 100755
index 0000000..6517bf7
--- /dev/null
+++ b/samples/mpi_reduce_bench/mpibench.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+#defaults
+default_num_procs=8
+default_num_nodes=2
+
+num_procs=$default_num_procs
+num_nodes=$default_num_nodes
+
+script_name=$0
+
+print_usage(){
+    echo "Usage: $script_name [Options]"
+    echo "\tRuns mpi_reduce_bench, which will write benchmark results"
+    echo "Options:"
+    echo "\t -n <num_procs>"
+    echo "\t\t-n option to slurm, default=$default_num_procs"
+    echo "\t -N <num_nodes>"
+    echo "\t\t-N option to slurm, default=$default_num_nodes"
+    echo "\t -t <tag>"
+    echo "\t\tA benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
+    echo "\t\tBy default the current git HEAD short hash will be used as a tag"
+}
+
+while getopts n:N:t: opt
+do
+    case "$opt" in
+        n)
+            if [ $OPTARG ]
+            then
+                num_procs=$OPTARG
+            else
+                print_usage
+                exit 1
+            fi
+        ;;
+        N)
+            if [ $OPTARG ]
+            then
+                num_nodes=$OPTARG
+            else
+                print_usage
+                exit 1
+            fi
+        ;;
+        t)
+            if [ $OPTARG ]
+            then
+                benchmark_label=$OPTARG
+            else
+                print_usage
+                exit 1
+            fi
+        ;;
+    esac
+done
+
+if [ -z "$benchmark_label" ]
+then
+    benchmark_label=$(git rev-parse --short HEAD)
+fi
+set -x
+srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p gpu -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}

From eb05e0279375b6d9fc79898f0c420d1c7f285e3f Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Sat, 6 Jun 2020 19:22:05 +0300
Subject: [PATCH 45/89] Added vector reductions to mpi reduction benchmarks

---
 samples/mpi_reduce_bench/main.cc | 48 +++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/samples/mpi_reduce_bench/main.cc b/samples/mpi_reduce_bench/main.cc
index 7c279d3..4b4298b 100644
--- a/samples/mpi_reduce_bench/main.cc
+++ b/samples/mpi_reduce_bench/main.cc
@@ -80,16 +80,15 @@ main(int argc, char** argv)
     // GPU alloc & compute
     acGridInit(info);
 
+    const size_t num_iters      = 100;
+    const double nth_percentile = 0.90;
+    std::vector<double> results; // ms
+    Timer t;
+    // Scalar benchmarks
     for (auto& testCase : scalarReductionTests) {
-        // Percentiles
-        const size_t num_iters      = 100;
-        const double nth_percentile = 0.90;
-        std::vector<double> results; // ms
+        results.clear();
         results.reserve(num_iters);
 
-        // Benchmark
-        Timer t;
-
         for (size_t i = 0; i < num_iters; ++i) {
             acGridSynchronizeStream(STREAM_ALL);
             timer_reset(&t);
@@ -119,6 +118,41 @@ main(int argc, char** argv)
             fclose(fp);
         }
     }
+
+    // Vector benchmarks
+    for (auto& testCase : vectorReductionTests) {
+        results.clear();
+        results.reserve(num_iters);
+
+        for (size_t i = 0; i < num_iters; ++i) {
+            acGridSynchronizeStream(STREAM_ALL);
+            timer_reset(&t);
+            acGridSynchronizeStream(STREAM_ALL);
+            acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c, &testCase.candidate);
+            acGridSynchronizeStream(STREAM_ALL);
+            results.push_back(timer_diff_nsec(t) / 1e6);
+            acGridSynchronizeStream(STREAM_ALL);
+        }
+
+        if (!pid) {
+            std::sort(results.begin(), results.end(),
+                      [](const double& a, const double& b) { return a < b; });
+            fprintf(stdout,
+                    "Reduction time %g ms (%gth "
+                    "percentile)--------------------------------------\n",
+                    results[nth_percentile * num_iters], 100 * nth_percentile);
+
+            char path[4096] = "mpi_reduction_benchmark.csv";
+
+            FILE* fp = fopen(path, "a");
+            ERRCHK_ALWAYS(fp);
+            
+            // Format
+            // benchmark label, test label, nprocs, measured (ms)
+            fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
+            fclose(fp);
+        }
+    }
     acGridQuit();
     MPI_Finalize();
     return EXIT_SUCCESS;

From 53b48bb8ce3a5cf3e3d2ccd95bf7ef07f17db7d9 Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Sat, 6 Jun 2020 22:53:08 +0300
Subject: [PATCH 46/89] MPI_Allreduce -> MPI_Reduce for MPI reductions +
 benchmark batch script

Slightly ugly because this changes the benchmark behaviour slightly
However we now have a way to run batch benchmarks from one script, no need to generate new ones
---
 samples/mpi_reduce_bench/mpibench.sh | 77 +++++++++++++++++-----------
 src/core/device.cc                   | 15 +++---
 2 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/samples/mpi_reduce_bench/mpibench.sh b/samples/mpi_reduce_bench/mpibench.sh
index 6517bf7..4e9eae3 100755
--- a/samples/mpi_reduce_bench/mpibench.sh
+++ b/samples/mpi_reduce_bench/mpibench.sh
@@ -11,47 +11,45 @@ script_name=$0
 
 print_usage(){
     echo "Usage: $script_name [Options]"
-    echo "\tRuns mpi_reduce_bench, which will write benchmark results"
+    echo "      Runs ./mpi_reduce_bench, which will write benchmark results to a csv file"
+    echo "      Remember to run this script from your build directory"
+    echo "      The benchmarks are submitted with sbatch, unless the -i option is passed"
     echo "Options:"
-    echo "\t -n <num_procs>"
-    echo "\t\t-n option to slurm, default=$default_num_procs"
-    echo "\t -N <num_nodes>"
-    echo "\t\t-N option to slurm, default=$default_num_nodes"
-    echo "\t -t <tag>"
-    echo "\t\tA benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
-    echo "\t\tBy default the current git HEAD short hash will be used as a tag"
+    echo "      -n <num_procs>"
+    echo "              number of tasks for slurm, default=$default_num_procs"
+    echo "      -N <num_nodes>"
+    echo "              number of nodes for slurm, default=$default_num_nodes"
+    echo "      -t <tag>"
+    echo "              A benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
+    echo "              By default the current git HEAD short hash will be used as a tag"
+    echo "      -i"
+    echo "              Run the benchmark interactively with srun instead of sbatch"
+    echo "      -h"
+    echo "              Print this message"
 }
 
-while getopts n:N:t: opt
+while getopts :n:N:t:ih opt
 do
     case "$opt" in
         n)
-            if [ $OPTARG ]
-            then
-                num_procs=$OPTARG
-            else
-                print_usage
-                exit 1
-            fi
+            num_procs=$OPTARG
         ;;
         N)
-            if [ $OPTARG ]
-            then
-                num_nodes=$OPTARG
-            else
-                print_usage
-                exit 1
-            fi
+            num_nodes=$OPTARG
         ;;
         t)
-            if [ $OPTARG ]
-            then
-                benchmark_label=$OPTARG
-            else
-                print_usage
-                exit 1
-            fi
+            benchmark_label=$OPTARG
         ;;
+        i)
+            interactively=1
+        ;;
+        h)
+            print_usage
+            exit 0
+        ;;
+        ?)
+            print_usage
+            exit 1
     esac
 done
 
@@ -60,4 +58,21 @@ then
     benchmark_label=$(git rev-parse --short HEAD)
 fi
 set -x
-srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p gpu -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}
+
+if [ -z "$interactively"]
+then
+sbatch <<EOF
+#!/bin/sh
+#BATCH --job-name=astaroth
+#SBATCH --account=project_2000403
+#SBATCH --time=00:14:59
+#SBATCH --mem=48000
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:v100:4
+#SBATCH -n ${num_procs}
+#SBATCH -N ${num_nodes}
+srun ./mpi_reduce_bench ${benchmark_label}
+EOF
+else
+    srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p gpu -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}
+fi
diff --git a/src/core/device.cc b/src/core/device.cc
index a3ead26..abe2d1c 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1645,23 +1645,22 @@ acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* re
     MPI_Datatype datatype = MPI_FLOAT;
 #endif
 
-    /*
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    */
 
     int world_size;
     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 
     AcReal mpi_res;
-    MPI_Allreduce(&local_result, &mpi_res, 1, datatype, op, MPI_COMM_WORLD);
-
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.nn.x * grid.decomposition.x * grid.nn.y *
+    MPI_Reduce(&local_result, &mpi_res, 1, datatype, op, 0, MPI_COMM_WORLD);
+    if (rank == 0){
+        if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+            const AcReal inv_n = AcReal(1.) / (grid.nn.x * grid.decomposition.x * grid.nn.y *
                                            grid.decomposition.y * grid.nn.z * grid.decomposition.z);
-        mpi_res            = sqrt(inv_n * mpi_res);
+            mpi_res            = sqrt(inv_n * mpi_res);
+        }
+        *result = mpi_res;
     }
-    *result = mpi_res;
     return AC_SUCCESS;
 }
 

From cd49db68d78a7ad0cdafb4a0828c48d7e57dfe77 Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Sun, 7 Jun 2020 15:50:49 +0300
Subject: [PATCH 47/89] No barrier benchmark

---
 src/core/device.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index abe2d1c..4f86dda 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1673,7 +1673,7 @@ acGridReduceScal(const Stream stream, const ReductionType rtype,
     const Device device = grid.device;
 
     acGridSynchronizeStream(STREAM_ALL);
-    MPI_Barrier(MPI_COMM_WORLD);
+    //MPI_Barrier(MPI_COMM_WORLD);
 
     AcReal local_result;
     acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
@@ -1690,7 +1690,7 @@ acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBuff
     const Device device = grid.device;
 
     acGridSynchronizeStream(STREAM_ALL);
-    MPI_Barrier(MPI_COMM_WORLD);
+    //MPI_Barrier(MPI_COMM_WORLD);
 
     AcReal local_result;
     acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);

From 9840b817d081339cc04272225a7505bad3b4b0c0 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 7 Jun 2020 21:59:33 +0300
Subject: [PATCH 48/89] Added the (hopefully final) basic test case used for
 the benchmarks

---
 samples/benchmark/main.cc          |  7 ++--
 samples/genbenchmarkscripts/main.c |  9 ++---
 src/core/device.cc                 | 57 +++++++++++++++++++++---------
 src/core/kernels/integration.cuh   |  2 +-
 4 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index dd14129..962a316 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -56,11 +56,12 @@ morton3D(const uint64_t pid)
 {
     uint64_t i, j, k;
     i = j = k = 0;
+
     for (int bit = 0; bit <= 21; ++bit) {
         const uint64_t mask = 0x1l << 3 * bit;
-        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
         j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
-        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
     }
 
     return (uint3_64){i, j, k};
@@ -174,7 +175,7 @@ main(int argc, char** argv)
     */
 
     // Percentiles
-    const size_t num_iters      = 100;
+    const size_t num_iters      = 1000;
     const double nth_percentile = 0.90;
     std::vector<double> results; // ms
     results.reserve(num_iters);
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index 6f160b3..a45bf1a 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -29,6 +29,7 @@ main(void)
         fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
         fprintf(fp, "#SBATCH -n %d\n", nprocs);
         fprintf(fp, "#SBATCH -N %d\n", nodes);
+        fprintf(fp, "#SBATCH --exclusive\n");
 
         // Modules
         fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
@@ -37,13 +38,13 @@ main(void)
         // Profile and run
         fprintf(fp, "mkdir -p profile_%d\n", nprocs);
 
-        const int nx = 1792;
+        const int nx = 256; // max size 1792;
         const int ny = nx;
         const int nz = nx;
         fprintf(fp,
-                "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
-                "%d\n",
-                nprocs, nx, ny, nz);
+                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                //"%d\n",
+                "srun ./benchmark %d %d %d\n", nx, ny, nz);
 
         fclose(fp);
     }
diff --git a/src/core/device.cc b/src/core/device.cc
index 1e070cb..35af82d 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -527,6 +527,15 @@ morton3D(const uint64_t pid)
     i = j = k = 0;
 
     if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << 3 * bit;
+            k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+            i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+        }
+    }
+    /*
+    else if (DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 3 * bit;
             i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
@@ -534,18 +543,19 @@ morton3D(const uint64_t pid)
             k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
         }
     }
+    */
     // Just a quick copy/paste for other decomp dims
     else if (DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 2 * bit;
-            i |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
-            j |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
+            j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
+            k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
         }
     }
     else if (DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 1 * bit;
-            i |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
+            k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
         }
     }
     else {
@@ -562,24 +572,33 @@ morton1D(const uint3_64 pid)
     uint64_t i = 0;
 
     if (DECOMPOSITION_AXES == 3) {
+        for (int bit = 0; bit <= 21; ++bit) {
+            const uint64_t mask = 0x1l << bit;
+            i |= ((pid.z & mask) << 0) << 2 * bit;
+            i |= ((pid.y & mask) << 1) << 2 * bit;
+            i |= ((pid.x & mask) << 2) << 2 * bit;
+        }
+    }
+    /*
+    else if (DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.x & mask) << 0) << 2 * bit;
             i |= ((pid.y & mask) << 1) << 2 * bit;
             i |= ((pid.z & mask) << 2) << 2 * bit;
         }
-    }
+    }*/
     else if (DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 1 * bit;
-            i |= ((pid.y & mask) << 1) << 1 * bit;
+            i |= ((pid.y & mask) << 0) << 1 * bit;
+            i |= ((pid.z & mask) << 1) << 1 * bit;
         }
     }
     else if (DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 0 * bit;
+            i |= ((pid.z & mask) << 0) << 0 * bit;
         }
     }
     else {
@@ -1204,6 +1223,8 @@ typedef struct {
     CommData sidexy_data;
     CommData sidexz_data;
     CommData sideyz_data;
+
+    // int comm_cart;
 } Grid;
 
 static Grid grid = {};
@@ -1444,16 +1465,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acPackCommData(device, sideyz_b0s, &sideyz_data);
 #endif
 
-#if MPI_COMPUTE_ENABLED
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-////////////////////////////////////////////
-#endif // MPI_COMPUTE_ENABLED
-
 #if MPI_COMM_ENABLED
         MPI_Barrier(MPI_COMM_WORLD);
 
@@ -1474,7 +1485,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommData(device, sidexy_b0s, &sidexy_data);
         acTransferCommData(device, sidexz_b0s, &sidexz_data);
         acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
 
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
         // acTransferCommDataWait(corner_data); // Do not rm: required for corners
         acTransferCommDataWait(edgex_data);
         acTransferCommDataWait(edgey_data);
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 97326ad..4c01148 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -134,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
     // RK3
     dim3 best_dims(0, 0, 0);
     float best_time          = INFINITY;
-    const int num_iterations = 5;
+    const int num_iterations = 10;
 
     for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
         for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {

From c7f23eb50c7a01110e2223f4684ab6f43dbca212 Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login2.bullx>
Date: Tue, 9 Jun 2020 14:07:37 +0300
Subject: [PATCH 49/89] Added partition argument to mpibench script

---
 samples/mpi_reduce_bench/mpibench.sh | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/samples/mpi_reduce_bench/mpibench.sh b/samples/mpi_reduce_bench/mpibench.sh
index 4e9eae3..3b450e6 100755
--- a/samples/mpi_reduce_bench/mpibench.sh
+++ b/samples/mpi_reduce_bench/mpibench.sh
@@ -3,10 +3,13 @@
 #defaults
 default_num_procs=8
 default_num_nodes=2
+default_partition=gpu
 
 num_procs=$default_num_procs
 num_nodes=$default_num_nodes
 
+partition=$default_partition
+
 script_name=$0
 
 print_usage(){
@@ -19,6 +22,8 @@ print_usage(){
     echo "              number of tasks for slurm, default=$default_num_procs"
     echo "      -N <num_nodes>"
     echo "              number of nodes for slurm, default=$default_num_nodes"
+    echo "      -p <partition>"
+    echo "              which partition to use for slurm, default=$default_partition"
     echo "      -t <tag>"
     echo "              A benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
     echo "              By default the current git HEAD short hash will be used as a tag"
@@ -28,7 +33,7 @@ print_usage(){
     echo "              Print this message"
 }
 
-while getopts :n:N:t:ih opt
+while getopts :n:N:t:p:ih opt
 do
     case "$opt" in
         n)
@@ -43,6 +48,9 @@ do
         i)
             interactively=1
         ;;
+        p)
+            partition=$OPTARG
+        ;;
         h)
             print_usage
             exit 0
@@ -67,12 +75,12 @@ sbatch <<EOF
 #SBATCH --account=project_2000403
 #SBATCH --time=00:14:59
 #SBATCH --mem=48000
-#SBATCH --partition=gpu
+#SBATCH --partition=${partition}
 #SBATCH --gres=gpu:v100:4
 #SBATCH -n ${num_procs}
 #SBATCH -N ${num_nodes}
 srun ./mpi_reduce_bench ${benchmark_label}
 EOF
 else
-    srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p gpu -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}
+    srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p ${partition} -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}
 fi

From fa422cf4575cb0a8eace8f0409693e881fc0f709 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 10 Jun 2020 02:16:23 +0300
Subject: [PATCH 50/89] Added a better-pipelined version of the acGridIntegrate
 and a switch for toggling the transfer of corners

---
 src/core/device.cc | 211 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index 35af82d..481b465 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1373,9 +1373,220 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
 
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
+#define MPI_INCL_CORNERS (0)
 
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+#if MPI_INCL_CORNERS
+    CommData corner_data = grid.corner_data; // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
+
+// Corners
+#if MPI_INCL_CORNERS
+    // Do not rm: required for corners
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
+#endif // MPI_INCL_CORNERS
+
+    // Edges X
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
+
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges Y
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
+
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
+
+    // Edges Z
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acDeviceSynchronizeStream(device, STREAM_ALL);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+#if MPI_COMPUTE_ENABLED
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommDataWait(sidexy_data);
+        acUnpinCommData(device, &sidexy_data);
+        acTransferCommDataWait(sidexz_data);
+        acUnpinCommData(device, &sidexz_data);
+        acTransferCommDataWait(sideyz_data);
+        acUnpinCommData(device, &sideyz_data);
+
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+#if MPI_INCL_CORNERS
+        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                    // MPI_INCL_CORNERS
+        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommDataWait(edgex_data);
+        acUnpinCommData(device, &edgex_data);
+        acTransferCommDataWait(edgey_data);
+        acUnpinCommData(device, &edgey_data);
+        acTransferCommDataWait(edgez_data);
+        acUnpinCommData(device, &edgez_data);
+
+#if MPI_INCL_CORNERS
+        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                        // MPI_INCL_CORNERS
+#endif                                                        // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        acUnpackCommData(device, edgex_b0s, &edgex_data);
+        acUnpackCommData(device, edgey_b0s, &edgey_data);
+        acUnpackCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataWait(corner_data);   // Do not rm: required for corners
+        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
+#endif                                         // MPI_INCL_CORNERS
+#endif                                         // MPI_COMM_ENABLED
+#if MPI_COMPUTE_ENABLED
+#if MPI_INCL_CORNERS
+        acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                      // MPI_INCL_CORNERS
+#endif                                                      // MPI_COMPUTE_ENABLED
+
+        // Wait for unpacking
+        acSyncCommData(sidexy_data);
+        acSyncCommData(sidexz_data);
+        acSyncCommData(sideyz_data);
+        acSyncCommData(edgex_data);
+        acSyncCommData(edgey_data);
+        acSyncCommData(edgez_data);
+#if MPI_INCL_CORNERS
+        acSyncCommData(corner_data); // Do not rm: required for corners
+#endif                               // MPI_INCL_CORNERS
+
+#if MPI_COMPUTE_ENABLED
+        { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
+        }
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
+        }
+        { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
+        }
+        { // Top
+            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
+        }
+        { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
+        }
+        { // Right
+            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
+        }
+#endif // MPI_COMPUTE_ENABLED
+        acDeviceSwapBuffers(device);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
     // acGridSynchronizeStream(stream);

From 1cdb9e2ce7c3eb675bf466b6bd5e3a6432c956d3 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 10 Jun 2020 12:32:56 +0300
Subject: [PATCH 51/89] Added missing synchronization to the end of the new
 integration function

---
 src/core/device.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 481b465..f47f7a0 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -12,6 +12,11 @@
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 #define MPI_GPUDIRECT_DISABLED (0)
 
+#define DECOMPOSITION_AXES (3)
+#define MPI_COMPUTE_ENABLED (1)
+#define MPI_COMM_ENABLED (1)
+#define MPI_INCL_CORNERS (0)
+
 AcResult
 acDevicePrintInfo(const Device device)
 {
@@ -518,8 +523,6 @@ mod(const int a, const int b)
     return r < 0 ? r + b : r;
 }
 
-#define DECOMPOSITION_AXES (3)
-
 static uint3_64
 morton3D(const uint64_t pid)
 {
@@ -1371,10 +1374,6 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
-#define MPI_COMPUTE_ENABLED (1)
-#define MPI_COMM_ENABLED (1)
-#define MPI_INCL_CORNERS (0)
-
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
@@ -1582,6 +1581,9 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acDeviceSwapBuffers(device);
     }
 
+    // Does not have to be STREAM_ALL, only the streams used with
+    // acDeviceIntegrateSubstep (less likely to break this way though)
+    acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
     return AC_SUCCESS;
 }
 

From 0030db01f381b0d2f41f2007a79ad5ee29e1cd3c Mon Sep 17 00:00:00 2001
From: Oskar Lappi <lappiosk@puhti-login1.bullx>
Date: Wed, 10 Jun 2020 16:51:35 +0300
Subject: [PATCH 52/89] Automatic calculation of nodes based on processes

---
 samples/mpi_reduce_bench/mpibench.sh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/samples/mpi_reduce_bench/mpibench.sh b/samples/mpi_reduce_bench/mpibench.sh
index 3b450e6..935b769 100755
--- a/samples/mpi_reduce_bench/mpibench.sh
+++ b/samples/mpi_reduce_bench/mpibench.sh
@@ -20,8 +20,6 @@ print_usage(){
     echo "Options:"
     echo "      -n <num_procs>"
     echo "              number of tasks for slurm, default=$default_num_procs"
-    echo "      -N <num_nodes>"
-    echo "              number of nodes for slurm, default=$default_num_nodes"
     echo "      -p <partition>"
     echo "              which partition to use for slurm, default=$default_partition"
     echo "      -t <tag>"
@@ -33,14 +31,12 @@ print_usage(){
     echo "              Print this message"
 }
 
-while getopts :n:N:t:p:ih opt
+while getopts :n:t:p:ih opt
 do
     case "$opt" in
         n)
             num_procs=$OPTARG
-        ;;
-        N)
-            num_nodes=$OPTARG
+            num_nodes=$(( 1 + ($num_procs - 1)/4))
         ;;
         t)
             benchmark_label=$OPTARG
@@ -67,6 +63,8 @@ then
 fi
 set -x
 
+exit 0
+
 if [ -z "$interactively"]
 then
 sbatch <<EOF

From 0e4b39d6d78210b57bd8b38dd09ec770bde2e1b4 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 11 Jun 2020 11:28:52 +0300
Subject: [PATCH 53/89] Added a toggle for using pinned memory

---
 src/core/device.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index f47f7a0..01d5e87 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -16,6 +16,7 @@
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
+#define MPI_USE_PINNED (1)
 
 AcResult
 acDevicePrintInfo(const Device device)
@@ -1165,7 +1166,7 @@ acTransferCommData(const Device device, //
         const int npid = getPid(pid3d + neighbor, decomp);
 
         PackedData* dst = &data->dsts[b0_idx];
-        if (onTheSameNode(pid, npid)) {
+        if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) {
             MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
                       MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
             dst->pinned = false;
@@ -1187,7 +1188,7 @@ acTransferCommData(const Device device, //
         const int npid = getPid(pid3d - neighbor, decomp);
 
         PackedData* src = &data->srcs[b0_idx];
-        if (onTheSameNode(pid, npid)) {
+        if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) {
             cudaStreamSynchronize(data->streams[b0_idx]);
             MPI_Isend(src->data, count, datatype, npid, b0_idx, //
                       MPI_COMM_WORLD, &data->send_reqs[b0_idx]);

From f04e347c451c1555ce24b0e24c37cfcf699c6289 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:13:15 +0300
Subject: [PATCH 54/89] Cleanup before merging to the master merge candidate
 branch

---
 samples/bwtest/CMakeLists.txt |   2 +-
 samples/bwtest/main.c         |  28 ++++
 src/core/CMakeLists.txt       |   2 +-
 src/core/device.cc            | 289 +++-------------------------------
 4 files changed, 52 insertions(+), 269 deletions(-)

diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt
index cd4329f..229e7e2 100644
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -5,5 +5,5 @@ find_package(OpenMP)
 find_package(CUDAToolkit)
 
 add_executable(bwtest main.c)
-target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
 target_compile_options(bwtest PRIVATE -O3)
diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index 73f4387..9dd25d9 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -7,6 +7,7 @@
 #include <mpi.h>
 
 #include <cuda_runtime_api.h>
+#include <cuda.h> // CUDA driver API
 
 #include "timer_hires.h" // From src/common
 
@@ -56,6 +57,17 @@ allocDevice(const size_t bytes)
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
+    #define USE_CUDA_DRIVER_PINNING (1)
+    #if USE_CUDA_DRIVER_PINNING
+    uint8_t* arr = allocDevice(bytes);
+
+    unsigned int flag = 1;
+    CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr);
+
+    errchk(retval == CUDA_SUCCESS);
+    return arr;
+
+    #else
     uint8_t* arr;
     // Standard (20 GiB/s internode, 85 GiB/s intranode)
     // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -65,8 +77,24 @@ allocDevicePinned(const size_t bytes)
     const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
     errchk(retval == cudaSuccess);
     return arr;
+    #endif
 }
 
+/*
+static uint8_t*
+allocDevicePinned(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    errchk(retval == cudaSuccess);
+    return arr;
+}*/
+
 static void
 freeDevice(uint8_t* arr)
 {
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 757cbfe..81bcf14 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
 
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
 
 ## Options
 if (MPI_ENABLED)
diff --git a/src/core/device.cc b/src/core/device.cc
index 01d5e87..688ed89 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -10,13 +10,16 @@
 #include "kernels/kernels.h"
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
-#define MPI_GPUDIRECT_DISABLED (0)
 
-#define DECOMPOSITION_AXES (3)
+#define MPI_GPUDIRECT_DISABLED (0) // Buffer through host memory, deprecated
+#define MPI_DECOMPOSITION_AXES (3)
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1)
+#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory
+#define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
+
+#include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
 
 AcResult
 acDevicePrintInfo(const Device device)
@@ -530,7 +533,7 @@ morton3D(const uint64_t pid)
     uint64_t i, j, k;
     i = j = k = 0;
 
-    if (DECOMPOSITION_AXES == 3) {
+    if (MPI_DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 3 * bit;
             k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
@@ -538,32 +541,22 @@ morton3D(const uint64_t pid)
             i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
         }
     }
-    /*
-    else if (DECOMPOSITION_AXES == 3) {
-        for (int bit = 0; bit <= 21; ++bit) {
-            const uint64_t mask = 0x1l << 3 * bit;
-            i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
-            j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
-            k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
-        }
-    }
-    */
     // Just a quick copy/paste for other decomp dims
-    else if (DECOMPOSITION_AXES == 2) {
+    else if (MPI_DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 2 * bit;
             j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
             k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
         }
     }
-    else if (DECOMPOSITION_AXES == 1) {
+    else if (MPI_DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << 1 * bit;
             k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
         }
     }
     else {
-        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
         ERRCHK_ALWAYS(0);
     }
 
@@ -575,7 +568,7 @@ morton1D(const uint3_64 pid)
 {
     uint64_t i = 0;
 
-    if (DECOMPOSITION_AXES == 3) {
+    if (MPI_DECOMPOSITION_AXES == 3) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.z & mask) << 0) << 2 * bit;
@@ -583,64 +576,26 @@ morton1D(const uint3_64 pid)
             i |= ((pid.x & mask) << 2) << 2 * bit;
         }
     }
-    /*
-    else if (DECOMPOSITION_AXES == 3) {
-        for (int bit = 0; bit <= 21; ++bit) {
-            const uint64_t mask = 0x1l << bit;
-            i |= ((pid.x & mask) << 0) << 2 * bit;
-            i |= ((pid.y & mask) << 1) << 2 * bit;
-            i |= ((pid.z & mask) << 2) << 2 * bit;
-        }
-    }*/
-    else if (DECOMPOSITION_AXES == 2) {
+    else if (MPI_DECOMPOSITION_AXES == 2) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.y & mask) << 0) << 1 * bit;
             i |= ((pid.z & mask) << 1) << 1 * bit;
         }
     }
-    else if (DECOMPOSITION_AXES == 1) {
+    else if (MPI_DECOMPOSITION_AXES == 1) {
         for (int bit = 0; bit <= 21; ++bit) {
             const uint64_t mask = 0x1l << bit;
             i |= ((pid.z & mask) << 0) << 0 * bit;
         }
     }
     else {
-        fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
+        fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
         ERRCHK_ALWAYS(0);
     }
 
     return i;
 }
-/*
-static uint3_64
-morton3D(const uint64_t pid)
-{
-    uint64_t i, j, k;
-    i = j = k = 0;
-    for (int bit = 0; bit <= 21; ++bit) {
-        const uint64_t mask = 0x1l << 3 * bit;
-        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
-        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
-        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
-    }
-
-    return (uint3_64){i, j, k};
-}
-
-static uint64_t
-morton1D(const uint3_64 pid)
-{
-    uint64_t i = 0;
-    for (int bit = 0; bit <= 21; ++bit) {
-        const uint64_t mask = 0x1l << bit;
-        i |= ((pid.x & mask) << 0) << 2 * bit;
-        i |= ((pid.y & mask) << 1) << 2 * bit;
-        i |= ((pid.z & mask) << 2) << 2 * bit;
-    }
-    return i;
-}
-*/
 
 static uint3_64
 decompose(const uint64_t target)
@@ -701,9 +656,17 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
+    #if MPI_USE_CUDA_DRIVER_PINNING
+      ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
+
+      unsigned int flag = 1;
+      CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned);
+      ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
+    #else
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
     // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
     // slower than pinned (38 ms vs. 125 ms)
+    #fi // USE_CUDA_DRIVER_PINNING
 
     return data;
 }
@@ -1588,214 +1551,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     return AC_SUCCESS;
 }
 
-AcResult
-acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
-{
-    ERRCHK(grid.initialized);
-    // acGridSynchronizeStream(stream);
-
-    const Device device = grid.device;
-    const int3 nn       = grid.nn;
-    // CommData corner_data = grid.corner_data; // Do not rm: required for corners
-    CommData edgex_data  = grid.edgex_data;
-    CommData edgey_data  = grid.edgey_data;
-    CommData edgez_data  = grid.edgez_data;
-    CommData sidexy_data = grid.sidexy_data;
-    CommData sidexz_data = grid.sidexz_data;
-    CommData sideyz_data = grid.sideyz_data;
-
-    acDeviceSynchronizeStream(device, stream);
-
-    // Corners
-    /*
-    // Do not rm: required for corners
-    const int3 corner_b0s[] = {
-        (int3){0, 0, 0},
-        (int3){NGHOST + nn.x, 0, 0},
-        (int3){0, NGHOST + nn.y, 0},
-        (int3){0, 0, NGHOST + nn.z},
-
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
-        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
-    };
-    */
-
-    // Edges X
-    const int3 edgex_b0s[] = {
-        (int3){NGHOST, 0, 0},
-        (int3){NGHOST, NGHOST + nn.y, 0},
-
-        (int3){NGHOST, 0, NGHOST + nn.z},
-        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
-    };
-
-    // Edges Y
-    const int3 edgey_b0s[] = {
-        (int3){0, NGHOST, 0},
-        (int3){NGHOST + nn.x, NGHOST, 0},
-
-        (int3){0, NGHOST, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
-    };
-
-    // Edges Z
-    const int3 edgez_b0s[] = {
-        (int3){0, 0, NGHOST},
-        (int3){NGHOST + nn.x, 0, NGHOST},
-
-        (int3){0, NGHOST + nn.y, NGHOST},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
-    };
-
-    // Sides XY
-    const int3 sidexy_b0s[] = {
-        (int3){NGHOST, NGHOST, 0},             //
-        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
-    };
-
-    // Sides XZ
-    const int3 sidexz_b0s[] = {
-        (int3){NGHOST, 0, NGHOST},             //
-        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
-    };
-
-    // Sides YZ
-    const int3 sideyz_b0s[] = {
-        (int3){0, NGHOST, NGHOST},             //
-        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
-    };
-
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-
-#if MPI_COMM_ENABLED
-        // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
-        acPackCommData(device, edgex_b0s, &edgex_data);
-        acPackCommData(device, edgey_b0s, &edgey_data);
-        acPackCommData(device, edgez_b0s, &edgez_data);
-        acPackCommData(device, sidexy_b0s, &sidexy_data);
-        acPackCommData(device, sidexz_b0s, &sidexz_data);
-        acPackCommData(device, sideyz_b0s, &sideyz_data);
-#endif
-
-#if MPI_COMM_ENABLED
-        MPI_Barrier(MPI_COMM_WORLD);
-
-#if MPI_GPUDIRECT_DISABLED
-        // acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
-        acTransferCommDataToHost(device, &edgex_data);
-        acTransferCommDataToHost(device, &edgey_data);
-        acTransferCommDataToHost(device, &edgez_data);
-        acTransferCommDataToHost(device, &sidexy_data);
-        acTransferCommDataToHost(device, &sidexz_data);
-        acTransferCommDataToHost(device, &sideyz_data);
-#endif
-
-        // acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
-        acTransferCommData(device, edgex_b0s, &edgex_data);
-        acTransferCommData(device, edgey_b0s, &edgey_data);
-        acTransferCommData(device, edgez_b0s, &edgez_data);
-        acTransferCommData(device, sidexy_b0s, &sidexy_data);
-        acTransferCommData(device, sidexz_b0s, &sidexz_data);
-        acTransferCommData(device, sideyz_b0s, &sideyz_data);
-#endif // MPI_COMM_ENABLED
-
-#if MPI_COMPUTE_ENABLED
-        //////////// INNER INTEGRATION //////////////
-        {
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = nn;
-            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
-        }
-////////////////////////////////////////////
-#endif // MPI_COMPUTE_ENABLED
-
-#if MPI_COMM_ENABLED
-        // acTransferCommDataWait(corner_data); // Do not rm: required for corners
-        acTransferCommDataWait(edgex_data);
-        acTransferCommDataWait(edgey_data);
-        acTransferCommDataWait(edgez_data);
-        acTransferCommDataWait(sidexy_data);
-        acTransferCommDataWait(sidexz_data);
-        acTransferCommDataWait(sideyz_data);
-
-#if MPI_GPUDIRECT_DISABLED
-        // acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners
-        acTransferCommDataToDevice(device, &edgex_data);
-        acTransferCommDataToDevice(device, &edgey_data);
-        acTransferCommDataToDevice(device, &edgez_data);
-        acTransferCommDataToDevice(device, &sidexy_data);
-        acTransferCommDataToDevice(device, &sidexz_data);
-        acTransferCommDataToDevice(device, &sideyz_data);
-#endif
-
-        // acUnpinCommData(device, &corner_data); // Do not rm: required for corners
-        acUnpinCommData(device, &edgex_data);
-        acUnpinCommData(device, &edgey_data);
-        acUnpinCommData(device, &edgez_data);
-        acUnpinCommData(device, &sidexy_data);
-        acUnpinCommData(device, &sidexz_data);
-        acUnpinCommData(device, &sideyz_data);
-
-        // acUnpackCommData(device, corner_b0s, &corner_data);
-        acUnpackCommData(device, edgex_b0s, &edgex_data);
-        acUnpackCommData(device, edgey_b0s, &edgey_data);
-        acUnpackCommData(device, edgez_b0s, &edgez_data);
-        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
-        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
-        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
-        //////////// OUTER INTEGRATION //////////////
-
-        // Wait for unpacking
-        // acSyncCommData(corner_data); // Do not rm: required for corners
-        acSyncCommData(edgex_data);
-        acSyncCommData(edgey_data);
-        acSyncCommData(edgez_data);
-        acSyncCommData(sidexy_data);
-        acSyncCommData(sidexz_data);
-        acSyncCommData(sideyz_data);
-#endif // MPI_COMM_ENABLED
-#if MPI_COMPUTE_ENABLED
-        { // Front
-            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
-            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
-        }
-        { // Back
-            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
-            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
-        }
-        { // Bottom
-            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
-        }
-        { // Top
-            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
-        }
-        { // Left
-            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
-        }
-        { // Right
-            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
-        }
-#endif // MPI_COMPUTE_ENABLED
-        acDeviceSwapBuffers(device);
-        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
-        ////////////////////////////////////////////
-    }
-
-    return AC_SUCCESS;
-}
-
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
 {

From 88f99c12e4d3a035bcc53db630adc3c8eeb4db52 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:20:43 +0300
Subject: [PATCH 55/89] Fixed #fi -> #endif

---
 src/core/device.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 688ed89..181e802 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -666,7 +666,7 @@ acCreatePackedData(const int3 dims)
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
     // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
     // slower than pinned (38 ms vs. 125 ms)
-    #fi // USE_CUDA_DRIVER_PINNING
+    #endif // USE_CUDA_DRIVER_PINNING
 
     return data;
 }

From 3c3b2a188593f8b2900012788b46f2b3f50af451 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:35:19 +0300
Subject: [PATCH 56/89] Reverted the default settings to what they were before
 merge. Note: LFORCING (1) is potentially not tested properly, TODO recheck.

---
 CMakeLists.txt                   | 4 ++--
 acc/mhd_solver/stencil_kernel.ac | 4 ++--
 config/astaroth.conf             | 6 +++---
 src/utils/modelsolver.c          | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04100bc..682be55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ project(astaroth C CXX CUDA)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 
 ## Project-wide compilation flags
-set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow")
+set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
@@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11)
 find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
 cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
 string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
-set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
+set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
 
 
diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac
index 0f37ea5..905eb65 100644
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -5,8 +5,8 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (0)
-#define LUPWD (0)
+#define LFORCING (1)
+#define LUPWD (1)
 #define LSINK (0)
 
 #define AC_THERMAL_CONDUCTIVITY (0.001) // TODO: make an actual config parameter
diff --git a/config/astaroth.conf b/config/astaroth.conf
index 83e93d9..190948b 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -5,9 +5,9 @@
  * "Compile-time" params
  * =============================================================================
  */
-AC_nx = 256
-AC_ny = 256
-AC_nz = 256
+AC_nx = 128
+AC_ny = 128
+AC_nz = 128
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index e482bd9..d9446eb 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -38,8 +38,8 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (0)
-#define LUPWD (0)
+#define LFORCING (1)
+#define LUPWD (1)
 #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
 
 typedef AcReal Scalar;

From 0d1c5b3911f502c3fe8a7d91b344a81921a2d822 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:56:30 +0300
Subject: [PATCH 57/89] Autoformatted

---
 samples/bwtest/main.c           | 47 ++++++++++++--------
 src/core/device.cc              | 21 ++++-----
 src/core/kernels/kernels.cu     | 25 ++++++-----
 src/core/kernels/reductions.cuh |  5 ++-
 src/utils/modelsolver.c         | 78 ++++++++++++++++++++++-----------
 5 files changed, 108 insertions(+), 68 deletions(-)

diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index 9dd25d9..35d98d5 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -6,15 +6,21 @@
 
 #include <mpi.h>
 
-#include <cuda_runtime_api.h>
 #include <cuda.h> // CUDA driver API
+#include <cuda_runtime_api.h>
 
 #include "timer_hires.h" // From src/common
 
 //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
 #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
 
-#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
+#define errchk(x)                                                                                  \
+    {                                                                                              \
+        if (!(x)) {                                                                                \
+            fprintf(stderr, "errchk(%s) failed", #x);                                              \
+            assert(x);                                                                             \
+        }                                                                                          \
+    }
 
 /*
   Findings:
@@ -57,17 +63,18 @@ allocDevice(const size_t bytes)
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
-    #define USE_CUDA_DRIVER_PINNING (1)
-    #if USE_CUDA_DRIVER_PINNING
+#define USE_CUDA_DRIVER_PINNING (1)
+#if USE_CUDA_DRIVER_PINNING
     uint8_t* arr = allocDevice(bytes);
 
     unsigned int flag = 1;
-    CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr);
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)arr);
 
     errchk(retval == CUDA_SUCCESS);
     return arr;
 
-    #else
+#else
     uint8_t* arr;
     // Standard (20 GiB/s internode, 85 GiB/s intranode)
     // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -77,7 +84,7 @@ allocDevicePinned(const size_t bytes)
     const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
     errchk(retval == cudaSuccess);
     return arr;
-    #endif
+#endif
 }
 
 /*
@@ -267,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
     cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
 }
 
-
 static void
 sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
 {
@@ -327,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
-
 static void
-measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
-                                                                                                            uint8_t* hsrc, uint8_t* ddst)
+measurebw2(const char* msg, const size_t bytes,
+           void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
+           uint8_t* hsrc, uint8_t* ddst)
 {
     const size_t num_samples = 100;
 
@@ -414,8 +420,8 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Host)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Host)", //
-                  2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
         //          2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
 
         freeHost(src);
@@ -434,11 +440,12 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
         //          2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
+                  dst);
 
         freeDevice(src);
         freeDevice(dst);
@@ -456,7 +463,7 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
 
         freeDevice(src);
         freeDevice(dst);
@@ -472,7 +479,8 @@ main(void)
         measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
         measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
 
-        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
+                   ddst);
 
         freeDevice(dsrc);
         freeDevice(ddst);
@@ -490,7 +498,8 @@ main(void)
         measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
         measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
 
-        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
+                   hsrc, ddst);
 
         freeDevice(dsrc);
         freeDevice(ddst);
diff --git a/src/core/device.cc b/src/core/device.cc
index 181e802..8cda677 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -16,7 +16,7 @@
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory
+#define MPI_USE_PINNED (1)              // Do inter-node comm with pinned memory
 #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
 
 #include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
@@ -656,17 +656,18 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
-    #if MPI_USE_CUDA_DRIVER_PINNING
-      ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
+#if MPI_USE_CUDA_DRIVER_PINNING
+    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
 
-      unsigned int flag = 1;
-      CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned);
-      ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
-    #else
+    unsigned int flag = 1;
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)data.data_pinned);
+    ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
+#else
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
-    // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
-    // slower than pinned (38 ms vs. 125 ms)
-    #endif // USE_CUDA_DRIVER_PINNING
+// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
+// slower than pinned (38 ms vs. 125 ms)
+#endif // USE_CUDA_DRIVER_PINNING
 
     return data;
 }
diff --git a/src/core/kernels/kernels.cu b/src/core/kernels/kernels.cu
index 3c59486..2878f38 100644
--- a/src/core/kernels/kernels.cu
+++ b/src/core/kernels/kernels.cu
@@ -75,17 +75,20 @@ exp(const acComplex& val)
 {
     return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
 }
-static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const AcReal& a, const acComplex& b)
 {
     return (acComplex){a * b.x, a * b.y};
 }
 
-static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
+static __device__ inline acComplex
+operator*(const acComplex& b, const AcReal& a)
 {
     return (acComplex){a * b.x, a * b.y};
 }
 
-static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const acComplex& a, const acComplex& b)
 {
     return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
 }
@@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
 
     const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
 
     const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
 
     const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
     }
 
     if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
-        fprintf(
-            stderr,
-            "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n",
-            value.x, value.y, value.z, int3param_names[param]);
+        fprintf(stderr,
+                "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
+                "Skipping.\n",
+                value.x, value.y, value.z, int3param_names[param]);
         return AC_FAILURE;
     }
 
@@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device)
 {
     cudaSetDevice(device->id);
 
-    // clang-format off
+// clang-format off
     // Scalar
     #define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
     AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)
diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh
index 8877e7e..1f40df3 100644
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
     assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
     assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
 
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
+                                                                   src1[IDX(src_idx)],
+                                                                   src2[IDX(src_idx)]);
 }
 
 template <ReduceFunc reduce>
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index d9446eb..4c2edcf 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -103,11 +103,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
 #elif STENCIL_ORDER == 4
     const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0),
-                                   (Scalar)(1.0 / 60.0)};
+    const Scalar coefficients[] = {
+        0,
+        (Scalar)(3.0 / 4.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 60.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0),
-                                   (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)};
+    const Scalar coefficients[] = {
+        0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -126,15 +131,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
 #if STENCIL_ORDER == 2
     const Scalar coefficients[] = {-2, 1};
 #elif STENCIL_ORDER == 4
-    const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0),
-                                   (Scalar)(-1.0 / 12.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-5.0 / 2.0),
+        (Scalar)(4.0 / 3.0),
+        (Scalar)(-1.0 / 12.0),
+    };
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0),
-                                   (Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-49.0 / 18.0),
+        (Scalar)(3.0 / 2.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 90.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),
-                                   (Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0),
-                                   (Scalar)(-1.0 / 560.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),    (Scalar)(-1.0 / 5.0),
+        (Scalar)(8.0 / 315.0),   (Scalar)(-1.0 / 560.0),
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -156,16 +169,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
     const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
 #elif STENCIL_ORDER == 4
     const Scalar coefficients[] = {
-        0, (Scalar)(1.0 / 32.0),
-        (Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders
+        0,
+        (Scalar)(1.0 / 32.0),
+        (Scalar)(1.0 / 64.0),
+    }; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
     const Scalar fac            = ((Scalar)(1. / 720.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac,
-                                   (Scalar)(2.0) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(270.0) * fac,
+        (Scalar)(-27.0) * fac,
+        (Scalar)(2.0) * fac,
+    };
 #elif STENCIL_ORDER == 8
     const Scalar fac            = ((Scalar)(1. / 20160.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac,
-                                   (Scalar)(128.) * fac, (Scalar)(-9.) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(8064.) * fac,
+        (Scalar)(-1008.) * fac,
+        (Scalar)(128.) * fac,
+        (Scalar)(-9.) * fac,
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -207,14 +231,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
     Scalar pencil_a[STENCIL_ORDER + 1];
     //#pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2,
-                                   k)];
+        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + offset - STENCIL_ORDER / 2, k)];
 
     Scalar pencil_b[STENCIL_ORDER + 1];
     //#pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset,
-                                   k)];
+        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + STENCIL_ORDER / 2 - offset, k)];
 
     return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
 }
@@ -539,7 +563,8 @@ gradient_of_divergence(const VectorData vec)
     return (Vector){
         hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
         hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
-        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]};
+        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
+    };
 }
 
 // Takes uu gradients and returns S
@@ -805,10 +830,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
                                        getInt(AC_ny) * getReal(AC_dsy),
                                        getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
     (void)a;                                                             // WARNING: not used
-    Vector xx        = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
-                         (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
-                         (globalVertexIdx.z - getInt(AC_nz_min)) *
-                             getReal(AC_dsz)}; // sink (current index)
+    Vector xx = (Vector){
+        (globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
+        (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
+        (globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
+    }; // sink (current index)
     const Scalar cs2 = getReal(AC_cs2_sound);
     const Scalar cs  = sqrt(cs2);
 

From ba0bfd65b4f59ee1f19caac0b5257e0519238339 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 16:10:27 +0300
Subject: [PATCH 58/89] Merged the new reduction functions manually

---
 src/core/device.cc | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index 8cda677..ed763a0 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1698,4 +1698,83 @@ acGridPeriodicBoundconds(const Stream stream)
     acSyncCommData(sideyz_data);
     return AC_SUCCESS;
 }
+
+static AcResult
+acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* result)
+{
+
+    MPI_Op op;
+    if (rtype == RTYPE_MAX) {
+        op = MPI_MAX;
+    }
+    else if (rtype == RTYPE_MIN) {
+        op = MPI_MIN;
+    }
+    else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
+        op = MPI_SUM;
+    }
+    else {
+        ERROR("Unrecognised rtype");
+    }
+
+#if AC_DOUBLE_PRECISION == 1
+    MPI_Datatype datatype = MPI_DOUBLE;
+#else
+    MPI_Datatype datatype = MPI_FLOAT;
+#endif
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int world_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+    AcReal mpi_res;
+    MPI_Reduce(&local_result, &mpi_res, 1, datatype, op, 0, MPI_COMM_WORLD);
+    if (rank == 0) {
+        if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+            const AcReal inv_n = AcReal(1.) /
+                                 (grid.nn.x * grid.decomposition.x * grid.nn.y *
+                                  grid.decomposition.y * grid.nn.z * grid.decomposition.z);
+            mpi_res = sqrt(inv_n * mpi_res);
+        }
+        *result = mpi_res;
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridReduceScal(const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    ERRCHK(grid.initialized);
+
+    const Device device = grid.device;
+
+    acGridSynchronizeStream(STREAM_ALL);
+    // MPI_Barrier(MPI_COMM_WORLD);
+
+    AcReal local_result;
+    acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
+
+    return acMPIReduceScal(local_result, rtype, result);
+}
+
+AcResult
+acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf0,
+                const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result)
+{
+    ERRCHK(grid.initialized);
+
+    const Device device = grid.device;
+
+    acGridSynchronizeStream(STREAM_ALL);
+    // MPI_Barrier(MPI_COMM_WORLD);
+
+    AcReal local_result;
+    acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);
+
+    return acMPIReduceScal(local_result, rtype, result);
+}
+
 #endif // AC_MPI_ENABLED

From fab620eb0de10bc70a111a9cb9c0561e81b4e8de Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 16:34:50 +0300
Subject: [PATCH 59/89] Reordered reduction autotests and made it so that the
 exact same mesh is used for both the model and candidates instead of the
 unclean integrated one

---
 samples/mpitest/main.cc | 59 ++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/samples/mpitest/main.cc b/samples/mpitest/main.cc
index f8de568..83a95ec 100644
--- a/samples/mpitest/main.cc
+++ b/samples/mpitest/main.cc
@@ -49,29 +49,45 @@ main(void)
 
     // GPU alloc & compute
     acGridInit(info);
-    acGridLoadMesh(model, STREAM_DEFAULT);
 
+    // INTEGRATION TESTS START ---------------------------------------------------------------------
+    acGridLoadMesh(model, STREAM_DEFAULT);
     acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
     acGridPeriodicBoundconds(STREAM_DEFAULT);
+    acGridStoreMesh(STREAM_DEFAULT, &candidate);
+
+    if (pid == 0) {
+        acModelIntegrateStep(model, FLT_EPSILON);
+        acMeshApplyPeriodicBounds(&model);
+        acVerifyMesh(model, candidate);
+    }
+    // INTEGRATION TESTS END -----------------------------------------------------------------------
+
+    // REDUCTION TESTS START -----------------------------------------------------------------------
+    acGridLoadMesh(model, STREAM_DEFAULT);
 
-    // clang-format off
-    // Define scalar reduction tests here
     std::vector<AcScalReductionTestCase> scalarReductionTests{
-        acCreateScalReductionTestCase("Scalar MAX",     VTXBUF_UUX, RTYPE_MAX),
-        acCreateScalReductionTestCase("Scalar MIN",     VTXBUF_UUX, RTYPE_MIN),
-        acCreateScalReductionTestCase("Scalar RMS",     VTXBUF_UUX, RTYPE_RMS),
+        acCreateScalReductionTestCase("Scalar MAX", VTXBUF_UUX, RTYPE_MAX),
+        acCreateScalReductionTestCase("Scalar MIN", VTXBUF_UUX, RTYPE_MIN),
+        /*
+        acCreateScalReductionTestCase("Scalar RMS", VTXBUF_UUX, RTYPE_RMS),
         acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
-        acCreateScalReductionTestCase("Scalar SUM",     VTXBUF_UUX, RTYPE_SUM)
+        acCreateScalReductionTestCase("Scalar SUM", VTXBUF_UUX, RTYPE_SUM),
+        */
     };
-    // Define vector reduction tests here
     std::vector<AcVecReductionTestCase> vectorReductionTests{
-        acCreateVecReductionTestCase("Vector MAX",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
-        acCreateVecReductionTestCase("Vector MIN",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
-        acCreateVecReductionTestCase("Vector RMS",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
-        acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP),
-        acCreateVecReductionTestCase("Vector SUM",     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM)
+        acCreateVecReductionTestCase("Vector MAX", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
+        acCreateVecReductionTestCase("Vector MIN", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
+        /*
+        acCreateVecReductionTestCase("Vector RMS", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
+        acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
+                                     RTYPE_RMS_EXP),
+        acCreateVecReductionTestCase("Vector SUM", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM),
+        */
     };
-    // clang-format on
+    // False positives due to too strict error bounds, skip the tests until we can determine a
+    // proper error bound
+    fprintf(stderr, "WARNING: RTYPE_RMS, RTYPE_RMS_EXP, and RTYPE_SUM tests skipped\n");
 
     for (auto& testCase : scalarReductionTests) {
         acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
@@ -80,25 +96,18 @@ main(void)
         acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c,
                         &testCase.candidate);
     }
-
-    acGridStoreMesh(STREAM_DEFAULT, &candidate);
-    acGridQuit();
-
-    // Verify
     if (pid == 0) {
-        acModelIntegrateStep(model, FLT_EPSILON);
-        acMeshApplyPeriodicBounds(&model);
-
-        acVerifyMesh(model, candidate);
-
-        // Check reductions
         acVerifyScalReductions(model, scalarReductionTests.data(), scalarReductionTests.size());
         acVerifyVecReductions(model, vectorReductionTests.data(), vectorReductionTests.size());
+    }
+    // REDUCTION TESTS END -------------------------------------------------------------------------
 
+    if (pid == 0) {
         acMeshDestroy(&model);
         acMeshDestroy(&candidate);
     }
 
+    acGridQuit();
     MPI_Finalize();
     return EXIT_SUCCESS;
 }

From c0c337610b77600466fc2b0b1594a3ea60608a0a Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 16:42:39 +0300
Subject: [PATCH 60/89] Added mpi_reduce_bench to samples

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 682be55..29ec11e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,7 @@ if (BUILD_SAMPLES)
     add_subdirectory(samples/benchmark)
     add_subdirectory(samples/bwtest)
     add_subdirectory(samples/genbenchmarkscripts)
+    add_subdirectory(samples/mpi_reduce_bench)
 endif()
 
 if (BUILD_STANDALONE)

From 196edac46d807eaf48079053c025af14d258964e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 17:03:54 +0300
Subject: [PATCH 61/89] Added proper casts to modelsolver.c

---
 src/core/kernels/integration.cuh | 11 +++++------
 src/utils/modelsolver.c          |  9 ++++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 4c01148..0d88ac3 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -41,12 +41,11 @@ static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
               const AcReal3 rate_of_change, const AcReal dt)
 {
-    return (AcReal3){rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x,
-                                                dt),
-                     rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y,
-                                                dt),
-                     rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z,
-                                                dt)};
+    return (AcReal3){
+        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt),
+    };
 }
 
 #define rk3(state_previous, state_current, rate_of_change, dt)                                     \
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index 4c2edcf..3937e5e 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -41,6 +41,7 @@
 #define LFORCING (1)
 #define LUPWD (1)
 #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
+#define R_PI ((Scalar)M_PI)
 
 typedef AcReal Scalar;
 // typedef AcReal3 Vector;
@@ -54,6 +55,8 @@ typedef float Vector __attribute__((vector_size(4 * sizeof(float))));
 #define fabs fabsf
 #define exp expf
 #define sqrt sqrtf
+#define cos cosf
+#define sin sinf
 #endif
 
 typedef struct {
@@ -802,9 +805,9 @@ Vector
 helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
 {
     (void)magnitude; // WARNING: unused
-    xx[0] = xx[0] * (2.0 * M_PI / (getReal(AC_dsx) * getInt(AC_nx)));
-    xx[1] = xx[1] * (2.0 * M_PI / (getReal(AC_dsy) * getInt(AC_ny)));
-    xx[2] = xx[2] * (2.0 * M_PI / (getReal(AC_dsz) * getInt(AC_nz)));
+    xx[0] = xx[0] * ((Scalar)2.0 * R_PI / (getReal(AC_dsx) * getInt(AC_nx)));
+    xx[1] = xx[1] * ((Scalar)2.0 * R_PI / (getReal(AC_dsy) * getInt(AC_ny)));
+    xx[2] = xx[2] * ((Scalar)2.0 * R_PI / (getReal(AC_dsz) * getInt(AC_nz)));
 
     Scalar cos_phi     = cos(phi);
     Scalar sin_phi     = sin(phi);

From 70ecacee7c703ddb273cf8022ba7086b7b2c8e3c Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 17:04:35 +0300
Subject: [PATCH 62/89] Reverted the default build options to what they were
 before merging (again)

---
 CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29ec11e..b2722d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,11 +30,11 @@ endif()
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 
 ## Options
-option(DOUBLE_PRECISION "Generates double precision code."                    ON)
-option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            ON)
-option(BUILD_STANDALONE "Builds standalone Astaroth."                         OFF)
-option(MPI_ENABLED      "Enables additional functions for MPI communciation." ON)
-option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
+option(DOUBLE_PRECISION "Generates double precision code."                    OFF)
+option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            OFF)
+option(BUILD_STANDALONE "Builds standalone Astaroth."                         ON)
+option(MPI_ENABLED      "Enables additional functions for MPI communciation." OFF)
+option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
 
 ## Options (DEPRECATED)
 # option(BUILD_DEBUG              "Builds the program with extensive error checking"          OFF)

From fbb8d7c7c6edf8acb1689cab13a5b7291f9f147a Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 06:34:16 +0300
Subject: [PATCH 63/89] Added a minimal Fortran interface to Astaroth

---
 CMakeLists.txt           |  4 ++-
 acc/src/code_generator.c | 75 ++++++++++++++++++++++++++++++++++------
 include/astaroth.h       | 12 +++----
 src/core/CMakeLists.txt  |  2 +-
 4 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2722d1..5152379 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,7 +55,8 @@ message(STATUS "AC module dir: ${DSL_MODULE_DIR}")
 file(GLOB DSL_SOURCES ${DSL_MODULE_DIR}/*
                       ${CMAKE_SOURCE_DIR}/acc/stdlib/*)
 set(DSL_HEADERS "${PROJECT_BINARY_DIR}/user_kernels.h"
-                "${PROJECT_BINARY_DIR}/user_defines.h")
+                "${PROJECT_BINARY_DIR}/user_defines.h"
+                "${PROJECT_BINARY_DIR}/astaroth.f90")
 
 add_custom_command (
     COMMENT "Building ACC objects ${DSL_MODULE_DIR}"
@@ -99,6 +100,7 @@ if (BUILD_SAMPLES)
     add_subdirectory(samples/bwtest)
     add_subdirectory(samples/genbenchmarkscripts)
     add_subdirectory(samples/mpi_reduce_bench)
+    add_subdirectory(samples/fortrantest)
 endif()
 
 if (BUILD_STANDALONE)
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index f69d186..8738590 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -39,9 +39,11 @@ ASTNode* root = NULL;
 // Output files
 static FILE* DSLHEADER  = NULL;
 static FILE* CUDAHEADER = NULL;
+static FILE* FHEADER    = NULL;
 
 static const char* dslheader_filename  = "user_defines.h";
 static const char* cudaheader_filename = "user_kernels.h";
+static const char* fheader_filename    = "astaroth.f90";
 
 // Forward declaration of yyparse
 int yyparse(void);
@@ -98,7 +100,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
     ['<'] = "<",
     ['>'] = ">",
     ['!'] = "!",
-    ['.'] = "."};
+    ['.'] = ".",
+};
 
 static const char*
 translate(const int token)
@@ -261,9 +264,8 @@ traverse(const ASTNode* node)
             if (typequal->token == KERNEL) {
                 fprintf(CUDAHEADER, "GEN_KERNEL_PARAM_BOILERPLATE");
                 if (node->lhs != NULL) {
-                    fprintf(
-                        stderr,
-                        "Syntax error: function parameters for Kernel functions not allowed!\n");
+                    fprintf(stderr, "Syntax error: function parameters for Kernel functions not "
+                                    "allowed!\n");
                 }
             }
             else if (typequal->token == PREPROCESSED) {
@@ -597,8 +599,10 @@ generate_preprocessed_structures(void)
 }
 
 static void
-generate_header(void)
+generate_headers(void)
 {
+    int enumcounter = 0;
+
     fprintf(DSLHEADER, "#pragma once\n");
 
     // Int params
@@ -606,56 +610,103 @@ generate_header(void)
     for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
         if (symbol_table[i].type_specifier == INT && symbol_table[i].type_qualifier == UNIFORM) {
             fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
         }
     }
     fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT_PARAMS = %d\n\n", enumcounter);
 
     // Int3 params
     fprintf(DSLHEADER, "#define AC_FOR_USER_INT3_PARAM_TYPES(FUNC)");
+    enumcounter = 0;
     for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
         if (symbol_table[i].type_specifier == INT3 && symbol_table[i].type_qualifier == UNIFORM) {
             fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
         }
     }
     fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT3_PARAMS = %d\n\n", enumcounter);
 
     // Scalar params
     fprintf(DSLHEADER, "#define AC_FOR_USER_REAL_PARAM_TYPES(FUNC)");
+    enumcounter = 0;
     for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
         if (symbol_table[i].type_specifier == SCALAR && symbol_table[i].type_qualifier == UNIFORM) {
             fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
         }
     }
     fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL_PARAMS = %d\n\n", enumcounter);
 
     // Vector params
     fprintf(DSLHEADER, "#define AC_FOR_USER_REAL3_PARAM_TYPES(FUNC)");
+    enumcounter = 0;
     for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
         if (symbol_table[i].type_specifier == VECTOR && symbol_table[i].type_qualifier == UNIFORM) {
             fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
         }
     }
     fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL3_PARAMS = %d\n\n", enumcounter);
 
     // Scalar fields
     fprintf(DSLHEADER, "#define AC_FOR_VTXBUF_HANDLES(FUNC)");
+    enumcounter = 0;
     for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
         if (symbol_table[i].type_specifier == SCALARFIELD &&
             symbol_table[i].type_qualifier == UNIFORM) {
             fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
         }
     }
     fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_VTXBUF_HANDLES = %d\n\n", enumcounter);
 
     // Scalar arrays
     fprintf(DSLHEADER, "#define AC_FOR_SCALARARRAY_HANDLES(FUNC)");
+    enumcounter = 0;
     for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
         if (symbol_table[i].type_specifier == SCALARARRAY &&
             symbol_table[i].type_qualifier == UNIFORM) {
             fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
         }
     }
     fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
+
+    // Do Fortran-specific
+    const char* fortran_structs = R"(
+      integer, parameter :: precision = c_float ! TODO WARNING
+
+      type, bind(C) :: AcMeshInfo
+        integer(c_int), dimension(AC_NUM_INT_PARAMS)      :: int_params
+        integer(c_int), dimension(AC_NUM_INT3_PARAMS, 3)  :: int3_params
+        real(precision), dimension(AC_NUM_REAL_PARAMS)    :: real_params
+        real(precision), dimension(AC_NUM_REAL3_PARAMS, 3) :: real3_params
+      end type AcMeshInfo
+    )";
+    fprintf(FHEADER, "%s\n", fortran_structs);
 }
 
 static void
@@ -681,20 +732,21 @@ main(void)
 
     DSLHEADER  = fopen(dslheader_filename, "w+");
     CUDAHEADER = fopen(cudaheader_filename, "w+");
+    FHEADER    = fopen(fheader_filename, "w+");
     assert(DSLHEADER);
     assert(CUDAHEADER);
+    assert(FHEADER);
 
     // Add built-in param symbols
-    for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i) {
+    for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i)
         add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT, builtin_int_params[i]);
-    }
-    for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i) {
+
+    for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i)
         add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT3, builtin_int3_params[i]);
-    }
 
     // Generate
     traverse(root);
-    generate_header();
+    generate_headers();
     generate_preprocessed_structures();
     generate_library_hooks();
 
@@ -703,9 +755,12 @@ main(void)
     // Cleanup
     fclose(DSLHEADER);
     fclose(CUDAHEADER);
+    fclose(FHEADER);
     astnode_destroy(root);
 
     fprintf(stdout, "-- Generated %s\n", dslheader_filename);
     fprintf(stdout, "-- Generated %s\n", cudaheader_filename);
+    fprintf(stdout, "-- Generated %s\n", fheader_filename);
+
     return EXIT_SUCCESS;
 }
diff --git a/include/astaroth.h b/include/astaroth.h
index 47beb88..d72d598 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -322,15 +322,13 @@ AcResult acGridIntegrate(const Stream stream, const AcReal dt);
 AcResult acGridPeriodicBoundconds(const Stream stream);
 
 /** TODO */
-AcResult
-acGridReduceScal(const Stream stream, const ReductionType rtype,
-                 const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
 
 /** TODO */
-AcResult
-acGridReduceVec(const Stream stream, const ReductionType rtype,
-               const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-               const VertexBufferHandle vtxbuf2, AcReal* result);
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
 #endif // AC_MPI_ENABLED
 
 /*
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 81bcf14..1a70f93 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,7 +1,7 @@
 find_package(CUDAToolkit)
 
 ## Astaroth Core
-add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
+add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc)
 target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
 
 ## Options

From c44c3d02b4d7879312279f03f52e2cf8133fe9fe Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 06:35:13 +0300
Subject: [PATCH 64/89] Added a sample for testing the Fortran interface

---
 samples/fortrantest/CMakeLists.txt |  4 ++++
 samples/fortrantest/main.f90       | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 samples/fortrantest/CMakeLists.txt
 create mode 100644 samples/fortrantest/main.f90

diff --git a/samples/fortrantest/CMakeLists.txt b/samples/fortrantest/CMakeLists.txt
new file mode 100644
index 0000000..128ad8e
--- /dev/null
+++ b/samples/fortrantest/CMakeLists.txt
@@ -0,0 +1,4 @@
+enable_language(Fortran)
+
+add_executable(fortrantest main.f90)
+target_link_libraries(fortrantest astaroth_core)
diff --git a/samples/fortrantest/main.f90 b/samples/fortrantest/main.f90
new file mode 100644
index 0000000..1eba03f
--- /dev/null
+++ b/samples/fortrantest/main.f90
@@ -0,0 +1,23 @@
+program pc
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+  include "astaroth.f90"
+
+  type(AcMeshInfo) :: info
+  type(c_ptr) :: device
+
+  print *, "Num int params"
+  print *, AC_NUM_INT_PARAMS
+
+  ! Setup config
+  info%int_params(AC_nx + 1) = 128
+  info%int_params(AC_ny + 1) = 128
+  info%int_params(AC_nz + 1) = 128
+  call acupdatebuiltinparams(info)
+
+  call acdevicecreate(0, info, device)
+  call acdeviceprintinfo(device)
+  call acdevicedestroy(device)
+
+end program

From f11c5b84fb5108880530dd64082db783efc15065 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 06:36:00 +0300
Subject: [PATCH 65/89] Forgot the actual interface from previous commits, here
 it is

---
 include/astaroth_fortran.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 include/astaroth_fortran.h

diff --git a/include/astaroth_fortran.h b/include/astaroth_fortran.h
new file mode 100644
index 0000000..bf9a1e9
--- /dev/null
+++ b/include/astaroth_fortran.h
@@ -0,0 +1,24 @@
+#pragma once
+#include "astaroth.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Utils
+ */
+void acupdatebuiltinparams_(AcMeshInfo* info);
+
+/**
+ * Device
+ */
+void acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle);
+
+void acdevicedestroy_(Device* device);
+
+void acdeviceprintinfo_(const Device* device);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif

From 264abddefba467e02ea81deb23d7f4396e8f1bb5 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 03:41:23 +0000
Subject: [PATCH 66/89] bitbucket-pipelines.yml edited online with Bitbucket

---
 bitbucket-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
index e49d4f1..bc6ac44 100644
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@@ -24,7 +24,7 @@ pipelines:
           - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
           - apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
           - apt-get update
-          - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev
+          - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev gfortran
           - cmake -DDSL_MODULE_DIR="acc/mhd_solver" -DBUILD_STANDALONE=ON -DBUILD_UTILS=ON -DBUILD_RT_VISUALIZATION=OFF -DBUILD_SAMPLES=ON -DDOUBLE_PRECISION=OFF -DMULTIGPU_ENABLED=ON -DMPI_ENABLED=OFF .. # Single precision
           - make -j
           - rm -rf *

From 172ffc34dca28e85f69e14850437575266c31240 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 06:44:27 +0300
Subject: [PATCH 67/89] Was missing another fortran file, added

---
 src/core/astaroth_fortran.cc | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 src/core/astaroth_fortran.cc

diff --git a/src/core/astaroth_fortran.cc b/src/core/astaroth_fortran.cc
new file mode 100644
index 0000000..aa535d1
--- /dev/null
+++ b/src/core/astaroth_fortran.cc
@@ -0,0 +1,32 @@
+#include "astaroth_fortran.h"
+
+#include "astaroth.h"
+#include "astaroth_utils.h"
+
+void
+acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle)
+{
+    // TODO errorcheck
+    acDeviceCreate(*id, *info, handle);
+}
+
+void
+acdevicedestroy_(Device* device)
+{
+    // TODO errorcheck
+    acDeviceDestroy(*device);
+}
+
+void
+acdeviceprintinfo_(const Device* device)
+{
+    // TODO errorcheck
+    acDevicePrintInfo(*device);
+}
+
+void
+acupdatebuiltinparams_(AcMeshInfo* info)
+{
+    // TODO errorcheck
+    acUpdateBuiltinParams(info);
+}

From 0a191920045120bfd9e54b29b40a773cad66c121 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 19:53:39 +0300
Subject: [PATCH 68/89] Auto-optimization was not on for all GPUs when using
 MPI. May have to rerun all benchmarks for the MPI paper.

---
 src/core/device.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index ed763a0..689eaf7 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -167,9 +167,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
     *device_handle = device;
 
     // Autoptimize
-    if (id == 0) {
-        acDeviceAutoOptimize(device);
-    }
+    acDeviceAutoOptimize(device);
 
     return AC_SUCCESS;
 }

From 1b50374cdbcea0008d5d720f0cdc37e24d5dbdad Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 20:09:35 +0300
Subject: [PATCH 69/89] Added the rest of the basic functions required for
 running simulations with the fortran interface

---
 include/astaroth_fortran.h   | 24 +++++++++++++++
 src/core/astaroth_fortran.cc | 60 ++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/include/astaroth_fortran.h b/include/astaroth_fortran.h
index bf9a1e9..11a37a9 100644
--- a/include/astaroth_fortran.h
+++ b/include/astaroth_fortran.h
@@ -19,6 +19,30 @@ void acdevicedestroy_(Device* device);
 
 void acdeviceprintinfo_(const Device* device);
 
+void acupdatebuiltinparams_(AcMeshInfo* info);
+
+void acdeviceswapbuffers_(const Device* device);
+
+void acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMesh* host_mesh);
+
+void acdevicestoremesh_(const Device* device, const Stream* stream, AcMesh* host_mesh);
+
+void acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
+                               const int3* start, const int3* end, const AcReal* dt);
+void acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
+                                 const int3* end);
+
+void acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                         const VertexBufferHandle* vtxbuf_handle, AcReal* result);
+
+void acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                        const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
+                        const VertexBufferHandle* vtxbuf2, AcReal* result);
+
+void acdevicesynchronizestream_(const Device* device, const Stream* stream);
+
+void acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/core/astaroth_fortran.cc b/src/core/astaroth_fortran.cc
index aa535d1..ca93591 100644
--- a/src/core/astaroth_fortran.cc
+++ b/src/core/astaroth_fortran.cc
@@ -30,3 +30,63 @@ acupdatebuiltinparams_(AcMeshInfo* info)
     // TODO errorcheck
     acUpdateBuiltinParams(info);
 }
+
+void
+acdeviceswapbuffers_(const Device* device)
+{
+    acDeviceSwapBuffers(*device);
+}
+
+void
+acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMesh* host_mesh)
+{
+    acDeviceLoadMesh(*device, *stream, *host_mesh);
+}
+
+void
+acdevicestoremesh_(const Device* device, const Stream* stream, AcMesh* host_mesh)
+{
+    acDeviceStoreMesh(*device, *stream, host_mesh);
+}
+
+void
+acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
+                          const int3* start, const int3* end, const AcReal* dt)
+{
+    acDeviceIntegrateSubstep(*device, *stream, *step_number, *start, *end, *dt);
+}
+
+void
+acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
+                            const int3* end)
+{
+
+    acDevicePeriodicBoundconds(*device, *stream, *start, *end);
+}
+
+void
+acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                    const VertexBufferHandle* vtxbuf_handle, AcReal* result)
+{
+    acDeviceReduceScal(*device, *stream, *rtype, *vtxbuf_handle, result);
+}
+
+void
+acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                   const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
+                   const VertexBufferHandle* vtxbuf2, AcReal* result)
+{
+    acDeviceReduceVec(*device, *stream, *rtype, *vtxbuf0, *vtxbuf1, *vtxbuf2, result);
+}
+
+void
+acdevicesynchronizestream_(const Device* device, const Stream* stream)
+{
+    acDeviceSynchronizeStream(*device, *stream);
+}
+
+void
+acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info)
+{
+    acDeviceLoadMeshInfo(*device, *info);
+}

From 7e71e323595cf575fa63f75862ec9456da09b67c Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 20:21:16 +0300
Subject: [PATCH 70/89] Fortran does not seem to really support arrays of
 pointers, better to modify the interface function to take the f array as an
 input and use it in C to costruct a proper AcMesh

---
 src/core/astaroth_fortran.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/core/astaroth_fortran.cc b/src/core/astaroth_fortran.cc
index ca93591..e397a9a 100644
--- a/src/core/astaroth_fortran.cc
+++ b/src/core/astaroth_fortran.cc
@@ -40,12 +40,14 @@ acdeviceswapbuffers_(const Device* device)
 void
 acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMesh* host_mesh)
 {
+    // TODO construct AcMesh from fortran farray
     acDeviceLoadMesh(*device, *stream, *host_mesh);
 }
 
 void
 acdevicestoremesh_(const Device* device, const Stream* stream, AcMesh* host_mesh)
 {
+    // TODO construct AcMesh from fortran farray
     acDeviceStoreMesh(*device, *stream, host_mesh);
 }
 

From 39c7fc6c6fa8d8359dca69f548637aca83c0c3d6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 25 Jun 2020 20:40:02 +0300
Subject: [PATCH 71/89] Streams are now generated with acc

---
 acc/src/code_generator.c | 42 ++++++++++++++++++++++++++++++++++++++++
 include/astaroth.h       | 23 ----------------------
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 8738590..2799429 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -707,6 +707,48 @@ generate_headers(void)
       end type AcMeshInfo
     )";
     fprintf(FHEADER, "%s\n", fortran_structs);
+
+    // Streams
+    const size_t nstreams = 20;
+    for (size_t i = 0; i < nstreams; ++i) {
+        fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i);
+        fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i);
+    }
+    fprintf(DSLHEADER, "#define NUM_STREAMS (%lu)\n", nstreams);
+    fprintf(DSLHEADER, "#define STREAM_DEFAULT (STREAM_0)\n");
+    fprintf(DSLHEADER, "#define STREAM_ALL (NUM_STREAMS)\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: NUM_STREAMS = %lu\n", nstreams);
+    fprintf(FHEADER, "integer(c_int), parameter :: STREAM_DEFAULT = STREAM_0\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: STREAM_ALL = NUM_STREAMS\n");
+
+    fprintf(DSLHEADER, "typedef int Stream;\n");
+    /*
+    // Reduction types
+    const size_t counter = 0;
+    fprintf(DSLHEADER, "#define RTYPE_MAX (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MAX = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_MIN (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MIN = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_RMS (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_RMS_EXP (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS_EXP = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_SUM (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_SUM = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_MAX (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MAX = %lu\n", counter);
+    ++counter;
+    */
 }
 
 static void
diff --git a/include/astaroth.h b/include/astaroth.h
index d72d598..61a4f91 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -59,29 +59,6 @@ typedef enum {
     NUM_REDUCTION_TYPES
 } ReductionType;
 
-typedef enum {
-    STREAM_DEFAULT,
-    STREAM_0,
-    STREAM_1,
-    STREAM_2,
-    STREAM_3,
-    STREAM_4,
-    STREAM_5,
-    STREAM_6,
-    STREAM_7,
-    STREAM_8,
-    STREAM_9,
-    STREAM_10,
-    STREAM_11,
-    STREAM_12,
-    STREAM_13,
-    STREAM_14,
-    STREAM_15,
-    STREAM_16,
-    NUM_STREAMS
-} Stream;
-#define STREAM_ALL (NUM_STREAMS)
-
 #define AC_GEN_ID(X) X,
 typedef enum {
     AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_ID) //

From 6f59890a3fa949a6b85646db5850f03309eeccc5 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Fri, 26 Jun 2020 09:52:33 +0300
Subject: [PATCH 72/89] Added loading and storing functions to the fortran
 interface

---
 include/astaroth_fortran.h   |  6 ++++--
 src/core/astaroth_fortran.cc | 29 +++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/include/astaroth_fortran.h b/include/astaroth_fortran.h
index 11a37a9..e0a708d 100644
--- a/include/astaroth_fortran.h
+++ b/include/astaroth_fortran.h
@@ -23,9 +23,11 @@ void acupdatebuiltinparams_(AcMeshInfo* info);
 
 void acdeviceswapbuffers_(const Device* device);
 
-void acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMesh* host_mesh);
+void acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                       const int* num_farrays, AcReal* farray);
 
-void acdevicestoremesh_(const Device* device, const Stream* stream, AcMesh* host_mesh);
+void acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                        const int* num_farrays, AcReal* farray);
 
 void acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
                                const int3* start, const int3* end, const AcReal* dt);
diff --git a/src/core/astaroth_fortran.cc b/src/core/astaroth_fortran.cc
index e397a9a..0c7e715 100644
--- a/src/core/astaroth_fortran.cc
+++ b/src/core/astaroth_fortran.cc
@@ -2,6 +2,7 @@
 
 #include "astaroth.h"
 #include "astaroth_utils.h"
+#include "errchk.h"
 
 void
 acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle)
@@ -38,17 +39,33 @@ acdeviceswapbuffers_(const Device* device)
 }
 
 void
-acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMesh* host_mesh)
+acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                  const int* num_farrays, AcReal* farray)
 {
-    // TODO construct AcMesh from fortran farray
-    acDeviceLoadMesh(*device, *stream, *host_mesh);
+    ERRCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
+    const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
+
+    AcMesh mesh;
+    mesh.info = *info;
+    for (int i = 0; i < *num_farrays; ++i)
+        mesh.vertex_buffer[i] = &farray[i * mxyz];
+
+    acDeviceLoadMesh(*device, *stream, mesh);
 }
 
 void
-acdevicestoremesh_(const Device* device, const Stream* stream, AcMesh* host_mesh)
+acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                   const int* num_farrays, AcReal* farray)
 {
-    // TODO construct AcMesh from fortran farray
-    acDeviceStoreMesh(*device, *stream, host_mesh);
+    ERRCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
+    AcMesh mesh;
+    mesh.info = *info;
+
+    const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
+    for (int i = 0; i < *num_farrays; ++i)
+        mesh.vertex_buffer[i] = &farray[i * mxyz];
+
+    acDeviceStoreMesh(*device, *stream, &mesh);
 }
 
 void

From e764725564c28746a44539ab8bb62ee66e766ffd Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Fri, 26 Jun 2020 09:54:17 +0300
Subject: [PATCH 73/89] acUpdateBuiltinParams now recalculates AC_inv_dsx and
 others if necessary

---
 src/utils/config_loader.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/utils/config_loader.c b/src/utils/config_loader.c
index 2715387..6d7f930 100644
--- a/src/utils/config_loader.c
+++ b/src/utils/config_loader.c
@@ -92,14 +92,17 @@ acUpdateBuiltinParams(AcMeshInfo* config)
     config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
     config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
 
-    /*
-    // DEPRECATED: Spacing TODO
     // These do not have to be defined by empty projects any more.
     // These should be set only if stdderiv.h is included
+    #ifdef AC_dsx
     config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
+    #endif
+    #ifdef AC_dsy
     config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
+    #endif
+    #ifdef AC_dsz
     config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
-    */
+    #endif
 
     /* Additional helper params */
     // Int helpers

From 50fb54f1aa0f2e4f73aa8fa2f0d73471c11ba9b4 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 28 Jun 2020 18:14:54 +0300
Subject: [PATCH 74/89] Added more warnings since its easy to make off-by-one
 mistakes when dealing with fortran-c-interop

---
 src/core/astaroth_fortran.cc     | 12 ++++++++++--
 src/core/kernels/integration.cuh |  2 ++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/core/astaroth_fortran.cc b/src/core/astaroth_fortran.cc
index 0c7e715..d974aa1 100644
--- a/src/core/astaroth_fortran.cc
+++ b/src/core/astaroth_fortran.cc
@@ -42,7 +42,8 @@ void
 acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
                   const int* num_farrays, AcReal* farray)
 {
-    ERRCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
+    ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
+    WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
     const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
 
     AcMesh mesh;
@@ -57,7 +58,8 @@ void
 acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
                    const int* num_farrays, AcReal* farray)
 {
-    ERRCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
+    ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
+    WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
     AcMesh mesh;
     mesh.info = *info;
 
@@ -109,3 +111,9 @@ acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info)
 {
     acDeviceLoadMeshInfo(*device, *info);
 }
+
+void
+acgetdevicecount_(int* count)
+{
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(count));
+}
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 0d88ac3..40c6b15 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -215,6 +215,8 @@ AcResult
 acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
                          const int3 end, VertexBufferArray vba)
 {
+    ERRCHK_ALWAYS(step_number >= 0);
+    ERRCHK_ALWAYS(step_number < 3);
     const dim3 tpb = rk3_tpb;
 
     const int3 n = end - start;

From 852fae17cf7488652e3bf19106ab80c13a839856 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 28 Jun 2020 18:15:40 +0300
Subject: [PATCH 75/89] Added a function for getting the GPU count from fortran

---
 include/astaroth_fortran.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/astaroth_fortran.h b/include/astaroth_fortran.h
index e0a708d..9c02c1f 100644
--- a/include/astaroth_fortran.h
+++ b/include/astaroth_fortran.h
@@ -45,6 +45,8 @@ void acdevicesynchronizestream_(const Device* device, const Stream* stream);
 
 void acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info);
 
+void acgetdevicecount_(int* count);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif

From d0ca1f81959511c70a9506fcd2170da6d62f443d Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 28 Jun 2020 18:16:19 +0300
Subject: [PATCH 76/89] Reduction types are now generated with acc instead of
 being explicitly declared in astaroth.h

---
 acc/src/code_generator.c | 17 +++++++----------
 include/astaroth.h       |  3 +++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 2799429..11711c0 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -697,13 +697,11 @@ generate_headers(void)
 
     // Do Fortran-specific
     const char* fortran_structs = R"(
-      integer, parameter :: precision = c_float ! TODO WARNING
-
       type, bind(C) :: AcMeshInfo
         integer(c_int), dimension(AC_NUM_INT_PARAMS)      :: int_params
         integer(c_int), dimension(AC_NUM_INT3_PARAMS, 3)  :: int3_params
-        real(precision), dimension(AC_NUM_REAL_PARAMS)    :: real_params
-        real(precision), dimension(AC_NUM_REAL3_PARAMS, 3) :: real3_params
+        real, dimension(AC_NUM_REAL_PARAMS)    :: real_params
+        real, dimension(AC_NUM_REAL3_PARAMS, 3) :: real3_params
       end type AcMeshInfo
     )";
     fprintf(FHEADER, "%s\n", fortran_structs);
@@ -722,9 +720,9 @@ generate_headers(void)
     fprintf(FHEADER, "integer(c_int), parameter :: STREAM_ALL = NUM_STREAMS\n");
 
     fprintf(DSLHEADER, "typedef int Stream;\n");
-    /*
+
     // Reduction types
-    const size_t counter = 0;
+    size_t counter = 0;
     fprintf(DSLHEADER, "#define RTYPE_MAX (%lu)\n", counter);
     fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MAX = %lu\n", counter);
     ++counter;
@@ -745,10 +743,9 @@ generate_headers(void)
     fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_SUM = %lu\n", counter);
     ++counter;
 
-    fprintf(DSLHEADER, "#define RTYPE_MAX (%lu)\n", counter);
-    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MAX = %lu\n", counter);
-    ++counter;
-    */
+    fprintf(DSLHEADER, "typedef int ReductionType;\n");
+    fprintf(DSLHEADER, "#define NUM_REDUCTION_TYPES (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: NUM_REDUCTION_TYPES = %lu\n", counter);
 }
 
 static void
diff --git a/include/astaroth.h b/include/astaroth.h
index 61a4f91..be323ef 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -50,6 +50,8 @@ typedef struct {
 
 typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
 
+/*
+// Deprecated, defined during code generation
 typedef enum {
     RTYPE_MAX,
     RTYPE_MIN,
@@ -58,6 +60,7 @@ typedef enum {
     RTYPE_SUM,
     NUM_REDUCTION_TYPES
 } ReductionType;
+*/
 
 #define AC_GEN_ID(X) X,
 typedef enum {

From 6cab3586cfd2c7585b2bb7647fed53e950a45b83 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 29 Jun 2020 01:06:30 +0300
Subject: [PATCH 77/89] The generated fortran header is now consistent with
 fortran conventions. Also cleaned up the C version of the header.

---
 acc/src/code_generator.c                 | 48 +++++++++++++++-------
 src/core/astaroth_fortran.cc             | 51 ++++++++++++------------
 {include => src/core}/astaroth_fortran.h | 12 +++---
 3 files changed, 65 insertions(+), 46 deletions(-)
 rename {include => src/core}/astaroth_fortran.h (97%)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 11711c0..02aa6e0 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -43,7 +43,7 @@ static FILE* FHEADER    = NULL;
 
 static const char* dslheader_filename  = "user_defines.h";
 static const char* cudaheader_filename = "user_kernels.h";
-static const char* fheader_filename    = "astaroth.f90";
+static const char* fheader_filename    = "astaroth_fortran.h";
 
 // Forward declaration of yyparse
 int yyparse(void);
@@ -601,12 +601,32 @@ generate_preprocessed_structures(void)
 static void
 generate_headers(void)
 {
-    int enumcounter = 0;
+    // Fortran interface
+    const char* fortran_interface = R"(
+!  -*-f90-*-  (for emacs)    vim:set filetype=fortran:  (for vim)
+
+! Utils (see astaroth_fortran.cc for definitions)
+external acupdatebuiltinparams
+external acgetdevicecount
+
+! Device interface (see astaroth_fortran.cc for definitions)
+external acdevicecreate, acdevicedestroy
+external acdeviceprintinfo
+external acdeviceloadmeshinfo
+external acdeviceloadmesh, acdevicestoremesh
+external acdeviceintegratesubstep
+external acdeviceperiodicboundconds
+external acdeviceswapbuffers
+external acdevicereducescal, acdevicereducevec
+external acdevicesynchronizestream
+  )";
+    fprintf(FHEADER, "%s\n", fortran_interface);
 
     fprintf(DSLHEADER, "#pragma once\n");
 
     // Int params
     fprintf(DSLHEADER, "#define AC_FOR_USER_INT_PARAM_TYPES(FUNC)");
+    int enumcounter = 0;
     for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
         if (symbol_table[i].type_specifier == INT && symbol_table[i].type_qualifier == UNIFORM) {
             fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
@@ -695,17 +715,6 @@ generate_headers(void)
     fprintf(DSLHEADER, "\n\n");
     fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
 
-    // Do Fortran-specific
-    const char* fortran_structs = R"(
-      type, bind(C) :: AcMeshInfo
-        integer(c_int), dimension(AC_NUM_INT_PARAMS)      :: int_params
-        integer(c_int), dimension(AC_NUM_INT3_PARAMS, 3)  :: int3_params
-        real, dimension(AC_NUM_REAL_PARAMS)    :: real_params
-        real, dimension(AC_NUM_REAL3_PARAMS, 3) :: real3_params
-      end type AcMeshInfo
-    )";
-    fprintf(FHEADER, "%s\n", fortran_structs);
-
     // Streams
     const size_t nstreams = 20;
     for (size_t i = 0; i < nstreams; ++i) {
@@ -719,7 +728,7 @@ generate_headers(void)
     fprintf(FHEADER, "integer(c_int), parameter :: STREAM_DEFAULT = STREAM_0\n");
     fprintf(FHEADER, "integer(c_int), parameter :: STREAM_ALL = NUM_STREAMS\n");
 
-    fprintf(DSLHEADER, "typedef int Stream;\n");
+    fprintf(DSLHEADER, "typedef int Stream;\n\n");
 
     // Reduction types
     size_t counter = 0;
@@ -746,6 +755,17 @@ generate_headers(void)
     fprintf(DSLHEADER, "typedef int ReductionType;\n");
     fprintf(DSLHEADER, "#define NUM_REDUCTION_TYPES (%lu)\n", counter);
     fprintf(FHEADER, "integer(c_int), parameter :: NUM_REDUCTION_TYPES = %lu\n", counter);
+
+    // Fortran structs
+    const char* fortran_structs = R"(
+type, bind(C) :: AcMeshInfo
+  integer(c_int), dimension(AC_NUM_INT_PARAMS)      :: int_params
+  integer(c_int), dimension(AC_NUM_INT3_PARAMS, 3)  :: int3_params
+  real, dimension(AC_NUM_REAL_PARAMS)               :: real_params
+  real, dimension(AC_NUM_REAL3_PARAMS, 3)           :: real3_params
+end type AcMeshInfo
+  )";
+    fprintf(FHEADER, "%s\n", fortran_structs);
 }
 
 static void
diff --git a/src/core/astaroth_fortran.cc b/src/core/astaroth_fortran.cc
index d974aa1..80ee033 100644
--- a/src/core/astaroth_fortran.cc
+++ b/src/core/astaroth_fortran.cc
@@ -4,38 +4,46 @@
 #include "astaroth_utils.h"
 #include "errchk.h"
 
+/**
+ * Utils
+ */
+void
+acupdatebuiltinparams_(AcMeshInfo* info)
+{
+    acUpdateBuiltinParams(info);
+}
+
+void
+acgetdevicecount_(int* count)
+{
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(count));
+}
+
+/**
+ * Device
+ */
 void
 acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle)
 {
-    // TODO errorcheck
     acDeviceCreate(*id, *info, handle);
 }
 
 void
 acdevicedestroy_(Device* device)
 {
-    // TODO errorcheck
     acDeviceDestroy(*device);
 }
 
 void
 acdeviceprintinfo_(const Device* device)
 {
-    // TODO errorcheck
     acDevicePrintInfo(*device);
 }
 
 void
-acupdatebuiltinparams_(AcMeshInfo* info)
+acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info)
 {
-    // TODO errorcheck
-    acUpdateBuiltinParams(info);
-}
-
-void
-acdeviceswapbuffers_(const Device* device)
-{
-    acDeviceSwapBuffers(*device);
+    acDeviceLoadMeshInfo(*device, *info);
 }
 
 void
@@ -81,10 +89,15 @@ void
 acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
                             const int3* end)
 {
-
     acDevicePeriodicBoundconds(*device, *stream, *start, *end);
 }
 
+void
+acdeviceswapbuffers_(const Device* device)
+{
+    acDeviceSwapBuffers(*device);
+}
+
 void
 acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
                     const VertexBufferHandle* vtxbuf_handle, AcReal* result)
@@ -105,15 +118,3 @@ acdevicesynchronizestream_(const Device* device, const Stream* stream)
 {
     acDeviceSynchronizeStream(*device, *stream);
 }
-
-void
-acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info)
-{
-    acDeviceLoadMeshInfo(*device, *info);
-}
-
-void
-acgetdevicecount_(int* count)
-{
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(count));
-}
diff --git a/include/astaroth_fortran.h b/src/core/astaroth_fortran.h
similarity index 97%
rename from include/astaroth_fortran.h
rename to src/core/astaroth_fortran.h
index 9c02c1f..fa0ba3a 100644
--- a/include/astaroth_fortran.h
+++ b/src/core/astaroth_fortran.h
@@ -10,6 +10,8 @@ extern "C" {
  */
 void acupdatebuiltinparams_(AcMeshInfo* info);
 
+void acgetdevicecount_(int* count);
+
 /**
  * Device
  */
@@ -19,9 +21,7 @@ void acdevicedestroy_(Device* device);
 
 void acdeviceprintinfo_(const Device* device);
 
-void acupdatebuiltinparams_(AcMeshInfo* info);
-
-void acdeviceswapbuffers_(const Device* device);
+void acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info);
 
 void acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
                        const int* num_farrays, AcReal* farray);
@@ -34,6 +34,8 @@ void acdeviceintegratesubstep_(const Device* device, const Stream* stream, const
 void acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
                                  const int3* end);
 
+void acdeviceswapbuffers_(const Device* device);
+
 void acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
                          const VertexBufferHandle* vtxbuf_handle, AcReal* result);
 
@@ -43,10 +45,6 @@ void acdevicereducevec_(const Device* device, const Stream* stream, const Reduct
 
 void acdevicesynchronizestream_(const Device* device, const Stream* stream);
 
-void acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info);
-
-void acgetdevicecount_(int* count);
-
 #ifdef __cplusplus
 } // extern "C"
 #endif

From 003c202e8caea5a1a3d0be9091237a18bd5a7cd3 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 16:39:24 +0300
Subject: [PATCH 78/89] Pulled useful changes from the benchmark branch.
 GPUDirect RDMA (unpinned) is now the default for MPI communication.

---
 acc/src/code_generator.c           |   2 +-
 samples/benchmark/main.cc          |  14 +-
 samples/genbenchmarkscripts/main.c |  15 +-
 src/core/device.cc                 | 229 ++++++++++++++++++++++++++++-
 4 files changed, 238 insertions(+), 22 deletions(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 02aa6e0..aab16eb 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -43,7 +43,7 @@ static FILE* FHEADER    = NULL;
 
 static const char* dslheader_filename  = "user_defines.h";
 static const char* cudaheader_filename = "user_kernels.h";
-static const char* fheader_filename    = "astaroth_fortran.h";
+static const char* fheader_filename    = "astaroth.f90";
 
 // Forward declaration of yyparse
 int yyparse(void);
diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index 962a316..16a99df 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -207,24 +207,18 @@ main(int argc, char** argv)
                 results[nth_percentile * num_iters], 100 * nth_percentile);
 
         char path[4096] = "";
-        if (test == TEST_STRONG_SCALING)
-            strncpy(path, "strong_scaling.csv", sizeof(path));
-        else if (test == TEST_WEAK_SCALING)
-            strncpy(path, "weak_scaling.csv", sizeof(path));
-        else
-            ERROR("Invalid test type");
+        sprintf(path, "%s_%d.csv", test == TEST_STRONG_SCALING ? "strong" : "weak", nprocs);
 
         FILE* fp = fopen(path, "a");
         ERRCHK_ALWAYS(fp);
         // Format
-        // nprocs, measured (ms)
-        fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
-
+        // nprocs, min, 50th perc, 90th perc, max
+        fprintf(fp, "%d, %g, %g, %g, %g\n", nprocs, results[0], results[0.5 * num_iters], results[nth_percentile * num_iters], results[num_iters-1]);
         fclose(fp);
     }
 
     /*
-const size_t num_iters      = 100;
+const size_t num_iters      = 1000;
 const double nth_percentile = 0.90;
 
 std::vector<double> results; // ms
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index a45bf1a..ce782ed 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -21,6 +21,7 @@ main(void)
         fprintf(fp, "#SBATCH --time=00:14:59\n");
         fprintf(fp, "#SBATCH --mem=32000\n");
         fprintf(fp, "#SBATCH --partition=gpu\n");
+        fprintf(fp, "#SBATCH --cpus-per-task=10\n");
 
         // nprocs, nodes, gpus
         const int max_gpus_per_node = 4;
@@ -29,22 +30,30 @@ main(void)
         fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
         fprintf(fp, "#SBATCH -n %d\n", nprocs);
         fprintf(fp, "#SBATCH -N %d\n", nodes);
-        fprintf(fp, "#SBATCH --exclusive\n");
+        //fprintf(fp, "#SBATCH --exclusive\n");
+        if (nprocs > 4)
+            fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
 
         // Modules
-        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        // OpenMPI
+        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n");
+        // HPCX
+        //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
         fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
 
         // Profile and run
-        fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+        //fprintf(fp, "mkdir -p profile_%d\n", nprocs);
 
         const int nx = 256; // max size 1792;
         const int ny = nx;
         const int nz = nx;
+        /*
         fprintf(fp,
                 //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
                 //"%d\n",
                 "srun ./benchmark %d %d %d\n", nx, ny, nz);
+        */
+        fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
 
         fclose(fp);
     }
diff --git a/src/core/device.cc b/src/core/device.cc
index 689eaf7..e465017 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -16,7 +16,7 @@
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1)              // Do inter-node comm with pinned memory
+#define MPI_USE_PINNED (0)              // Do inter-node comm with pinned memory
 #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
 
 #include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
@@ -115,7 +115,7 @@ AcResult
 acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
 {
     cudaSetDevice(id);
-    // cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate
+    cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate
     // multiple devices with a single GPU
 
     // Create Device
@@ -1169,10 +1169,8 @@ acTransferCommData(const Device device, //
 static void
 acTransferCommDataWait(const CommData data)
 {
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
+    MPI_Waitall(data.count, data.recv_reqs, MPI_STATUSES_IGNORE);
+    MPI_Waitall(data.count, data.send_reqs, MPI_STATUSES_IGNORE);
 }
 
 typedef struct {
@@ -1337,8 +1335,10 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
+/*
+// Unused
 AcResult
-acGridIntegrate(const Stream stream, const AcReal dt)
+acGridIntegratePipelined(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
     acGridSynchronizeStream(stream);
@@ -1549,6 +1549,220 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
     return AC_SUCCESS;
 }
+*/
+
+AcResult
+acGridIntegrate(const Stream stream, const AcReal dt)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+#if MPI_INCL_CORNERS
+    CommData corner_data = grid.corner_data; // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
+
+    acDeviceSynchronizeStream(device, stream);
+
+// Corners
+#if MPI_INCL_CORNERS
+    // Do not rm: required for corners
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
+#endif // MPI_INCL_CORNERS
+
+    // Edges X
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
+
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges Y
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
+
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
+
+    // Edges Z
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                    // MPI_INCL_CORNERS
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif
+
+#if MPI_COMM_ENABLED
+        MPI_Barrier(MPI_COMM_WORLD);
+
+#if MPI_GPUDIRECT_DISABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
+#endif                                                  // MPI_INCL_CORNERS
+        acTransferCommDataToHost(device, &edgex_data);
+        acTransferCommDataToHost(device, &edgey_data);
+        acTransferCommDataToHost(device, &edgez_data);
+        acTransferCommDataToHost(device, &sidexy_data);
+        acTransferCommDataToHost(device, &sidexz_data);
+        acTransferCommDataToHost(device, &sideyz_data);
+#endif
+#if MPI_INCL_CORNERS
+        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                        // MPI_INCL_CORNERS
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataWait(corner_data); // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+        acTransferCommDataWait(edgex_data);
+        acTransferCommDataWait(edgey_data);
+        acTransferCommDataWait(edgez_data);
+        acTransferCommDataWait(sidexy_data);
+        acTransferCommDataWait(sidexz_data);
+        acTransferCommDataWait(sideyz_data);
+
+#if MPI_INCL_CORNERS
+        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
+#endif                                         // MPI_INCL_CORNERS
+        acUnpinCommData(device, &edgex_data);
+        acUnpinCommData(device, &edgey_data);
+        acUnpinCommData(device, &edgez_data);
+        acUnpinCommData(device, &sidexy_data);
+        acUnpinCommData(device, &sidexz_data);
+        acUnpinCommData(device, &sideyz_data);
+
+#if MPI_INCL_CORNERS
+        acUnpackCommData(device, corner_b0s, &corner_data);
+#endif // MPI_INCL_CORNERS
+        acUnpackCommData(device, edgex_b0s, &edgex_data);
+        acUnpackCommData(device, edgey_b0s, &edgey_data);
+        acUnpackCommData(device, edgez_b0s, &edgez_data);
+        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+//////////// OUTER INTEGRATION //////////////
+
+// Wait for unpacking
+#if MPI_INCL_CORNERS
+        acSyncCommData(corner_data); // Do not rm: required for corners
+#endif                               // MPI_INCL_CORNERS
+        acSyncCommData(edgex_data);
+        acSyncCommData(edgey_data);
+        acSyncCommData(edgez_data);
+        acSyncCommData(sidexy_data);
+        acSyncCommData(sidexz_data);
+        acSyncCommData(sideyz_data);
+#endif // MPI_COMM_ENABLED
+#if MPI_COMPUTE_ENABLED
+        { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
+        }
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
+        }
+        { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
+        }
+        { // Top
+            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
+        }
+        { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
+        }
+        { // Right
+            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
+        }
+#endif // MPI_COMPUTE_ENABLED
+        acDeviceSwapBuffers(device);
+        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
+        ////////////////////////////////////////////
+    }
+
+    return AC_SUCCESS;
+}
 
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
@@ -1774,5 +1988,4 @@ acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBuff
 
     return acMPIReduceScal(local_result, rtype, result);
 }
-
 #endif // AC_MPI_ENABLED

From 770173a55dbad13bc18366b8a63ad724eae3a79a Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 13:55:22 +0000
Subject: [PATCH 79/89] Limited automated build time to 5 minutes.

---
 bitbucket-pipelines.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
index bc6ac44..6fbb638 100644
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@@ -12,6 +12,8 @@ image: nvidia/cuda
 # ==> Updating the kernel drivers by ourselves probably requires creating our own docker image.
 # ===> Which might not even work since I don't know what kind of hardware we're running on (lspci was not available)
 
+options:
+  max-time: 5 # Max time allowed for building (minutes) 
 pipelines:
   # default: # Default is run at every push but we have only 500 build minutes / month so that probably wouldn't work out
   custom: # Manual/scheduled building only

From 31db032f43fd4acc933abc3a690edf6229099fef Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 17:05:07 +0300
Subject: [PATCH 80/89] Upped the version number

---
 doxyfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doxyfile b/doxyfile
index b057fd6..a8daf9f 100644
--- a/doxyfile
+++ b/doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Astaroth"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 2.2
+PROJECT_NUMBER         = 2.3
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

From cd888be9ec3a82a7fbe0d5f3ad3b695bda6173f7 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 15:17:37 +0000
Subject: [PATCH 81/89] README.md edited online with Bitbucket

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 289a45f..8ac7978 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,7 @@ In the base directory, run
 | Option | Description | Default |
 |--------|-------------|---------|
 | CMAKE_BUILD_TYPE | Selects the build type. Possible values: Debug, Release, RelWithDebInfo, MinSizeRel. See (CMake documentation)[https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html] for more details. | Release |
+| CMAKE_CUDA_ARCHITECTURES | Selects the supported CUDA architectures. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | 35;37;60;70 |
 | DOUBLE_PRECISION | Generates double precision code. | OFF |
 | BUILD_SAMPLES | Builds projects in samples subdirectory. | OFF |
 | BUILD_STANDALONE | Builds a standalone library for testing, benchmarking and simulation. | ON |

From bb821df6865373c0aa69adf974b37df148692152 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 15:19:05 +0000
Subject: [PATCH 82/89] README.md edited online with Bitbucket

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8ac7978..a5c3f72 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ In the base directory, run
 | Option | Description | Default |
 |--------|-------------|---------|
 | CMAKE_BUILD_TYPE | Selects the build type. Possible values: Debug, Release, RelWithDebInfo, MinSizeRel. See (CMake documentation)[https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html] for more details. | Release |
-| CMAKE_CUDA_ARCHITECTURES | Selects the supported CUDA architectures. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | 35;37;60;70 |
+| CMAKE_CUDA_ARCHITECTURES | Selects CUDA architecture support. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | "35;37;60;70" |
 | DOUBLE_PRECISION | Generates double precision code. | OFF |
 | BUILD_SAMPLES | Builds projects in samples subdirectory. | OFF |
 | BUILD_STANDALONE | Builds a standalone library for testing, benchmarking and simulation. | ON |

From 5e04a61cd2ebc3c46f47f0f12d0d5216337882ea Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 15:43:58 +0000
Subject: [PATCH 83/89] README.md edited online with Bitbucket

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a5c3f72..796beec 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ In the base directory, run
 | Option | Description | Default |
 |--------|-------------|---------|
 | CMAKE_BUILD_TYPE | Selects the build type. Possible values: Debug, Release, RelWithDebInfo, MinSizeRel. See (CMake documentation)[https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html] for more details. | Release |
-| CMAKE_CUDA_ARCHITECTURES | Selects CUDA architecture support. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | "35;37;60;70" |
+| CUDA_ARCHITECTURES | Selects CUDA architecture support. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | "60;70" |
 | DOUBLE_PRECISION | Generates double precision code. | OFF |
 | BUILD_SAMPLES | Builds projects in samples subdirectory. | OFF |
 | BUILD_STANDALONE | Builds a standalone library for testing, benchmarking and simulation. | ON |

From 8fb271bbf3a33076d4f9a9bda768093320bfa776 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 18:45:10 +0300
Subject: [PATCH 84/89] Upped CMake version to 3.18 and cleaned up CUDA
 architecture selection

---
 CMakeLists.txt | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5152379..85ccb88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,8 @@
 ## CMake settings
 # V3.9 required for first-class CUDA support
 # V3.17 required for the FindCUDAToolkit package
-cmake_minimum_required(VERSION 3.17)
+# V3.18 required for CMAKE_CUDA_ARCHITECTURES
+cmake_minimum_required(VERSION 3.18)
 find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
 find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
 
@@ -16,12 +17,19 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 11)
 
-find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
-cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
-string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
+## CUDA
+# GPU, compute capability
+# K40, 3.5
+# K80, 3.7
+# P100, 6.0
+# V100, 7.0
+if (NOT CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 60 70) # Default
+else ()
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) # User-specified
+endif()
 set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
-
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${COMMON_CUDA_FLAGS}")
 
 ## Build type
 if(NOT CMAKE_BUILD_TYPE)

From a5d6fb4303453a3f805dafc3a5dfa57ffc31ac03 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 19:34:28 +0300
Subject: [PATCH 85/89] Host flags were not propagated to the CUDA compiler,
 fixed

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85ccb88..12b8378 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,8 +28,8 @@ if (NOT CUDA_ARCHITECTURES)
 else ()
     set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) # User-specified
 endif()
-set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${COMMON_CUDA_FLAGS}")
+string (REPLACE " " "," CUDA_COMMON_FLAGS "${COMMON_FLAGS}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options=${CUDA_COMMON_FLAGS}")
 
 ## Build type
 if(NOT CMAKE_BUILD_TYPE)

From 3afab7753349408b1734337ce37b5c984493a2c0 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 19:58:21 +0300
Subject: [PATCH 86/89] Removed astaroth_utils from astaroth_core dependencies

---
 include/astaroth.h           | 14 ++++++++
 include/astaroth_utils.h     | 29 ++++++++---------
 src/core/CMakeLists.txt      |  2 +-
 src/core/astaroth.cc         | 62 ++++++++++++++++++++++++++++++++++++
 src/core/astaroth_fortran.cc |  1 -
 src/core/device.cc           |  1 -
 src/utils/config_loader.c    | 39 -----------------------
 src/utils/memory.c           | 23 -------------
 8 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index be323ef..09a1405 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -562,6 +562,20 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const
 /** */
 AcResult acDeviceRunMPITest(void);
 
+/*
+ * =============================================================================
+ * Helper functions
+ * =============================================================================
+ */
+/** Updates the built-in parameters based on nx, ny and nz */
+AcResult acUpdateBuiltinParams(AcMeshInfo* config);
+
+/** Creates a mesh stored in host memory */
+AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
+
+/** Destroys a mesh stored in host memory */
+AcResult acMeshDestroy(AcMesh* mesh);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/include/astaroth_utils.h b/include/astaroth_utils.h
index d96074c..ee5ebd4 100644
--- a/include/astaroth_utils.h
+++ b/include/astaroth_utils.h
@@ -70,20 +70,16 @@ bool printErrorToScreen(const Error error);
 /** Loads data from the config file */
 AcResult acLoadConfig(const char* config_path, AcMeshInfo* config);
 
-/** Updates the built-in parameters based on nx, ny and nz */
-AcResult acUpdateBuiltinParams(AcMeshInfo* config);
+/** */
+AcScalReductionTestCase acCreateScalReductionTestCase(const char* label,
+                                                      const VertexBufferHandle vtxbuf,
+                                                      const ReductionType rtype);
 
 /** */
-AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
-
-/** */
-AcResult acMeshDestroy(AcMesh* mesh);
-
-/** */
-AcScalReductionTestCase acCreateScalReductionTestCase(const char* label, const VertexBufferHandle vtxbuf, const ReductionType rtype);
-
-/** */
-AcVecReductionTestCase acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c, const ReductionType rtype);
+AcVecReductionTestCase acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
+                                                    const VertexBufferHandle b,
+                                                    const VertexBufferHandle c,
+                                                    const ReductionType rtype);
 
 /** */
 AcResult acMeshSet(const AcReal value, AcMesh* mesh);
@@ -104,16 +100,19 @@ AcResult acModelIntegrateStep(AcMesh mesh, const AcReal dt);
 AcReal acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a);
 
 /** TODO */
-AcReal acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c);
+AcReal acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
+                        const VertexBufferHandle b, const VertexBufferHandle c);
 
 /** */
 AcResult acVerifyMesh(const AcMesh model, const AcMesh candidate);
 
 /** */
-AcResult acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases, const size_t numCases);
+AcResult acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
+                                const size_t numCases);
 
 /** */
-AcResult acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases, const size_t numCases);
+AcResult acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
+                               const size_t numCases);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 1a70f93..9e2a556 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
 
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
+target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver)
 
 ## Options
 if (MPI_ENABLED)
diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc
index 3821e3a..3a38240 100644
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -158,3 +158,65 @@ acGetNode(void)
     ERRCHK_ALWAYS(num_nodes > 0);
     return nodes[0];
 }
+
+AcResult
+acUpdateBuiltinParams(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
+    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
+
+// These do not have to be defined by empty projects any more.
+// These should be set only if stdderiv.h is included
+#ifdef AC_dsx
+    config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
+#endif
+#ifdef AC_dsy
+    config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
+#endif
+#ifdef AC_dsz
+    config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
+#endif
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
+{
+    mesh->info = info;
+
+    const size_t bytes = acVertexBufferSizeBytes(mesh->info);
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        mesh->vertex_buffer[w] = (AcReal*)malloc(bytes);
+        ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acMeshDestroy(AcMesh* mesh)
+{
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+        free(mesh->vertex_buffer[w]);
+
+    return AC_SUCCESS;
+}
diff --git a/src/core/astaroth_fortran.cc b/src/core/astaroth_fortran.cc
index 80ee033..18aa749 100644
--- a/src/core/astaroth_fortran.cc
+++ b/src/core/astaroth_fortran.cc
@@ -1,7 +1,6 @@
 #include "astaroth_fortran.h"
 
 #include "astaroth.h"
-#include "astaroth_utils.h"
 #include "errchk.h"
 
 /**
diff --git a/src/core/device.cc b/src/core/device.cc
index e465017..5495286 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -2,7 +2,6 @@
 
 #include <string.h>
 
-#include "astaroth_utils.h"
 #include "errchk.h"
 #include "math_utils.h"
 #include "timer_hires.h"
diff --git a/src/utils/config_loader.c b/src/utils/config_loader.c
index 6d7f930..50ed84c 100644
--- a/src/utils/config_loader.c
+++ b/src/utils/config_loader.c
@@ -74,45 +74,6 @@ parse_config(const char* path, AcMeshInfo* config)
     fclose(fp);
 }
 
-AcResult
-acUpdateBuiltinParams(AcMeshInfo* config)
-{
-    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
-    ///////////// PAD TEST
-    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
-    ///////////// PAD TEST
-    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
-    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
-
-    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
-    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
-    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
-    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
-
-    // These do not have to be defined by empty projects any more.
-    // These should be set only if stdderiv.h is included
-    #ifdef AC_dsx
-    config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
-    #endif
-    #ifdef AC_dsy
-    config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
-    #endif
-    #ifdef AC_dsz
-    config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
-    #endif
-
-    /* Additional helper params */
-    // Int helpers
-    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
-    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
-    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
-
-    return AC_SUCCESS;
-}
-
 /**
 \brief Loads data from astaroth.conf into a config struct.
 \return AC_SUCCESS on success, AC_FAILURE if there are potentially uninitialized values.
diff --git a/src/utils/memory.c b/src/utils/memory.c
index ae50872..f587f4d 100644
--- a/src/utils/memory.c
+++ b/src/utils/memory.c
@@ -20,29 +20,6 @@
 
 #include "errchk.h"
 
-AcResult
-acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
-{
-    mesh->info = info;
-
-    const size_t bytes = acVertexBufferSizeBytes(mesh->info);
-    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
-        mesh->vertex_buffer[w] = malloc(bytes);
-        ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
-    }
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acMeshDestroy(AcMesh* mesh)
-{
-    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
-        free(mesh->vertex_buffer[w]);
-
-    return AC_SUCCESS;
-}
-
 AcResult
 acMeshSet(const AcReal value, AcMesh* mesh)
 {

From fca615defbe7784fc23c53d274c9d641e13bfa77 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 29 Jul 2020 20:01:11 +0300
Subject: [PATCH 87/89] Removed an old unused file

---
 samples/mpitest/main_old.cc | 80 -------------------------------------
 1 file changed, 80 deletions(-)
 delete mode 100644 samples/mpitest/main_old.cc

diff --git a/samples/mpitest/main_old.cc b/samples/mpitest/main_old.cc
deleted file mode 100644
index 16c27e4..0000000
--- a/samples/mpitest/main_old.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-/**
-    Running: mpirun -np <num processes> <executable>
-*/
-#include "astaroth.h"
-#include "astaroth_utils.h"
-
-#include <mpi.h>
-
-int
-main(void)
-{
-    MPI_Init(NULL, NULL);
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-
-    // CPU alloc
-    AcMeshInfo info;
-    acLoadConfig(AC_DEFAULT_CONFIG, &info);
-    info.real_params[AC_inv_dsx]   = AcReal(1.0) / info.real_params[AC_dsx];
-    info.real_params[AC_inv_dsy]   = AcReal(1.0) / info.real_params[AC_dsy];
-    info.real_params[AC_inv_dsz]   = AcReal(1.0) / info.real_params[AC_dsz];
-    info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
-
-    AcMesh model, candidate;
-    if (pid == 0) {
-        acMeshCreate(info, &model);
-        acMeshCreate(info, &candidate);
-        acMeshRandomize(&model);
-        acMeshRandomize(&candidate);
-    }
-
-    // GPU alloc & compute
-    Grid grid;
-    acGridCreateMPI(info, &grid);
-
-    acGridLoadMeshMPI(grid, STREAM_DEFAULT, model);
-    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
-
-    acGridIntegrateMPI(grid, FLT_EPSILON);
-    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
-    acGridSynchronizeMeshMPI(grid, STREAM_DEFAULT);
-    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
-
-    acGridStoreMeshMPI(grid, STREAM_DEFAULT, &candidate);
-    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
-
-    acGridDestroyMPI(grid);
-
-    // Verify
-    if (pid == 0) {
-        acModelIntegrateStep(model, FLT_EPSILON);
-        acMeshApplyPeriodicBounds(&model);
-
-        acVerifyMesh(model, candidate);
-        acMeshDestroy(&model);
-        acMeshDestroy(&candidate);
-    }
-
-    MPI_Finalize();
-    return EXIT_SUCCESS;
-}

From 5185a4d4718da3afc32b379425ccf59e5117a283 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 30 Jul 2020 13:58:11 +0000
Subject: [PATCH 88/89] README.md edited online with Bitbucket

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 796beec..8cf4b52 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,7 @@ See `analysis/python/` directory of existing data visualization and analysis scr
 
 * `astaroth/include/astaroth.h`: Astaroth main header. Contains the interface for accessing single- and multi-GPU layers.
 * `astaroth/include/astaroth_utils.h`: Utility library header. Provides functions for performing common tasks on host, such as allocating and verifying meshes.
+* `<build directory>/astaroth.f90`: Fortran interface to Astaroth. Generated when building the library.
 
 ## FAQ
 

From 0872695c48e560e75c1caa128965f3f73b20d208 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 30 Jul 2020 14:38:12 +0000
Subject: [PATCH 89/89] Updated API_specification_and_user_manual.md with info
 on the acGrid layer

---
 .../API_specification_and_user_manual.md      | 46 +++++++++++++++----
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
index 6411f85..cd27066 100644
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -79,7 +79,7 @@ typedef enum {
 ```
 
 The API is divided into layers which differ in the level of control provided over the execution.
-There are two primary layers:
+There are three primary layers:
 
 * Device layer
     * Functions start with acDevice.
@@ -92,7 +92,13 @@ There are two primary layers:
     * All functions are asynchronous and executed concurrently on all devices in the node.
     * Subsequent functions called in the same stream (see Section #Streams and synchronization) are guaranteed to be synchronous.
 
-Finally, a third layer is provided for convenience and backwards compatibility.
+* Grid layer
+    * Functions start with acGrid.
+    * Provides control over all devices on multiple node.
+    * Requires MPI. `MPI_Init()` must be called before calling any acGrid functions.
+    * Streams are used to control concurrency the same way as on acDevice and acNode layers.
+
+Finally, a fourth layer is provided for convenience and backwards compatibility.
 
 * Astaroth layer (deprecated)
     * Functions start with `ac` only, f.ex. acInit().
@@ -126,6 +132,12 @@ AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* co
 AcResult acNodeAutoOptimize(const Node node);
 ```
 
+Grid layer.
+```C
+AcResult acGridInit(const AcMeshInfo info);
+AcResult acGridQuit(void);
+```
+
 General helper functions.
 ```C
 size_t acVertexBufferSize(const AcMeshInfo info);
@@ -159,6 +171,7 @@ AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
                                           const AcMesh host_mesh,
                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
                                           const int3 dst, const int num_vertices);
+AcResult acGridLoadMesh(const AcMesh host_mesh, const Stream stream);
 ```
 
 Storing meshes and vertex buffer to host memory.
@@ -180,6 +193,7 @@ AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                            const VertexBufferHandle vtxbuf_handle, const int3 src,
                                            const int3 dst, const int num_vertices,
                                            AcMesh* host_mesh);
+AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
 ```
 
 Transferring data between devices
@@ -242,6 +256,13 @@ AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionT
 AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
                          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
                          const VertexBufferHandle vtxbuf2, AcReal* result);
+AcResult acGridIntegrate(const Stream stream, const AcReal dt);
+AcResult acGridPeriodicBoundconds(const Stream stream);
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
 ```
 
 Finally, there's a library function that is automatically generated for all user-specified `Kernel`
@@ -261,10 +282,13 @@ yet completed. Therefore special care must be taken in order to ensure proper sy
 
 Synchronization is done using `Stream` primitives, defined as
 ```C
-typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
+typedef enum {STREAM_0, ..., STREAM_15, NUM_STREAMS};
+#define STREAM_DEFAULT (STREAM_0)
 #define STREAM_ALL (NUM_STREAMS)
 ```
 
+> **Note:** There is guaranteed to be at least 16 distinct streams.
+
 Functions queued in the same stream will be executed sequentially. If two or more consequent
 functions are queued in different streams, then these functions may execute in parallel. For
 additional control over streams, there is a barrier synchronization function which blocks execution
@@ -286,6 +310,7 @@ Astaroth API provides the following functions for barrier synchronization.
 AcResult acSynchronize(void);
 AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
 AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
+AcResult acGridSynchronizeStream(const Stream stream);
 ```
 
 ## Data Synchronization
@@ -408,13 +433,16 @@ int mz = info.int_params[AC_mz];
 after initialization.
 
 
-### Decomposition
-Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
+### Decomposition (`acNode` layer)
+
+> **Note:** This section describes implementation details specific to the acNode layer. The acGrid layer is not related to the `GridDims` structure described here. 
+
+`GridDims` contains the dimensions of the the mesh decomposed to multiple devices.
 ```C
 typedef struct {
     int3 m; // Size of the simulation domain (includes the ghost zones)
     int3 n; // Size of the computational domain (without ghost zones)
-} Grid;
+} GridDims;
 ```
 
 As briefly discussed in the section Data synchronization, a `Mesh` is distributed to multiple
@@ -427,8 +455,6 @@ Let *i* be the device id. The portion of the halos shared by neighboring devices
 `acNodeSynchronizeVertexBuffer` and `acNodeSynchronizeMesh` communicate these shared areas among
 the devices in the node.
 
-> **Note:** The decomposition scheme is subject to change.
-
 # Astaroth Domain-Specific Language
 
 We designed the Astaroth Domain-specific Language (DSL) for expressing stencil computations in a
@@ -478,8 +504,8 @@ In addition to basic datatypes in C/C++/CUDA, such as int and int3, we provide t
 | Complex     | A tuple of two 32- or 64-bit floating-point numbers. The real part is stored in member .x, while the imaginary component is in .y. Basic operations, such as multiplication, are defined as built-in functions.                                                                                                                   | std::complex<float> or std::complex<double>                                                          |
 | Matrix      | A tuple of three Vectors. Is stored in column-major order, f.ex. Matrix[i][j] is the component on row i, column j. (TODO recheck specs.)                                                                                                                                                                                          | float3[3] or double3[3]                                                                              |
 | ScalarArray | A one-dimensional array of Scalars stored in device memory. Given mesh dimensions (mx, my, mz), consists of max(mx, max(my, mz)) elements.                                                                                                                                                                                        | float[] or double[]                                                                                  |
-| ScalarField | An abstraction of a three-dimensional scalar field stored in device memory. Is implemented as a handle to a one-dimensional Scalar array consisting of input and output segments. The data is stored linearly in order i + j * mx + k * mx * my, given some vertex index (i, j, k) and mesh constisting of (mx, my, mz) vertices. | float[2][] or double[2][]                                                                            |
-| VectorField | An abstraction of a three-dimensional vector field stored in device memory. Is implemented as a tuple of three ScalarField handles.                                                                                                                                                                                               | Three distinct float[2][] or double[2][] arrays for each component. Stored as a structure of arrays. |
+| ScalarField | A three-dimensional scalar field stored in row-wise scan order where coordinates `(i, j, k)` correspond to a one-dimensional index `i + j * mx + k * mx * my`. Consists of two buffers, one used for input and another one for output. | Two float[] or double[] arrays                                                                            |
+| VectorField | A three-dimensional vector field. Consists of three `ScalarFields`. | Three `ScalarFields` stored contiguously in memory as a structure of arrays |
 
 ## Precision