From ed7cf3f5401d24f2b848792730c63a86da3883fd Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 26 Mar 2020 15:02:37 +0200
Subject: [PATCH] Added a production-ready interface for doing multi-node runs
 with Astaroth with MPI

---
 include/astaroth.h |  56 +++-
 src/core/device.cc | 773 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 661 insertions(+), 168 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index af03634..b170796 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -116,10 +116,10 @@ typedef enum {
 
 #define _UNUSED __attribute__((unused)) // Does not give a warning if unused
 #define AC_GEN_STR(X) #X,
-static const char* intparam_names[] _UNUSED    = {AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR) "-end-"};
-static const char* int3param_names[] _UNUSED   = {AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR) "-end-"};
-static const char* realparam_names[] _UNUSED   = {AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR) "-end-"};
-static const char* real3param_names[] _UNUSED  = {AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR) "-end-"};
+static const char* intparam_names[] _UNUSED   = {AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR) "-end-"};
+static const char* int3param_names[] _UNUSED  = {AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR) "-end-"};
+static const char* realparam_names[] _UNUSED  = {AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR) "-end-"};
+static const char* real3param_names[] _UNUSED = {AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR) "-end-"};
 static const char* scalararray_names[] _UNUSED = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR) "-end-"};
 static const char* vtxbuf_names[] _UNUSED      = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR) "-end-"};
 #undef AC_GEN_STR
@@ -144,17 +144,20 @@ typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dis
 // Node
 typedef struct node_s* Node; // Opaque pointer to node_s.
 
+// Grid
+// typedef struct grid_s* Grid; // Opaque pointer to grid_s
+
 typedef struct {
     int3 m;
     int3 n;
-} Grid; // WARNING: Grid structure may be deprecated in future versions (TODO)
+} GridDims;
 
 typedef struct {
     int num_devices;
     Device* devices;
 
-    Grid grid;
-    Grid subgrid;
+    GridDims grid;
+    GridDims subgrid;
 } DeviceConfiguration;
 
 #ifdef __cplusplus
@@ -280,6 +283,45 @@ int acGetNumDevicesPerNode(void);
 /** */
 Node acGetNode(void);
 
+/*
+ * =============================================================================
+ * Grid interface
+ * =============================================================================
+ */
+#if AC_MPI_ENABLED
+/**
+Initializes all available devices.
+
+Must compile and run the code with MPI.
+
+Must allocate exactly one process per GPU. And the same number of processes
+per node as there are GPUs on that node.
+
+Devices in the grid are configured based on the contents of AcMesh.
+ */
+AcResult acGridInit(const AcMeshInfo info);
+
+/**
+Resets all devices on the current grid.
+ */
+AcResult acGridQuit(void);
+
+/** */
+AcResult acGridSynchronizeStream(const Stream stream);
+
+/** */
+AcResult acGridLoadMesh(const AcMesh host_mesh, const Stream stream);
+
+/** */
+AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
+
+/** */
+AcResult acGridIntegrate(const Stream stream, const AcReal dt);
+
+/** */
+AcResult acGridPeriodicBoundconds(const Stream stream);
+#endif // AC_MPI_ENABLED
+
 /*
  * =============================================================================
  * Node interface
diff --git a/src/core/device.cc b/src/core/device.cc
index 631e287..af27a6e 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -833,6 +833,13 @@ acDestroyCommData(const Device device, CommData* data)
     data->dims  = (int3){-1, -1, -1};
 }
 
+static void
+acSyncCommData(const CommData data)
+{
+    for (size_t i = 0; i < data.count; ++i)
+        cudaStreamSynchronize(data.streams[i]);
+}
+
 static void
 acPackCommData(const Device device, const int3* a0s, CommData* data)
 {
@@ -980,6 +987,226 @@ acTransferCommDataWait(const CommData data)
     }
 }
 
+typedef struct {
+    Device device;
+    AcMesh submesh;
+    int3 decomposition;
+    bool initialized;
+
+    int3 nn;
+    CommData corner_data;
+    CommData edgex_data;
+    CommData edgey_data;
+    CommData edgez_data;
+    CommData sidexy_data;
+    CommData sidexz_data;
+    CommData sideyz_data;
+} Grid;
+
+static Grid grid = {};
+
+AcResult
+acGridSynchronizeStream(const Stream stream)
+{
+    ERRCHK(grid.initialized);
+
+    acDeviceSynchronizeStream(grid.device, stream);
+    MPI_Barrier(MPI_COMM_WORLD);
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridInit(const AcMeshInfo info)
+{
+    ERRCHK(!grid.initialized);
+
+    // Check that MPI is initialized
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+    printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
+
+    // Decompose
+    AcMeshInfo submesh_info  = info;
+    const int3 decomposition = decompose(nprocs);
+    const int3 pid3d         = getPid3D(pid, decomposition);
+
+    printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
+    printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
+    ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
+    ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
+    ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0);
+
+    const int submesh_nx                       = info.int_params[AC_nx] / decomposition.x;
+    const int submesh_ny                       = info.int_params[AC_ny] / decomposition.y;
+    const int submesh_nz                       = info.int_params[AC_nz] / decomposition.z;
+    submesh_info.int_params[AC_nx]             = submesh_nx;
+    submesh_info.int_params[AC_ny]             = submesh_ny;
+    submesh_info.int_params[AC_nz]             = submesh_nz;
+    submesh_info.int3_params[AC_global_grid_n] = (int3){
+        info.int_params[AC_nx],
+        info.int_params[AC_ny],
+        info.int_params[AC_nz],
+    };
+    submesh_info.int3_params[AC_multigpu_offset] = pid3d *
+                                                   (int3){submesh_nx, submesh_ny, submesh_nz};
+    acUpdateBuiltinParams(&submesh_info);
+
+    // GPU alloc
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+
+    Device device;
+    acDeviceCreate(pid % devices_per_node, submesh_info, &device);
+
+    // CPU alloc
+    AcMesh submesh;
+    acMeshCreate(submesh_info, &submesh);
+
+    // Setup the global grid structure
+    grid.device        = device;
+    grid.submesh       = submesh;
+    grid.decomposition = decomposition;
+    grid.initialized   = true;
+
+    // Configure
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    // Corners
+    const int3 corner_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+        (int3){NGHOST, nn.y, NGHOST},   //
+        (int3){nn.x, nn.y, NGHOST},     //
+
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){nn.x, NGHOST, nn.z},   //
+        (int3){NGHOST, nn.y, nn.z},   //
+        (int3){nn.x, nn.y, nn.z},
+    };
+
+    // Edges X
+    const int3 edgex_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, nn.y, NGHOST},   //
+
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){NGHOST, nn.y, nn.z},   //
+    };
+
+    // Edges Y
+    const int3 edgey_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){nn.x, NGHOST, nn.z},   //
+    };
+
+    // Edges Z
+    const int3 edgez_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+
+        (int3){NGHOST, nn.y, NGHOST}, //
+        (int3){nn.x, nn.y, NGHOST},   //
+    };
+
+    // Sides XY
+    const int3 sidexy_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, NGHOST, nn.z},   //
+    };
+
+    // Sides XZ
+    const int3 sidexz_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, nn.y, NGHOST},   //
+    };
+
+    // Sides YZ
+    const int3 sideyz_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+    };
+
+    const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 edgex_dims  = (int3){nn.x, NGHOST, NGHOST};
+    const int3 edgey_dims  = (int3){NGHOST, nn.y, NGHOST};
+    const int3 edgez_dims  = (int3){NGHOST, NGHOST, nn.z};
+    const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};
+    const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};
+    const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};
+    grid.nn                = nn;
+    grid.corner_data       = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
+    grid.edgex_data        = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
+    grid.edgey_data        = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
+    grid.edgez_data        = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
+    grid.sidexy_data       = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
+    grid.sidexz_data       = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
+    grid.sideyz_data       = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));
+
+    acGridSynchronizeStream(STREAM_ALL);
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridQuit(void)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(STREAM_ALL);
+
+    acDestroyCommData(grid.device, &grid.corner_data);
+    acDestroyCommData(grid.device, &grid.edgex_data);
+    acDestroyCommData(grid.device, &grid.edgey_data);
+    acDestroyCommData(grid.device, &grid.edgez_data);
+    acDestroyCommData(grid.device, &grid.sidexy_data);
+    acDestroyCommData(grid.device, &grid.sidexz_data);
+    acDestroyCommData(grid.device, &grid.sideyz_data);
+
+    grid.initialized   = false;
+    grid.decomposition = (int3){-1, -1, -1};
+    acMeshDestroy(&grid.submesh);
+    acDeviceDestroy(grid.device);
+
+    acGridSynchronizeStream(STREAM_ALL);
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridLoadMesh(const AcMesh host_mesh, const Stream stream)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    acDeviceDistributeMeshMPI(host_mesh, grid.decomposition, &grid.submesh);
+    acDeviceLoadMesh(grid.device, stream, grid.submesh);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    acDeviceStoreMesh(grid.device, stream, &grid.submesh);
+    acGridSynchronizeStream(stream);
+
+    acDeviceGatherMeshMPI(grid.submesh, grid.decomposition, host_mesh);
+
+    return AC_SUCCESS;
+}
+
 static AcResult
 acDeviceIntegrateMPI(const Device device, const AcReal dt)
 {
@@ -1518,186 +1745,410 @@ acDeviceCommunicateHalosMPI(const Device device)
     return AC_SUCCESS;
 }
 
-/*
-static int3
-findOptimalDecomposition(const int3 nn)
-{
-  int3 decomposition = (int3){1, 1, 1};
-}*/
-
 AcResult
-acDeviceRunMPITest(void)
+acGridIntegrate(const Stream stream, const AcReal dt)
 {
-    MPI_Init(NULL, NULL);
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
 
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const Device device  = grid.device;
+    const int3 nn        = grid.nn;
+    CommData corner_data = grid.corner_data;
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
 
-    char processor_name[MPI_MAX_PROCESSOR_NAME];
-    int name_len;
-    MPI_Get_processor_name(processor_name, &name_len);
-    printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
+    // Corners
+    const int3 corner_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+        (int3){NGHOST, nn.y, NGHOST},   //
+        (int3){nn.x, nn.y, NGHOST},     //
 
-    // Create model and candidate meshes
-    AcMeshInfo info;
-    acLoadConfig(AC_DEFAULT_CONFIG, &info);
-    info.real_params[AC_inv_dsx]   = AcReal(1.0) / info.real_params[AC_dsx];
-    info.real_params[AC_inv_dsy]   = AcReal(1.0) / info.real_params[AC_dsy];
-    info.real_params[AC_inv_dsz]   = AcReal(1.0) / info.real_params[AC_dsz];
-    info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
-
-    AcMesh model, candidate;
-
-    // Master CPU
-    if (pid == 0) {
-        acMeshCreate(info, &model);
-        acMeshCreate(info, &candidate);
-
-        acMeshRandomize(&model);
-        acMeshRandomize(&candidate);
-    }
-
-    /// DECOMPOSITION & SUBMESH ///////////////////////////////////
-    AcMeshInfo submesh_info  = info;
-    const int3 decomposition = decompose(nprocs);
-    const int3 pid3d         = getPid3D(pid, decomposition);
-
-    printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
-    printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
-    ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
-    ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
-    ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0);
-
-    const int submesh_nx                       = info.int_params[AC_nx] / decomposition.x;
-    const int submesh_ny                       = info.int_params[AC_ny] / decomposition.y;
-    const int submesh_nz                       = info.int_params[AC_nz] / decomposition.z;
-    submesh_info.int_params[AC_nx]             = submesh_nx;
-    submesh_info.int_params[AC_ny]             = submesh_ny;
-    submesh_info.int_params[AC_nz]             = submesh_nz;
-    submesh_info.int3_params[AC_global_grid_n] = (int3){
-        info.int_params[AC_nx],
-        info.int_params[AC_ny],
-        info.int_params[AC_nz],
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){nn.x, NGHOST, nn.z},   //
+        (int3){NGHOST, nn.y, nn.z},   //
+        (int3){nn.x, nn.y, nn.z},
     };
-    submesh_info.int3_params[AC_multigpu_offset] = pid3d *
-                                                   (int3){submesh_nx, submesh_ny, submesh_nz};
-    acUpdateBuiltinParams(&submesh_info);
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
 
-    AcMesh submesh;
-    acMeshCreate(submesh_info, &submesh);
-    acMeshRandomize(&submesh);
-    ////////////////////////////////////////////////////////////////
+        (int3){0, 0, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
 
-    // GPU INIT ////////////////////////////////////////////////////
-    int devices_per_node = -1;
-    cudaGetDeviceCount(&devices_per_node);
+    // Edges X
+    const int3 edgex_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, nn.y, NGHOST},   //
 
-    Device device;
-    acDeviceCreate(pid % devices_per_node, submesh_info, &device);
-    // TODO enable peer access
-    ////////////////////////////////////////////////////////////////
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){NGHOST, nn.y, nn.z},   //
+    };
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
 
-    // DISTRIBUTE & LOAD //////////////////////////////////////////
-    acDeviceDistributeMeshMPI(model, decomposition, &submesh);
-    acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
-    ///////////////////////////////////////////////////////////////
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
 
-    // SYNC //////////////////////////////////////////////////////
-    acDeviceSynchronizeStream(device, STREAM_ALL);
-    MPI_Barrier(MPI_COMM_WORLD);
-    //////////////////////////////////////////////////////////////
+    // Edges Y
+    const int3 edgey_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
 
-    // TIMING START //////////////////////////////////////////////
-    acDeviceSynchronizeStream(device, STREAM_ALL);
-    MPI_Barrier(MPI_COMM_WORLD);
-    Timer t;
-    timer_reset(&t);
-    //////////////////////////////////////////////////////////////
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){nn.x, NGHOST, nn.z},   //
+    };
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
 
-    // INTEGRATION & BOUNDCONDS////////////////////////////////////
-    // acDeviceCommunicateHalosMPI(device);
-    acDeviceIntegrateMPI(device, FLT_EPSILON);
-    acDeviceCommunicateHalosMPI(device);
-    ///////////////////////////////////////////////////////////////
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
 
-    // TIMING END //////////////////////////////////////////////
-    acDeviceSynchronizeStream(device, STREAM_ALL);
-    MPI_Barrier(MPI_COMM_WORLD);
-    if (!pid) {
-        timer_diff_print(t);
+    // Edges Z
+    const int3 edgez_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+
+        (int3){NGHOST, nn.y, NGHOST}, //
+        (int3){nn.x, nn.y, NGHOST},   //
+    };
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, NGHOST, nn.z},   //
+    };
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, nn.y, NGHOST},   //
+    };
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+    };
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acPackCommData(device, corner_a0s, &corner_data);
+        acPackCommData(device, edgex_a0s, &edgex_data);
+        acPackCommData(device, edgey_a0s, &edgey_data);
+        acPackCommData(device, edgez_a0s, &edgez_data);
+        acPackCommData(device, sidexy_a0s, &sidexy_data);
+        acPackCommData(device, sidexz_a0s, &sidexz_data);
+        acPackCommData(device, sideyz_a0s, &sideyz_data);
+
+#if MPI_GPUDIRECT_DISABLED
+        acTransferCommDataToHost(device, &corner_data);
+        acTransferCommDataToHost(device, &edgex_data);
+        acTransferCommDataToHost(device, &edgey_data);
+        acTransferCommDataToHost(device, &edgez_data);
+        acTransferCommDataToHost(device, &sidexy_data);
+        acTransferCommDataToHost(device, &sidexz_data);
+        acTransferCommDataToHost(device, &sideyz_data);
+#endif
+
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+        ////////////////////////////////////////////
+
+        acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
+        acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
+        acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+
+        acTransferCommDataWait(corner_data);
+        acTransferCommDataWait(edgex_data);
+        acTransferCommDataWait(edgey_data);
+        acTransferCommDataWait(edgez_data);
+        acTransferCommDataWait(sidexy_data);
+        acTransferCommDataWait(sidexz_data);
+        acTransferCommDataWait(sideyz_data);
+
+#if MPI_GPUDIRECT_DISABLED
+        acTransferCommDataToDevice(device, &corner_data);
+        acTransferCommDataToDevice(device, &edgex_data);
+        acTransferCommDataToDevice(device, &edgey_data);
+        acTransferCommDataToDevice(device, &edgez_data);
+        acTransferCommDataToDevice(device, &sidexy_data);
+        acTransferCommDataToDevice(device, &sidexz_data);
+        acTransferCommDataToDevice(device, &sideyz_data);
+#endif
+
+        acUnpackCommData(device, corner_b0s, &corner_data);
+        acUnpackCommData(device, edgex_b0s, &edgex_data);
+        acUnpackCommData(device, edgey_b0s, &edgey_data);
+        acUnpackCommData(device, edgez_b0s, &edgez_data);
+        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+        //////////// OUTER INTEGRATION //////////////
+
+        // Wait for unpacking
+        acSyncCommData(corner_data);
+        acSyncCommData(edgex_data);
+        acSyncCommData(edgey_data);
+        acSyncCommData(edgez_data);
+        acSyncCommData(sidexy_data);
+        acSyncCommData(sidexz_data);
+        acSyncCommData(sideyz_data);
+        { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
+        }
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
+        }
+        { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
+        }
+        { // Top
+            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
+        }
+        { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
+        }
+        { // Right
+            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
+        }
+        acDeviceSwapBuffers(device);
+        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
+        ////////////////////////////////////////////
     }
-    MPI_Barrier(MPI_COMM_WORLD);
-    //////////////////////////////////////////////////////////////
 
-    // STORE & GATHER /////////////////////////////////////////////
-    MPI_Barrier(MPI_COMM_WORLD);
-    acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
-    acDeviceSynchronizeStream(device, STREAM_DEFAULT);
-    acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
-    //////////////////////////////////////////////////////////////
-
-    // VERIFY ////////////////////////////////////////////////////
-    if (pid == 0) {
-        acModelIntegrateStep(model, FLT_EPSILON);
-        acMeshApplyPeriodicBounds(&model);
-
-        acVerifyMesh(model, candidate);
-        acMeshDestroy(&model);
-        acMeshDestroy(&candidate);
-    }
-    //////////////////////////////////////////////////////////////
-
-    // DESTROY ///////////////////////////////////////////////////
-    acDeviceDestroy(device);
-    acMeshDestroy(&submesh);
-    MPI_Finalize();
-    //////////////////////////////////////////////////////////////
     return AC_SUCCESS;
 }
-#else
+
 AcResult
-acDeviceRunMPITest(void)
+acGridPeriodicBoundconds(const Stream stream)
 {
-    WARNING("MPI was not enabled but acDeviceRunMPITest() was called");
-    return AC_FAILURE;
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    const Device device  = grid.device;
+    const int3 nn        = grid.nn;
+    CommData corner_data = grid.corner_data;
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
+
+    // Corners
+    const int3 corner_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+        (int3){NGHOST, nn.y, NGHOST},   //
+        (int3){nn.x, nn.y, NGHOST},     //
+
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){nn.x, NGHOST, nn.z},   //
+        (int3){NGHOST, nn.y, nn.z},   //
+        (int3){nn.x, nn.y, nn.z},
+    };
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+
+        (int3){0, 0, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges X
+    const int3 edgex_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, nn.y, NGHOST},   //
+
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){NGHOST, nn.y, nn.z},   //
+    };
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
+
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges Y
+    const int3 edgey_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+
+        (int3){NGHOST, NGHOST, nn.z}, //
+        (int3){nn.x, NGHOST, nn.z},   //
+    };
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
+
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
+
+    // Edges Z
+    const int3 edgez_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+
+        (int3){NGHOST, nn.y, NGHOST}, //
+        (int3){nn.x, nn.y, NGHOST},   //
+    };
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, NGHOST, nn.z},   //
+    };
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){NGHOST, nn.y, NGHOST},   //
+    };
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_a0s[] = {
+        (int3){NGHOST, NGHOST, NGHOST}, //
+        (int3){nn.x, NGHOST, NGHOST},   //
+    };
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    acPackCommData(device, corner_a0s, &corner_data);
+    acPackCommData(device, edgex_a0s, &edgex_data);
+    acPackCommData(device, edgey_a0s, &edgey_data);
+    acPackCommData(device, edgez_a0s, &edgez_data);
+    acPackCommData(device, sidexy_a0s, &sidexy_data);
+    acPackCommData(device, sidexz_a0s, &sidexz_data);
+    acPackCommData(device, sideyz_a0s, &sideyz_data);
+
+#if MPI_GPUDIRECT_DISABLED
+    acTransferCommDataToHost(device, &corner_data);
+    acTransferCommDataToHost(device, &edgex_data);
+    acTransferCommDataToHost(device, &edgey_data);
+    acTransferCommDataToHost(device, &edgez_data);
+    acTransferCommDataToHost(device, &sidexy_data);
+    acTransferCommDataToHost(device, &sidexz_data);
+    acTransferCommDataToHost(device, &sideyz_data);
+#endif
+
+    acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
+    acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
+    acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
+    acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
+    acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
+    acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
+    acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+
+    acTransferCommDataWait(corner_data);
+    acTransferCommDataWait(edgex_data);
+    acTransferCommDataWait(edgey_data);
+    acTransferCommDataWait(edgez_data);
+    acTransferCommDataWait(sidexy_data);
+    acTransferCommDataWait(sidexz_data);
+    acTransferCommDataWait(sideyz_data);
+
+#if MPI_GPUDIRECT_DISABLED
+    acTransferCommDataToDevice(device, &corner_data);
+    acTransferCommDataToDevice(device, &edgex_data);
+    acTransferCommDataToDevice(device, &edgey_data);
+    acTransferCommDataToDevice(device, &edgez_data);
+    acTransferCommDataToDevice(device, &sidexy_data);
+    acTransferCommDataToDevice(device, &sidexz_data);
+    acTransferCommDataToDevice(device, &sideyz_data);
+#endif
+
+    acUnpackCommData(device, corner_b0s, &corner_data);
+    acUnpackCommData(device, edgex_b0s, &edgex_data);
+    acUnpackCommData(device, edgey_b0s, &edgey_data);
+    acUnpackCommData(device, edgez_b0s, &edgez_data);
+    acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+    acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+    acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+
+    // Wait for unpacking
+    acSyncCommData(corner_data);
+    acSyncCommData(edgex_data);
+    acSyncCommData(edgey_data);
+    acSyncCommData(edgez_data);
+    acSyncCommData(sidexy_data);
+    acSyncCommData(sidexz_data);
+    acSyncCommData(sideyz_data);
+    return AC_SUCCESS;
 }
 #endif // AC_MPI_ENABLED
-
-/*
-struct grid_s {
-    Device device;
-};
-
-typedef grid_s* Grid;
-
-AcResult
-acGridInit(void)
-{
-    MPI_Init(NULL, NULL);
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-
-    char processor_name[MPI_MAX_PROCESSOR_NAME];
-    int name_len;
-    MPI_Get_processor_name(processor_name, &name_len);
-    printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
-}
-
-AcResult
-acGridLoad(const AcMesh mesh, Grid* grid)
-{
-}
-
-AcResult
-acGridStore(const Grid grid, AcMesh* mesh)
-{
-}
-
-AcResult
-acGridQuit(AcGrid& grid)
-{
-}
-*/