From f1138b04ac503816a996454ad42aed391b31d738 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 28 May 2020 16:42:50 +0300
Subject: [PATCH] Cleaned up the MPI implementation, removed all older
 implementations (removed also MPI window implementation which might be handy
 in the future when CUDA-aware support is introduced). If the removed stuff is
 needed later, here are some keywords to help find this commit: MPI_window,
 sendrecv, bidirectional, unidirectional transfer, real-time pinning, a0s,
 b0s.

---
 src/core/device.cc         | 1090 +++---------------------------------
 src/core/kernels/kernels.h |   12 +-
 2 files changed, 67 insertions(+), 1035 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 322fc6d..b7295d7 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -574,17 +574,9 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
-#if AC_MPI_RT_PINNING
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
     // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
     // slower than pinned (38 ms vs. 125 ms)
-#endif // AC_MPI_RT_PINNING
-
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    ERRCHK_ALWAYS(MPI_Win_create(data.data, bytes, sizeof(AcReal), MPI_INFO_NULL, MPI_COMM_WORLD,
-                                 &data.win) == MPI_SUCCESS);
-    MPI_Win_fence(0, data.win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
 
     return data;
 }
@@ -592,13 +584,7 @@ acCreatePackedData(const int3 dims)
 static AcResult
 acDestroyPackedData(PackedData* data)
 {
-#if AC_MPI_RT_PINNING
     cudaFree(data->data_pinned);
-#endif // AC_MPI_RT_PINNING
-
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    MPI_Win_free(&data->win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
 
     data->dims = (int3){-1, -1, -1};
     cudaFree(data->data);
@@ -619,22 +605,12 @@ acCreatePackedDataHost(const int3 dims)
     data.data          = (AcReal*)malloc(bytes);
     ERRCHK_ALWAYS(data.data);
 
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    ERRCHK_ALWAYS(MPI_Win_create(data.data, bytes, sizeof(AcReal), MPI_INFO_NULL, MPI_COMM_WORLD,
-                                 &data.win) == MPI_SUCCESS);
-    MPI_Win_fence(0, data.win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
-
     return data;
 }
 
 static AcResult
 acDestroyPackedDataHost(PackedData* data)
 {
-#if AC_MPI_UNIDIRECTIONAL_COMM
-    MPI_Win_free(&data->win);
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
-
     data->dims = (int3){-1, -1, -1};
     free(data->data);
     data->data = NULL;
@@ -665,7 +641,6 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
 }
 #endif // MPI_GPUDIRECT_DISABLED
 
-#if AC_MPI_RT_PINNING
 static void
 acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
 {
@@ -692,7 +667,6 @@ acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* dd
                          NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
 }
-#endif // AC_MPI_RT_PINNING
 
 // TODO: do with packed data
 static AcResult
@@ -954,12 +928,28 @@ acSyncCommData(const CommData data)
         cudaStreamSynchronize(data.streams[i]);
 }
 
+static int3
+mod(const int3 a, const int3 n)
+{
+    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
+}
+
 static void
-acPackCommData(const Device device, const int3* a0s, CommData* data)
+acPackCommData(const Device device, const int3* b0s, CommData* data)
 {
     cudaSetDevice(device->id);
-    for (size_t i = 0; i < data->count; ++i)
-        acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+    const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
+
+    for (size_t i = 0; i < data->count; ++i) {
+        const int3 a0 = mod(b0s[i] - nghost, nn) + nghost;
+        acKernelPackData(data->streams[i], device->vba, a0, data->srcs[i]);
+    }
 }
 
 static void
@@ -989,7 +979,6 @@ acTransferCommDataToDevice(const Device device, CommData* data)
 }
 #endif
 
-#if AC_MPI_RT_PINNING
 static inline void
 acPinCommData(const Device device, CommData* data)
 {
@@ -1011,491 +1000,9 @@ acUnpinCommData(const Device device, CommData* data)
     for (size_t i = 0; i < data->count; ++i)
         acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
 }
-#endif
-
-#if AC_MPI_UNIDIRECTIONAL_COMM
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-#if MPI_GPUDIRECT_DISABLED
-                            MPI_Win_fence(0, data->srcs_host[a_idx].win);
-                            MPI_Win_fence(0, data->dsts_host[b_idx].win);
-#else
-                            MPI_Win_fence(0, data->srcs[a_idx].win);
-                            MPI_Win_fence(0, data->dsts[b_idx].win);
-#endif
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-
-#if MPI_GPUDIRECT_DISABLED
-
-                            MPI_Put(data->srcs_host[a_idx].data, count, datatype,
-                                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
-                                    data->dsts_host[b_idx].win);
-
-                            /*
-            MPI_Get(data->dsts_host[b_idx].data, count, datatype,
-                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
-                    data->srcs_host[a_idx].win);
-                    */
-
-#else
-                            /*
-                                            MPI_Put(data->srcs[a_idx].data, count, datatype,
-                                                    getPid(pid3d - neighbor, decomp), 0, count,
-                               datatype, data->dsts[b_idx].win);
-                                            */
-
-                            MPI_Get(data->dsts[b_idx].data, count, datatype,
-                                    getPid(pid3d - neighbor, decomp), 0, count, datatype,
-                                    data->srcs[a_idx].win);
-                            ERROR("CUDA-aware MPI_Put/MPI_Get not yet supported with UCX "
-                                  "(2020-04-02)");
-#endif
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-#if MPI_GPUDIRECT_DISABLED
-                            MPI_Win_fence(0, data->srcs_host[a_idx].win);
-                            MPI_Win_fence(0, data->dsts_host[b_idx].win);
-#else
-                            MPI_Win_fence(0, data->srcs[a_idx].win);
-                            MPI_Win_fence(0, data->dsts[b_idx].win);
-#endif
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    (void)data;
-    // NOP
-}
-
-#elif AC_MPI_BIDIRECTIONAL_SCHEME_A
-
-static int3
-mod(const int3 a, const int3 n)
-{
-    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
-}
 
 static AcResult
 acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 mm = (int3){
-        device->local_config.int_params[AC_mx],
-        device->local_config.int_params[AC_my],
-        device->local_config.int_params[AC_mz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t num_blocks = data->count;
-
-    for (size_t b0_idx = 0; b0_idx < num_blocks / 2; ++b0_idx) {
-        const int3 b0     = b0s[b0_idx];
-        const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
-        const int3 a0     = mod(((b0 - nghost) + nn), nn) + nghost;
-
-        size_t a0_idx = -1;
-        for (size_t i = 0; i < num_blocks; ++i) {
-            if (a0s[i].x == a0.x && a0s[i].y == a0.y && a0s[i].z == a0.z) {
-                a0_idx = i;
-                break;
-            }
-        }
-        ERRCHK_ALWAYS(a0_idx < num_blocks); // TODO debug REMOVE
-
-        const int3 neighbor = (int3){
-            a0.x < b0.x ? -1 : a0.x > b0.x ? 1 : 0,
-            a0.y < b0.y ? -1 : a0.y > b0.y ? 1 : 0,
-            a0.z < b0.z ? -1 : a0.z > b0.z ? 1 : 0,
-        };
-
-        const int3 b1 = (int3){
-            neighbor.x < 0 ? a0.x - nghost.x : neighbor.x > 0 ? a0.x + nghost.x : a0.x,
-            neighbor.y < 0 ? a0.y - nghost.y : neighbor.y > 0 ? a0.y + nghost.y : a0.y,
-            neighbor.z < 0 ? a0.z - nghost.z : neighbor.z > 0 ? a0.z + nghost.z : a0.z,
-        };
-        const int3 a1 = mod(((b1 + nn - nghost) + nn), nn) + nghost;
-        size_t a1_idx = -1;
-        for (size_t i = 0; i < num_blocks; ++i) {
-            if (a0s[i].x == a1.x && a0s[i].y == a1.y && a0s[i].z == a1.z) {
-                a1_idx = i;
-                break;
-            }
-        }
-        ERRCHK_ALWAYS(a1_idx < num_blocks); // TODO debug REMOVE
-
-        size_t b1_idx = -1;
-        for (size_t i = 0; i < num_blocks; ++i) {
-            if (b0s[i].x == b1.x && b0s[i].y == b1.y && b0s[i].z == b1.z) {
-                b1_idx = i;
-                break;
-            }
-        }
-        ERRCHK_ALWAYS(b1_idx < num_blocks); // TODO debug REMOVE
-
-        const int3 pid3d        = getPid3D(pid, decomp);
-        const int3 npid3d_front = pid3d + neighbor;
-        const int3 npid3d_back  = pid3d - neighbor;
-        const int npid_front    = getPid(npid3d_front, decomp);
-        const int npid_back     = getPid(npid3d_back, decomp);
-        const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-        PackedData* a0_data = &data->srcs[a0_idx];
-        PackedData* a1_data = &data->srcs[a1_idx];
-        PackedData* b0_data = &data->dsts[b0_idx];
-        PackedData* b1_data = &data->dsts[b1_idx];
-
-        // Deadlock!
-        /*
-        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
-                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
-                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-                     */
-        cudaDeviceSynchronize();
-        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
-                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
-                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
-                     MPI_STATUS_IGNORE);
-
-        /*
-        const int3 pid3d   = getPid3D(pid, decomp);
-        const int3 npid3d  = pid3d + neighbor;
-        const int npid     = getPid(npid3d, decomp);
-        const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-        PackedData* src = &data->srcs[a0_idx];
-        PackedData* dst = &data->dsts[b1_idx];
-
-
-        if (onTheSameNode(pid, npid)) {
-            MPI_Irecv(dst->data, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
-                      &data->recv_reqs[b1_idx]);
-            dst->pinned = false;
-            cudaStreamSynchronize(data->streams[a0_idx]);
-            MPI_Isend(src->data, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                      &data->send_reqs[b0_idx]);
-        }
-        else {
-            MPI_Irecv(dst->data_pinned, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
-                      &data->recv_reqs[b1_idx]);
-            dst->pinned = true;
-
-            if (!src->pinned) {
-                acPinPackedData(device, data->streams[a0_idx], src);
-                cudaStreamSynchronize(data->streams[a0_idx]);
-            }
-            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                      &data->send_reqs[b0_idx]);
-        }
-        */
-
-        /*
-        const int3 neighbor = (int3){
-            a0.x < b0.x ? a0.x - nghost.x : a0.x > b0.x ? a0.x + nghost.x : a0.x,
-            a0.y < b0.y ? a0.y - nghost.y : a0.y > b0.y ? a0.y + nghost.y : a0.y,
-            a0.z < b0.z ? a0.z - nghost.z : a0.z > b0.z ? a0.z + nghost.z : a0.z,
-        };*/
-
-        printf("a0 -> b0: (%d, %d, %d) -> (%d, %d, %d)\n", a0.x, a0.y, a0.z, b0.x, b0.y, b0.z);
-        printf("b1: (%d, %d, %d)\n", b1.x, b1.y, b1.z);
-        printf("neighbor: (%d, %d, %d)\n", neighbor.x, neighbor.y, neighbor.z);
-
-        /*
-        const int3 b1 = (int3){
-            a0.x < b0.x ? a0.x - nghost.x : a0.x > b0.x ? a0.x + nghost.x : a0.x,
-            a0.y < b0.y ? a0.y - nghost.y : a0.y > b0.y ? a0.y + nghost.y : a0.y,
-            a0.z < b0.z ? a0.z - nghost.z : a0.z > b0.z ? a0.z + nghost.z : a0.z,
-        };
-        const int3 a1 = mod(((b1 - nghost) + nn), nn) + nghost;
-
-        printf("b0, a0: (%d, %d, %d) -> (%d, %d, %d)\n", b0.x, b0.y, b0.z, a0.x, a0.y, a0.z);
-        printf("b1, a1: (%d, %d, %d) -> (%d, %d, %d)\n\n", b1.x, b1.y, b1.z, a1.x, a1.y, a1.z);
-        */
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-
-#elif AC_MPI_BIDIRECTIONAL_SCHEME_B
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* src    = &data->srcs_host[a_idx];
-                            PackedData* dst    = &data->dsts_host[b_idx];
-#else
-                            PackedData* src = &data->srcs[a_idx];
-                            PackedData* dst = &data->dsts[b_idx];
-#endif
-
-                            const int3 pid3d     = getPid3D(pid, decomp);
-                            const int npid_front = getPid(pid3d + neighbor, decomp);
-                            const int npid_back  = getPid(pid3d - neighbor, decomp);
-
-                            dst->pinned = false;
-
-                            if (onTheSameNode(pid, npid_back)) {
-                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = false;
-                                cudaStreamSynchronize(data->streams[a_idx]);
-                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            else {
-                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = true;
-                                cudaStreamSynchronize(data->streams[a_idx]);
-                                if (!src->pinned) {
-                                    acPinPackedData(device, data->streams[a_idx], src);
-                                    cudaStreamSynchronize(data->streams[a_idx]);
-                                }
-                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            /*
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            MPI_Status status;
-                            MPI_Sendrecv(src->data, count, datatype, npid_front, b_idx, //
-                                         dst->data, count, datatype, npid_back, b_idx,  //
-                                         MPI_COMM_WORLD, &status);
-                                         */
-
-                            /*
-                            const int npid_back = getPid(pid3d - neighbor, decomp);
-
-                            if (onTheSameNode(pid, npid_back)) {
-                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = false;
-                            }
-                            else {
-                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = true;
-                            }
-
-                            const int npid_front = getPid(pid3d + neighbor, decomp);
-
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            if (onTheSameNode(pid, npid_front)) {
-                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            else {
-                                if (!src->pinned) {
-                                    acPinPackedData(device, data->streams[a_idx], src);
-                                    cudaStreamSynchronize(data->streams[a_idx]);
-                                }
-                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                            */
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-
-#elif AC_MPI_RT_PINNING_IMPROVED
-static int3
-mod(const int3 a, const int3 n)
-{
-    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
-}
-
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
                    const int3* b0s,     // Dst idx inside bound zone
                    CommData* data)
 {
@@ -1554,29 +1061,17 @@ acTransferCommData(const Device device, //
         };
         const int npid = getPid(pid3d - neighbor, decomp);
 
-        const int3 a0 = mod(b0 - nghost, nn) + nghost;
-
-        // Not needed if there's a 1-to-1 mapping from b -> a
-        size_t a0_idx = -1;
-        for (size_t i = 0; i < blockcount; ++i) {
-            if (a0s[i].x == a0.x && a0s[i].y == a0.y && a0s[i].z == a0.z) {
-                a0_idx = i;
-                break;
-            }
-        }
-        ERRCHK(a0_idx < blockcount);
-
-        PackedData* src = &data->srcs[a0_idx];
+        PackedData* src = &data->srcs[b0_idx];
         if (onTheSameNode(pid, npid)) {
-            cudaStreamSynchronize(data->streams[a0_idx]);
+            cudaStreamSynchronize(data->streams[b0_idx]);
             MPI_Isend(src->data, count, datatype, npid, b0_idx, //
                       MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
         }
         else {
-            acPinPackedData(device, data->streams[a0_idx], src);
-            cudaStreamSynchronize(data->streams[a0_idx]);
-            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                      &data->send_reqs[b0_idx]);
+            acPinPackedData(device, data->streams[b0_idx], src);
+            cudaStreamSynchronize(data->streams[b0_idx]);
+            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
             src->pinned = true;
         }
     }
@@ -1593,257 +1088,6 @@ acTransferCommDataWait(const CommData data)
     }
 }
 
-#elif AC_MPI_RT_PINNING
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* dst    = &data->dsts_host[b_idx];
-#else
-                            PackedData* dst = &data->dsts[b_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d - neighbor, decomp);
-
-                            if (onTheSameNode(pid, npid)) {
-                                MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                          &data->recv_reqs[b_idx]);
-                                dst->pinned = false;
-                            }
-                            else {
-                                MPI_Irecv(dst->data_pinned, count, datatype, npid, b_idx,
-                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                                dst->pinned = true;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* src    = &data->srcs_host[a_idx];
-#else
-                            PackedData* src = &data->srcs[a_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d + neighbor, decomp);
-
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            if (onTheSameNode(pid, npid)) {
-                                MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                          &data->send_reqs[b_idx]);
-                            }
-                            else {
-                                if (!src->pinned) {
-                                    acPinPackedData(device, data->streams[a_idx], src);
-                                    cudaStreamSynchronize(data->streams[a_idx]);
-                                    src->pinned = true;
-                                }
-                                MPI_Isend(src->data_pinned, count, datatype, npid, b_idx,
-                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-#else
-static AcResult
-acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
-                   CommData* data)
-{
-    cudaSetDevice(device->id);
-
-    MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
-        datatype = MPI_DOUBLE;
-
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomp = decompose(nprocs);
-
-    const int3 nn = (int3){
-        device->local_config.int_params[AC_nx],
-        device->local_config.int_params[AC_ny],
-        device->local_config.int_params[AC_nz],
-    };
-
-    const int3 dims         = data->dims;
-    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* dst    = &data->dsts_host[b_idx];
-#else
-                            PackedData* dst = &data->dsts[b_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d - neighbor, decomp);
-
-                            MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                      &data->recv_reqs[b_idx]);
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData* src    = &data->srcs_host[a_idx];
-#else
-                            PackedData* src = &data->srcs[a_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-                            const int npid   = getPid(pid3d + neighbor, decomp);
-
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
-                                      &data->send_reqs[b_idx]);
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return AC_SUCCESS;
-}
-
-static void
-acTransferCommDataWait(const CommData data)
-{
-    for (size_t i = 0; i < data.count; ++i) {
-        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
-        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
-    }
-}
-#endif // AC_MPI_UNIDIRECTIONAL_COMM
-
 typedef struct {
     Device device;
     AcMesh submesh;
@@ -1937,79 +1181,19 @@ acGridInit(const AcMeshInfo info)
         device->local_config.int_params[AC_nz],
     };
 
-    // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
-
-    // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
-
-    // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
-
-    // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
-
-    // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
-
-    // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
-
-    // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
-
-    const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 edgex_dims  = (int3){nn.x, NGHOST, NGHOST};
-    const int3 edgey_dims  = (int3){NGHOST, nn.y, NGHOST};
-    const int3 edgez_dims  = (int3){NGHOST, NGHOST, nn.z};
-    const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};
-    const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};
-    const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};
-    grid.nn                = nn;
-    grid.corner_data       = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
-    grid.edgex_data        = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
-    grid.edgey_data        = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
-    grid.edgez_data        = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
-    grid.sidexy_data       = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
-    grid.sidexz_data       = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
-    grid.sideyz_data       = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));
+    // Create CommData
+    // We have 8 corners, 12 edges, and 6 sides
+    //
+    // For simplicity's sake all data blocks inside a single CommData struct
+    // have the same dimensions.
+    grid.nn          = nn;
+    grid.corner_data = acCreateCommData(device, (int3){NGHOST, NGHOST, NGHOST}, 8);
+    grid.edgex_data  = acCreateCommData(device, (int3){nn.x, NGHOST, NGHOST}, 4);
+    grid.edgey_data  = acCreateCommData(device, (int3){NGHOST, nn.y, NGHOST}, 4);
+    grid.edgez_data  = acCreateCommData(device, (int3){NGHOST, NGHOST, nn.z}, 4);
+    grid.sidexy_data = acCreateCommData(device, (int3){nn.x, nn.y, NGHOST}, 2);
+    grid.sidexz_data = acCreateCommData(device, (int3){nn.x, NGHOST, nn.z}, 2);
+    grid.sideyz_data = acCreateCommData(device, (int3){NGHOST, nn.y, nn.z}, 2);
 
     acGridSynchronizeStream(STREAM_ALL);
     return AC_SUCCESS;
@@ -2083,30 +1267,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     acDeviceSynchronizeStream(device, stream);
 
     // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
-    /*
-    const int3 corner_b0s[] = {
-        (int3){0, 0, 0},
-        (int3){NGHOST + nn.x, 0, 0},
-        (int3){0, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-
-        (int3){0, 0, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
-        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
-    };
-    */
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -2120,13 +1280,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
     const int3 edgex_b0s[] = {
         (int3){NGHOST, 0, 0},
         (int3){NGHOST, NGHOST + nn.y, 0},
@@ -2136,13 +1289,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
     const int3 edgey_b0s[] = {
         (int3){0, NGHOST, 0},
         (int3){NGHOST + nn.x, NGHOST, 0},
@@ -2152,13 +1298,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
     const int3 edgez_b0s[] = {
         (int3){0, 0, NGHOST},
         (int3){NGHOST + nn.x, 0, NGHOST},
@@ -2168,55 +1307,31 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     };
 
     // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
     const int3 sidexy_b0s[] = {
         (int3){NGHOST, NGHOST, 0},             //
         (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
     };
 
     // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
     const int3 sidexz_b0s[] = {
         (int3){NGHOST, 0, NGHOST},             //
         (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
     };
 
     // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
     const int3 sideyz_b0s[] = {
         (int3){0, NGHOST, NGHOST},             //
         (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
     };
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        // acPackCommData(device, corner_a0s, &corner_data);
-        acPackCommData(device, edgex_a0s, &edgex_data);
-        acPackCommData(device, edgey_a0s, &edgey_data);
-        acPackCommData(device, edgez_a0s, &edgez_data);
-        acPackCommData(device, sidexy_a0s, &sidexy_data);
-        acPackCommData(device, sidexz_a0s, &sidexz_data);
-        acPackCommData(device, sideyz_a0s, &sideyz_data);
-
-        /*
-        #if AC_MPI_RT_PINNING
-                acPinCommData(device, &corner_data);
-                acPinCommData(device, &edgex_data);
-                acPinCommData(device, &edgey_data);
-                acPinCommData(device, &edgez_data);
-                acPinCommData(device, &sidexy_data);
-                acPinCommData(device, &sidexz_data);
-                acPinCommData(device, &sideyz_data);
-        #endif
-        */
+        // acPackCommData(device, corner_b0s, &corner_data);
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
 
         //////////// INNER INTEGRATION //////////////
         {
@@ -2238,13 +1353,13 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToHost(device, &sideyz_data);
 #endif
 
-        // acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
-        acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
-        acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
-        acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
-        acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
-        acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
-        acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+        // acTransferCommData(device, corner_b0s, &corner_data);
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
 
         // acTransferCommDataWait(corner_data);
         acTransferCommDataWait(edgex_data);
@@ -2264,7 +1379,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acTransferCommDataToDevice(device, &sideyz_data);
 #endif
 
-#if AC_MPI_RT_PINNING
         // acUnpinCommData(device, &corner_data);
         acUnpinCommData(device, &edgex_data);
         acUnpinCommData(device, &edgey_data);
@@ -2272,7 +1386,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acUnpinCommData(device, &sidexy_data);
         acUnpinCommData(device, &sidexz_data);
         acUnpinCommData(device, &sideyz_data);
-#endif
 
         // acUnpackCommData(device, corner_b0s, &corner_data);
         acUnpackCommData(device, edgex_b0s, &edgex_data);
@@ -2346,30 +1459,6 @@ acGridPeriodicBoundconds(const Stream stream)
     CommData sideyz_data = grid.sideyz_data;
 
     // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
-    /*
-    const int3 corner_b0s[] = {
-        (int3){0, 0, 0},
-        (int3){NGHOST + nn.x, 0, 0},
-        (int3){0, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-
-        (int3){0, 0, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
-        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
-    };
-    */
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -2383,13 +1472,6 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
     const int3 edgex_b0s[] = {
         (int3){NGHOST, 0, 0},
         (int3){NGHOST, NGHOST + nn.y, 0},
@@ -2399,13 +1481,6 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
     const int3 edgey_b0s[] = {
         (int3){0, NGHOST, 0},
         (int3){NGHOST + nn.x, NGHOST, 0},
@@ -2415,13 +1490,6 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
     const int3 edgez_b0s[] = {
         (int3){0, 0, NGHOST},
         (int3){NGHOST + nn.x, 0, NGHOST},
@@ -2431,54 +1499,30 @@ acGridPeriodicBoundconds(const Stream stream)
     };
 
     // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
     const int3 sidexy_b0s[] = {
         (int3){NGHOST, NGHOST, 0},             //
         (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
     };
 
     // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
     const int3 sidexz_b0s[] = {
         (int3){NGHOST, 0, NGHOST},             //
         (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
     };
 
     // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
     const int3 sideyz_b0s[] = {
         (int3){0, NGHOST, NGHOST},             //
         (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
     };
 
-    acPackCommData(device, corner_a0s, &corner_data);
-    acPackCommData(device, edgex_a0s, &edgex_data);
-    acPackCommData(device, edgey_a0s, &edgey_data);
-    acPackCommData(device, edgez_a0s, &edgez_data);
-    acPackCommData(device, sidexy_a0s, &sidexy_data);
-    acPackCommData(device, sidexz_a0s, &sidexz_data);
-    acPackCommData(device, sideyz_a0s, &sideyz_data);
-
-    /*
-    #if AC_MPI_RT_PINNING
-        acPinCommData(device, &corner_data);
-        acPinCommData(device, &edgex_data);
-        acPinCommData(device, &edgey_data);
-        acPinCommData(device, &edgez_data);
-        acPinCommData(device, &sidexy_data);
-        acPinCommData(device, &sidexz_data);
-        acPinCommData(device, &sideyz_data);
-    #endif
-    */
+    acPackCommData(device, corner_b0s, &corner_data);
+    acPackCommData(device, edgex_b0s, &edgex_data);
+    acPackCommData(device, edgey_b0s, &edgey_data);
+    acPackCommData(device, edgez_b0s, &edgez_data);
+    acPackCommData(device, sidexy_b0s, &sidexy_data);
+    acPackCommData(device, sidexz_b0s, &sidexz_data);
+    acPackCommData(device, sideyz_b0s, &sideyz_data);
 
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -2492,13 +1536,13 @@ acGridPeriodicBoundconds(const Stream stream)
     acTransferCommDataToHost(device, &sideyz_data);
 #endif
 
-    acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
-    acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
-    acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
-    acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
-    acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
-    acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
-    acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+    acTransferCommData(device, corner_b0s, &corner_data);
+    acTransferCommData(device, edgex_b0s, &edgex_data);
+    acTransferCommData(device, edgey_b0s, &edgey_data);
+    acTransferCommData(device, edgez_b0s, &edgez_data);
+    acTransferCommData(device, sidexy_b0s, &sidexy_data);
+    acTransferCommData(device, sidexz_b0s, &sidexz_data);
+    acTransferCommData(device, sideyz_b0s, &sideyz_data);
 
     acTransferCommDataWait(corner_data);
     acTransferCommDataWait(edgex_data);
@@ -2518,7 +1562,6 @@ acGridPeriodicBoundconds(const Stream stream)
     acTransferCommDataToDevice(device, &sideyz_data);
 #endif
 
-#if AC_MPI_RT_PINNING
     acUnpinCommData(device, &corner_data);
     acUnpinCommData(device, &edgex_data);
     acUnpinCommData(device, &edgey_data);
@@ -2526,7 +1569,6 @@ acGridPeriodicBoundconds(const Stream stream)
     acUnpinCommData(device, &sidexy_data);
     acUnpinCommData(device, &sidexz_data);
     acUnpinCommData(device, &sideyz_data);
-#endif
 
     acUnpackCommData(device, corner_b0s, &corner_data);
     acUnpackCommData(device, edgex_b0s, &edgex_data);
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 5f91008..577567b 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -5,25 +5,15 @@
 #include <mpi.h>
 #include <stdbool.h>
 
-#define AC_MPI_UNIDIRECTIONAL_COMM (0)
-#define AC_MPI_BIDIRECTIONAL_SCHEME_A (0)
-#define AC_MPI_BIDIRECTIONAL_SCHEME_B (0)
-#define AC_MPI_RT_PINNING_IMPROVED (1)
-#define AC_MPI_RT_PINNING (1)
+#define MPI_GPUDIRECT_DISABLED (0)
 #endif // AC_MPI_ENABLED
 
 typedef struct {
     int3 dims;
     AcReal* data;
 
-#if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
     AcReal* data_pinned;
     bool pinned = false; // Set if data was received to pinned memory
-#endif                   // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
-
-#if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
-    MPI_Win win; // MPI window for RMA
-#endif           // (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
 } PackedData;
 
 typedef struct {