From 0d62f56e2771834e8bef9dd0e4d01d1dc39ef75e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 28 May 2020 15:31:43 +0300
Subject: [PATCH] Tried an alternative approach to comm (was worse than the
 current solution) and rewrote the current best solution for (now easier to
 read)

---
 src/core/device.cc         | 333 ++++++++++++++++++++++++++++++++++++-
 src/core/kernels/kernels.h |   3 +
 2 files changed, 328 insertions(+), 8 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index e017611..322fc6d 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1166,7 +1166,7 @@ acTransferCommDataWait(const CommData data)
     // NOP
 }
 
-#elif AC_MPI_BIDIRECTIONAL_SCHEME
+#elif AC_MPI_BIDIRECTIONAL_SCHEME_A
 
 static int3
 mod(const int3 a, const int3 n)
@@ -1206,8 +1206,7 @@ acTransferCommData(const Device device, //
     const int3 dims         = data->dims;
     const size_t num_blocks = data->count;
 
-    cudaDeviceSynchronize(); // TODO debug REMOVE
-    for (size_t b0_idx = 0; b0_idx < num_blocks; ++b0_idx) {
+    for (size_t b0_idx = 0; b0_idx < num_blocks / 2; ++b0_idx) {
         const int3 b0     = b0s[b0_idx];
         const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
         const int3 a0     = mod(((b0 - nghost) + nn), nn) + nghost;
@@ -1232,6 +1231,15 @@ acTransferCommData(const Device device, //
             neighbor.y < 0 ? a0.y - nghost.y : neighbor.y > 0 ? a0.y + nghost.y : a0.y,
             neighbor.z < 0 ? a0.z - nghost.z : neighbor.z > 0 ? a0.z + nghost.z : a0.z,
         };
+        const int3 a1 = mod(((b1 + nn - nghost) + nn), nn) + nghost;
+        size_t a1_idx = -1;
+        for (size_t i = 0; i < num_blocks; ++i) {
+            if (a0s[i].x == a1.x && a0s[i].y == a1.y && a0s[i].z == a1.z) {
+                a1_idx = i;
+                break;
+            }
+        }
+        ERRCHK_ALWAYS(a1_idx < num_blocks); // TODO debug REMOVE
 
         size_t b1_idx = -1;
         for (size_t i = 0; i < num_blocks; ++i) {
@@ -1242,17 +1250,66 @@ acTransferCommData(const Device device, //
         }
         ERRCHK_ALWAYS(b1_idx < num_blocks); // TODO debug REMOVE
 
+        const int3 pid3d        = getPid3D(pid, decomp);
+        const int3 npid3d_front = pid3d + neighbor;
+        const int3 npid3d_back  = pid3d - neighbor;
+        const int npid_front    = getPid(npid3d_front, decomp);
+        const int npid_back     = getPid(npid3d_back, decomp);
+        const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+        PackedData* a0_data = &data->srcs[a0_idx];
+        PackedData* a1_data = &data->srcs[a1_idx];
+        PackedData* b0_data = &data->dsts[b0_idx];
+        PackedData* b1_data = &data->dsts[b1_idx];
+
+        // Deadlock!
+        /*
+        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
+                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
+                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+                     */
+        cudaDeviceSynchronize();
+        MPI_Sendrecv(a0_data, count, datatype, npid_front, b0_idx, //
+                     b0_data, count, datatype, npid_back, b0_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        MPI_Sendrecv(a1_data, count, datatype, npid_back, b1_idx, //
+                     b1_data, count, datatype, npid_front, b1_idx, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+
+        /*
         const int3 pid3d   = getPid3D(pid, decomp);
-        const int npid     = getPid(pid3d + neighbor, decomp);
+        const int3 npid3d  = pid3d + neighbor;
+        const int npid     = getPid(npid3d, decomp);
         const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
 
         PackedData* src = &data->srcs[a0_idx];
         PackedData* dst = &data->dsts[b1_idx];
 
-        MPI_Irecv(dst->data, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
-                  &data->recv_reqs[b1_idx]);
-        MPI_Isend(src->data, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
-                  &data->send_reqs[b0_idx]);
+
+        if (onTheSameNode(pid, npid)) {
+            MPI_Irecv(dst->data, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
+                      &data->recv_reqs[b1_idx]);
+            dst->pinned = false;
+            cudaStreamSynchronize(data->streams[a0_idx]);
+            MPI_Isend(src->data, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
+                      &data->send_reqs[b0_idx]);
+        }
+        else {
+            MPI_Irecv(dst->data_pinned, count, datatype, npid, b1_idx, MPI_COMM_WORLD,
+                      &data->recv_reqs[b1_idx]);
+            dst->pinned = true;
+
+            if (!src->pinned) {
+                acPinPackedData(device, data->streams[a0_idx], src);
+                cudaStreamSynchronize(data->streams[a0_idx]);
+            }
+            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
+                      &data->send_reqs[b0_idx]);
+        }
+        */
 
         /*
         const int3 neighbor = (int3){
@@ -1290,6 +1347,252 @@ acTransferCommDataWait(const CommData data)
     }
 }
 
+#elif AC_MPI_BIDIRECTIONAL_SCHEME_B
+static AcResult
+acTransferCommData(const Device device, //
+                   const int3* a0s,     // Src idx inside comp. domain
+                   const int3* b0s,     // Dst idx inside bound zone
+                   CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    const int3 dims         = data->dims;
+    const size_t blockcount = data->count;
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+
+                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+#if MPI_GPUDIRECT_DISABLED
+                            PackedData* src    = &data->srcs_host[a_idx];
+                            PackedData* dst    = &data->dsts_host[b_idx];
+#else
+                            PackedData* src = &data->srcs[a_idx];
+                            PackedData* dst = &data->dsts[b_idx];
+#endif
+
+                            const int3 pid3d     = getPid3D(pid, decomp);
+                            const int npid_front = getPid(pid3d + neighbor, decomp);
+                            const int npid_back  = getPid(pid3d - neighbor, decomp);
+
+                            dst->pinned = false;
+
+                            if (onTheSameNode(pid, npid_back)) {
+                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = false;
+                                cudaStreamSynchronize(data->streams[a_idx]);
+                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            else {
+                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = true;
+                                cudaStreamSynchronize(data->streams[a_idx]);
+                                if (!src->pinned) {
+                                    acPinPackedData(device, data->streams[a_idx], src);
+                                    cudaStreamSynchronize(data->streams[a_idx]);
+                                }
+                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            /*
+                            cudaStreamSynchronize(data->streams[a_idx]);
+                            MPI_Status status;
+                            MPI_Sendrecv(src->data, count, datatype, npid_front, b_idx, //
+                                         dst->data, count, datatype, npid_back, b_idx,  //
+                                         MPI_COMM_WORLD, &status);
+                                         */
+
+                            /*
+                            const int npid_back = getPid(pid3d - neighbor, decomp);
+
+                            if (onTheSameNode(pid, npid_back)) {
+                                MPI_Irecv(dst->data, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = false;
+                            }
+                            else {
+                                MPI_Irecv(dst->data_pinned, count, datatype, npid_back, b_idx,
+                                          MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
+                                dst->pinned = true;
+                            }
+
+                            const int npid_front = getPid(pid3d + neighbor, decomp);
+
+                            cudaStreamSynchronize(data->streams[a_idx]);
+                            if (onTheSameNode(pid, npid_front)) {
+                                MPI_Isend(src->data, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            else {
+                                if (!src->pinned) {
+                                    acPinPackedData(device, data->streams[a_idx], src);
+                                    cudaStreamSynchronize(data->streams[a_idx]);
+                                }
+                                MPI_Isend(src->data_pinned, count, datatype, npid_front, b_idx,
+                                          MPI_COMM_WORLD, &data->send_reqs[b_idx]);
+                            }
+                            */
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return AC_SUCCESS;
+}
+
+static void
+acTransferCommDataWait(const CommData data)
+{
+
+    for (size_t i = 0; i < data.count; ++i) {
+        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
+        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
+    }
+}
+
+#elif AC_MPI_RT_PINNING_IMPROVED
+static int3
+mod(const int3 a, const int3 n)
+{
+    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
+}
+
+static AcResult
+acTransferCommData(const Device device, //
+                   const int3* a0s,     // Src idx inside comp. domain
+                   const int3* b0s,     // Dst idx inside bound zone
+                   CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    const int3 pid3d        = getPid3D(pid, decomp);
+    const int3 dims         = data->dims;
+    const size_t blockcount = data->count;
+    const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+    const int3 nghost       = (int3){NGHOST, NGHOST, NGHOST};
+
+    for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
+
+        const int3 b0       = b0s[b0_idx];
+        const int3 neighbor = (int3){
+            b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
+            b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
+            b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
+        };
+        const int npid = getPid(pid3d + neighbor, decomp);
+
+        PackedData* dst = &data->dsts[b0_idx];
+        if (onTheSameNode(pid, npid)) {
+            MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
+            dst->pinned = false;
+        }
+        else {
+            MPI_Irecv(dst->data_pinned, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
+            dst->pinned = true;
+        }
+    }
+
+    for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
+        const int3 b0       = b0s[b0_idx];
+        const int3 neighbor = (int3){
+            b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
+            b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
+            b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
+        };
+        const int npid = getPid(pid3d - neighbor, decomp);
+
+        const int3 a0 = mod(b0 - nghost, nn) + nghost;
+
+        // Not needed if there's a 1-to-1 mapping from b -> a
+        size_t a0_idx = -1;
+        for (size_t i = 0; i < blockcount; ++i) {
+            if (a0s[i].x == a0.x && a0s[i].y == a0.y && a0s[i].z == a0.z) {
+                a0_idx = i;
+                break;
+            }
+        }
+        ERRCHK(a0_idx < blockcount);
+
+        PackedData* src = &data->srcs[a0_idx];
+        if (onTheSameNode(pid, npid)) {
+            cudaStreamSynchronize(data->streams[a0_idx]);
+            MPI_Isend(src->data, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
+        }
+        else {
+            acPinPackedData(device, data->streams[a0_idx], src);
+            cudaStreamSynchronize(data->streams[a0_idx]);
+            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, MPI_COMM_WORLD,
+                      &data->send_reqs[b0_idx]);
+            src->pinned = true;
+        }
+    }
+
+    return AC_SUCCESS;
+}
+
+static void
+acTransferCommDataWait(const CommData data)
+{
+    for (size_t i = 0; i < data.count; ++i) {
+        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
+        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
+    }
+}
+
 #elif AC_MPI_RT_PINNING
 static AcResult
 acTransferCommData(const Device device, //
@@ -1401,6 +1704,7 @@ acTransferCommData(const Device device, //
                                 if (!src->pinned) {
                                     acPinPackedData(device, data->streams[a_idx], src);
                                     cudaStreamSynchronize(data->streams[a_idx]);
+                                    src->pinned = true;
                                 }
                                 MPI_Isend(src->data_pinned, count, datatype, npid, b_idx,
                                           MPI_COMM_WORLD, &data->send_reqs[b_idx]);
@@ -1790,6 +2094,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         (int3){NGHOST, nn.y, nn.z},   //
         (int3){nn.x, nn.y, nn.z},
     };
+    /*
     const int3 corner_b0s[] = {
         (int3){0, 0, 0},
         (int3){NGHOST + nn.x, 0, 0},
@@ -1801,6 +2106,18 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         (int3){0, NGHOST + nn.y, NGHOST + nn.z},
         (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
     };
+    */
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
 
     // Edges X
     const int3 edgex_a0s[] = {
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 805cbed..5f91008 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -6,6 +6,9 @@
 #include <stdbool.h>
 
 #define AC_MPI_UNIDIRECTIONAL_COMM (0)
+#define AC_MPI_BIDIRECTIONAL_SCHEME_A (0)
+#define AC_MPI_BIDIRECTIONAL_SCHEME_B (0)
+#define AC_MPI_RT_PINNING_IMPROVED (1)
 #define AC_MPI_RT_PINNING (1)
 #endif // AC_MPI_ENABLED