From fa422cf4575cb0a8eace8f0409693e881fc0f709 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 10 Jun 2020 02:16:23 +0300
Subject: [PATCH] Added a better-pipelined version of the acGridIntegrate and a
 switch for toggling the transfer of corners

---
 src/core/device.cc | 211 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index 35af82d..481b465 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1373,9 +1373,220 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
 
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
+#define MPI_INCL_CORNERS (0)
 
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+#if MPI_INCL_CORNERS
+    CommData corner_data = grid.corner_data; // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
+
+// Corners
+#if MPI_INCL_CORNERS
+    // Do not rm: required for corners
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
+#endif // MPI_INCL_CORNERS
+
+    // Edges X
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
+
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges Y
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
+
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
+
+    // Edges Z
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acDeviceSynchronizeStream(device, STREAM_ALL);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+#if MPI_COMPUTE_ENABLED
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
+        }
+
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommDataWait(sidexy_data);
+        acUnpinCommData(device, &sidexy_data);
+        acTransferCommDataWait(sidexz_data);
+        acUnpinCommData(device, &sidexz_data);
+        acTransferCommDataWait(sideyz_data);
+        acUnpinCommData(device, &sideyz_data);
+
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+#if MPI_INCL_CORNERS
+        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                    // MPI_INCL_CORNERS
+        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+        acTransferCommDataWait(edgex_data);
+        acUnpinCommData(device, &edgex_data);
+        acTransferCommDataWait(edgey_data);
+        acUnpinCommData(device, &edgey_data);
+        acTransferCommDataWait(edgez_data);
+        acUnpinCommData(device, &edgez_data);
+
+#if MPI_INCL_CORNERS
+        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                        // MPI_INCL_CORNERS
+#endif                                                        // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        acUnpackCommData(device, edgex_b0s, &edgex_data);
+        acUnpackCommData(device, edgey_b0s, &edgey_data);
+        acUnpackCommData(device, edgez_b0s, &edgez_data);
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataWait(corner_data);   // Do not rm: required for corners
+        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
+#endif                                         // MPI_INCL_CORNERS
+#endif                                         // MPI_COMM_ENABLED
+#if MPI_COMPUTE_ENABLED
+#if MPI_INCL_CORNERS
+        acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                      // MPI_INCL_CORNERS
+#endif                                                      // MPI_COMPUTE_ENABLED
+
+        // Wait for unpacking
+        acSyncCommData(sidexy_data);
+        acSyncCommData(sidexz_data);
+        acSyncCommData(sideyz_data);
+        acSyncCommData(edgex_data);
+        acSyncCommData(edgey_data);
+        acSyncCommData(edgez_data);
+#if MPI_INCL_CORNERS
+        acSyncCommData(corner_data); // Do not rm: required for corners
+#endif                               // MPI_INCL_CORNERS
+
+#if MPI_COMPUTE_ENABLED
+        { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
+        }
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
+        }
+        { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
+        }
+        { // Top
+            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
+        }
+        { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
+        }
+        { // Right
+            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
+        }
+#endif // MPI_COMPUTE_ENABLED
+        acDeviceSwapBuffers(device);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
 {
     ERRCHK(grid.initialized);
     // acGridSynchronizeStream(stream);