From 1d81333ff7cd723547fa602d87dcf6194858df25 Mon Sep 17 00:00:00 2001
From: Johannes Pekkila <jpekkila@daint104.login.cscs.ch>
Date: Wed, 23 Oct 2019 12:07:23 +0200
Subject: [PATCH] More concurrent kernels and MPI comm

---
 src/core/device.cu | 102 ++++++++++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 44 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index 5f83025..f473000 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -1995,14 +1995,18 @@ acDeviceBoundcondStepMPI_single_step_best(const Device device, AcMesh* submesh)
 
 // BEST USE THIS! (full integration step)
 static AcResult
-acDeviceBoundcondStepMPI(const Device device, AcMesh* submesh)
+acDeviceIntegrateStepMPI(const Device device, const AcReal dt, AcMesh* submesh)
 {
     const int mx       = device->local_config.int_params[AC_mx];
     const int my       = device->local_config.int_params[AC_my];
     const int mz       = device->local_config.int_params[AC_mz];
+    const int nx       = device->local_config.int_params[AC_nx];
+    const int ny       = device->local_config.int_params[AC_ny];
+    const int nz       = device->local_config.int_params[AC_nz];
     const size_t count = mx * my * NGHOST;
 
-    for (int isubstep = 0; isubtep < 3; ++isubstep) {
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acDeviceSynchronizeStream(device, STREAM_ALL);
         // Local boundconds
         for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
             // Front plate local
@@ -2018,21 +2022,15 @@ acDeviceBoundcondStepMPI(const Device device, AcMesh* submesh)
                 acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end);
             }
         }
+#define INNER_BOUNDCOND_STREAM ((Stream)(NUM_STREAMS - 1))
         // Inner boundconds (while waiting)
         for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
 
             const int3 start = (int3){0, 0, 2 * NGHOST};
             const int3 end   = (int3){mx, my, mz - 2 * NGHOST};
-            acDevicePeriodicBoundcondStep(device, (Stream)(NUM_STREAMS - 1), (VertexBufferHandle)i,
+            acDevicePeriodicBoundcondStep(device, INNER_BOUNDCOND_STREAM, (VertexBufferHandle)i,
                                           start, end);
         }
-        // Inner integration
-        {
-            ERRCHK(NUM_STREAMS - 2 >= 0);
-            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = node->subgrid.n;
-            acDeviceIntegrateSubstep(devices, (Stream)(NUM_STREAMS - 2), isubstep, m1, m2, dt);
-        }
 
         // MPI
         MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES];
@@ -2088,51 +2086,60 @@ acDeviceBoundcondStepMPI(const Device device, AcMesh* submesh)
                           i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, &request);
             }
         }
+        // Inner integration
+        {
+            ERRCHK(NUM_STREAMS - 2 >= 0);
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = (int3){mx, my, mz} - m1;
+            acDeviceIntegrateSubstep(device, (Stream)(NUM_STREAMS - 2), isubstep, m1, m2, dt);
+        }
 
         for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
             MPI_Status status;
             MPI_Wait(&recv_requests[i], &status);
             MPI_Wait(&recv_requests[i + NUM_VTXBUF_HANDLES], &status);
         }
+
+        acDeviceSynchronizeStream(device, INNER_BOUNDCOND_STREAM);
         // #pragma omp parallel for
-        for (int i = 0; i < node->num_devices; ++i) { // Front
+        { // Front
             const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
-            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
-            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
+            const int3 m2 = m1 + (int3){nx, ny, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
         }
         // #pragma omp parallel for
-        for (int i = 0; i < node->num_devices; ++i) { // Back
-            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
-            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
-            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nz};
+            const int3 m2 = m1 + (int3){nx, ny, NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
         }
         // #pragma omp parallel for
-        for (int i = 0; i < node->num_devices; ++i) { // Bottom
+        { // Bottom
             const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
+            const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
         }
         // #pragma omp parallel for
-        for (int i = 0; i < node->num_devices; ++i) { // Top
-            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
+        { // Top
+            const int3 m1 = (int3){NGHOST, ny, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
         }
         // #pragma omp parallel for
-        for (int i = 0; i < node->num_devices; ++i) { // Left
+        { // Left
             const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
-                                        node->subgrid.n.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
+            const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
         }
         // #pragma omp parallel for
-        for (int i = 0; i < node->num_devices; ++i) { // Right
-            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
-            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
-                                        node->subgrid.n.z - 2 * NGHOST};
-            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
+        { // Right
+            const int3 m1 = (int3){nx, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST};
+            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
         }
-        acNodeSwapBuffers(node);
+        MPI_Barrier(MPI_COMM_WORLD);
+        acDeviceSwapBuffers(device);
+        MPI_Barrier(MPI_COMM_WORLD);
     }
 
     return AC_SUCCESS;
@@ -2313,10 +2320,12 @@ acDeviceRunMPITest(void)
     AcMeshInfo info;
     acLoadConfig(AC_DEFAULT_CONFIG, &info);
 
+    // Rewind here!
     const int nn           = 512;
     info.int_params[AC_nx] = info.int_params[AC_ny] = info.int_params[AC_nz] = nn;
     acUpdateConfig(&info);
 
+    /*
     AcMesh model, candidate;
 
     // Master CPU
@@ -2326,7 +2335,7 @@ acDeviceRunMPITest(void)
 
         acMeshRandomize(&model);
         acMeshApplyPeriodicBounds(&model);
-    }
+    }*/
 
     assert(info.int_params[AC_nz] % num_processes == 0);
 
@@ -2345,8 +2354,9 @@ acDeviceRunMPITest(void)
     // Create submesh
     AcMesh submesh;
     acMeshCreate(submesh_info, &submesh);
+    acMeshRandomize(&submesh);
 
-    acDeviceDistributeMeshMPI(model, &submesh);
+    // acDeviceDistributeMeshMPI(model, &submesh);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////
     Device device;
@@ -2355,16 +2365,18 @@ acDeviceRunMPITest(void)
 
     // Warmup
     acDeviceSynchronizeStream(device, STREAM_ALL);
-    for (int i = 0; i < 10; ++i) {
-        acDeviceBoundcondStepMPI(device, &submesh);
+    for (int i = 0; i < 2; ++i) {
+        // acDeviceBoundcondStepMPI(device, &submesh);
+        acDeviceIntegrateStepMPI(device, FLT_EPSILON, &submesh);
     }
 
     // Benchmark
-    const int num_iters = 100;
+    const int num_iters = 10;
     Timer total_time;
     timer_reset(&total_time);
     for (int i = 0; i < num_iters; ++i) {
-        acDeviceBoundcondStepMPI(device, &submesh);
+        // acDeviceBoundcondStepMPI(device, &submesh);
+        acDeviceIntegrateStepMPI(device, FLT_EPSILON, &submesh);
     }
     if (pid == 0) {
         const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
@@ -2377,16 +2389,18 @@ acDeviceRunMPITest(void)
     acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
     acDeviceDestroy(device);
     ////////////////////////////////////////////////////////////////////////////////////////////////
-    acDeviceGatherMeshMPI(submesh, &candidate);
+    // acDeviceGatherMeshMPI(submesh, &candidate);
 
     acMeshDestroy(&submesh);
-
-    // Master CPU
-    if (pid == 0) {
+    /*
+        // Master CPU
+        if (pid == 0)
+    {
         acVerifyMesh(model, candidate);
         acMeshDestroy(&model);
         acMeshDestroy(&candidate);
     }
+    */
 
     MPI_Finalize();
     return AC_FAILURE;