From 915e1c7c1464c80a3aed806cb3bd88c9f673c0af Mon Sep 17 00:00:00 2001
From: Johannes Pekkila <jpekkila@daint105.login.cscs.ch>
Date: Mon, 21 Oct 2019 15:50:53 +0200
Subject: [PATCH] Trying to overlap MPI communication with computation of
 boundary conditions. However, NVIDIA seemed to forget one important detail in
 the documentation for CUDA-aware MPI: it looks like CUDA streams are not
 supported with CUDA-aware MPI communication. So in the end the fastest
 solution might be to use old-school gpu->cpu->cpu->gpu MPI communication
 after all

---
 src/core/device.cu | 91 +++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index 6d40379..205cffa 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -192,8 +192,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
     }
 
     // Reductions
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
-                                         acVertexBufferCompdomainSizeBytes(device_config)));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
     ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
 
 #if PACKED_DATA_TRANSFERS
@@ -860,9 +860,14 @@ acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst)
 
 /** NOTE: Assumes 1 process per GPU */
 static AcResult
-acDeviceCommunicateHalosMPI(const Device device)
+acDeviceCommunicateHalosMPI(const Device device, const int3 subgrid_m)
 {
-    //MPI_Barrier(MPI_COMM_WORLD);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        const int3 start = (int3){0, 0, NGHOST};
+        const int3 end   = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST};
+        acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end);
+    }
+
     MPI_Datatype datatype = MPI_FLOAT;
     if (sizeof(AcReal) == 8)
         datatype = MPI_DOUBLE;
@@ -873,47 +878,46 @@ acDeviceCommunicateHalosMPI(const Device device)
 
     const size_t count = device->local_config.int_params[AC_mx] *
                          device->local_config.int_params[AC_my] * NGHOST;
-    MPI_Request recv_requests[2*NUM_VTXBUF_HANDLES];
+    MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES];
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         { // Front
             // ...|ooooxxx|... -> xxx|ooooooo|...
             const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz],
                                                      device->local_config);
             const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config);
-            //const int send_pid   = (pid + 1) % num_processes;
-            const int recv_pid   = (pid + num_processes - 1) % num_processes;
+            // const int send_pid   = (pid + 1) % num_processes;
+            const int recv_pid = (pid + num_processes - 1) % num_processes;
 
             MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD,
-                     &recv_requests[i]);
-
+                      &recv_requests[i]);
         }
         { // Back
             // ...|ooooooo|xxx <- ...|xxxoooo|...
             const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config);
             const size_t dst_idx = acVertexBufferIdx(
                 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config);
-            //const int send_pid = (pid + num_processes - 1) % num_processes;
+            // const int send_pid = (pid + num_processes - 1) % num_processes;
             const int recv_pid = (pid + 1) % num_processes;
 
-            MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i,
-                     MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]);
-            
+            MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid,
+                      NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD,
+                      &recv_requests[NUM_VTXBUF_HANDLES + i]);
         }
     }
-    acDeviceSynchronizeStream(device, STREAM_DEFAULT); // Ensure that local bounds are done before sending
+
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceSynchronizeStream(device, (Stream)i);
         { // Front
             // ...|ooooxxx|... -> xxx|ooooooo|...
             const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz],
                                                      device->local_config);
             const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config);
             const int send_pid   = (pid + 1) % num_processes;
-            //const int recv_pid   = (pid + num_processes - 1) % num_processes;
+            // const int recv_pid   = (pid + num_processes - 1) % num_processes;
 
-	    MPI_Request request;
+            MPI_Request request;
             MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD,
                       &request);
-
         }
         { // Back
             // ...|ooooooo|xxx <- ...|xxxoooo|...
@@ -921,16 +925,16 @@ acDeviceCommunicateHalosMPI(const Device device)
             const size_t dst_idx = acVertexBufferIdx(
                 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config);
             const int send_pid = (pid + num_processes - 1) % num_processes;
-            //const int recv_pid = (pid + 1) % num_processes;
+            // const int recv_pid = (pid + 1) % num_processes;
 
             MPI_Request request;
             MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
                       NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request);
         }
     }
-    MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES];
-    MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses);
- 
+    MPI_Status recv_statuses[2 * NUM_VTXBUF_HANDLES];
+    MPI_Waitall(2 * NUM_VTXBUF_HANDLES, recv_requests, recv_statuses);
+
     return AC_SUCCESS;
 }
 #if 0 // Ok basic working
@@ -977,7 +981,7 @@ acDeviceCommunicateHalosMPI(const Device device)
 
             MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i,
                      MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]);
-            
+
             MPI_Request request;
             MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
                       NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request);
@@ -985,7 +989,7 @@ acDeviceCommunicateHalosMPI(const Device device)
     }
     MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES];
     MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses);
- 
+
     return AC_SUCCESS;
 }
 #endif
@@ -1173,9 +1177,9 @@ acDeviceRunMPITest(void)
 {
     int num_processes, pid;
     MPI_Init(NULL, NULL);
-    //int provided;
-    //MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); // Hybrid MP + MPI
-    //ERRCHK_ALWAYS(provided == MPI_THREAD_MULTIPLE);
+    // int provided;
+    // MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); // Hybrid MP + MPI
+    // ERRCHK_ALWAYS(provided == MPI_THREAD_MULTIPLE);
     MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
 
@@ -1207,18 +1211,20 @@ acDeviceRunMPITest(void)
 #endif /* MPIX_CUDA_AWARE_SUPPORT */
        //////// Borrowing end
 
-    int direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA"));
-    if(direct != 1){
-        printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n");
-        exit (EXIT_FAILURE);
-    } 
+    int direct = getenv("MPICH_RDMA_ENABLED_CUDA") == NULL
+                     ? 0
+                     : atoi(getenv("MPICH_RDMA_ENABLED_CUDA"));
+    if (direct != 1) {
+        printf("MPICH_RDMA_ENABLED_CUDA not enabled!\n");
+        exit(EXIT_FAILURE);
+    }
     // Create model and candidate meshes
     AcMeshInfo info;
     acLoadConfig(AC_DEFAULT_CONFIG, &info);
 
     const int nn           = 256;
     info.int_params[AC_nx] = info.int_params[AC_ny] = nn;
-    info.int_params[AC_nz] = 4*512;
+    info.int_params[AC_nz]                          = 4 * 512;
     acUpdateConfig(&info);
 
     AcMesh model, candidate;
@@ -1264,7 +1270,6 @@ acDeviceRunMPITest(void)
     acDeviceCreate(0, submesh_info, &device);
     acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
 
-
     // Warmup
     acDeviceSynchronizeStream(device, STREAM_ALL);
     for (int i = 0; i < 10; ++i) {
@@ -1275,8 +1280,8 @@ acDeviceRunMPITest(void)
             acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end);
         }
         acDeviceSynchronizeStream(device, STREAM_DEFAULT);
-	// Includes periodic bounds at first and last ghost zone
-        acDeviceCommunicateHalosMPI(device); 
+        // Includes periodic bounds at first and last ghost zone
+        acDeviceCommunicateHalosMPI(device, subgrid_m);
     }
 
     ////////////////////////////// Timer start
@@ -1285,17 +1290,11 @@ acDeviceRunMPITest(void)
     timer_reset(&total_time);
     for (int i = 0; i < num_iters; ++i) {
         ///// Communication start
-        {
-            const int3 start = (int3){0, 0, NGHOST};
-            const int3 end   = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST};
-            acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end);
-        }
 #if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU
-        //acDeviceSynchronizeStream(device, STREAM_DEFAULT);
-        //MPI_Barrier(MPI_COMM_WORLD);
-        acDeviceCommunicateHalosMPI(
-            device); // Includes periodic bounds at first and last ghost zone
-        MPI_Barrier(MPI_COMM_WORLD);
+      // acDeviceSynchronizeStream(device, STREAM_DEFAULT);
+      // MPI_Barrier(MPI_COMM_WORLD);
+        acDeviceCommunicateHalosMPI(device, subgrid_m);
+        // Includes periodic bounds at first and last ghost zone
 #else // Explicit GPU-CPU-CPU-GPU
         acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
         acHostCommunicateHalosMPI(&submesh);
@@ -1307,7 +1306,7 @@ acDeviceRunMPITest(void)
         const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
         printf("vertices: %d^3, iterations: %d\n", nn, num_iters);
         printf("Total time: %f ms\n", ms_elapsed);
-	printf("Time per step: %f ms\n", ms_elapsed / num_iters);
+        printf("Time per step: %f ms\n", ms_elapsed / num_iters);
     }
     ////////////////////////////// Timer end