Trying to overlap MPI communication with computation of boundary conditions. However, NVIDIA seemed to forget one important detail in the documentation for CUDA-aware MPI: it looks like CUDA streams are not supported with CUDA-aware MPI communication. So in the end the fastest solution might be to use old-school gpu->cpu->cpu->gpu MPI communication after all

2019-10-21 15:50:53 +02:00
parent f120343110
commit 915e1c7c14
1 changed files with 45 additions and 46 deletions
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -192,8 +192,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
    }

    // Reductions
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
-                                         acVertexBufferCompdomainSizeBytes(device_config)));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));

 #if PACKED_DATA_TRANSFERS
@@ -860,9 +860,14 @@ acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst)

 /** NOTE: Assumes 1 process per GPU */
 static AcResult
-acDeviceCommunicateHalosMPI(const Device device)
+acDeviceCommunicateHalosMPI(const Device device, const int3 subgrid_m)
 {
-    //MPI_Barrier(MPI_COMM_WORLD);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        const int3 start = (int3){0, 0, NGHOST};
+        const int3 end   = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST};
+        acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end);
+    }
+
    MPI_Datatype datatype = MPI_FLOAT;
    if (sizeof(AcReal) == 8)
        datatype = MPI_DOUBLE;
@@ -885,7 +890,6 @@ acDeviceCommunicateHalosMPI(const Device device)

            MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD,
                      &recv_requests[i]);
-
        }
        { // Back
            // ...|ooooooo|xxx <- ...|xxxoooo|...
@@ -895,13 +899,14 @@ acDeviceCommunicateHalosMPI(const Device device)
            // const int send_pid = (pid + num_processes - 1) % num_processes;
            const int recv_pid = (pid + 1) % num_processes;

-            MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i,
-                     MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]);
+            MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid,
+                      NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD,
+                      &recv_requests[NUM_VTXBUF_HANDLES + i]);
+        }
+    }

-        }
-    }
-    acDeviceSynchronizeStream(device, STREAM_DEFAULT); // Ensure that local bounds are done before sending
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceSynchronizeStream(device, (Stream)i);
        { // Front
            // ...|ooooxxx|... -> xxx|ooooooo|...
            const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz],
@@ -913,7 +918,6 @@ acDeviceCommunicateHalosMPI(const Device device)
            MPI_Request request;
            MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD,
                      &request);
-
        }
        { // Back
            // ...|ooooooo|xxx <- ...|xxxoooo|...
@@ -1207,7 +1211,9 @@ acDeviceRunMPITest(void)
 #endif /* MPIX_CUDA_AWARE_SUPPORT */
       //////// Borrowing end

-    int direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA"));
+    int direct = getenv("MPICH_RDMA_ENABLED_CUDA") == NULL
+                     ? 0
+                     : atoi(getenv("MPICH_RDMA_ENABLED_CUDA"));
    if (direct != 1) {
        printf("MPICH_RDMA_ENABLED_CUDA not enabled!\n");
        exit(EXIT_FAILURE);
@@ -1264,7 +1270,6 @@ acDeviceRunMPITest(void)
    acDeviceCreate(0, submesh_info, &device);
    acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);

-
    // Warmup
    acDeviceSynchronizeStream(device, STREAM_ALL);
    for (int i = 0; i < 10; ++i) {
@@ -1276,7 +1281,7 @@ acDeviceRunMPITest(void)
        }
        acDeviceSynchronizeStream(device, STREAM_DEFAULT);
        // Includes periodic bounds at first and last ghost zone
-        acDeviceCommunicateHalosMPI(device); 
+        acDeviceCommunicateHalosMPI(device, subgrid_m);
    }

    ////////////////////////////// Timer start
@@ -1285,17 +1290,11 @@ acDeviceRunMPITest(void)
    timer_reset(&total_time);
    for (int i = 0; i < num_iters; ++i) {
        ///// Communication start
-        {
-            const int3 start = (int3){0, 0, NGHOST};
-            const int3 end   = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST};
-            acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end);
-        }
 #if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU
      // acDeviceSynchronizeStream(device, STREAM_DEFAULT);
      // MPI_Barrier(MPI_COMM_WORLD);
-        acDeviceCommunicateHalosMPI(
-            device); // Includes periodic bounds at first and last ghost zone
-        MPI_Barrier(MPI_COMM_WORLD);
+        acDeviceCommunicateHalosMPI(device, subgrid_m);
+        // Includes periodic bounds at first and last ghost zone
 #else // Explicit GPU-CPU-CPU-GPU
        acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
        acHostCommunicateHalosMPI(&submesh);