From 7b475b6dee8f4824ff6ffd13cba30934806577a8 Mon Sep 17 00:00:00 2001 From: Johannes Pekkila Date: Fri, 18 Oct 2019 11:50:22 +0200 Subject: [PATCH] Better MPI synchronization --- src/core/device.cu | 79 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/src/core/device.cu b/src/core/device.cu index 849672c..6d40379 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -858,6 +858,82 @@ acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst) } } +/** NOTE: Assumes 1 process per GPU */ +static AcResult +acDeviceCommunicateHalosMPI(const Device device) +{ + //MPI_Barrier(MPI_COMM_WORLD); + MPI_Datatype datatype = MPI_FLOAT; + if (sizeof(AcReal) == 8) + datatype = MPI_DOUBLE; + + int pid, num_processes; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &num_processes); + + const size_t count = device->local_config.int_params[AC_mx] * + device->local_config.int_params[AC_my] * NGHOST; + MPI_Request recv_requests[2*NUM_VTXBUF_HANDLES]; + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + { // Front + // ...|ooooxxx|... -> xxx|ooooooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz], + device->local_config); + const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); + //const int send_pid = (pid + 1) % num_processes; + const int recv_pid = (pid + num_processes - 1) % num_processes; + + MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD, + &recv_requests[i]); + + } + { // Back + // ...|ooooooo|xxx <- ...|xxxoooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config); + const size_t dst_idx = acVertexBufferIdx( + 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config); + //const int send_pid = (pid + num_processes - 1) % num_processes; + const int recv_pid = (pid + 1) % num_processes; + + MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i, + MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]); + + } + } + acDeviceSynchronizeStream(device, STREAM_DEFAULT); // Ensure that local bounds are done before sending + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + { // Front + // ...|ooooxxx|... -> xxx|ooooooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz], + device->local_config); + const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); + const int send_pid = (pid + 1) % num_processes; + //const int recv_pid = (pid + num_processes - 1) % num_processes; + + MPI_Request request; + MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD, + &request); + + } + { // Back + // ...|ooooooo|xxx <- ...|xxxoooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config); + const size_t dst_idx = acVertexBufferIdx( + 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config); + const int send_pid = (pid + num_processes - 1) % num_processes; + //const int recv_pid = (pid + 1) % num_processes; + + MPI_Request request; + MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, + NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request); + } + } + MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES]; + MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses); + + return AC_SUCCESS; +} +#if 0 // Ok basic working /** NOTE: Assumes 1 process per GPU */ static AcResult acDeviceCommunicateHalosMPI(const Device device) @@ -912,6 +988,7 @@ acDeviceCommunicateHalosMPI(const Device device) return AC_SUCCESS; } +#endif #if 0 /** NOTE: Assumes 1 process per GPU */ @@ -1214,7 +1291,7 @@ acDeviceRunMPITest(void) acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end); } #if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU - acDeviceSynchronizeStream(device, STREAM_DEFAULT); + //acDeviceSynchronizeStream(device, STREAM_DEFAULT); //MPI_Barrier(MPI_COMM_WORLD); acDeviceCommunicateHalosMPI( device); // Includes periodic bounds at first and last ghost zone