From 01ad141d90df3f4bfca70dba551cdbc2d7e83beb Mon Sep 17 00:00:00 2001 From: jpekkila Date: Thu, 28 May 2020 17:05:12 +0300 Subject: [PATCH] Added comments and a short overview of the MPI implementation --- src/core/device.cc | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/core/device.cc b/src/core/device.cc index b7295d7..f63a0e8 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -460,6 +460,37 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType } #if AC_MPI_ENABLED +/** +Quick overview of the MPI implementation: + +The halo is partitioned into segments. The first coordinate of a segment is b0. +The array containing multiple b0s is called... "b0s". + +Each b0 maps to an index in computational domain of some neighboring process a0. +We have a0 = mod(b0 - nghost, nn) + nghost. +Intuitively, we + 1) Transform b0 into coordinate system where (0, 0, 0) is the first index in + the comp domain. + 2) Wrap the transformed b0 around nn (comp domain) + 3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index + in the ghost zone + +struct PackedData is used for packing and unpacking and holds the actual data in + the halo partition +struct CommData holds multiple PackedDatas for sending and receiving halo + partition +struct Grid contains information about the GPU device, decomposition, the total + mesh dimensions and CommDatas + + +Basic steps: + 1) Distribute the mesh among ranks + 2) Integrate & communicate + - start inner integration and at the same time, pack halo data and send it to neighbors + - once all halo data has been received, unpack and do outer integration + - sync and start again + 3) Gather the mesh to rank 0 for postprocessing +*/ #include #include @@ -1003,7 +1034,7 @@ acUnpinCommData(const Device device, CommData* data) static AcResult acTransferCommData(const Device device, // - const int3* b0s, // Dst idx inside bound zone + const int3* b0s, // Halo partition coordinates CommData* data) { cudaSetDevice(device->id); @@ -1072,7 +1103,6 @@ acTransferCommData(const Device device, // cudaStreamSynchronize(data->streams[b0_idx]); MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, // MPI_COMM_WORLD, &data->send_reqs[b0_idx]); - src->pinned = true; } }