Added comments and a short overview of the MPI implementation

2020-05-28 17:05:12 +03:00
parent f1138b04ac
commit 01ad141d90
1 changed files with 32 additions and 2 deletions
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -460,6 +460,37 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
 }
 #if AC_MPI_ENABLED
 /**
 Quick overview of the MPI implementation:
 The halo is partitioned into segments. The first coordinate of a segment is b0.
 The array containing multiple b0s is called... "b0s".
 Each b0 maps to an index in computational domain of some neighboring process a0.
 We have a0 = mod(b0 - nghost, nn) + nghost.
 Intuitively, we
  1) Transform b0 into coordinate system where (0, 0, 0) is the first index in
     the comp domain.
  2) Wrap the transformed b0 around nn (comp domain)
  3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
     in the ghost zone
 struct PackedData is used for packing and unpacking and holds the actual data in
                  the halo partition
 struct CommData holds multiple PackedDatas for sending and receiving halo
                partition
 struct Grid contains information about the GPU device, decomposition, the total
            mesh dimensions and CommDatas
 Basic steps:
  1) Distribute the mesh among ranks
  2) Integrate & communicate
    - start inner integration and at the same time, pack halo data and send it to neighbors
    - once all halo data has been received, unpack and do outer integration
    - sync and start again
  3) Gather the mesh to rank 0 for postprocessing
 */
 #include <mpi.h>
 #include <stdint.h>
@@ -1003,7 +1034,7 @@ acUnpinCommData(const Device device, CommData* data)
 static AcResult
 acTransferCommData(const Device device, //
-                   const int3* b0s,     // Dst idx inside bound zone
+                   const int3* b0s,     // Halo partition coordinates
                   CommData* data)
 {
    cudaSetDevice(device->id);
@@ -1072,7 +1103,6 @@ acTransferCommData(const Device device, //
            cudaStreamSynchronize(data->streams[b0_idx]);
            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
                      MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
            src->pinned = true;
        }
    }