Added comments and a short overview of the MPI implementation
This commit is contained in:
@@ -460,6 +460,37 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if AC_MPI_ENABLED
|
#if AC_MPI_ENABLED
|
||||||
|
/**
|
||||||
|
Quick overview of the MPI implementation:
|
||||||
|
|
||||||
|
The halo is partitioned into segments. The first coordinate of a segment is b0.
|
||||||
|
The array containing multiple b0s is called... "b0s".
|
||||||
|
|
||||||
|
Each b0 maps to an index in computational domain of some neighboring process a0.
|
||||||
|
We have a0 = mod(b0 - nghost, nn) + nghost.
|
||||||
|
Intuitively, we
|
||||||
|
1) Transform b0 into coordinate system where (0, 0, 0) is the first index in
|
||||||
|
the comp domain.
|
||||||
|
2) Wrap the transformed b0 around nn (comp domain)
|
||||||
|
3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
|
||||||
|
in the ghost zone
|
||||||
|
|
||||||
|
struct PackedData is used for packing and unpacking and holds the actual data in
|
||||||
|
the halo partition
|
||||||
|
struct CommData holds multiple PackedDatas for sending and receiving halo
|
||||||
|
partition
|
||||||
|
struct Grid contains information about the GPU device, decomposition, the total
|
||||||
|
mesh dimensions and CommDatas
|
||||||
|
|
||||||
|
|
||||||
|
Basic steps:
|
||||||
|
1) Distribute the mesh among ranks
|
||||||
|
2) Integrate & communicate
|
||||||
|
- start inner integration and at the same time, pack halo data and send it to neighbors
|
||||||
|
- once all halo data has been received, unpack and do outer integration
|
||||||
|
- sync and start again
|
||||||
|
3) Gather the mesh to rank 0 for postprocessing
|
||||||
|
*/
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@@ -1003,7 +1034,7 @@ acUnpinCommData(const Device device, CommData* data)
|
|||||||
|
|
||||||
static AcResult
|
static AcResult
|
||||||
acTransferCommData(const Device device, //
|
acTransferCommData(const Device device, //
|
||||||
const int3* b0s, // Dst idx inside bound zone
|
const int3* b0s, // Halo partition coordinates
|
||||||
CommData* data)
|
CommData* data)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
@@ -1072,7 +1103,6 @@ acTransferCommData(const Device device, //
|
|||||||
cudaStreamSynchronize(data->streams[b0_idx]);
|
cudaStreamSynchronize(data->streams[b0_idx]);
|
||||||
MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
|
MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
|
||||||
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
|
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
|
||||||
src->pinned = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user