Rewrote the previous implementation, now fully works (verified) and gives the speedup we want. Communication latency is now completely hidden on at least two nodes (8 GPUs). Scaling looks very promising.

This commit is contained in:
jpekkila
2020-04-06 17:28:02 +03:00
parent 37f1c841a3
commit 427a3ac5d8
2 changed files with 93 additions and 57 deletions

View File

@@ -635,25 +635,32 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
#if AC_MPI_RT_PINNING
static void
acPinPackedData(const Device device, const cudaStream_t stream, PackedData* packed)
acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
{
cudaSetDevice(device->id);
// TODO sync stream
ddata->pinned = true;
const size_t bytes = packed->dims.x * packed->dims.y * packed->dims.z * sizeof(AcReal) *
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(packed->data_pinned, packed->data, bytes, cudaMemcpyDeviceToDevice,
stream));
ERRCHK_CUDA(
cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDeviceToHost, stream));
}
static void
acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* packed)
acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
{
cudaSetDevice(device->id);
if (!ddata->pinned)
return;
const size_t bytes = packed->dims.x * packed->dims.y * packed->dims.z * sizeof(AcReal) *
cudaSetDevice(device->id);
// TODO sync stream
ddata->pinned = false;
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(packed->data, packed->data_pinned, bytes, cudaMemcpyDeviceToDevice,
stream));
ERRCHK_CUDA(
cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyHostToDevice, stream));
}
#endif // AC_MPI_RT_PINNING
@@ -963,6 +970,12 @@ static void
acUnpinCommData(const Device device, CommData* data)
{
cudaSetDevice(device->id);
// Clear pin flags from src
for (size_t i = 0; i < data->count; ++i)
data->srcs[i].pinned = false;
// Transfer from pinned to gmem
for (size_t i = 0; i < data->count; ++i)
acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
}
@@ -1169,27 +1182,23 @@ acTransferCommData(const Device device, //
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
#if MPI_GPUDIRECT_DISABLED
PackedData src = data->srcs_host[a_idx];
PackedData* dst = &data->dsts_host[b_idx];
#else
PackedData src = data->srcs[a_idx];
#endif
#if MPI_GPUDIRECT_DISABLED
PackedData dst = data->dsts_host[b_idx];
#else
PackedData dst = data->dsts[b_idx];
PackedData* dst = &data->dsts[b_idx];
#endif
const int3 pid3d = getPid3D(pid, decomp);
const int npid_recv = getPid(pid3d - neighbor, decomp);
const int npid_send = getPid(pid3d + neighbor, decomp);
const int3 pid3d = getPid3D(pid, decomp);
const int npid = getPid(pid3d - neighbor, decomp);
if (onTheSameNode(pid, npid_recv)) {
MPI_Irecv(dst.data, count, datatype, npid_recv, b_idx,
MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
if (onTheSameNode(pid, npid)) {
MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
&data->recv_reqs[b_idx]);
dst->pinned = false;
}
else {
MPI_Irecv(dst.data_pinned, count, datatype, npid_recv, b_idx,
MPI_Irecv(dst->data_pinned, count, datatype, npid, b_idx,
MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
dst->pinned = true;
}
}
}
@@ -1219,27 +1228,25 @@ acTransferCommData(const Device device, //
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
#if MPI_GPUDIRECT_DISABLED
PackedData src = data->srcs_host[a_idx];
PackedData* src = &data->srcs_host[a_idx];
#else
PackedData src = data->srcs[a_idx];
#endif
#if MPI_GPUDIRECT_DISABLED
PackedData dst = data->dsts_host[b_idx];
#else
PackedData dst = data->dsts[b_idx];
PackedData* src = &data->srcs[a_idx];
#endif
const int3 pid3d = getPid3D(pid, decomp);
const int npid_recv = getPid(pid3d - neighbor, decomp);
const int npid_send = getPid(pid3d + neighbor, decomp);
const int3 pid3d = getPid3D(pid, decomp);
const int npid = getPid(pid3d + neighbor, decomp);
cudaStreamSynchronize(data->streams[a_idx]);
if (onTheSameNode(pid, npid_send)) {
MPI_Isend(src.data, count, datatype, npid_send, b_idx,
MPI_COMM_WORLD, &data->send_reqs[b_idx]);
if (onTheSameNode(pid, npid)) {
MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
&data->send_reqs[b_idx]);
}
else {
MPI_Isend(src.data_pinned, count, datatype, npid_send, b_idx,
if (!src->pinned) {
acPinPackedData(device, data->streams[a_idx], src);
cudaStreamSynchronize(data->streams[a_idx]);
}
MPI_Isend(src->data_pinned, count, datatype, npid, b_idx,
MPI_COMM_WORLD, &data->send_reqs[b_idx]);
}
}
@@ -1260,7 +1267,6 @@ acTransferCommDataWait(const CommData data)
MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
}
}
#else
static AcResult
acTransferCommData(const Device device, //
@@ -1309,14 +1315,16 @@ acTransferCommData(const Device device, //
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
#if MPI_GPUDIRECT_DISABLED
PackedData dst = data->dsts_host[b_idx];
PackedData* dst = &data->dsts_host[b_idx];
#else
PackedData dst = data->dsts[b_idx];
PackedData* dst = &data->dsts[b_idx];
#endif
const int3 pid3d = getPid3D(pid, decomp);
MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
b_idx, MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
const int npid = getPid(pid3d - neighbor, decomp);
MPI_Irecv(dst->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
&data->recv_reqs[b_idx]);
}
}
}
@@ -1345,16 +1353,16 @@ acTransferCommData(const Device device, //
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
#if MPI_GPUDIRECT_DISABLED
PackedData src = data->srcs_host[a_idx];
PackedData* src = &data->srcs_host[a_idx];
#else
PackedData src = data->srcs[a_idx];
PackedData* src = &data->srcs[a_idx];
#endif
const int3 pid3d = getPid3D(pid, decomp);
const int npid = getPid(pid3d + neighbor, decomp);
cudaStreamSynchronize(data->streams[a_idx]);
MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp),
b_idx, MPI_COMM_WORLD, &data->send_reqs[b_idx]);
MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
}
}
}
@@ -1724,15 +1732,17 @@ acGridIntegrate(const Stream stream, const AcReal dt)
acPackCommData(device, sidexz_a0s, &sidexz_data);
acPackCommData(device, sideyz_a0s, &sideyz_data);
#if AC_MPI_RT_PINNING
acPinCommData(device, &corner_data);
acPinCommData(device, &edgex_data);
acPinCommData(device, &edgey_data);
acPinCommData(device, &edgez_data);
acPinCommData(device, &sidexy_data);
acPinCommData(device, &sidexz_data);
acPinCommData(device, &sideyz_data);
#endif // AC_MPI_RT_PINNING
/*
#if AC_MPI_RT_PINNING
acPinCommData(device, &corner_data);
acPinCommData(device, &edgex_data);
acPinCommData(device, &edgey_data);
acPinCommData(device, &edgez_data);
acPinCommData(device, &sidexy_data);
acPinCommData(device, &sidexz_data);
acPinCommData(device, &sideyz_data);
#endif
*/
//////////// INNER INTEGRATION //////////////
{
@@ -1788,7 +1798,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
acUnpinCommData(device, &sidexy_data);
acUnpinCommData(device, &sidexz_data);
acUnpinCommData(device, &sideyz_data);
#endif // AC_MPI_RT_PINNING
#endif
acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data);
@@ -1971,6 +1981,20 @@ acGridPeriodicBoundconds(const Stream stream)
acPackCommData(device, sidexz_a0s, &sidexz_data);
acPackCommData(device, sideyz_a0s, &sideyz_data);
/*
#if AC_MPI_RT_PINNING
acPinCommData(device, &corner_data);
acPinCommData(device, &edgex_data);
acPinCommData(device, &edgey_data);
acPinCommData(device, &edgez_data);
acPinCommData(device, &sidexy_data);
acPinCommData(device, &sidexz_data);
acPinCommData(device, &sideyz_data);
#endif
*/
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToHost(device, &corner_data);
acTransferCommDataToHost(device, &edgex_data);
@@ -2007,6 +2031,16 @@ acGridPeriodicBoundconds(const Stream stream)
acTransferCommDataToDevice(device, &sideyz_data);
#endif
#if AC_MPI_RT_PINNING
acUnpinCommData(device, &corner_data);
acUnpinCommData(device, &edgex_data);
acUnpinCommData(device, &edgey_data);
acUnpinCommData(device, &edgez_data);
acUnpinCommData(device, &sidexy_data);
acUnpinCommData(device, &sidexz_data);
acUnpinCommData(device, &sideyz_data);
#endif
acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);

View File

@@ -3,6 +3,7 @@
#if AC_MPI_ENABLED
#include <mpi.h>
#include <stdbool.h>
#define AC_MPI_UNIDIRECTIONAL_COMM (0)
#define AC_MPI_RT_PINNING (1)
@@ -14,7 +15,8 @@ typedef struct {
#if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
AcReal* data_pinned;
#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
bool pinned; // Set if data was received to pinned memory
#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
#if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
MPI_Win win; // MPI window for RMA