MPI distribute and gather were incorrect, fixed. Now tested to work with 1,2, and 4 GPUs.

This commit is contained in:
jpekkila
2020-01-16 19:12:32 +02:00
parent d7f56eeb67
commit 29b38d3b89

View File

@@ -1096,11 +1096,322 @@ getPid(const int3 pid, const int3 decomposition)
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y; mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
} }
static AcResult
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Distributing mesh...\n");
fflush(stdout);
assert(dst);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
// Submesh nn
const int3 nn = (int3){
dst->info.int_params[AC_nx],
dst->info.int_params[AC_ny],
dst->info.int_params[AC_nz],
};
// Submesh mm
const int3 mm = (int3){
dst->info.int_params[AC_mx],
dst->info.int_params[AC_my],
dst->info.int_params[AC_mz],
};
// Send to self
if (pid == 0) {
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
&src.vertex_buffer[vtxbuf][src_idx], //
count * sizeof(src.vertex_buffer[i][0]));
}
}
}
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
if (pid != 0) {
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
// Recv
MPI_Status status;
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0,
MPI_COMM_WORLD, &status);
}
else {
for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
j + tgt_pid3d.y * nn.y, //
k + tgt_pid3d.z * nn.z, //
src.info);
// Send
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0,
MPI_COMM_WORLD);
}
}
}
}
}
return AC_SUCCESS;
}
static AcResult
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Gathering mesh...\n");
fflush(stdout);
assert(dst);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
// Submesh nn
const int3 nn = (int3){
src.info.int_params[AC_nx],
src.info.int_params[AC_ny],
src.info.int_params[AC_nz],
};
// Submesh mm
const int3 mm = (int3){
src.info.int_params[AC_mx],
src.info.int_params[AC_my],
src.info.int_params[AC_mz],
};
// Send to self
if (pid == 0) {
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
&src.vertex_buffer[vtxbuf][src_idx], //
count * sizeof(src.vertex_buffer[i][0]));
}
}
}
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
if (pid != 0) {
// Send
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0,
MPI_COMM_WORLD);
}
else {
for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
j + tgt_pid3d.y * nn.y, //
k + tgt_pid3d.z * nn.z, //
dst->info);
// Recv
MPI_Status status;
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0,
MPI_COMM_WORLD, &status);
}
}
}
}
}
return AC_SUCCESS;
}
/*
// Close to correct 3D distribute/gather but deadlocks
static AcResult
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Distributing mesh...\n");
assert(dst);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
// Submesh nn
const int3 nn = (int3){
dst->info.int_params[AC_nx],
dst->info.int_params[AC_ny],
dst->info.int_params[AC_nz],
};
// Submesh mm
const int3 mm = (int3){
dst->info.int_params[AC_mx],
dst->info.int_params[AC_my],
dst->info.int_params[AC_mz],
};
for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
j + tgt_pid3d.y * nn.y, //
k + tgt_pid3d.z * nn.z, //
src.info);
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
if (pid == 0) {
// Send
if (pid == tgt_pid) {
// Send to self
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
&src.vertex_buffer[vtxbuf][src_idx], //
count * sizeof(src.vertex_buffer[i][0]));
}
else {
// Send to others
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid,
vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
MPI_COMM_WORLD);
}
}
else {
// Recv
MPI_Status status;
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0,
vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
MPI_COMM_WORLD, &status);
}
}
}
}
}
return AC_SUCCESS;
}
static AcResult
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Distributing mesh...\n");
assert(dst);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
// Submesh nn
const int3 nn = (int3){
dst->info.int_params[AC_nx],
dst->info.int_params[AC_ny],
dst->info.int_params[AC_nz],
};
// Submesh mm
const int3 mm = (int3){
dst->info.int_params[AC_mx],
dst->info.int_params[AC_my],
dst->info.int_params[AC_mz],
};
for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
j + tgt_pid3d.y * nn.y, //
k + tgt_pid3d.z * nn.z, //
dst->info);
if (pid == 0) {
// Send
if (pid == tgt_pid) {
// Send to self
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
&src.vertex_buffer[vtxbuf][src_idx], //
count * sizeof(src.vertex_buffer[i][0]));
}
else {
// Recv from others
MPI_Status status;
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid,
vtxbuf, MPI_COMM_WORLD, &status);
}
}
else {
// Send to 0
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, vtxbuf,
MPI_COMM_WORLD);
}
}
}
}
}
return AC_SUCCESS;
}
*/
/*
static void static void
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{ {
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
printf("Distributing mesh...\n"); printf("Distributing mesh...\n");
assert(dst);
MPI_Datatype datatype = MPI_FLOAT; MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8) if (sizeof(AcReal) == 8)
@@ -1110,33 +1421,31 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &num_processes); MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
const size_t count = acVertexBufferSize(dst->info); // const size_t count = acVertexBufferSize(dst->info);
const int3 offset = (int3){
src.info.int_params[AC_nx] / decomposition.x,
src.info.int_params[AC_ny] / decomposition.y,
src.info.int_params[AC_nz] / decomposition.z,
};
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
if (pid == 0) { if (pid == 0) {
// Communicate to self // Communicate to self
assert(dst);
memcpy(&dst->vertex_buffer[i][0], // memcpy(&dst->vertex_buffer[i][0], //
&src.vertex_buffer[i][0], // &src.vertex_buffer[i][0], //
count * sizeof(src.vertex_buffer[i][0])); count * sizeof(src.vertex_buffer[i][0]));
// Communicate to others // Communicate to others
for (int j = 1; j < num_processes; ++j) { for (int tgt = 1; tgt < num_processes; ++tgt) {
const int3 target_pid = getPid3D(j, decomposition); const int3 target_pid = getPid3D(tgt, decomposition);
const int3 offset = (int3){ const size_t src_idx = acVertexBufferIdx(target_pid.x * offset.x,
src.info.int_params[AC_nx] / decomposition.x,
src.info.int_params[AC_ny] / decomposition.y,
src.info.int_params[AC_nz] / decomposition.z,
};
const size_t src_idx = acVertexBufferIdx(target_pid.x * offset.x,
target_pid.y * offset.y, target_pid.y * offset.y,
target_pid.z * offset.z, src.info); target_pid.z * offset.z, src.info);
MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, j, 0, MPI_COMM_WORLD); MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, tgt, 0, MPI_COMM_WORLD);
} }
} }
else { else {
assert(dst);
// Recv // Recv
const size_t dst_idx = 0; const size_t dst_idx = 0;
@@ -1147,6 +1456,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
} }
} }
static void static void
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst) acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{ {
@@ -1199,6 +1509,7 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
} }
} }
} }
*/
/* /*
static AcResult static AcResult
@@ -2210,7 +2521,9 @@ acDeviceRunMPITest(void)
acMeshRandomize(&submesh); acMeshRandomize(&submesh);
#if BENCH_STRONG_SCALING #if BENCH_STRONG_SCALING
MPI_Barrier(MPI_COMM_WORLD);
acDeviceDistributeMeshMPI(model, decomposition, &submesh); acDeviceDistributeMeshMPI(model, decomposition, &submesh);
MPI_Barrier(MPI_COMM_WORLD);
#endif #endif
// Init the GPU // Init the GPU
@@ -2254,7 +2567,7 @@ acDeviceRunMPITest(void)
acDeviceCommunicateHalosMPI(device); acDeviceCommunicateHalosMPI(device);
acDeviceSynchronizeStream(device, STREAM_ALL); acDeviceSynchronizeStream(device, STREAM_ALL);
acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); // acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); // TODO re-enable
acDeviceGatherMeshMPI(submesh, decomposition, &candidate); acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
if (pid == 0) { if (pid == 0) {
// acModelIntegrateStep(model, FLT_EPSILON); // TODO // acModelIntegrateStep(model, FLT_EPSILON); // TODO