MPI distribute and gather were incorrect, fixed. Now tested to work with 1,2, and 4 GPUs.
This commit is contained in:
@@ -1096,11 +1096,322 @@ getPid(const int3 pid, const int3 decomposition)
|
|||||||
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
|
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static AcResult
|
||||||
|
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||||
|
{
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
printf("Distributing mesh...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
assert(dst);
|
||||||
|
|
||||||
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
|
if (sizeof(AcReal) == 8)
|
||||||
|
datatype = MPI_DOUBLE;
|
||||||
|
|
||||||
|
int pid, nprocs;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
|
// Submesh nn
|
||||||
|
const int3 nn = (int3){
|
||||||
|
dst->info.int_params[AC_nx],
|
||||||
|
dst->info.int_params[AC_ny],
|
||||||
|
dst->info.int_params[AC_nz],
|
||||||
|
};
|
||||||
|
|
||||||
|
// Submesh mm
|
||||||
|
const int3 mm = (int3){
|
||||||
|
dst->info.int_params[AC_mx],
|
||||||
|
dst->info.int_params[AC_my],
|
||||||
|
dst->info.int_params[AC_mz],
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send to self
|
||||||
|
if (pid == 0) {
|
||||||
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||||
|
// For pencils
|
||||||
|
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
|
||||||
|
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
|
||||||
|
const int i = NGHOST;
|
||||||
|
const int count = nn.x;
|
||||||
|
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
|
||||||
|
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
|
||||||
|
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
|
||||||
|
&src.vertex_buffer[vtxbuf][src_idx], //
|
||||||
|
count * sizeof(src.vertex_buffer[i][0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||||
|
// For pencils
|
||||||
|
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
|
||||||
|
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
|
||||||
|
const int i = NGHOST;
|
||||||
|
const int count = nn.x;
|
||||||
|
|
||||||
|
if (pid != 0) {
|
||||||
|
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
|
||||||
|
// Recv
|
||||||
|
MPI_Status status;
|
||||||
|
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0,
|
||||||
|
MPI_COMM_WORLD, &status);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
|
||||||
|
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
|
||||||
|
const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
|
||||||
|
j + tgt_pid3d.y * nn.y, //
|
||||||
|
k + tgt_pid3d.z * nn.z, //
|
||||||
|
src.info);
|
||||||
|
|
||||||
|
// Send
|
||||||
|
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0,
|
||||||
|
MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AcResult
|
||||||
|
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||||
|
{
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
printf("Gathering mesh...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
assert(dst);
|
||||||
|
|
||||||
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
|
if (sizeof(AcReal) == 8)
|
||||||
|
datatype = MPI_DOUBLE;
|
||||||
|
|
||||||
|
int pid, nprocs;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
|
// Submesh nn
|
||||||
|
const int3 nn = (int3){
|
||||||
|
src.info.int_params[AC_nx],
|
||||||
|
src.info.int_params[AC_ny],
|
||||||
|
src.info.int_params[AC_nz],
|
||||||
|
};
|
||||||
|
|
||||||
|
// Submesh mm
|
||||||
|
const int3 mm = (int3){
|
||||||
|
src.info.int_params[AC_mx],
|
||||||
|
src.info.int_params[AC_my],
|
||||||
|
src.info.int_params[AC_mz],
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send to self
|
||||||
|
if (pid == 0) {
|
||||||
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||||
|
// For pencils
|
||||||
|
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
|
||||||
|
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
|
||||||
|
const int i = NGHOST;
|
||||||
|
const int count = nn.x;
|
||||||
|
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
|
||||||
|
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
|
||||||
|
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
|
||||||
|
&src.vertex_buffer[vtxbuf][src_idx], //
|
||||||
|
count * sizeof(src.vertex_buffer[i][0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||||
|
// For pencils
|
||||||
|
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
|
||||||
|
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
|
||||||
|
const int i = NGHOST;
|
||||||
|
const int count = nn.x;
|
||||||
|
|
||||||
|
if (pid != 0) {
|
||||||
|
// Send
|
||||||
|
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
|
||||||
|
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0,
|
||||||
|
MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
|
||||||
|
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
|
||||||
|
const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
|
||||||
|
j + tgt_pid3d.y * nn.y, //
|
||||||
|
k + tgt_pid3d.z * nn.z, //
|
||||||
|
dst->info);
|
||||||
|
|
||||||
|
// Recv
|
||||||
|
MPI_Status status;
|
||||||
|
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0,
|
||||||
|
MPI_COMM_WORLD, &status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
// Close to correct 3D distribute/gather but deadlocks
|
||||||
|
static AcResult
|
||||||
|
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||||
|
{
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
printf("Distributing mesh...\n");
|
||||||
|
assert(dst);
|
||||||
|
|
||||||
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
|
if (sizeof(AcReal) == 8)
|
||||||
|
datatype = MPI_DOUBLE;
|
||||||
|
|
||||||
|
int pid, nprocs;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
|
// Submesh nn
|
||||||
|
const int3 nn = (int3){
|
||||||
|
dst->info.int_params[AC_nx],
|
||||||
|
dst->info.int_params[AC_ny],
|
||||||
|
dst->info.int_params[AC_nz],
|
||||||
|
};
|
||||||
|
|
||||||
|
// Submesh mm
|
||||||
|
const int3 mm = (int3){
|
||||||
|
dst->info.int_params[AC_mx],
|
||||||
|
dst->info.int_params[AC_my],
|
||||||
|
dst->info.int_params[AC_mz],
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
|
||||||
|
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
|
||||||
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||||
|
|
||||||
|
// For pencils
|
||||||
|
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
|
||||||
|
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
|
||||||
|
const int i = NGHOST;
|
||||||
|
const int count = nn.x;
|
||||||
|
const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
|
||||||
|
j + tgt_pid3d.y * nn.y, //
|
||||||
|
k + tgt_pid3d.z * nn.z, //
|
||||||
|
src.info);
|
||||||
|
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
|
||||||
|
|
||||||
|
if (pid == 0) {
|
||||||
|
// Send
|
||||||
|
if (pid == tgt_pid) {
|
||||||
|
// Send to self
|
||||||
|
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
|
||||||
|
&src.vertex_buffer[vtxbuf][src_idx], //
|
||||||
|
count * sizeof(src.vertex_buffer[i][0]));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Send to others
|
||||||
|
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid,
|
||||||
|
vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
|
||||||
|
MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Recv
|
||||||
|
MPI_Status status;
|
||||||
|
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0,
|
||||||
|
vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
|
||||||
|
MPI_COMM_WORLD, &status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AcResult
|
||||||
|
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||||
|
{
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
printf("Distributing mesh...\n");
|
||||||
|
assert(dst);
|
||||||
|
|
||||||
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
|
if (sizeof(AcReal) == 8)
|
||||||
|
datatype = MPI_DOUBLE;
|
||||||
|
|
||||||
|
int pid, nprocs;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
|
// Submesh nn
|
||||||
|
const int3 nn = (int3){
|
||||||
|
dst->info.int_params[AC_nx],
|
||||||
|
dst->info.int_params[AC_ny],
|
||||||
|
dst->info.int_params[AC_nz],
|
||||||
|
};
|
||||||
|
|
||||||
|
// Submesh mm
|
||||||
|
const int3 mm = (int3){
|
||||||
|
dst->info.int_params[AC_mx],
|
||||||
|
dst->info.int_params[AC_my],
|
||||||
|
dst->info.int_params[AC_mz],
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
|
||||||
|
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
|
||||||
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||||
|
|
||||||
|
// For pencils
|
||||||
|
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
|
||||||
|
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
|
||||||
|
const int i = NGHOST;
|
||||||
|
const int count = nn.x;
|
||||||
|
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
|
||||||
|
const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
|
||||||
|
j + tgt_pid3d.y * nn.y, //
|
||||||
|
k + tgt_pid3d.z * nn.z, //
|
||||||
|
dst->info);
|
||||||
|
|
||||||
|
if (pid == 0) {
|
||||||
|
// Send
|
||||||
|
if (pid == tgt_pid) {
|
||||||
|
// Send to self
|
||||||
|
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
|
||||||
|
&src.vertex_buffer[vtxbuf][src_idx], //
|
||||||
|
count * sizeof(src.vertex_buffer[i][0]));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Recv from others
|
||||||
|
MPI_Status status;
|
||||||
|
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid,
|
||||||
|
vtxbuf, MPI_COMM_WORLD, &status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Send to 0
|
||||||
|
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, vtxbuf,
|
||||||
|
MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
static void
|
static void
|
||||||
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||||
{
|
{
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
printf("Distributing mesh...\n");
|
printf("Distributing mesh...\n");
|
||||||
|
assert(dst);
|
||||||
|
|
||||||
MPI_Datatype datatype = MPI_FLOAT;
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
if (sizeof(AcReal) == 8)
|
if (sizeof(AcReal) == 8)
|
||||||
@@ -1110,33 +1421,31 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
|
|||||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
||||||
|
|
||||||
const size_t count = acVertexBufferSize(dst->info);
|
// const size_t count = acVertexBufferSize(dst->info);
|
||||||
|
const int3 offset = (int3){
|
||||||
|
src.info.int_params[AC_nx] / decomposition.x,
|
||||||
|
src.info.int_params[AC_ny] / decomposition.y,
|
||||||
|
src.info.int_params[AC_nz] / decomposition.z,
|
||||||
|
};
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
|
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
// Communicate to self
|
// Communicate to self
|
||||||
assert(dst);
|
|
||||||
memcpy(&dst->vertex_buffer[i][0], //
|
memcpy(&dst->vertex_buffer[i][0], //
|
||||||
&src.vertex_buffer[i][0], //
|
&src.vertex_buffer[i][0], //
|
||||||
count * sizeof(src.vertex_buffer[i][0]));
|
count * sizeof(src.vertex_buffer[i][0]));
|
||||||
|
|
||||||
// Communicate to others
|
// Communicate to others
|
||||||
for (int j = 1; j < num_processes; ++j) {
|
for (int tgt = 1; tgt < num_processes; ++tgt) {
|
||||||
const int3 target_pid = getPid3D(j, decomposition);
|
const int3 target_pid = getPid3D(tgt, decomposition);
|
||||||
const int3 offset = (int3){
|
const size_t src_idx = acVertexBufferIdx(target_pid.x * offset.x,
|
||||||
src.info.int_params[AC_nx] / decomposition.x,
|
|
||||||
src.info.int_params[AC_ny] / decomposition.y,
|
|
||||||
src.info.int_params[AC_nz] / decomposition.z,
|
|
||||||
};
|
|
||||||
const size_t src_idx = acVertexBufferIdx(target_pid.x * offset.x,
|
|
||||||
target_pid.y * offset.y,
|
target_pid.y * offset.y,
|
||||||
target_pid.z * offset.z, src.info);
|
target_pid.z * offset.z, src.info);
|
||||||
|
|
||||||
MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, j, 0, MPI_COMM_WORLD);
|
MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, tgt, 0, MPI_COMM_WORLD);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
assert(dst);
|
|
||||||
|
|
||||||
// Recv
|
// Recv
|
||||||
const size_t dst_idx = 0;
|
const size_t dst_idx = 0;
|
||||||
@@ -1147,6 +1456,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||||
{
|
{
|
||||||
@@ -1199,6 +1509,7 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
static AcResult
|
static AcResult
|
||||||
@@ -2210,7 +2521,9 @@ acDeviceRunMPITest(void)
|
|||||||
acMeshRandomize(&submesh);
|
acMeshRandomize(&submesh);
|
||||||
|
|
||||||
#if BENCH_STRONG_SCALING
|
#if BENCH_STRONG_SCALING
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
acDeviceDistributeMeshMPI(model, decomposition, &submesh);
|
acDeviceDistributeMeshMPI(model, decomposition, &submesh);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Init the GPU
|
// Init the GPU
|
||||||
@@ -2254,7 +2567,7 @@ acDeviceRunMPITest(void)
|
|||||||
acDeviceCommunicateHalosMPI(device);
|
acDeviceCommunicateHalosMPI(device);
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
acDeviceSynchronizeStream(device, STREAM_ALL);
|
||||||
|
|
||||||
acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
|
// acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); // TODO re-enable
|
||||||
acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
|
acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
// acModelIntegrateStep(model, FLT_EPSILON); // TODO
|
// acModelIntegrateStep(model, FLT_EPSILON); // TODO
|
||||||
|
Reference in New Issue
Block a user