MPI now compiles and runs on Puhti, basic verification test with boundary transfers OK. Gives an "UCX WARN object 0x2fa7780 was not returned to mpool ucp_requests" warning though which seems to indicate that not all asynchronous MPI calls finished before MPI_Finalize

This commit is contained in:
jpekkila
2019-12-05 16:15:37 +02:00
parent 9d70a29ae0
commit e47cfad6b5

View File

@@ -748,7 +748,6 @@ acDeviceBoundStepMPI(const Device device)
const int mz = device->local_config.int_params[AC_mz]; const int mz = device->local_config.int_params[AC_mz];
const size_t count = mx * my * NGHOST; const size_t count = mx * my * NGHOST;
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acDeviceSynchronizeStream(device, STREAM_ALL); acDeviceSynchronizeStream(device, STREAM_ALL);
// Local boundconds // Local boundconds
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -771,8 +770,8 @@ acDeviceBoundStepMPI(const Device device)
const int3 start = (int3){0, 0, 2 * NGHOST}; const int3 start = (int3){0, 0, 2 * NGHOST};
const int3 end = (int3){mx, my, mz - 2 * NGHOST}; const int3 end = (int3){mx, my, mz - 2 * NGHOST};
acDevicePeriodicBoundcondStep(device, INNER_BOUNDCOND_STREAM, (VertexBufferHandle)i, acDevicePeriodicBoundcondStep(device, INNER_BOUNDCOND_STREAM, (VertexBufferHandle)i, start,
start, end); end);
} }
// MPI // MPI
@@ -810,8 +809,7 @@ acDeviceBoundStepMPI(const Device device)
{ {
// Send front // Send front
// ...|ooooxxx|... -> xxx|ooooooo|... // ...|ooooxxx|... -> xxx|ooooooo|...
const size_t src_idx = acVertexBufferIdx(0, 0, mz - 2 * NGHOST, const size_t src_idx = acVertexBufferIdx(0, 0, mz - 2 * NGHOST, device->local_config);
device->local_config);
const int send_pid = (pid + 1) % num_processes; const int send_pid = (pid + 1) % num_processes;
MPI_Request request; MPI_Request request;
@@ -828,18 +826,103 @@ acDeviceBoundStepMPI(const Device device)
i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, &request); i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, &request);
} }
} }
MPI_Waitall(NUM_VTXBUF_HANDLES, recv_requests, MPI_STATUSES_IGNORE);
return AC_SUCCESS;
}
/*
// 1D decomp
static AcResult
acDeviceBoundStepMPI(const Device device)
{
const int mx = device->local_config.int_params[AC_mx];
const int my = device->local_config.int_params[AC_my];
const int mz = device->local_config.int_params[AC_mz];
const size_t count = mx * my * NGHOST;
acDeviceSynchronizeStream(device, STREAM_ALL);
// Local boundconds
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
MPI_Status status; // Front plate local
MPI_Wait(&recv_requests[i], &status); {
MPI_Wait(&recv_requests[i + NUM_VTXBUF_HANDLES], &status); const int3 start = (int3){0, 0, NGHOST};
const int3 end = (int3){mx, my, 2 * NGHOST};
acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end);
} }
MPI_Barrier(MPI_COMM_WORLD); // Back plate local
acDeviceSwapBuffers(device); {
MPI_Barrier(MPI_COMM_WORLD); const int3 start = (int3){0, 0, mz - 2 * NGHOST};
const int3 end = (int3){mx, my, mz - NGHOST};
acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end);
} }
}
#define INNER_BOUNDCOND_STREAM ((Stream)(NUM_STREAMS - 1))
// Inner boundconds (while waiting)
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
const int3 start = (int3){0, 0, 2 * NGHOST};
const int3 end = (int3){mx, my, mz - 2 * NGHOST};
acDevicePeriodicBoundcondStep(device, INNER_BOUNDCOND_STREAM, (VertexBufferHandle)i, start,
end);
}
// MPI
MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES];
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int pid, num_processes;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
{ // Recv neighbor's front
// ...|ooooxxx|... -> xxx|ooooooo|...
const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config);
const int recv_pid = (pid + num_processes - 1) % num_processes;
MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD,
&recv_requests[i]);
}
{ // Recv neighbor's back
// ...|ooooooo|xxx <- ...|xxxoooo|...
const size_t dst_idx = acVertexBufferIdx(0, 0, mz - NGHOST, device->local_config);
const int recv_pid = (pid + 1) % num_processes;
MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid,
NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD,
&recv_requests[i + NUM_VTXBUF_HANDLES]);
}
}
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceSynchronizeStream(device, (Stream)i);
{
// Send front
// ...|ooooxxx|... -> xxx|ooooooo|...
const size_t src_idx = acVertexBufferIdx(0, 0, mz - 2 * NGHOST, device->local_config);
const int send_pid = (pid + 1) % num_processes;
MPI_Request request;
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD,
&request);
}
{ // Send back
// ...|ooooooo|xxx <- ...|xxxoooo|...
const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config);
const int send_pid = (pid + num_processes - 1) % num_processes;
MPI_Request request;
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, &request);
}
}
MPI_Waitall(NUM_VTXBUF_HANDLES, recv_requests, MPI_STATUSES_IGNORE);
return AC_SUCCESS; return AC_SUCCESS;
} }
*/
// 1D decomp // 1D decomp
static AcResult static AcResult