Trying to overlap MPI communication with computation of boundary conditions. However, NVIDIA seemed to forget one important detail in the documentation for CUDA-aware MPI: it looks like CUDA streams are not supported with CUDA-aware MPI communication. So in the end the fastest solution might be to use old-school gpu->cpu->cpu->gpu MPI communication after all

This commit is contained in:
Johannes Pekkila
2019-10-21 15:50:53 +02:00
parent f120343110
commit 915e1c7c14

View File

@@ -192,8 +192,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
} }
// Reductions // Reductions
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad, ERRCHK_CUDA_ALWAYS(
acVertexBufferCompdomainSizeBytes(device_config))); cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
#if PACKED_DATA_TRANSFERS #if PACKED_DATA_TRANSFERS
@@ -860,9 +860,14 @@ acDeviceGatherMeshMPI(const AcMesh src, AcMesh* dst)
/** NOTE: Assumes 1 process per GPU */ /** NOTE: Assumes 1 process per GPU */
static AcResult static AcResult
acDeviceCommunicateHalosMPI(const Device device) acDeviceCommunicateHalosMPI(const Device device, const int3 subgrid_m)
{ {
//MPI_Barrier(MPI_COMM_WORLD); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
const int3 start = (int3){0, 0, NGHOST};
const int3 end = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST};
acDevicePeriodicBoundcondStep(device, (Stream)i, (VertexBufferHandle)i, start, end);
}
MPI_Datatype datatype = MPI_FLOAT; MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8) if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE; datatype = MPI_DOUBLE;
@@ -873,47 +878,46 @@ acDeviceCommunicateHalosMPI(const Device device)
const size_t count = device->local_config.int_params[AC_mx] * const size_t count = device->local_config.int_params[AC_mx] *
device->local_config.int_params[AC_my] * NGHOST; device->local_config.int_params[AC_my] * NGHOST;
MPI_Request recv_requests[2*NUM_VTXBUF_HANDLES]; MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES];
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
{ // Front { // Front
// ...|ooooxxx|... -> xxx|ooooooo|... // ...|ooooxxx|... -> xxx|ooooooo|...
const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz], const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz],
device->local_config); device->local_config);
const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config);
//const int send_pid = (pid + 1) % num_processes; // const int send_pid = (pid + 1) % num_processes;
const int recv_pid = (pid + num_processes - 1) % num_processes; const int recv_pid = (pid + num_processes - 1) % num_processes;
MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD, MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD,
&recv_requests[i]); &recv_requests[i]);
} }
{ // Back { // Back
// ...|ooooooo|xxx <- ...|xxxoooo|... // ...|ooooooo|xxx <- ...|xxxoooo|...
const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config); const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config);
const size_t dst_idx = acVertexBufferIdx( const size_t dst_idx = acVertexBufferIdx(
0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config); 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config);
//const int send_pid = (pid + num_processes - 1) % num_processes; // const int send_pid = (pid + num_processes - 1) % num_processes;
const int recv_pid = (pid + 1) % num_processes; const int recv_pid = (pid + 1) % num_processes;
MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i, MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid,
MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]); NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD,
&recv_requests[NUM_VTXBUF_HANDLES + i]);
} }
} }
acDeviceSynchronizeStream(device, STREAM_DEFAULT); // Ensure that local bounds are done before sending
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceSynchronizeStream(device, (Stream)i);
{ // Front { // Front
// ...|ooooxxx|... -> xxx|ooooooo|... // ...|ooooxxx|... -> xxx|ooooooo|...
const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz], const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz],
device->local_config); device->local_config);
const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config);
const int send_pid = (pid + 1) % num_processes; const int send_pid = (pid + 1) % num_processes;
//const int recv_pid = (pid + num_processes - 1) % num_processes; // const int recv_pid = (pid + num_processes - 1) % num_processes;
MPI_Request request; MPI_Request request;
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD, MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD,
&request); &request);
} }
{ // Back { // Back
// ...|ooooooo|xxx <- ...|xxxoooo|... // ...|ooooooo|xxx <- ...|xxxoooo|...
@@ -921,16 +925,16 @@ acDeviceCommunicateHalosMPI(const Device device)
const size_t dst_idx = acVertexBufferIdx( const size_t dst_idx = acVertexBufferIdx(
0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config); 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config);
const int send_pid = (pid + num_processes - 1) % num_processes; const int send_pid = (pid + num_processes - 1) % num_processes;
//const int recv_pid = (pid + 1) % num_processes; // const int recv_pid = (pid + 1) % num_processes;
MPI_Request request; MPI_Request request;
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request); NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request);
} }
} }
MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES]; MPI_Status recv_statuses[2 * NUM_VTXBUF_HANDLES];
MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses); MPI_Waitall(2 * NUM_VTXBUF_HANDLES, recv_requests, recv_statuses);
return AC_SUCCESS; return AC_SUCCESS;
} }
#if 0 // Ok basic working #if 0 // Ok basic working
@@ -977,7 +981,7 @@ acDeviceCommunicateHalosMPI(const Device device)
MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i, MPI_Irecv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i,
MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]); MPI_COMM_WORLD, &recv_requests[NUM_VTXBUF_HANDLES + i]);
MPI_Request request; MPI_Request request;
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request); NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request);
@@ -985,7 +989,7 @@ acDeviceCommunicateHalosMPI(const Device device)
} }
MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES]; MPI_Status recv_statuses[2*NUM_VTXBUF_HANDLES];
MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses); MPI_Waitall(2*NUM_VTXBUF_HANDLES, recv_requests, recv_statuses);
return AC_SUCCESS; return AC_SUCCESS;
} }
#endif #endif
@@ -1173,9 +1177,9 @@ acDeviceRunMPITest(void)
{ {
int num_processes, pid; int num_processes, pid;
MPI_Init(NULL, NULL); MPI_Init(NULL, NULL);
//int provided; // int provided;
//MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); // Hybrid MP + MPI // MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); // Hybrid MP + MPI
//ERRCHK_ALWAYS(provided == MPI_THREAD_MULTIPLE); // ERRCHK_ALWAYS(provided == MPI_THREAD_MULTIPLE);
MPI_Comm_size(MPI_COMM_WORLD, &num_processes); MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_rank(MPI_COMM_WORLD, &pid);
@@ -1207,18 +1211,20 @@ acDeviceRunMPITest(void)
#endif /* MPIX_CUDA_AWARE_SUPPORT */ #endif /* MPIX_CUDA_AWARE_SUPPORT */
//////// Borrowing end //////// Borrowing end
int direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA")); int direct = getenv("MPICH_RDMA_ENABLED_CUDA") == NULL
if(direct != 1){ ? 0
printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n"); : atoi(getenv("MPICH_RDMA_ENABLED_CUDA"));
exit (EXIT_FAILURE); if (direct != 1) {
} printf("MPICH_RDMA_ENABLED_CUDA not enabled!\n");
exit(EXIT_FAILURE);
}
// Create model and candidate meshes // Create model and candidate meshes
AcMeshInfo info; AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info); acLoadConfig(AC_DEFAULT_CONFIG, &info);
const int nn = 256; const int nn = 256;
info.int_params[AC_nx] = info.int_params[AC_ny] = nn; info.int_params[AC_nx] = info.int_params[AC_ny] = nn;
info.int_params[AC_nz] = 4*512; info.int_params[AC_nz] = 4 * 512;
acUpdateConfig(&info); acUpdateConfig(&info);
AcMesh model, candidate; AcMesh model, candidate;
@@ -1264,7 +1270,6 @@ acDeviceRunMPITest(void)
acDeviceCreate(0, submesh_info, &device); acDeviceCreate(0, submesh_info, &device);
acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
// Warmup // Warmup
acDeviceSynchronizeStream(device, STREAM_ALL); acDeviceSynchronizeStream(device, STREAM_ALL);
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
@@ -1275,8 +1280,8 @@ acDeviceRunMPITest(void)
acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end); acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end);
} }
acDeviceSynchronizeStream(device, STREAM_DEFAULT); acDeviceSynchronizeStream(device, STREAM_DEFAULT);
// Includes periodic bounds at first and last ghost zone // Includes periodic bounds at first and last ghost zone
acDeviceCommunicateHalosMPI(device); acDeviceCommunicateHalosMPI(device, subgrid_m);
} }
////////////////////////////// Timer start ////////////////////////////// Timer start
@@ -1285,17 +1290,11 @@ acDeviceRunMPITest(void)
timer_reset(&total_time); timer_reset(&total_time);
for (int i = 0; i < num_iters; ++i) { for (int i = 0; i < num_iters; ++i) {
///// Communication start ///// Communication start
{
const int3 start = (int3){0, 0, NGHOST};
const int3 end = (int3){subgrid_m.x, subgrid_m.y, subgrid_m.z - NGHOST};
acDevicePeriodicBoundconds(device, STREAM_DEFAULT, start, end);
}
#if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU #if 1 // GPU-GPU if CUDA-aware MPI, otherwise managed CPU-GPU-GPU-CPU
//acDeviceSynchronizeStream(device, STREAM_DEFAULT); // acDeviceSynchronizeStream(device, STREAM_DEFAULT);
//MPI_Barrier(MPI_COMM_WORLD); // MPI_Barrier(MPI_COMM_WORLD);
acDeviceCommunicateHalosMPI( acDeviceCommunicateHalosMPI(device, subgrid_m);
device); // Includes periodic bounds at first and last ghost zone // Includes periodic bounds at first and last ghost zone
MPI_Barrier(MPI_COMM_WORLD);
#else // Explicit GPU-CPU-CPU-GPU #else // Explicit GPU-CPU-CPU-GPU
acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
acHostCommunicateHalosMPI(&submesh); acHostCommunicateHalosMPI(&submesh);
@@ -1307,7 +1306,7 @@ acDeviceRunMPITest(void)
const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
printf("vertices: %d^3, iterations: %d\n", nn, num_iters); printf("vertices: %d^3, iterations: %d\n", nn, num_iters);
printf("Total time: %f ms\n", ms_elapsed); printf("Total time: %f ms\n", ms_elapsed);
printf("Time per step: %f ms\n", ms_elapsed / num_iters); printf("Time per step: %f ms\n", ms_elapsed / num_iters);
} }
////////////////////////////// Timer end ////////////////////////////// Timer end