Multi-GPU optimizations: removed some unnecessary synchronization and divided the calculation of boundary conditions to local and global steps.

This commit is contained in:
jpekkila
2019-07-05 18:21:44 +03:00
parent f1066a2c11
commit 5fdfdeca9e
3 changed files with 52 additions and 2 deletions

View File

@@ -457,6 +457,49 @@ acBoundcondStep(void)
return AC_SUCCESS;
}
AcResult
acLocalBoundcondStep(void)
{
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
}
else {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
}
}
return AC_SUCCESS;
}
AcResult
acGlobalBoundcondStep(void)
{
if (num_devices > 1) {
// With periodic boundary conditions we exchange the front and back plates of the
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
{
@@ -495,7 +538,11 @@ acIntegrate(const AcReal& dt)
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
acSwapBuffers();
acBoundcondStep();
acLocalBoundcondStep();
acSynchronizeStream(STREAM_ALL);
acGlobalBoundcondStep();
acSynchronizeHalos();
acSynchronizeStream(STREAM_ALL);
}
return AC_SUCCESS;
}

View File

@@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamCreate(&device->streams[i]);
cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking);
}
// Memory

View File

@@ -921,6 +921,7 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
ERROR("Unrecognized rtype");
}
// clang-format on
cudaStreamSynchronize(stream);
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;
@@ -971,6 +972,8 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
ERROR("Unrecognized rtype");
}
// clang-format on
cudaStreamSynchronize(stream);
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;