Multi-GPU optimizations: removed some unnecessary synchronization and divided the calculation of boundary conditions to local and global steps.
This commit is contained in:
@@ -457,6 +457,49 @@ acBoundcondStep(void)
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acLocalBoundcondStep(void)
|
||||
{
|
||||
if (num_devices == 1) {
|
||||
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
|
||||
}
|
||||
else {
|
||||
// Local boundary conditions
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
|
||||
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acGlobalBoundcondStep(void)
|
||||
{
|
||||
if (num_devices > 1) {
|
||||
// With periodic boundary conditions we exchange the front and back plates of the
|
||||
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
|
||||
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
{
|
||||
const int3 src = (int3){0, 0, subgrid.n.z};
|
||||
const int3 dst = (int3){0, 0, 0};
|
||||
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
|
||||
num_vertices);
|
||||
}
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
{
|
||||
const int3 src = (int3){0, 0, NGHOST};
|
||||
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
|
||||
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
|
||||
num_vertices);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
|
||||
{
|
||||
@@ -495,7 +538,11 @@ acIntegrate(const AcReal& dt)
|
||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
|
||||
acSwapBuffers();
|
||||
acBoundcondStep();
|
||||
acLocalBoundcondStep();
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
acGlobalBoundcondStep();
|
||||
acSynchronizeHalos();
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
@@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||
cudaStreamCreate(&device->streams[i]);
|
||||
cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking);
|
||||
}
|
||||
|
||||
// Memory
|
||||
|
@@ -921,6 +921,7 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
@@ -971,6 +972,8 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
|
Reference in New Issue
Block a user