Multi-GPU optimizations: removed some unnecessary synchronization and divided the calculation of boundary conditions to local and global steps.
This commit is contained in:
@@ -457,6 +457,49 @@ acBoundcondStep(void)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acLocalBoundcondStep(void)
|
||||||
|
{
|
||||||
|
if (num_devices == 1) {
|
||||||
|
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Local boundary conditions
|
||||||
|
// #pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
|
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
|
||||||
|
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
|
||||||
|
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acGlobalBoundcondStep(void)
|
||||||
|
{
|
||||||
|
if (num_devices > 1) {
|
||||||
|
// With periodic boundary conditions we exchange the front and back plates of the
|
||||||
|
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
|
||||||
|
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
|
||||||
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||||
|
{
|
||||||
|
const int3 src = (int3){0, 0, subgrid.n.z};
|
||||||
|
const int3 dst = (int3){0, 0, 0};
|
||||||
|
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
|
||||||
|
num_vertices);
|
||||||
|
}
|
||||||
|
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||||
|
{
|
||||||
|
const int3 src = (int3){0, 0, NGHOST};
|
||||||
|
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
|
||||||
|
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
|
||||||
|
num_vertices);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
|
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
|
||||||
{
|
{
|
||||||
@@ -495,7 +538,11 @@ acIntegrate(const AcReal& dt)
|
|||||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||||
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
|
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
|
||||||
acSwapBuffers();
|
acSwapBuffers();
|
||||||
acBoundcondStep();
|
acLocalBoundcondStep();
|
||||||
|
acSynchronizeStream(STREAM_ALL);
|
||||||
|
acGlobalBoundcondStep();
|
||||||
|
acSynchronizeHalos();
|
||||||
|
acSynchronizeStream(STREAM_ALL);
|
||||||
}
|
}
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
|||||||
|
|
||||||
// Concurrency
|
// Concurrency
|
||||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||||
cudaStreamCreate(&device->streams[i]);
|
cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Memory
|
// Memory
|
||||||
|
|||||||
@@ -921,6 +921,7 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
|
|||||||
ERROR("Unrecognized rtype");
|
ERROR("Unrecognized rtype");
|
||||||
}
|
}
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
cudaStreamSynchronize(stream);
|
||||||
AcReal result;
|
AcReal result;
|
||||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
return result;
|
return result;
|
||||||
@@ -971,6 +972,8 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
|
|||||||
ERROR("Unrecognized rtype");
|
ERROR("Unrecognized rtype");
|
||||||
}
|
}
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
|
cudaStreamSynchronize(stream);
|
||||||
AcReal result;
|
AcReal result;
|
||||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
return result;
|
return result;
|
||||||
|
|||||||
Reference in New Issue
Block a user