diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 4d96a86..326419e 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -457,6 +457,49 @@ acBoundcondStep(void) return AC_SUCCESS; } +AcResult +acLocalBoundcondStep(void) +{ + if (num_devices == 1) { + boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m); + } + else { + // Local boundary conditions + // #pragma omp parallel for + for (int i = 0; i < num_devices; ++i) { + const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE + const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; + boundcondStep(devices[i], STREAM_PRIMARY, d0, d1); + } + } + return AC_SUCCESS; +} + +AcResult +acGlobalBoundcondStep(void) +{ + if (num_devices > 1) { + // With periodic boundary conditions we exchange the front and back plates of the + // grid. The exchange is done between the first and last device (0 and num_devices - 1). + const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; + // ...|ooooxxx|... -> xxx|ooooooo|... + { + const int3 src = (int3){0, 0, subgrid.n.z}; + const int3 dst = (int3){0, 0, 0}; + copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst, + num_vertices); + } + // ...|ooooooo|xxx <- ...|xxxoooo|... + { + const int3 src = (int3){0, 0, NGHOST}; + const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; + copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst, + num_vertices); + } + } + return AC_SUCCESS; +} + AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end) { @@ -495,7 +538,11 @@ acIntegrate(const AcReal& dt) for (int isubstep = 0; isubstep < 3; ++isubstep) { acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized. acSwapBuffers(); - acBoundcondStep(); + acLocalBoundcondStep(); + acSynchronizeStream(STREAM_ALL); + acGlobalBoundcondStep(); + acSynchronizeHalos(); + acSynchronizeStream(STREAM_ALL); } return AC_SUCCESS; } diff --git a/src/core/device.cu b/src/core/device.cu index ef72e21..a9834d8 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle // Concurrency for (int i = 0; i < NUM_STREAM_TYPES; ++i) { - cudaStreamCreate(&device->streams[i]); + cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking); } // Memory diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index ca82c9e..19f4e47 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -921,6 +921,7 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st ERROR("Unrecognized rtype"); } // clang-format on + cudaStreamSynchronize(stream); AcReal result; cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); return result; @@ -971,6 +972,8 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta ERROR("Unrecognized rtype"); } // clang-format on + + cudaStreamSynchronize(stream); AcReal result; cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); return result;