Multi-GPU optimizations: removed some unnecessary synchronization and divided the calculation of boundary conditions to local and global steps.

2019-07-05 18:21:44 +03:00
parent f1066a2c11
commit 5fdfdeca9e
3 changed files with 52 additions and 2 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -457,6 +457,49 @@ acBoundcondStep(void)
    return AC_SUCCESS;
 }

+AcResult
+acLocalBoundcondStep(void)
+{
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
+    }
+    else {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acGlobalBoundcondStep(void)
+{
+    if (num_devices > 1) {
+        // With periodic boundary conditions we exchange the front and back plates of the
+        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
+        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        {
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
+                                   num_vertices);
+        }
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        {
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
+                                   num_vertices);
+        }
+    }
+    return AC_SUCCESS;
+}
+
 AcResult
 acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
 {
@@ -495,7 +538,11 @@ acIntegrate(const AcReal& dt)
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
        acSwapBuffers();
-        acBoundcondStep();
+        acLocalBoundcondStep();
+        acSynchronizeStream(STREAM_ALL);
+        acGlobalBoundcondStep();
+        acSynchronizeHalos();
+        acSynchronizeStream(STREAM_ALL);
    }
    return AC_SUCCESS;
 }
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle

    // Concurrency
    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamCreate(&device->streams[i]);
+        cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking);
    }

    // Memory
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -921,6 +921,7 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
        ERROR("Unrecognized rtype");
    }
    // clang-format on
+    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
@@ -971,6 +972,8 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
        ERROR("Unrecognized rtype");
    }
    // clang-format on
+
+    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;