Multi-GPU optimizations: removed some unnecessary synchronization and divided the calculation of boundary conditions to local and global steps.

2019-07-05 18:21:44 +03:00
parent f1066a2c11
commit 5fdfdeca9e
3 changed files with 52 additions and 2 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -457,6 +457,49 @@ acBoundcondStep(void)
    return AC_SUCCESS;
 }
 AcResult
 acLocalBoundcondStep(void)
 {
    if (num_devices == 1) {
        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
    }
    else {
        // Local boundary conditions
        // #pragma omp parallel for
        for (int i = 0; i < num_devices; ++i) {
            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
        }
    }
    return AC_SUCCESS;
 }
 AcResult
 acGlobalBoundcondStep(void)
 {
    if (num_devices > 1) {
        // With periodic boundary conditions we exchange the front and back plates of the
        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
        // ...|ooooxxx|... -> xxx|ooooooo|...
        {
            const int3 src = (int3){0, 0, subgrid.n.z};
            const int3 dst = (int3){0, 0, 0};
            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
                                   num_vertices);
        }
        // ...|ooooooo|xxx <- ...|xxxoooo|...
        {
            const int3 src = (int3){0, 0, NGHOST};
            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
                                   num_vertices);
        }
    }
    return AC_SUCCESS;
 }
 AcResult
 acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
 {
@@ -495,7 +538,11 @@ acIntegrate(const AcReal& dt)
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
        acSwapBuffers();
-        acBoundcondStep();
+        acLocalBoundcondStep();
        acSynchronizeStream(STREAM_ALL);
        acGlobalBoundcondStep();
        acSynchronizeHalos();
        acSynchronizeStream(STREAM_ALL);
    }
    return AC_SUCCESS;
 }
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -150,7 +150,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
    // Concurrency
    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamCreate(&device->streams[i]);
+        cudaStreamCreateWithFlags(&device->streams[i], cudaStreamNonBlocking);
    }
    // Memory
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -921,6 +921,7 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
        ERROR("Unrecognized rtype");
    }
    // clang-format on
    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
@@ -971,6 +972,8 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
        ERROR("Unrecognized rtype");
    }
    // clang-format on
    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;