Reviewed the Astaroth interface. Now there's a clear distinction between synchronous and asynchronous functions. For basic usage, we provide a set of functions that are always safe to call (acIntegrate, acLoad, etc), but because of this, must be quite restricted in the sense that f.ex. the whole mesh must be loaded at once and computations cannot be executed concurrently on multiple GPUs. For more advanced users we provide asynchronous functions (such as acLoadWithOffset). Since we cannot know how the asynchronous functions are called (for example, when the integration step has been fully completed and the halos of neighboring subgrids can be safely communicated between GPUs), the responsibility of synchronization must be left to the user. In the existing implementations we currently use only the basic "safe" set of functions (except in renderer.cc), so the existing functionality has not been changed with these latests commits. Autotests also pass.

2019-07-09 18:42:00 +03:00
parent 1251f61570
commit 0bda016e17
7 changed files with 298 additions and 300 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -183,6 +183,61 @@ acCheckDeviceAvailability(void)
        return AC_FAILURE;
 }

+AcResult
+acSynchronizeStream(const StreamType stream)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], stream);
+    }
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+synchronize_halos(void)
+{
+    // Exchanges the halos of subgrids
+    // After this step, the data within the main grid ranging from
+    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
+    // has been synchronized and transferred to appropriate subgrids
+
+    // We loop only to num_devices - 1 since the front and back plate of the grid is not
+    // transferred because their contents depend on the boundary conditions.
+
+    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
+    // I.e. the halos of subgrids must contain up-to-date data!
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices - 1; ++i) {
+        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        {
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+            copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
+                                   dst, num_vertices);
+        }
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        {
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
+                                   dst, num_vertices);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acSynchronizeMesh(void)
+{
+    acSynchronizeStream(STREAM_ALL);
+    synchronize_halos();
+    acSynchronizeStream(STREAM_ALL);
+
+    return AC_SUCCESS;
+}
+
 AcResult
 acInit(const AcMeshInfo& config)
 {
@@ -235,18 +290,187 @@ acInit(const AcMeshInfo& config)
        loadGlobalGrid(devices[i], grid);
        printDeviceInfo(devices[i]);
    }
+
+    acSynchronizeStream(STREAM_ALL);
    return AC_SUCCESS;
 }

 AcResult
 acQuit(void)
 {
+    acSynchronizeStream(STREAM_ALL);
+
    for (int i = 0; i < num_devices; ++i) {
        destroyDevice(devices[i]);
    }
    return AC_SUCCESS;
 }

+AcResult
+acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
+{
+    // See the beginning of the file for an explanation of the index mapping
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        // DECOMPOSITION OFFSET HERE
+        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
+        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
+
+        const int3 da = max(start, d0);
+        const int3 db = min(end, d1);
+
+        if (db.z >= da.z) {
+            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
+            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
+            rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 end   = start + grid.n;
+    acIntegrateStepWithOffset(isubstep, dt, start, end);
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+local_boundcondstep(void)
+{
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
+    }
+    else {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+global_boundcondstep(void)
+{
+    if (num_devices > 1) {
+        // With periodic boundary conditions we exchange the front and back plates of the
+        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
+        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        {
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
+                                   num_vertices);
+        }
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        {
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
+                                   num_vertices);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    local_boundcondstep();
+    acSynchronizeStream(STREAM_ALL);
+    global_boundcondstep();
+    synchronize_halos();
+    acSynchronizeStream(STREAM_ALL);
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+swap_buffers(void)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        swapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrate(const AcReal& dt)
+{
+    acSynchronizeStream(STREAM_ALL);
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
+        swap_buffers();
+        acBoundcondStep();
+    }
+    return AC_SUCCESS;
+}
+
+static AcReal
+simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+{
+    AcReal res = results[0];
+    for (int i = 1; i < n; ++i) {
+        if (rtype == RTYPE_MAX) {
+            res = max(res, results[i]);
+        }
+        else if (rtype == RTYPE_MIN) {
+            res = min(res, results[i]);
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+            res = sum(res, results[i]);
+        }
+        else {
+            ERROR("Invalid rtype");
+        }
+    }
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
+        res                = sqrt(inv_n * res);
+    }
+
+    return res;
+}
+
+AcReal
+acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
+}
+
+AcReal
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
+            const VertexBufferHandle& c)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
+}
+
 AcResult
 acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
 {
@@ -284,6 +508,14 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
    return AC_SUCCESS;
 }

+AcResult
+acLoad(const AcMesh& host_mesh)
+{
+    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
+    acSynchronizeStream(STREAM_ALL);
+    return AC_SUCCESS;
+}
+
 AcResult
 acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
 {
@@ -308,241 +540,14 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
    return AC_SUCCESS;
 }

-// acCopyMeshToDevice
-AcResult
-acLoad(const AcMesh& host_mesh)
-{
-    return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
-}
-
-// acCopyMeshToHost
 AcResult
 acStore(AcMesh* host_mesh)
 {
-    return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
-}
-
-static AcResult
-acSwapBuffers(void)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        swapBuffers(devices[i]);
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acSynchronizeHalos(void)
-{
-    // Exchanges the halos of subgrids
-    // After this step, the data within the main grid ranging from
-    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
-    // has been synchronized and transferred to appropriate subgrids
-
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
-    // transferred because their contents depend on the boundary conditions.
-
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices - 1; ++i) {
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
-                                   dst, num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
-                                   dst, num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acSynchronizeStream(const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
-    }
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acSynchronize(void)
-{
+    acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
    acSynchronizeStream(STREAM_ALL);
-    acSynchronizeHalos();
-    acSynchronizeStream(STREAM_ALL);
-
    return AC_SUCCESS;
 }

-static AcResult
-acLocalBoundcondStep(void)
-{
-    if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
-    }
-    else {
-        // Local boundary conditions
-        // #pragma omp parallel for
-        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acGlobalBoundcondStep(void)
-{
-    if (num_devices > 1) {
-        // With periodic boundary conditions we exchange the front and back plates of the
-        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStep(void)
-{
-    acLocalBoundcondStep();
-    acSynchronizeStream(STREAM_ALL);
-    acGlobalBoundcondStep();
-    acSynchronizeHalos();
-    acSynchronizeStream(STREAM_ALL);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
-
-        const int3 da = max(start, d0);
-        const int3 db = min(end, d1);
-
-        if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStep(const int& isubstep, const AcReal& dt)
-{
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + grid.n;
-    acIntegrateStepWithOffset(isubstep, dt, start, end);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrate(const AcReal& dt)
-{
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
-        acSwapBuffers();
-        acLocalBoundcondStep();
-        acSynchronizeStream(STREAM_ALL);
-        acGlobalBoundcondStep();
-        acSynchronizeHalos();
-        acSynchronizeStream(STREAM_ALL);
-    }
-    return AC_SUCCESS;
-}
-
-static AcReal
-simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
-{
-    AcReal res = results[0];
-    for (int i = 1; i < n; ++i) {
-        if (rtype == RTYPE_MAX) {
-            res = max(res, results[i]);
-        }
-        else if (rtype == RTYPE_MIN) {
-            res = min(res, results[i]);
-        }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-            res = sum(res, results[i]);
-        }
-        else {
-            ERROR("Invalid rtype");
-        }
-    }
-
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res                = sqrt(inv_n * res);
-    }
-
-    return res;
-}
-
-AcReal
-acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
-{
-    AcReal results[num_devices];
-
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
-AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
-            const VertexBufferHandle& c)
-{
-    AcReal results[num_devices];
-
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
 AcResult
 acLoadDeviceConstant(const AcRealParam param, const AcReal value)
 {
@@ -552,9 +557,3 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
    }
    return AC_SUCCESS;
 }
-
-AcResult
-acAutoOptimize(void)
-{
-    return autoOptimize(devices[0]);
-}
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -27,15 +27,6 @@
 #pragma once
 #include "astaroth.h"

-// clang-format off
-typedef enum {
-    STREAM_PRIMARY,
-    STREAM_SECONDARY,
-    NUM_STREAM_TYPES,
-    STREAM_ALL
-} StreamType;
-// clang-format on
-
 typedef struct {
    int3 m;
    int3 n;
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -413,8 +413,6 @@ check_rk3(const AcMeshInfo& mesh_info)
            const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities

            acIntegrate(dt);
-            acBoundcondStep();
-            acSynchronize();

            model_rk3(dt, model_mesh);
            boundconds(model_mesh->info, model_mesh);
@@ -712,8 +710,6 @@ run_autotest(void)

                // Device integration step
                acIntegrate(dt);
-                acBoundcondStep();
-                acSynchronize();
                acStore(candidate_mesh);

                // Check fields
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -122,7 +122,6 @@ run_benchmark(void)
        // Warmup
        for (int i = 0; i < 10; ++i) {
            acIntegrate(0);
-            acSynchronize();
        }

        Timer t;
@@ -135,8 +134,6 @@ run_benchmark(void)
 #else // GEN_BENCHMARK_FULL
            acIntegrate(dt);
 #endif
-            acSynchronize();
-
            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
            results.push_back(ms_elapsed);
        }
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -406,7 +406,7 @@ run_renderer(void)
            const int num_vertices = mesh->info.int_params[AC_mxy];
            const int3 dst         = (int3){0, 0, k_slice};
            acStoreWithOffset(dst, num_vertices, mesh);
-            acSynchronize();
+            acSynchronizeStream(STREAM_ALL);
            renderer_draw(*mesh); // Bottleneck is here
            printf("Step #%d, dt: %f\n", steps, double(dt));
            timer_reset(&frame_timer);
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -205,7 +205,6 @@ run_simulation(void)
    write_mesh_info(&mesh_info);
    print_diagnostics(0, AcReal(.0), t_step, diag_file);

-    acSynchronize();
    acStore(mesh);
    save_mesh(*mesh, 0, t_step);

@@ -275,8 +274,6 @@ run_simulation(void)
                acSynchronize();
                acStore(mesh);
            */
-
-            acSynchronize();
            acStore(mesh);

            save_mesh(*mesh, i, t_step);