Reviewed the Astaroth interface. Now there's a clear distinction between synchronous and asynchronous functions. For basic usage, we provide a set of functions that are always safe to call (acIntegrate, acLoad, etc), but because of this, must be quite restricted in the sense that f.ex. the whole mesh must be loaded at once and computations cannot be executed concurrently on multiple GPUs. For more advanced users we provide asynchronous functions (such as acLoadWithOffset). Since we cannot know how the asynchronous functions are called (for example, when the integration step has been fully completed and the halos of neighboring subgrids can be safely communicated between GPUs), the responsibility of synchronization must be left to the user. In the existing implementations we currently use only the basic "safe" set of functions (except in renderer.cc), so the existing functionality has not been changed with these latests commits. Autotests also pass.

2019-07-09 18:42:00 +03:00
parent 1251f61570
commit 0bda016e17
7 changed files with 298 additions and 300 deletions
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -212,71 +212,89 @@ typedef struct {

 /*
 * =============================================================================
- * Astaroth interface
+ * Astaroth interface: Basic functions. Synchronous.
 * =============================================================================
 */
+typedef enum {
+    STREAM_PRIMARY,   //
+    STREAM_SECONDARY, //
+    NUM_STREAM_TYPES, //
+    STREAM_ALL
+} StreamType;

 /** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more,
-    AC_FAILURE otherwise. */
+ * AC_FAILURE otherwise. */
 AcResult acCheckDeviceAvailability(void);

+/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL
+ * passed as a parameter */
+AcResult acSynchronizeStream(const StreamType stream);
+
+/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos
+ * of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */
+AcResult acSynchronizeMesh(void);
+
 /** Starting point of all GPU computation. Handles the allocation and
 initialization of *all memory needed on all GPUs in the node*. In other words,
 setups everything GPU-side so that calling any other GPU interface function
 afterwards does not result in illegal memory accesses. */
 AcResult acInit(const AcMeshInfo& mesh_info);

-/** Splits the host_mesh and distributes it among the GPUs in the node */
-AcResult acLoad(const AcMesh& host_mesh);
-AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
-
-/** Does all three steps of the RK3 integration and computes the boundary
-conditions when necessary. Note that the boundary conditions are not applied
-after the final integration step.
-The result can be fetched to CPU memory with acStore(). */
-AcResult acIntegrate(const AcReal& dt);
-
-/** Performs a single RK3 step without computing boundary conditions. */
-AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
-
-/** Performs a single RK3 step without computing boundary conditions.
-    Operates on a three-dimensional cuboid, where start and end are the
-    opposite corners. */
-AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
-                                   const int3& end);
-
-/** Applies boundary conditions on the GPU meshs and communicates the
- ghost zones among GPUs if necessary */
-AcResult acBoundcondStep(void);
-
-/** Performs a scalar reduction on all GPUs in the node and returns the result.
- */
-AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
-
-/** Performs a vector reduction on all GPUs in the node and returns the result.
- */
-AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
-                   const VertexBufferHandle& b, const VertexBufferHandle& c);
-
-/** Stores the mesh distributed among GPUs of the node back to a single host
- * mesh */
-AcResult acStore(AcMesh* host_mesh);
-AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
-
 /** Frees all GPU allocations and resets all devices in the node. Should be
 * called at exit. */
 AcResult acQuit(void);

-/** Synchronizes all devices. All calls to Astaroth are asynchronous by default
-    unless otherwise stated. */
-AcResult acSynchronize(void);
+/** Does all three substeps of the RK3 integration and computes the boundary
+conditions when necessary. The result is synchronized and the boundary conditions are applied
+after the final substep, after which the result can be fetched to CPU memory with acStore. */
+AcResult acIntegrate(const AcReal& dt);

-/** Loads a parameter to the constant memory of all devices */
+/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the
+ * whole computational domain, which must be up to date and synchronized before calling
+ * acReduceScal.
+ */
+AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
+
+/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the
+ * whole computational domain, which must be up to date and synchronized before calling
+ * acReduceVec.
+ */
+AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+                   const VertexBufferHandle& b, const VertexBufferHandle& c);
+
+/** Distributes the host mesh among the GPUs in the node. Synchronous. */
+AcResult acLoad(const AcMesh& host_mesh);
+
+/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous.
+ */
+AcResult acStore(AcMesh* host_mesh);
+
+/*
+ * =============================================================================
+ * Astaroth interface: Advanced functions. Asynchronous.
+ * =============================================================================
+ */
+/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
 AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);

-/** Auto-optimizes the library. This function is free from side-effects: the input vertex buffer is
- * guaranteed not be modified.*/
-AcResult acAutoOptimize(void);
+/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
+AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
+
+/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
+ * CPU memory. Asynchronous.
+ */
+AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
+
+/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
+AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
+
+/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
+ * Asynchronous.*/
+AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
+                                   const int3& end);
+
+/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
+AcResult acBoundcondStep(void);

 /* End extern "C" */
 #ifdef __cplusplus
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -183,6 +183,61 @@ acCheckDeviceAvailability(void)
        return AC_FAILURE;
 }

+AcResult
+acSynchronizeStream(const StreamType stream)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], stream);
+    }
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+synchronize_halos(void)
+{
+    // Exchanges the halos of subgrids
+    // After this step, the data within the main grid ranging from
+    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
+    // has been synchronized and transferred to appropriate subgrids
+
+    // We loop only to num_devices - 1 since the front and back plate of the grid is not
+    // transferred because their contents depend on the boundary conditions.
+
+    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
+    // I.e. the halos of subgrids must contain up-to-date data!
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices - 1; ++i) {
+        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        {
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+            copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
+                                   dst, num_vertices);
+        }
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        {
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
+                                   dst, num_vertices);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acSynchronizeMesh(void)
+{
+    acSynchronizeStream(STREAM_ALL);
+    synchronize_halos();
+    acSynchronizeStream(STREAM_ALL);
+
+    return AC_SUCCESS;
+}
+
 AcResult
 acInit(const AcMeshInfo& config)
 {
@@ -235,18 +290,187 @@ acInit(const AcMeshInfo& config)
        loadGlobalGrid(devices[i], grid);
        printDeviceInfo(devices[i]);
    }
+
+    acSynchronizeStream(STREAM_ALL);
    return AC_SUCCESS;
 }

 AcResult
 acQuit(void)
 {
+    acSynchronizeStream(STREAM_ALL);
+
    for (int i = 0; i < num_devices; ++i) {
        destroyDevice(devices[i]);
    }
    return AC_SUCCESS;
 }

+AcResult
+acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
+{
+    // See the beginning of the file for an explanation of the index mapping
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        // DECOMPOSITION OFFSET HERE
+        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
+        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
+
+        const int3 da = max(start, d0);
+        const int3 db = min(end, d1);
+
+        if (db.z >= da.z) {
+            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
+            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
+            rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 end   = start + grid.n;
+    acIntegrateStepWithOffset(isubstep, dt, start, end);
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+local_boundcondstep(void)
+{
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
+    }
+    else {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+global_boundcondstep(void)
+{
+    if (num_devices > 1) {
+        // With periodic boundary conditions we exchange the front and back plates of the
+        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
+        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        {
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
+                                   num_vertices);
+        }
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        {
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
+                                   num_vertices);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    local_boundcondstep();
+    acSynchronizeStream(STREAM_ALL);
+    global_boundcondstep();
+    synchronize_halos();
+    acSynchronizeStream(STREAM_ALL);
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+swap_buffers(void)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        swapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrate(const AcReal& dt)
+{
+    acSynchronizeStream(STREAM_ALL);
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
+        swap_buffers();
+        acBoundcondStep();
+    }
+    return AC_SUCCESS;
+}
+
+static AcReal
+simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+{
+    AcReal res = results[0];
+    for (int i = 1; i < n; ++i) {
+        if (rtype == RTYPE_MAX) {
+            res = max(res, results[i]);
+        }
+        else if (rtype == RTYPE_MIN) {
+            res = min(res, results[i]);
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+            res = sum(res, results[i]);
+        }
+        else {
+            ERROR("Invalid rtype");
+        }
+    }
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
+        res                = sqrt(inv_n * res);
+    }
+
+    return res;
+}
+
+AcReal
+acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
+}
+
+AcReal
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
+            const VertexBufferHandle& c)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
+}
+
 AcResult
 acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
 {
@@ -284,6 +508,14 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
    return AC_SUCCESS;
 }

+AcResult
+acLoad(const AcMesh& host_mesh)
+{
+    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
+    acSynchronizeStream(STREAM_ALL);
+    return AC_SUCCESS;
+}
+
 AcResult
 acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
 {
@@ -308,241 +540,14 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
    return AC_SUCCESS;
 }

-// acCopyMeshToDevice
-AcResult
-acLoad(const AcMesh& host_mesh)
-{
-    return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
-}
-
-// acCopyMeshToHost
 AcResult
 acStore(AcMesh* host_mesh)
 {
-    return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
-}
-
-static AcResult
-acSwapBuffers(void)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        swapBuffers(devices[i]);
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acSynchronizeHalos(void)
-{
-    // Exchanges the halos of subgrids
-    // After this step, the data within the main grid ranging from
-    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
-    // has been synchronized and transferred to appropriate subgrids
-
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
-    // transferred because their contents depend on the boundary conditions.
-
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices - 1; ++i) {
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
-                                   dst, num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
-                                   dst, num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acSynchronizeStream(const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
-    }
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acSynchronize(void)
-{
+    acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
    acSynchronizeStream(STREAM_ALL);
-    acSynchronizeHalos();
-    acSynchronizeStream(STREAM_ALL);
-
    return AC_SUCCESS;
 }

-static AcResult
-acLocalBoundcondStep(void)
-{
-    if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
-    }
-    else {
-        // Local boundary conditions
-        // #pragma omp parallel for
-        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acGlobalBoundcondStep(void)
-{
-    if (num_devices > 1) {
-        // With periodic boundary conditions we exchange the front and back plates of the
-        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStep(void)
-{
-    acLocalBoundcondStep();
-    acSynchronizeStream(STREAM_ALL);
-    acGlobalBoundcondStep();
-    acSynchronizeHalos();
-    acSynchronizeStream(STREAM_ALL);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
-
-        const int3 da = max(start, d0);
-        const int3 db = min(end, d1);
-
-        if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStep(const int& isubstep, const AcReal& dt)
-{
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + grid.n;
-    acIntegrateStepWithOffset(isubstep, dt, start, end);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrate(const AcReal& dt)
-{
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
-        acSwapBuffers();
-        acLocalBoundcondStep();
-        acSynchronizeStream(STREAM_ALL);
-        acGlobalBoundcondStep();
-        acSynchronizeHalos();
-        acSynchronizeStream(STREAM_ALL);
-    }
-    return AC_SUCCESS;
-}
-
-static AcReal
-simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
-{
-    AcReal res = results[0];
-    for (int i = 1; i < n; ++i) {
-        if (rtype == RTYPE_MAX) {
-            res = max(res, results[i]);
-        }
-        else if (rtype == RTYPE_MIN) {
-            res = min(res, results[i]);
-        }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-            res = sum(res, results[i]);
-        }
-        else {
-            ERROR("Invalid rtype");
-        }
-    }
-
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res                = sqrt(inv_n * res);
-    }
-
-    return res;
-}
-
-AcReal
-acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
-{
-    AcReal results[num_devices];
-
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
-AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
-            const VertexBufferHandle& c)
-{
-    AcReal results[num_devices];
-
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
 AcResult
 acLoadDeviceConstant(const AcRealParam param, const AcReal value)
 {
@@ -552,9 +557,3 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
    }
    return AC_SUCCESS;
 }
-
-AcResult
-acAutoOptimize(void)
-{
-    return autoOptimize(devices[0]);
-}
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -27,15 +27,6 @@
 #pragma once
 #include "astaroth.h"

-// clang-format off
-typedef enum {
-    STREAM_PRIMARY,
-    STREAM_SECONDARY,
-    NUM_STREAM_TYPES,
-    STREAM_ALL
-} StreamType;
-// clang-format on
-
 typedef struct {
    int3 m;
    int3 n;
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -413,8 +413,6 @@ check_rk3(const AcMeshInfo& mesh_info)
            const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities

            acIntegrate(dt);
-            acBoundcondStep();
-            acSynchronize();

            model_rk3(dt, model_mesh);
            boundconds(model_mesh->info, model_mesh);
@@ -712,8 +710,6 @@ run_autotest(void)

                // Device integration step
                acIntegrate(dt);
-                acBoundcondStep();
-                acSynchronize();
                acStore(candidate_mesh);

                // Check fields
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -122,7 +122,6 @@ run_benchmark(void)
        // Warmup
        for (int i = 0; i < 10; ++i) {
            acIntegrate(0);
-            acSynchronize();
        }

        Timer t;
@@ -135,8 +134,6 @@ run_benchmark(void)
 #else // GEN_BENCHMARK_FULL
            acIntegrate(dt);
 #endif
-            acSynchronize();
-
            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
            results.push_back(ms_elapsed);
        }
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -406,7 +406,7 @@ run_renderer(void)
            const int num_vertices = mesh->info.int_params[AC_mxy];
            const int3 dst         = (int3){0, 0, k_slice};
            acStoreWithOffset(dst, num_vertices, mesh);
-            acSynchronize();
+            acSynchronizeStream(STREAM_ALL);
            renderer_draw(*mesh); // Bottleneck is here
            printf("Step #%d, dt: %f\n", steps, double(dt));
            timer_reset(&frame_timer);
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -205,7 +205,6 @@ run_simulation(void)
    write_mesh_info(&mesh_info);
    print_diagnostics(0, AcReal(.0), t_step, diag_file);

-    acSynchronize();
    acStore(mesh);
    save_mesh(*mesh, 0, t_step);

@@ -275,8 +274,6 @@ run_simulation(void)
                acSynchronize();
                acStore(mesh);
            */
-
-            acSynchronize();
            acStore(mesh);

            save_mesh(*mesh, i, t_step);