diff --git a/include/astaroth.h b/include/astaroth.h
index 9c91448..f3c0fb0 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -212,71 +212,89 @@ typedef struct {
 
 /*
  * =============================================================================
- * Astaroth interface
+ * Astaroth interface: Basic functions. Synchronous.
  * =============================================================================
  */
+typedef enum {
+    STREAM_PRIMARY,   //
+    STREAM_SECONDARY, //
+    NUM_STREAM_TYPES, //
+    STREAM_ALL
+} StreamType;
 
 /** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more,
-    AC_FAILURE otherwise. */
+ * AC_FAILURE otherwise. */
 AcResult acCheckDeviceAvailability(void);
 
+/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL
+ * passed as a parameter */
+AcResult acSynchronizeStream(const StreamType stream);
+
+/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos
+ * of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */
+AcResult acSynchronizeMesh(void);
+
 /** Starting point of all GPU computation. Handles the allocation and
 initialization of *all memory needed on all GPUs in the node*. In other words,
 setups everything GPU-side so that calling any other GPU interface function
 afterwards does not result in illegal memory accesses. */
 AcResult acInit(const AcMeshInfo& mesh_info);
 
-/** Splits the host_mesh and distributes it among the GPUs in the node */
-AcResult acLoad(const AcMesh& host_mesh);
-AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
-
-/** Does all three steps of the RK3 integration and computes the boundary
-conditions when necessary. Note that the boundary conditions are not applied
-after the final integration step.
-The result can be fetched to CPU memory with acStore(). */
-AcResult acIntegrate(const AcReal& dt);
-
-/** Performs a single RK3 step without computing boundary conditions. */
-AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
-
-/** Performs a single RK3 step without computing boundary conditions.
-    Operates on a three-dimensional cuboid, where start and end are the
-    opposite corners. */
-AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
-                                   const int3& end);
-
-/** Applies boundary conditions on the GPU meshs and communicates the
- ghost zones among GPUs if necessary */
-AcResult acBoundcondStep(void);
-
-/** Performs a scalar reduction on all GPUs in the node and returns the result.
- */
-AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
-
-/** Performs a vector reduction on all GPUs in the node and returns the result.
- */
-AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
-                   const VertexBufferHandle& b, const VertexBufferHandle& c);
-
-/** Stores the mesh distributed among GPUs of the node back to a single host
- * mesh */
-AcResult acStore(AcMesh* host_mesh);
-AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
-
 /** Frees all GPU allocations and resets all devices in the node. Should be
  * called at exit. */
 AcResult acQuit(void);
 
-/** Synchronizes all devices. All calls to Astaroth are asynchronous by default
-    unless otherwise stated. */
-AcResult acSynchronize(void);
+/** Does all three substeps of the RK3 integration and computes the boundary
+conditions when necessary. The result is synchronized and the boundary conditions are applied
+after the final substep, after which the result can be fetched to CPU memory with acStore. */
+AcResult acIntegrate(const AcReal& dt);
 
-/** Loads a parameter to the constant memory of all devices */
+/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the
+ * whole computational domain, which must be up to date and synchronized before calling
+ * acReduceScal.
+ */
+AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
+
+/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the
+ * whole computational domain, which must be up to date and synchronized before calling
+ * acReduceVec.
+ */
+AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+                   const VertexBufferHandle& b, const VertexBufferHandle& c);
+
+/** Distributes the host mesh among the GPUs in the node. Synchronous. */
+AcResult acLoad(const AcMesh& host_mesh);
+
+/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous.
+ */
+AcResult acStore(AcMesh* host_mesh);
+
+/*
+ * =============================================================================
+ * Astaroth interface: Advanced functions. Asynchronous.
+ * =============================================================================
+ */
+/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
 AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
 
-/** Auto-optimizes the library. This function is free from side-effects: the input vertex buffer is
- * guaranteed not be modified.*/
-AcResult acAutoOptimize(void);
+/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
+AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
+
+/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
+ * CPU memory. Asynchronous.
+ */
+AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
+
+/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
+AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
+
+/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
+ * Asynchronous.*/
+AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
+                                   const int3& end);
+
+/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
+AcResult acBoundcondStep(void);
 
 /* End extern "C" */
 #ifdef __cplusplus
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 78f552c..3f22820 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -183,6 +183,61 @@ acCheckDeviceAvailability(void)
         return AC_FAILURE;
 }
 
+AcResult
+acSynchronizeStream(const StreamType stream)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], stream);
+    }
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+synchronize_halos(void)
+{
+    // Exchanges the halos of subgrids
+    // After this step, the data within the main grid ranging from
+    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
+    // has been synchronized and transferred to appropriate subgrids
+
+    // We loop only to num_devices - 1 since the front and back plate of the grid is not
+    // transferred because their contents depend on the boundary conditions.
+
+    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
+    // I.e. the halos of subgrids must contain up-to-date data!
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices - 1; ++i) {
+        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        {
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+            copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
+                                   dst, num_vertices);
+        }
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        {
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
+                                   dst, num_vertices);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acSynchronizeMesh(void)
+{
+    acSynchronizeStream(STREAM_ALL);
+    synchronize_halos();
+    acSynchronizeStream(STREAM_ALL);
+
+    return AC_SUCCESS;
+}
+
 AcResult
 acInit(const AcMeshInfo& config)
 {
@@ -235,18 +290,187 @@ acInit(const AcMeshInfo& config)
         loadGlobalGrid(devices[i], grid);
         printDeviceInfo(devices[i]);
     }
+
+    acSynchronizeStream(STREAM_ALL);
     return AC_SUCCESS;
 }
 
 AcResult
 acQuit(void)
 {
+    acSynchronizeStream(STREAM_ALL);
+
     for (int i = 0; i < num_devices; ++i) {
         destroyDevice(devices[i]);
     }
     return AC_SUCCESS;
 }
 
+AcResult
+acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
+{
+    // See the beginning of the file for an explanation of the index mapping
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        // DECOMPOSITION OFFSET HERE
+        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
+        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
+
+        const int3 da = max(start, d0);
+        const int3 db = min(end, d1);
+
+        if (db.z >= da.z) {
+            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
+            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
+            rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 end   = start + grid.n;
+    acIntegrateStepWithOffset(isubstep, dt, start, end);
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+local_boundcondstep(void)
+{
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
+    }
+    else {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+global_boundcondstep(void)
+{
+    if (num_devices > 1) {
+        // With periodic boundary conditions we exchange the front and back plates of the
+        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
+        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        {
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
+                                   num_vertices);
+        }
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        {
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
+                                   num_vertices);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    local_boundcondstep();
+    acSynchronizeStream(STREAM_ALL);
+    global_boundcondstep();
+    synchronize_halos();
+    acSynchronizeStream(STREAM_ALL);
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+swap_buffers(void)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        swapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrate(const AcReal& dt)
+{
+    acSynchronizeStream(STREAM_ALL);
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
+        swap_buffers();
+        acBoundcondStep();
+    }
+    return AC_SUCCESS;
+}
+
+static AcReal
+simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+{
+    AcReal res = results[0];
+    for (int i = 1; i < n; ++i) {
+        if (rtype == RTYPE_MAX) {
+            res = max(res, results[i]);
+        }
+        else if (rtype == RTYPE_MIN) {
+            res = min(res, results[i]);
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+            res = sum(res, results[i]);
+        }
+        else {
+            ERROR("Invalid rtype");
+        }
+    }
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
+        res                = sqrt(inv_n * res);
+    }
+
+    return res;
+}
+
+AcReal
+acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
+}
+
+AcReal
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
+            const VertexBufferHandle& c)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
+}
+
 AcResult
 acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
 {
@@ -284,6 +508,14 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
     return AC_SUCCESS;
 }
 
+AcResult
+acLoad(const AcMesh& host_mesh)
+{
+    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
+    acSynchronizeStream(STREAM_ALL);
+    return AC_SUCCESS;
+}
+
 AcResult
 acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
 {
@@ -308,241 +540,14 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
-// acCopyMeshToDevice
-AcResult
-acLoad(const AcMesh& host_mesh)
-{
-    return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
-}
-
-// acCopyMeshToHost
 AcResult
 acStore(AcMesh* host_mesh)
 {
-    return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
-}
-
-static AcResult
-acSwapBuffers(void)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        swapBuffers(devices[i]);
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acSynchronizeHalos(void)
-{
-    // Exchanges the halos of subgrids
-    // After this step, the data within the main grid ranging from
-    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
-    // has been synchronized and transferred to appropriate subgrids
-
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
-    // transferred because their contents depend on the boundary conditions.
-
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices - 1; ++i) {
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
-                                   dst, num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
-                                   dst, num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acSynchronizeStream(const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
-    }
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acSynchronize(void)
-{
+    acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
     acSynchronizeStream(STREAM_ALL);
-    acSynchronizeHalos();
-    acSynchronizeStream(STREAM_ALL);
-
     return AC_SUCCESS;
 }
 
-static AcResult
-acLocalBoundcondStep(void)
-{
-    if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
-    }
-    else {
-        // Local boundary conditions
-        // #pragma omp parallel for
-        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-acGlobalBoundcondStep(void)
-{
-    if (num_devices > 1) {
-        // With periodic boundary conditions we exchange the front and back plates of the
-        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStep(void)
-{
-    acLocalBoundcondStep();
-    acSynchronizeStream(STREAM_ALL);
-    acGlobalBoundcondStep();
-    acSynchronizeHalos();
-    acSynchronizeStream(STREAM_ALL);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
-
-        const int3 da = max(start, d0);
-        const int3 db = min(end, d1);
-
-        if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStep(const int& isubstep, const AcReal& dt)
-{
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + grid.n;
-    acIntegrateStepWithOffset(isubstep, dt, start, end);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrate(const AcReal& dt)
-{
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
-        acSwapBuffers();
-        acLocalBoundcondStep();
-        acSynchronizeStream(STREAM_ALL);
-        acGlobalBoundcondStep();
-        acSynchronizeHalos();
-        acSynchronizeStream(STREAM_ALL);
-    }
-    return AC_SUCCESS;
-}
-
-static AcReal
-simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
-{
-    AcReal res = results[0];
-    for (int i = 1; i < n; ++i) {
-        if (rtype == RTYPE_MAX) {
-            res = max(res, results[i]);
-        }
-        else if (rtype == RTYPE_MIN) {
-            res = min(res, results[i]);
-        }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-            res = sum(res, results[i]);
-        }
-        else {
-            ERROR("Invalid rtype");
-        }
-    }
-
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res                = sqrt(inv_n * res);
-    }
-
-    return res;
-}
-
-AcReal
-acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
-{
-    AcReal results[num_devices];
-
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
-AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
-            const VertexBufferHandle& c)
-{
-    AcReal results[num_devices];
-
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
 AcResult
 acLoadDeviceConstant(const AcRealParam param, const AcReal value)
 {
@@ -552,9 +557,3 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
     }
     return AC_SUCCESS;
 }
-
-AcResult
-acAutoOptimize(void)
-{
-    return autoOptimize(devices[0]);
-}
diff --git a/src/core/device.cuh b/src/core/device.cuh
index 228f4e9..bd2a308 100644
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -27,15 +27,6 @@
 #pragma once
 #include "astaroth.h"
 
-// clang-format off
-typedef enum {
-    STREAM_PRIMARY,
-    STREAM_SECONDARY,
-    NUM_STREAM_TYPES,
-    STREAM_ALL
-} StreamType;
-// clang-format on
-
 typedef struct {
     int3 m;
     int3 n;
diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc
index 6c1855e..bb8b55f 100644
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -413,8 +413,6 @@ check_rk3(const AcMeshInfo& mesh_info)
             const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
 
             acIntegrate(dt);
-            acBoundcondStep();
-            acSynchronize();
 
             model_rk3(dt, model_mesh);
             boundconds(model_mesh->info, model_mesh);
@@ -712,8 +710,6 @@ run_autotest(void)
 
                 // Device integration step
                 acIntegrate(dt);
-                acBoundcondStep();
-                acSynchronize();
                 acStore(candidate_mesh);
 
                 // Check fields
diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc
index 03acd63..10ee0b4 100644
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -122,7 +122,6 @@ run_benchmark(void)
         // Warmup
         for (int i = 0; i < 10; ++i) {
             acIntegrate(0);
-            acSynchronize();
         }
 
         Timer t;
@@ -135,8 +134,6 @@ run_benchmark(void)
 #else // GEN_BENCHMARK_FULL
             acIntegrate(dt);
 #endif
-            acSynchronize();
-
             const double ms_elapsed = timer_diff_nsec(t) / 1e6;
             results.push_back(ms_elapsed);
         }
diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc
index 4497001..8587226 100644
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -406,7 +406,7 @@ run_renderer(void)
             const int num_vertices = mesh->info.int_params[AC_mxy];
             const int3 dst         = (int3){0, 0, k_slice};
             acStoreWithOffset(dst, num_vertices, mesh);
-            acSynchronize();
+            acSynchronizeStream(STREAM_ALL);
             renderer_draw(*mesh); // Bottleneck is here
             printf("Step #%d, dt: %f\n", steps, double(dt));
             timer_reset(&frame_timer);
diff --git a/src/standalone/simulation.cc b/src/standalone/simulation.cc
index 69a74c9..133f691 100644
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -205,7 +205,6 @@ run_simulation(void)
     write_mesh_info(&mesh_info);
     print_diagnostics(0, AcReal(.0), t_step, diag_file);
 
-    acSynchronize();
     acStore(mesh);
     save_mesh(*mesh, 0, t_step);
 
@@ -275,8 +274,6 @@ run_simulation(void)
                 acSynchronize();
                 acStore(mesh);
             */
-
-            acSynchronize();
             acStore(mesh);
 
             save_mesh(*mesh, i, t_step);