diff --git a/include/astaroth.h b/include/astaroth.h
index 7e8a3a8..848740b 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -215,8 +215,7 @@ typedef struct {
  * =============================================================================
  */
 typedef enum {
-    STREAM_PRIMARY,   //
-    STREAM_SECONDARY, //
+    STREAM_DEFAULT,
     NUM_STREAM_TYPES, //
     STREAM_ALL
 } StreamType;
@@ -275,25 +274,35 @@ AcResult acStore(AcMesh* host_mesh);
  */
 /** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
 AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
+AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value,
+                                   const StreamType stream);
 
 /** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
 AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
+AcResult acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& start, const int num_vertices,
+                               const StreamType stream);
 
 /** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
  * CPU memory. Asynchronous.
  */
 AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
+AcResult acStoreWithOffsetAsync(const int3& start, const int num_vertices, AcMesh* host_mesh,
+                                const StreamType stream);
 
 /** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
 AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
+AcResult acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream);
 
 /** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
  * Asynchronous.*/
 AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
                                    const int3& end);
+AcResult acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start,
+                                        const int3& end, const StreamType stream);
 
 /** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
 AcResult acBoundcondStep(void);
+AcResult acBoundcondStepAsync(const StreamType stream);
 
 /* End extern "C" */
 #ifdef __cplusplus
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 5decb78..8cfcd2c 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -195,7 +195,7 @@ acSynchronizeStream(const StreamType stream)
 }
 
 static AcResult
-synchronize_halos(void)
+synchronize_halos(const StreamType stream)
 {
     // Exchanges the halos of subgrids
     // After this step, the data within the main grid ranging from
@@ -214,15 +214,15 @@ synchronize_halos(void)
         {
             const int3 src = (int3){0, 0, subgrid.n.z};
             const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
-                                   dst, num_vertices);
+            copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
+                                   num_vertices);
         }
         // ...|ooooooo|xxx <- ...|xxxoooo|...
         {
             const int3 src = (int3){0, 0, NGHOST};
             const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
-                                   dst, num_vertices);
+            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
+                                   num_vertices);
         }
     }
     return AC_SUCCESS;
@@ -232,7 +232,7 @@ AcResult
 acSynchronizeMesh(void)
 {
     acSynchronizeStream(STREAM_ALL);
-    synchronize_halos();
+    synchronize_halos(STREAM_DEFAULT);
     acSynchronizeStream(STREAM_ALL);
 
     return AC_SUCCESS;
@@ -307,7 +307,8 @@ acQuit(void)
 }
 
 AcResult
-acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
+acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start,
+                               const int3& end, const StreamType stream)
 {
     // See the beginning of the file for an explanation of the index mapping
     // #pragma omp parallel for
@@ -322,27 +323,37 @@ acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& sta
         if (db.z >= da.z) {
             const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
             const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
+            rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
         }
     }
     return AC_SUCCESS;
 }
 
 AcResult
-acIntegrateStep(const int& isubstep, const AcReal& dt)
+acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
+{
+    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
+}
+
+AcResult
+acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream)
 {
     const int3 start = (int3){NGHOST, NGHOST, NGHOST};
     const int3 end   = start + grid.n;
-    acIntegrateStepWithOffset(isubstep, dt, start, end);
+    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
+}
 
-    return AC_SUCCESS;
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
 }
 
 static AcResult
-local_boundcondstep(void)
+local_boundcondstep(const StreamType stream)
 {
     if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
+        boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
     }
     else {
         // Local boundary conditions
@@ -350,14 +361,14 @@ local_boundcondstep(void)
         for (int i = 0; i < num_devices; ++i) {
             const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
             const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+            boundcondStep(devices[i], stream, d0, d1);
         }
     }
     return AC_SUCCESS;
 }
 
 static AcResult
-global_boundcondstep(void)
+global_boundcondstep(const StreamType stream)
 {
     if (num_devices > 1) {
         // With periodic boundary conditions we exchange the front and back plates of the
@@ -367,30 +378,37 @@ global_boundcondstep(void)
         {
             const int3 src = (int3){0, 0, subgrid.n.z};
             const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
+            copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
                                    num_vertices);
         }
         // ...|ooooooo|xxx <- ...|xxxoooo|...
         {
             const int3 src = (int3){0, 0, NGHOST};
             const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
+            copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
                                    num_vertices);
         }
     }
     return AC_SUCCESS;
 }
 
+AcResult
+acBoundcondStepAsync(const StreamType stream)
+{
+    ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
+
+    local_boundcondstep(stream);
+    acSynchronizeStream(stream);
+    global_boundcondstep(stream);
+    synchronize_halos(stream);
+    acSynchronizeStream(stream);
+    return AC_SUCCESS;
+}
+
 AcResult
 acBoundcondStep(void)
 {
-    local_boundcondstep();
-    acSynchronizeStream(STREAM_ALL);
-    global_boundcondstep();
-    synchronize_halos();
-    acSynchronizeStream(STREAM_ALL);
-
-    return AC_SUCCESS;
+    return acBoundcondStepAsync(STREAM_DEFAULT);
 }
 
 static AcResult
@@ -450,7 +468,7 @@ acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_han
     AcReal results[num_devices];
     // #pragma omp parallel for
     for (int i = 0; i < num_devices; ++i) {
-        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
+        reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
     }
 
     return simple_final_reduce_scal(rtype, results, num_devices);
@@ -465,14 +483,15 @@ acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const Verte
     AcReal results[num_devices];
     // #pragma omp parallel for
     for (int i = 0; i < num_devices; ++i) {
-        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
+        reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
     }
 
     return simple_final_reduce_scal(rtype, results, num_devices);
 }
 
 AcResult
-acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
+acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& src, const int num_vertices,
+                      const StreamType stream)
 {
     // See the beginning of the file for an explanation of the index mapping
     // #pragma omp parallel for
@@ -501,13 +520,19 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
             const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
             // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
             // printf("\n");
-            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
+            copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
         }
         // printf("\n");
     }
     return AC_SUCCESS;
 }
 
+AcResult
+acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
+{
+    return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
+}
+
 AcResult
 acLoad(const AcMesh& host_mesh)
 {
@@ -517,7 +542,8 @@ acLoad(const AcMesh& host_mesh)
 }
 
 AcResult
-acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
+acStoreWithOffsetAsync(const int3& src, const int num_vertices, AcMesh* host_mesh,
+                       const StreamType stream)
 {
     // See the beginning of the file for an explanation of the index mapping
     // #pragma omp parallel for
@@ -534,12 +560,18 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
             const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
             // DECOMPOSITION OFFSET HERE
             const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
+            copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
         }
     }
     return AC_SUCCESS;
 }
 
+AcResult
+acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
+{
+    return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
+}
+
 AcResult
 acStore(AcMesh* host_mesh)
 {
@@ -549,12 +581,17 @@ acStore(AcMesh* host_mesh)
 }
 
 AcResult
-acLoadDeviceConstant(const AcRealParam param, const AcReal value)
+acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
 {
-    const StreamType stream = STREAM_PRIMARY;
     // #pragma omp parallel for
     for (int i = 0; i < num_devices; ++i) {
         loadDeviceConstant(devices[i], stream, param, value);
     }
     return AC_SUCCESS;
 }
+
+AcResult
+acLoadDeviceConstant(const AcRealParam param, const AcReal value)
+{
+    return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
+}