Merge branch 'master' into sink_20190723

Conflicts: acc/mhd_solver/stencil_defines.h src/standalone/simulation.cc
2019-08-08 12:17:28 +08:00
parent 2f14bb2a30 af24f17d1f
commit baac7c8d12
21 changed files with 2034 additions and 1207 deletions
--- a/acc/mhd_solver/stencil_assembly.sas
+++ b/acc/mhd_solver/stencil_assembly.sas
@@ -21,9 +21,12 @@ der6x_upwd(in Scalar vertex)

    return (Scalar){  Scalar(1.0/60.0)*inv_ds* (
                    - Scalar(20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    + Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
-                    - Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
-                    +               vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
+                    + Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
+                    - Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
+                    +               vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
 }

 Preprocessed Scalar
@@ -33,9 +36,12 @@ der6y_upwd(in Scalar vertex)

    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
-                    +               vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
+                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
+                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
+                    +               vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
 }

 Preprocessed Scalar
@@ -45,9 +51,12 @@ der6z_upwd(in Scalar vertex)

    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
+                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
+                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
 }

 #endif
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -309,23 +309,26 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
    // JP: This looks wrong:
    //    1) Should it be dsx * nx instead of dsx * ny?
    //    2) Should you also use globalGrid.n instead of the local n?
+    //    MV: You are rigth. Made a quickfix. I did not see the error  because multigpu is split
+    //        in z direction not y direction.
    //    3) Also final point: can we do this with vectors/quaternions instead?
    //       Tringonometric functions are much more expensive and inaccurate/
+    //    MV: Good idea. No an immediate priority.
    // Fun related article:
    // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
-    xx.x = xx.x*(2.0*M_PI/(dsx*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
-    xx.y = xx.y*(2.0*M_PI/(dsy*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
-    xx.z = xx.z*(2.0*M_PI/(dsz*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
+    xx.x = xx.x*(2.0*M_PI/(dsx*globalGridN.x));
+    xx.y = xx.y*(2.0*M_PI/(dsy*globalGridN.y));
+    xx.z = xx.z*(2.0*M_PI/(dsz*globalGridN.z));

    Scalar cos_phi = cos(phi);
    Scalar sin_phi = sin(phi);
-    Scalar cos_k_dox_x     = cos(dot(k_force, xx));
-    Scalar sin_k_dox_x     = sin(dot(k_force, xx));
+    Scalar cos_k_dot_x     = cos(dot(k_force, xx));
+    Scalar sin_k_dot_x     = sin(dot(k_force, xx));
    // Phase affect only the x-component
-    //Scalar real_comp       = cos_k_dox_x;
-    //Scalar imag_comp       = sin_k_dox_x;
-    Scalar real_comp_phase = cos_k_dox_x*cos_phi - sin_k_dox_x*sin_phi;
-    Scalar imag_comp_phase = cos_k_dox_x*sin_phi + sin_k_dox_x*cos_phi;
+    //Scalar real_comp       = cos_k_dot_x;
+    //Scalar imag_comp       = sin_k_dot_x;
+    Scalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
+    Scalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;


    Vector force = (Vector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
@@ -338,9 +341,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
 Vector
 forcing(int3 globalVertexIdx, Scalar dt)
 {
-    Vector a = Scalar(.5) * (Vector){globalGrid.n.x * dsx,
-                                     globalGrid.n.y * dsy,
-                                     globalGrid.n.z * dsz}; // source (origin)
+    Vector a = Scalar(.5) * (Vector){globalGridN.x * dsx,
+                                     globalGridN.y * dsy,
+                                     globalGridN.z * dsz}; // source (origin)
    Vector xx = (Vector){(globalVertexIdx.x - nx_min) * dsx,
                        (globalVertexIdx.y - ny_min) * dsy,
                        (globalVertexIdx.z - nz_min) * dsz}; // sink (current index)
--- a/doc/manual/.gitignore
+++ b/doc/manual/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.pdf
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -24,95 +24,62 @@ extern "C" {

 #include "astaroth_defines.h"

-/** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more,
- * AC_FAILURE otherwise. */
-AcResult acCheckDeviceAvailability(void);
+#include "astaroth_device.h"
+#include "astaroth_node.h"

-/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL
- * passed as a parameter */
-AcResult acSynchronizeStream(const StreamType stream);
+/*
+#include "astaroth_grid.h"
+#define acInit(x) acGridInit(x)
+#define acQuit() acGridQuit()
+#define acLoad(x) acGridLoadMesh(STREAM_DEFAULT, x)
+#define acReduceScal(x, y) acGridReduceScal(STREAM_DEFAULT, x, y)
+#define acReduceVec(x, y, z, w) acGridReduceVec(STREAM_DEFAULT, x, y, z, w)
+#define acBoundcondStep() acGridPeriodicBoundcondStep(STREAM_DEFAULT)
+#define acIntegrate(x) acGridIntegrateStep(STREAM_DEFAULT, x)
+#define acStore(x) acGridStoreMesh(STREAM_DEFAULT, x)
+#define acSynchronizeStream(x) acGridSynchronizeStream(x)
+#define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
+*/

-/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos
- * of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */
-AcResult acSynchronizeMesh(void);
-
-/** Starting point of all GPU computation. Handles the allocation and
-initialization of *all memory needed on all GPUs in the node*. In other words,
-setups everything GPU-side so that calling any other GPU interface function
-afterwards does not result in illegal memory accesses. */
+/** Allocates all memory and initializes the devices visible to the caller. Should be
+ * called before any other function in this interface. */
 AcResult acInit(const AcMeshInfo mesh_info);

 /** Frees all GPU allocations and resets all devices in the node. Should be
 * called at exit. */
 AcResult acQuit(void);

-/** Does all three substeps of the RK3 integration and computes the boundary
-conditions when necessary. The result is synchronized and the boundary conditions are applied
-after the final substep, after which the result can be fetched to CPU memory with acStore. */
+/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a
+ * parameter*/
+AcResult acSynchronizeStream(const Stream stream);
+
+/** Loads a constant to the memories of the devices visible to the caller */
+AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
+
+/** Loads an AcMesh to the devices visible to the caller */
+AcResult acLoad(const AcMesh host_mesh);
+
+/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/
+AcResult acStore(AcMesh* host_mesh);
+
+/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
+ * substep and the user is responsible for calling acBoundcondStep before reading the data. */
 AcResult acIntegrate(const AcReal dt);

-/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the
- * whole computational domain, which must be up to date and synchronized before calling
- * acReduceScal.
- */
-AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle a);
+/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the
+ * caller*/
+AcResult acBoundcondStep(void);

-/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the
- * whole computational domain, which must be up to date and synchronized before calling
- * acReduceVec.
- */
+/** Does a scalar reduction with the data stored in some vertex buffer */
+AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
+
+/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */
 AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
                   const VertexBufferHandle b, const VertexBufferHandle c);

-/** Distributes the host mesh among the GPUs in the node. Synchronous. */
-AcResult acLoad(const AcMesh host_mesh);
-
-/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous.
- */
-AcResult acStore(AcMesh* host_mesh);
-
-/*
- * =============================================================================
- * Astaroth interface: Advanced functions. Asynchronous.
- * =============================================================================
- */
-/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
-AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
-AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value,
-                                   const StreamType stream);
-
-/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
-AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 start, const int num_vertices);
-AcResult acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 start, const int num_vertices,
-                               const StreamType stream);
-
-/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
- * CPU memory. Asynchronous.
- */
-AcResult acStoreWithOffset(const int3 start, const int num_vertices, AcMesh* host_mesh);
-AcResult acStoreWithOffsetAsync(const int3 start, const int num_vertices, AcMesh* host_mesh,
-                                const StreamType stream);
-
-/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
-AcResult acIntegrateStep(const int isubstep, const AcReal dt);
-AcResult acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream);
-
-/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
- * Asynchronous.*/
-AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start,
-                                   const int3 end);
-AcResult acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
-                                        const int3 end, const StreamType stream);
-
-/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
-AcResult acBoundcondStep(void);
-AcResult acBoundcondStepAsync(const StreamType stream);
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
+/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory.
 */
+AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);

 #ifdef __cplusplus
 } // extern "C"
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -86,7 +86,9 @@ typedef struct {
        FUNC(AC_nxy),\
        FUNC(AC_nxyz),\

-#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)
+#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)\
+        FUNC(AC_global_grid_n),\
+        FUNC(AC_multigpu_offset),

 #define AC_FOR_BUILTIN_REAL_PARAM_TYPES(FUNC)

@@ -106,9 +108,26 @@ typedef enum {

 typedef enum {
    STREAM_DEFAULT,
-    NUM_STREAM_TYPES, //
-    STREAM_ALL
-} StreamType;
+    STREAM_0,
+    STREAM_1,
+    STREAM_2,
+    STREAM_3,
+    STREAM_4,
+    STREAM_5,
+    STREAM_6,
+    STREAM_7,
+    STREAM_8,
+    STREAM_9,
+    STREAM_10,
+    STREAM_11,
+    STREAM_12,
+    STREAM_13,
+    STREAM_14,
+    STREAM_15,
+    STREAM_16,
+    NUM_STREAM_TYPES
+} Stream;
+#define STREAM_ALL (NUM_STREAM_TYPES)

 #define AC_GEN_ID(X) X
 typedef enum {
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -0,0 +1,127 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "astaroth_defines.h"
+
+typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
+                                 // in Vulkan, f.ex. VkDevice
+
+/** */
+AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device);
+
+/** */
+AcResult acDeviceDestroy(Device device);
+
+/** */
+AcResult acDevicePrintInfo(const Device device);
+
+/** */
+AcResult acDeviceAutoOptimize(const Device device);
+
+/** */
+AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
+
+/** */
+AcResult acDeviceSwapBuffers(const Device device);
+
+/** */
+AcResult acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
+                              const AcReal value);
+
+/** */
+AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
+                                            const AcMesh host_mesh,
+                                            const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                            const int3 dst, const int num_vertices);
+
+/** Deprecated */
+AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
+                                    const AcMesh host_mesh, const int3 src, const int3 dst,
+                                    const int num_vertices);
+
+/** */
+AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
+                                  const VertexBufferHandle vtxbuf_handle);
+
+/** Deprecated */
+AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
+
+/** */
+AcResult acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
+                                             const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                             const int3 dst, const int num_vertices,
+                                             AcMesh* host_mesh);
+
+/** Deprecated */
+AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                                     const int3 dst, const int num_vertices, AcMesh* host_mesh);
+
+/** */
+AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
+                                   const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+
+/** Deprecated */
+AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
+
+/** */
+AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
+                                                const VertexBufferHandle vtxbuf_handle,
+                                                const int3 src, const int3 dst,
+                                                const int num_vertices, Device dst_device);
+
+/** Deprecated */
+AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
+                                        const int3 src, const int3 dst, const int num_vertices,
+                                        Device* dst_device);
+
+/** */
+AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
+                                      const VertexBufferHandle vtxbuf_handle, Device dst_device);
+
+/** Deprecated */
+AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
+
+/** */
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                                  const int3 start, const int3 end, const AcReal dt);
+/** */
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
+                                       const int3 end);
+
+/** */
+AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                                    const int3 end);
+
+/** */
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
+/** */
+AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
+                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                           const VertexBufferHandle vtxbuf2, AcReal* result);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
--- a/include/astaroth_grid.h
+++ b/include/astaroth_grid.h
@@ -0,0 +1,99 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "astaroth_defines.h"
+
+/** */
+AcResult acGridInit(const AcMeshInfo node_config);
+
+/** */
+AcResult acGridQuit(void);
+
+/** */
+AcResult acGridSynchronizeStream(const Stream stream);
+
+/** */
+AcResult acGridSwapBuffers(void);
+
+/** */
+AcResult acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value);
+
+/** */
+AcResult acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
+                                          const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                          const int3 dst, const int num_vertices);
+
+/** */
+AcResult acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
+                                  const int3 dst, const int num_vertices);
+
+/** */
+AcResult acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
+                                const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acGridLoadMesh(const Stream stream, const AcMesh host_mesh);
+
+/** */
+AcResult acGridStoreVertexBufferWithOffset(const Stream stream,
+                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                           const int3 dst, const int num_vertices,
+                                           AcMesh* host_mesh);
+
+/** */
+AcResult acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
+                                   const int num_vertices, AcMesh* host_mesh);
+
+/** */
+AcResult acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
+                                 AcMesh* host_mesh);
+
+/** */
+AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
+
+/** */
+AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start,
+                                const int3 end, const AcReal dt);
+
+/** */
+AcResult acGridIntegrateStep(const Stream stream, const AcReal dt);
+
+/** */
+AcResult acGridPeriodicBoundcondStep(const Stream stream);
+/** */
+/* acGridReduceScal(const Stream stream, const ReductionType rtype,
+  const VertexBufferHandle vtxbuf_handle, AcReal* result); */
+AcReal acGridReduceScal(const Stream stream, const ReductionType rtype,
+                        const VertexBufferHandle vtxbuf_handle);
+/** */
+/*AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                         const VertexBufferHandle vec2, AcReal* result);*/
+AcReal acGridReduceVec(const Stream stream, const ReductionType rtype,
+                       const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                       const VertexBufferHandle vec2);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
--- a/include/astaroth_node.h
+++ b/include/astaroth_node.h
@@ -0,0 +1,124 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "astaroth_defines.h"
+
+typedef struct node_s* Node; // Opaque pointer to node_s.
+
+typedef struct {
+
+} DeviceConfiguration;
+
+/** */
+AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
+
+/** */
+AcResult acNodeDestroy(Node node);
+
+/** */
+AcResult acNodePrintInfo(const Node node);
+
+/** */
+AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
+
+/** */
+AcResult acNodeAutoOptimize(const Node node);
+
+/** */
+AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
+
+/** Deprecated ? */
+AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle); // Not in Device
+
+/** */
+AcResult acNodeSynchronizeMesh(const Node node, const Stream stream); // Not in Device
+
+/** */
+AcResult acNodeSwapBuffers(const Node node);
+
+/** */
+AcResult acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
+                            const AcReal value);
+
+/** Deprecated ? Might be useful though if the user wants to load only one vtxbuf. But in this case
+ * the user should supply a AcReal* instead of vtxbuf_handle */
+AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
+                                          const AcMesh host_mesh,
+                                          const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                          const int3 dst, const int num_vertices);
+
+/** */
+AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                                  const int3 src, const int3 dst, const int num_vertices);
+
+/** Deprecated ? */
+AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
+                                const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
+
+/** Deprecated ? */
+AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
+                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                           const int3 dst, const int num_vertices,
+                                           AcMesh* host_mesh);
+
+/** */
+AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
+                                   const int3 dst, const int num_vertices, AcMesh* host_mesh);
+
+/** Deprecated ? */
+AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
+                                 const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+
+/** */
+AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
+
+/** */
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
+
+/** */
+AcResult acNodeIntegrate(const Node node, const AcReal dt);
+
+/** */
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
+
+/** */
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+/** */
+AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
--- a/scripts/auto_optimize.sh
+++ b/scripts/auto_optimize.sh
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-# Run this in your build directory (cd build && ../scripts/auto_optimize.sh)
-# Generates a ${BENCHMARK_FILE} which contains the threadblock dims and other
-# constants used in the integration in addition to the time used.
-
-MAX_THREADS=1024 # Max size of the thread block, depends on hardware
-
-BENCHMARK_FILE="benchmark.out"
-TBCONFCREATOR_SRC_PATH="../scripts/gen_rk3_threadblockconf.c"
-TBCONFFILE_DST_PATH="../src/core/kernels"
-
-C_COMPILER_NAME="gcc"
-
-rm ${BENCHMARK_FILE}
-
-for (( tz=2; tz<=8; tz*=2))
-do
-for (( ty=1; ty<=1; ty+=1))
-do
-for (( tx=16; tx<=64; tx*=2))
-do
-
-if ( (${tx}*${ty}*${tz}) > ${MAX_THREADS})
-then break
-fi
-
-for (( launch_bound=1; launch_bound<=8; launch_bound*=2))
-do
-for (( elems_per_thread=1; elems_per_thread<=128; elems_per_thread*=2))
-do
-    # Generate the threadblock configuration
-    ${C_COMPILER_NAME} ${TBCONFCREATOR_SRC_PATH} -o gen_rk3_threadblockconf
-    ./gen_rk3_threadblockconf ${tx} ${ty} ${tz} ${elems_per_thread} ${launch_bound}
-    rm gen_rk3_threadblockconf
-    mv rk3_threadblock.conf ${TBCONFFILE_DST_PATH}
-
-    # Compile and run the test build
-    cmake -DBUILD_DEBUG=OFF -DDOUBLE_PRECISION=OFF -DAUTO_OPTIMIZE=ON .. && make -j
-    #if ./ac_run -t; then
-    #    echo Success
-        ./ac_run -b
-    #else
-    #    echo fail!
-    #fi
-done 
-done 
-done 
-done
-done 
-
--- a/scripts/autotest.sh
+++ b/scripts/autotest.sh
@@ -5,10 +5,13 @@
 # source ./sourceme.sh
 # autotest.sh
 #
-# If you need slurm or to pass something before ./ac_run, set the variable
+# If you need slurm or to pass something before ./ac_run, export the variable
 # SRUN_COMMAND before calling this script.
 #
 # F.ex. on Taito SRUN_COMMAND="srun --gres=gpu:k80:4 --mem=24000 -t 00:14:59 -p gputest --cpus-per-task 1 -n 1"
+# export SRUN_COMMAND
+# autotest.sh
+echo "SRUN_COMMAND: " ${SRUN_COMMAND}

 results=()

--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -3,7 +3,8 @@
 ########################################

 ## Find packages
-find_package(CUDA 9 REQUIRED)
+find_package(CUDA REQUIRED)
+# find_package(OpenMP REQUIRED)

 ## Architecture and optimization flags
 set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
@@ -12,7 +13,8 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
                    -gencode arch=compute_61,code=sm_61
                    -lineinfo
                    -ftz=true # Flush denormalized floats to zero
-                    -std=c++11)
+                    -std=c++11
+                    )# --compiler-options ${OpenMP_CXX_FLAGS})
                    #--maxrregcount=255
                    # -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
                    # =cg to opt out
@@ -32,6 +34,6 @@ else ()
 endif ()

 ## Create and link the library
-include_directories(.)
-cuda_add_library(astaroth_core STATIC astaroth.cu device.cu)
+cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
+target_include_directories(astaroth_core PRIVATE .)
 target_link_libraries(astaroth_core m)
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -16,119 +16,8 @@
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-
-/**
- * @file
- * \brief Multi-GPU implementation.
- *
- %JP: The old way for computing boundary conditions conflicts with the
- way we have to do things with multiple GPUs.
-
- The older approach relied on unified memory, which represented the whole
- memory area as one huge mesh instead of several smaller ones. However, unified memory
- in its current state is more meant for quick prototyping when performance is not an issue.
- Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
- than when managing the memory explicitly.
-
- In this new approach, I have simplified the multi- and single-GPU layers significantly.
- Quick rundown:
-         New struct: Grid. There are two global variables, "grid" and "subgrid", which
-         contain the extents of the whole simulation domain and the decomposed grids,
- respectively. To simplify thing, we require that each GPU is assigned the same amount of
- work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
- work with.
-
-         The whole simulation domain is decomposed with respect to the z dimension.
-         For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
-         contain (nx, ny, nz / num_devices) vertices.
-
-         An local index (i, j, k) in some subgrid can be mapped to the global grid with
-                 global idx = (i, j, k + device_id * subgrid.n.z)
-
- Terminology:
-         - Single-GPU function: a function defined on the single-GPU layer (device.cu)
-
- Changes required to this commented code block:
-         - The thread block dimensions (tpb) are no longer passed to the kernel here but in
- device.cu instead. Same holds for any complex index calculations. Instead, the local
- coordinates should be passed as an int3 type without having to consider how the data is
- actually laid out in device memory
-         - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
- handle of type "Device" which should be passed to single-GPU functions. In this file, all
- devices are stored in a global array "devices[num_devices]".
-         - Every single-GPU function is executed asynchronously by default such that we
-           can optimize Astaroth by executing memory transactions concurrently with
- computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
-           Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
-           as a parameter and commands executing in different streams can be processed
-           in parallel/concurrently.
-
-
- Note on periodic boundaries (might be helpful when implementing other boundary conditions):
-
-         With multiple GPUs, periodic boundary conditions applied on indices ranging from
-
-                 (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
- STENCIL_ORDER/2)
-
-         on a single device are "local", in the sense that they can be computed without
- having to exchange data with neighboring GPUs. Special care is needed only for transferring
-         the data to the fron and back plates outside this range. In the solution we use
- here, we solve the local boundaries first, and then just exchange the front and back plates
-         in a "ring", like so
-                                 device_id
-                     (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
-
-### Throughout this file we use the following notation and names for various index offsets
-
-    Global coordinates: coordinates with respect to the global grid (static Grid grid)
-    Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
-
-    s0, s1: source indices in global coordinates
-    d0, d1: destination indices in global coordinates
-    da = max(s0, d0);
-    db = min(s1, d1);
-
-    These are used in at least
-    acLoad()
-    acStore()
-    acSynchronizeHalos()
-
-     Here we decompose the host mesh and distribute it among the GPUs in
-     the node.
-
-     The host mesh is a huge contiguous block of data. Its dimensions are given by
-     the global variable named "grid". A "grid" is decomposed into "subgrids",
-     one for each GPU. Here we check which parts of the range s0...s1 maps
-     to the memory space stored by some GPU, ranging d0...d1, and transfer
-     the data if needed.
-
-     The index mapping is inherently quite involved, but here's a picture which
-     hopefully helps make sense out of all this.
-
-
-     Grid
-                                      |----num_vertices---|
-     xxx|....................................................|xxx
-              ^                   ^   ^                   ^
-             d0                  d1  s0 (src)            s1
-
-     Subgrid
-
-              xxx|.............|xxx
-              ^                   ^
-             d0                  d1
-
-                                  ^   ^
-                                 db  da
- *
- */
+// #include "astaroth_defines.h"
 #include "astaroth.h"
-#include "errchk.h"
-
-#include "device.cuh"
-#include "math_utils.h" // sum for reductions
-// #include "standalone/config_loader.h" // update_config

 #define AC_GEN_STR(X) #X
 const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
@@ -142,532 +31,80 @@ const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) /
 const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 #undef AC_GEN_STR

-static const int MAX_NUM_DEVICES       = 32;
-static int num_devices                 = 0;
-static Device devices[MAX_NUM_DEVICES] = {};
-
-static Grid grid; // A grid consists of num_devices subgrids
-static Grid subgrid;
-
-static int
-gridIdx(const Grid grid, const int3 idx)
-{
-    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
-}
-
-static int3
-gridIdx3d(const Grid grid, const int idx)
-{
-    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
-                  idx / (grid.m.x * grid.m.y)};
-}
-
-static void
-printInt3(const int3 vec)
-{
-    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
-}
-
-static inline void
-print(const AcMeshInfo config)
-{
-    for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
-    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
-}
-
-static void
-update_builtin_params(AcMeshInfo* config)
-{
-    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
-    ///////////// PAD TEST
-    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
-    ///////////// PAD TEST
-    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
-    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
-
-    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
-    config->int_params[AC_nx_min] = NGHOST;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
-    config->int_params[AC_ny_min] = NGHOST;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
-    config->int_params[AC_nz_min] = NGHOST;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
-
-    /* Additional helper params */
-    // Int helpers
-    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
-    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
-    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
-
-#if VERBOSE_PRINTING // Defined in astaroth.h
-    printf("###############################################################\n");
-    printf("Config dimensions recalculated:\n");
-    print(*config);
-    printf("###############################################################\n");
-#endif
-}
-
-static Grid
-createGrid(const AcMeshInfo config)
-{
-    Grid grid;
-
-    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
-    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
-
-    return grid;
-}
+static const int num_nodes = 1;
+static Node nodes[num_nodes];

 AcResult
-acCheckDeviceAvailability(void)
+acInit(const AcMeshInfo mesh_info)
 {
-    int device_count; // Separate from num_devices to avoid side effects
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count));
-    if (device_count > 0)
-        return AC_SUCCESS;
-    else
-        return AC_FAILURE;
-}
-
-AcResult
-acSynchronizeStream(const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
-    }
-
-    return AC_SUCCESS;
-}
-
-static AcResult
-synchronize_halos(const StreamType stream)
-{
-    // Exchanges the halos of subgrids
-    // After this step, the data within the main grid ranging from
-    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
-    // has been synchronized and transferred to appropriate subgrids
-
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
-    // transferred because their contents depend on the boundary conditions.
-
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices - 1; ++i) {
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acSynchronizeMesh(void)
-{
-    acSynchronizeStream(STREAM_ALL);
-    synchronize_halos(STREAM_DEFAULT);
-    acSynchronizeStream(STREAM_ALL);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acInit(const AcMeshInfo config)
-{
-    // Get num_devices
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
-    if (num_devices < 1) {
-        ERROR("No CUDA devices found!");
-        return AC_FAILURE;
-    }
-    if (num_devices > MAX_NUM_DEVICES) {
-        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
-        num_devices = MAX_NUM_DEVICES;
-    }
-    if (!AC_MULTIGPU_ENABLED) {
-        WARNING("MULTIGPU_ENABLED was false. Using only one device");
-        num_devices = 1; // Use only one device if multi-GPU is not enabled
-    }
-    // Check that num_devices is divisible with AC_nz. This makes decomposing the
-    // problem domain to multiple GPUs much easier since we do not have to worry
-    // about remainders
-    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
-
-    // Decompose the problem domain
-    // The main grid
-    grid = createGrid(config);
-
-    // Subgrids
-    AcMeshInfo subgrid_config = config;
-    subgrid_config.int_params[AC_nz] /= num_devices;
-    update_builtin_params(&subgrid_config);
-    subgrid = createGrid(subgrid_config);
-
-    // Periodic boundary conditions become weird if the system can "fold unto itself".
-    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
-
-#if VERBOSE_PRINTING
-    // clang-format off
-    printf("Grid m ");   printInt3(grid.m);    printf("\n");
-    printf("Grid n ");   printInt3(grid.n);    printf("\n");
-    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
-    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
-    // clang-format on
-#endif
-
-    // Initialize the devices
-    for (int i = 0; i < num_devices; ++i) {
-        createDevice(i, subgrid_config, &devices[i]);
-        loadGlobalGrid(devices[i], grid);
-        printDeviceInfo(devices[i]);
-    }
-
-    // Enable peer access
-    for (int i = 0; i < num_devices; ++i) {
-        const int front = (i + 1) % num_devices;
-        const int back  = (i - 1 + num_devices) % num_devices;
-
-        int can_access_front, can_access_back;
-        cudaDeviceCanAccessPeer(&can_access_front, i, front);
-        cudaDeviceCanAccessPeer(&can_access_back, i, back);
-#if VERBOSE_PRINTING
-        printf(
-            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
-            i, front, can_access_front, back, can_access_back);
-#endif
-
-        cudaSetDevice(i);
-        if (can_access_front) {
-            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
-        }
-        if (can_access_back) {
-            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
-        }
-    }
-
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
+    return acNodeCreate(0, mesh_info, &nodes[0]);
 }

 AcResult
 acQuit(void)
 {
-    acSynchronizeStream(STREAM_ALL);
-
-    for (int i = 0; i < num_devices; ++i) {
-        destroyDevice(devices[i]);
-    }
-    return AC_SUCCESS;
+    return acNodeDestroy(nodes[0]);
 }

 AcResult
-acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
-                               const int3 end, const StreamType stream)
+acSynchronizeStream(const Stream stream)
 {
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
-
-        const int3 da = max(start, d0);
-        const int3 db = min(end, d1);
-
-        if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
-        }
-    }
-    return AC_SUCCESS;
+    return acNodeSynchronizeStream(nodes[0], stream);
 }

 AcResult
-acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end)
+acLoadDeviceConstant(const AcRealParam param, const AcReal value)
 {
-    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
+    return acNodeLoadConstant(nodes[0], STREAM_DEFAULT, param, value);
 }

 AcResult
-acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream)
+acLoad(const AcMesh host_mesh)
 {
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + grid.n;
-    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
+    return acNodeLoadMesh(nodes[0], STREAM_DEFAULT, host_mesh);
 }

 AcResult
-acIntegrateStep(const int isubstep, const AcReal dt)
+acStore(AcMesh* host_mesh)
 {
-    return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
-}
-
-static AcResult
-local_boundcondstep(const StreamType stream)
-{
-    if (num_devices == 1) {
-        boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
-    }
-    else {
-        // Local boundary conditions
-        // #pragma omp parallel for
-        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            boundcondStep(devices[i], stream, d0, d1);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-global_boundcondstep(const StreamType stream)
-{
-    if (num_devices > 1) {
-        // With periodic boundary conditions we exchange the front and back plates of the
-        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStepAsync(const StreamType stream)
-{
-    ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
-
-    local_boundcondstep(stream);
-    acSynchronizeStream(stream);
-    global_boundcondstep(stream);
-    synchronize_halos(stream);
-    acSynchronizeStream(stream);
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStep(void)
-{
-    return acBoundcondStepAsync(STREAM_DEFAULT);
-}
-
-static AcResult
-swap_buffers(void)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        swapBuffers(devices[i]);
-    }
-    return AC_SUCCESS;
+    return acNodeStoreMesh(nodes[0], STREAM_DEFAULT, host_mesh);
 }

 AcResult
 acIntegrate(const AcReal dt)
 {
-    acSynchronizeStream(STREAM_ALL);
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
-        swap_buffers();
-        acBoundcondStep();
-    }
-    return AC_SUCCESS;
+    /*
+    acNodeIntegrate(nodes[0], dt);
+    return acBoundcondStep();
+    */
+    return acNodeIntegrate(nodes[0], dt);
 }

-static AcReal
-simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n)
+AcResult
+acBoundcondStep(void)
 {
-    AcReal res = results[0];
-    for (int i = 1; i < n; ++i) {
-        if (rtype == RTYPE_MAX) {
-            res = max(res, results[i]);
-        }
-        else if (rtype == RTYPE_MIN) {
-            res = min(res, results[i]);
-        }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
-            res = sum(res, results[i]);
-        }
-        else {
-            ERROR("Invalid rtype");
-        }
-    }
-
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res                = sqrt(inv_n * res);
-    }
-    return res;
+    return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
 }

 AcReal
-acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle)
+acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
 {
-    acSynchronizeStream(STREAM_ALL);
-
-    AcReal results[num_devices];
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
+    AcReal result;
+    acNodeReduceScal(nodes[0], STREAM_DEFAULT, rtype, vtxbuf_handle, &result);
+    return result;
 }

 AcReal
 acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
            const VertexBufferHandle c)
 {
-    acSynchronizeStream(STREAM_ALL);
-
-    AcReal results[num_devices];
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
+    AcReal result;
+    acNodeReduceVec(nodes[0], STREAM_DEFAULT, rtype, a, b, c, &result);
+    return result;
 }

 AcResult
-acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices,
-                      const StreamType stream)
+acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh)
 {
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
-
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
-
-        const int3 da = max(s0, d0);
-        const int3 db = min(s1, d1);
-        /*
-        printf("Device %d\n", i);
-        printf("\ts0: "); printInt3(s0); printf("\n");
-        printf("\td0: "); printInt3(d0); printf("\n");
-        printf("\tda: "); printInt3(da); printf("\n");
-        printf("\tdb: "); printInt3(db); printf("\n");
-        printf("\td1: "); printInt3(d1); printf("\n");
-        printf("\ts1: "); printInt3(s1); printf("\n");
-        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
-        */
-        if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
-            // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
-            // printf("\n");
-            copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
-        }
-        // printf("\n");
-    }
-    return AC_SUCCESS;
+    return acNodeStoreMeshWithOffset(nodes[0], STREAM_DEFAULT, dst, dst, num_vertices, host_mesh);
 }
-
-AcResult
-acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices)
-{
-    return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
-}
-
-AcResult
-acLoad(const AcMesh host_mesh)
-{
-    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info));
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh,
-                       const StreamType stream)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
-
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
-
-        const int3 da = max(s0, d0);
-        const int3 db = min(s1, d1);
-        if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
-            // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh)
-{
-    return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
-}
-
-AcResult
-acStore(AcMesh* host_mesh)
-{
-    acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh);
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        loadDeviceConstant(devices[i], stream, param, value);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadDeviceConstant(const AcRealParam param, const AcReal value)
-{
-    return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
-}
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -24,7 +24,7 @@
 * Detailed info.
 *
 */
-#include "device.cuh"
+#include "astaroth_device.h"

 #include "errchk.h"

@@ -40,19 +40,19 @@ typedef struct {
 } VertexBufferArray;

 __constant__ AcMeshInfo d_mesh_info;
-__constant__ int3 d_multigpu_offset;
-__constant__ Grid globalGrid;
 #define DCONST_INT(X) (d_mesh_info.int_params[X])
 #define DCONST_INT3(X) (d_mesh_info.int3_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
+#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
+#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 #include "kernels/boundconds.cuh"
 #include "kernels/integration.cuh"
 #include "kernels/reductions.cuh"

-static dim3 rk3_tpb = (dim3){32, 1, 4};
+static dim3 rk3_tpb(32, 1, 4);

 #if PACKED_DATA_TRANSFERS // Defined in device.cuh
 // #include "kernels/pack_unpack.cuh"
@@ -76,8 +76,96 @@ struct device_s {
 #endif
 };

+// clang-format off
+static __global__ void dummy_kernel(void) {}
+// clang-format on
+
 AcResult
-printDeviceInfo(const Device device)
+acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
+{
+    cudaSetDevice(id);
+    cudaDeviceReset();
+
+    // Create Device
+    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
+    ERRCHK_ALWAYS(device);
+
+    device->id           = id;
+    device->local_config = device_config;
+
+    // Check that the code was compiled for the proper GPU architecture
+    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
+           "device supports the CUDA architecture you are compiling for.\n"
+           "Running dummy kernel... ");
+    fflush(stdout);
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    printf("Success!\n");
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
+    }
+
+    // Memory
+    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+    }
+    ERRCHK_CUDA_ALWAYS(
+        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+
+#if PACKED_DATA_TRANSFERS
+// Allocate data required for packed transfers here (cudaMalloc)
+#endif
+
+    // Device constants
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
+                                          cudaMemcpyHostToDevice));
+
+    printf("Created device %d (%p)\n", device->id, device);
+    *device_handle = device;
+
+    // Autoptimize
+    if (id == 0) {
+        acDeviceAutoOptimize(device);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceDestroy(Device device)
+{
+    cudaSetDevice(device->id);
+    printf("Destroying device %d (%p)\n", device->id, device);
+
+    // Memory
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        cudaFree(device->vba.in[i]);
+        cudaFree(device->vba.out[i]);
+    }
+    cudaFree(device->reduce_scratchpad);
+    cudaFree(device->reduce_result);
+
+#if PACKED_DATA_TRANSFERS
+// Free data required for packed tranfers here (cudaFree)
+#endif
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamDestroy(device->streams[i]);
+    }
+
+    // Destroy Device
+    free(device);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePrintInfo(const Device device)
 {
    const int device_id = device->id;

@@ -136,305 +224,8 @@ printDeviceInfo(const Device device)
    return AC_SUCCESS;
 }

-static __global__ void
-dummy_kernel(void)
-{
-}
-
 AcResult
-createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
-{
-    cudaSetDevice(id);
-    cudaDeviceReset();
-
-    // Create Device
-    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
-    ERRCHK_ALWAYS(device);
-
-    device->id           = id;
-    device->local_config = device_config;
-
-    // Check that the code was compiled for the proper GPU architecture
-    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
-           "device supports the CUDA architecture you are compiling for.\n"
-           "Running dummy kernel... ");
-    fflush(stdout);
-    dummy_kernel<<<1, 1>>>();
-    ERRCHK_CUDA_KERNEL_ALWAYS();
-    printf("Success!\n");
-
-    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
-    }
-
-    // Memory
-    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
-    }
-    ERRCHK_CUDA_ALWAYS(
-        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
-
-#if PACKED_DATA_TRANSFERS
-// Allocate data required for packed transfers here (cudaMalloc)
-#endif
-
-    // Device constants
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
-                                          cudaMemcpyHostToDevice));
-
-    // Multi-GPU offset. This is used to compute globalVertexIdx.
-    // Might be better to calculate this in astaroth.cu instead of here, s.t.
-    // everything related to the decomposition is limited to the multi-GPU layer
-    const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
-                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
-
-    printf("Created device %d (%p)\n", device->id, device);
-    *device_handle = device;
-
-    // Autoptimize
-    if (id == 0)
-        autoOptimize(device);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-destroyDevice(Device device)
-{
-    cudaSetDevice(device->id);
-    printf("Destroying device %d (%p)\n", device->id, device);
-
-    // Memory
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        cudaFree(device->vba.in[i]);
-        cudaFree(device->vba.out[i]);
-    }
-    cudaFree(device->reduce_scratchpad);
-    cudaFree(device->reduce_result);
-
-#if PACKED_DATA_TRANSFERS
-// Free data required for packed tranfers here (cudaFree)
-#endif
-
-    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamDestroy(device->streams[i]);
-    }
-
-    // Destroy Device
-    free(device);
-    return AC_SUCCESS;
-}
-
-AcResult
-boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
-{
-    cudaSetDevice(device->id);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
-           const VertexBufferHandle vtxbuf_handle, AcReal* result)
-{
-    cudaSetDevice(device->id);
-
-    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                              device->local_config.int_params[AC_ny_min],
-                              device->local_config.int_params[AC_nz_min]};
-
-    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                            device->local_config.int_params[AC_ny_max],
-                            device->local_config.int_params[AC_nz_max]};
-
-    *result = reduce_scal(device->streams[stream_type], rtype, start, end,
-                          device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
-                          device->reduce_result);
-    return AC_SUCCESS;
-}
-
-AcResult
-reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
-          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-          const VertexBufferHandle vtxbuf2, AcReal* result)
-{
-    cudaSetDevice(device->id);
-
-    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                              device->local_config.int_params[AC_ny_min],
-                              device->local_config.int_params[AC_nz_min]};
-
-    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                            device->local_config.int_params[AC_ny_max],
-                            device->local_config.int_params[AC_nz_max]};
-
-    *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
-                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
-                         device->reduce_scratchpad, device->reduce_result);
-    return AC_SUCCESS;
-}
-
-AcResult
-rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
-       const int3& end, const AcReal dt)
-{
-    cudaSetDevice(device->id);
-
-    // const dim3 tpb(32, 1, 4);
-    const dim3 tpb = rk3_tpb;
-
-    const int3 n = end - start;
-    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-    else
-        solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
-}
-
-AcResult
-synchronize(const Device device, const StreamType stream_type)
-{
-    cudaSetDevice(device->id);
-    if (stream_type == STREAM_ALL) {
-        cudaDeviceSynchronize();
-    }
-    else {
-        cudaStreamSynchronize(device->streams[stream_type]);
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-               const size_t bytes, AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-static AcResult
-storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-                const size_t bytes, AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
-                 const int3& src, const int3& dst, const int num_vertices)
-{
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
-                       num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
-               const int num_vertices, AcMesh* host_mesh)
-{
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
-                        num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
-                       Device dst_device, const int3& dst, const int num_vertices)
-{
-    cudaSetDevice(src_device->id);
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
-
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
-                                        &src_device->vba.in[i][src_idx], src_device->id,
-                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
-                                        src_device->streams[stream_type]));
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-swapBuffers(const Device device)
-{
-    cudaSetDevice(device->id);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        AcReal* tmp        = device->vba.in[i];
-        device->vba.in[i]  = device->vba.out[i];
-        device->vba.out[i] = tmp;
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param,
-                   const int value)
-{
-    cudaSetDevice(device->id);
-    // CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here)
-    // and something like d_mesh_info.real_params[] cannot be directly accessed.
-    // Therefore we have to obfuscate the code a bit and compute the offset address before
-    // invoking cudaMemcpyToSymbol.
-    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice,
-                                               device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param,
-                   const AcReal value)
-{
-    cudaSetDevice(device->id);
-    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice,
-                                               device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-loadGlobalGrid(const Device device, const Grid grid)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA_ALWAYS(
-        cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
-    return AC_SUCCESS;
-}
-
-AcResult
-autoOptimize(const Device device)
+acDeviceAutoOptimize(const Device device)
 {
    cudaSetDevice(device->id);

@@ -513,12 +304,303 @@ autoOptimize(const Device device)
    return AC_SUCCESS;
 }

+AcResult
+acDeviceSynchronizeStream(const Device device, const Stream stream)
+{
+    cudaSetDevice(device->id);
+    if (stream == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    }
+    else {
+        cudaStreamSynchronize(device->streams[stream]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceSwapBuffers(const Device device)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        AcReal* tmp        = device->vba.in[i];
+        device->vba.in[i]  = device->vba.out[i];
+        device->vba.out[i] = tmp;
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
+                     const AcReal value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
+                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                   const int3 dst, const int num_vertices)
+{
+    cudaSetDevice(device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
+
+    const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx];
+    AcReal* dst_ptr       = &device->vba.in[vtxbuf_handle][dst_idx];
+    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);
+
+    ERRCHK_CUDA(                                                                                  //
+        cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
+    );
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
+                           const int3 src, const int3 dst, const int num_vertices)
+{
+    WARNING("This function is deprecated");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
+                                           dst, num_vertices);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
+                         const VertexBufferHandle vtxbuf_handle)
+{
+    const int3 src            = (int3){0, 0, 0};
+    const int3 dst            = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+    acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst,
+                                       num_vertices);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
+{
+    WARNING("This function is deprecated");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
+                                    const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                    const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    cudaSetDevice(device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
+
+    const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx];
+    AcReal* dst_ptr       = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx];
+    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);
+
+    ERRCHK_CUDA(                                                                                  //
+        cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
+    );
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                            const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    WARNING("This function is deprecated");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
+                                            num_vertices, host_mesh);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreVertexBuffer(const Device device, const Stream stream,
+                          const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+
+    acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
+                                        host_mesh);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
+{
+    WARNING("This function is deprecated");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                       const int3 dst, const int num_vertices, Device dst_device)
+{
+    cudaSetDevice(src_device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
+
+    const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx];
+    AcReal* dst_ptr       = &dst_device->vba.in[vtxbuf_handle][dst_idx];
+    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);
+
+    ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes,
+                                    src_device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
+                               const int3 dst, const int num_vertices, Device dst_device)
+{
+    WARNING("This function is deprecated");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
+                                               num_vertices, dst_device);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
+                             const VertexBufferHandle vtxbuf_handle, Device dst_device)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(src_device->local_config);
+
+    acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                           num_vertices, dst_device);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
+{
+    WARNING("This function is deprecated");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                         const int3 start, const int3 end, const AcReal dt)
+{
+    cudaSetDevice(device->id);
+
+    const dim3 tpb = rk3_tpb;
+
+    const int3 n = end - start;
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+    else
+        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type,
+                              const VertexBufferHandle vtxbuf_handle, const int3 start,
+                              const int3 end)
+{
+    cudaSetDevice(device->id);
+    const cudaStream_t stream = device->streams[stream_type];
+
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                           const int3 end)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                   const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    cudaSetDevice(device->id);
+
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
+
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
+
+    *result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
+                          device->reduce_scratchpad, device->reduce_result);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
+                  const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                  const VertexBufferHandle vtxbuf2, AcReal* result)
+{
+    cudaSetDevice(device->id);
+
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
+
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
+
+    *result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
+                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
+                         device->reduce_scratchpad, device->reduce_result);
+    return AC_SUCCESS;
+}
+
 #if PACKED_DATA_TRANSFERS
 // Functions for calling packed data transfers
 #endif
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -1,107 +0,0 @@
-/*
-    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include "astaroth.h"
-
-typedef struct {
-    int3 m;
-    int3 n;
-} Grid;
-
-typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
-                                 // in Vulkan, f.ex. VkDevice
-
-/** */
-AcResult printDeviceInfo(const Device device);
-
-/** */
-AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
-
-/** */
-AcResult destroyDevice(Device device);
-
-/** */
-AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
-                       const int3& end);
-
-/** */
-AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
-                    const VertexBufferHandle vtxbuf_handle, AcReal* result);
-
-/** */
-AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
-                   const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                   const VertexBufferHandle vec2, AcReal* result);
-
-/** */
-AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
-                const int3& start, const int3& end, const AcReal dt);
-
-/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
-    a StreamType, the function synchronizes all streams on the device. */
-AcResult synchronize(const Device device, const StreamType stream_type);
-
-/** */
-AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
-                          const AcMesh& host_mesh, const int3& src, const int3& dst,
-                          const int num_vertices);
-
-/** */
-AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
-                        const int3& dst, const int num_vertices, AcMesh* host_mesh);
-
-/** */
-AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
-                                Device dst, const int3& dst_idx, const int num_vertices);
-
-/** Swaps the input/output buffers used in computations */
-AcResult swapBuffers(const Device device);
-
-/** */
-AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
-                            const AcIntParam param, const int value);
-
-/** */
-AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
-                            const AcRealParam param, const AcReal value);
-
-/** */
-AcResult loadGlobalGrid(const Device device, const Grid grid);
-
-/** */
-AcResult autoOptimize(const Device device);
-
-// #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers
-#if PACKED_DATA_TRANSFERS
-// Declarations used for packed data transfers
-#endif
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
--- a/src/core/grid.cu
+++ b/src/core/grid.cu
@@ -0,0 +1,208 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "astaroth_grid.h"
+
+#include "astaroth_node.h"
+
+const size_t MAX_NUM_NODES = 32;
+size_t num_nodes           = 0;
+static Node nodes[MAX_NUM_NODES];
+
+/** */
+AcResult
+acGridInit(const AcMeshInfo node_config)
+{
+    acNodeCreate(0, node_config, &nodes[0]);
+    ++num_nodes;
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridQuit(void)
+{
+    acNodeDestroy(nodes[0]);
+    --num_nodes;
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridSynchronizeStream(const Stream stream)
+{
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeSynchronizeStream(nodes[i], stream);
+    }
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridSwapBuffers(void)
+{
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeSwapBuffers(nodes[i]);
+    }
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
+{
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadConstant(node, stream, param, value);
+    }
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
+                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                 const int3 dst, const int num_vertices)
+{
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle)
+    }
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
+                         const int3 dst, const int num_vertices)
+{
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadMeshWithOffset(nodes[i], stream, host_mesh, src, dst, num_vertices);
+    }
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
+                       const VertexBufferHandle vtxbuf_handle)
+{
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadVertexBuffer(node, stream, host_mesh, vtxbuf_handle);
+    }
+    WARNING("Proper multinode not yet implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
+                                  const int3 src, const int3 dst, const int num_vertices,
+                                  AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
+                          const int num_vertices, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
+                        AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
+                       const AcReal dt)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridIntegrateStep(const Stream stream, const AcReal dt)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridPeriodicBoundcondStep(const Stream stream)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+#if 0
+/** */
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+/** */
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                         const VertexBufferHandle vec2, AcReal* result);
+#endif
+/** */
+AcResult
+acGridReduceScal(const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    return 0;
+}
+/** */
+AcResult
+acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
+                const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
+{
+    return 0;
+}
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -0,0 +1,818 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Multi-GPU implementation.
+ *
+ %JP: The old way for computing boundary conditions conflicts with the
+ way we have to do things with multiple GPUs.
+
+ The older approach relied on unified memory, which represented the whole
+ memory area as one huge mesh instead of several smaller ones. However, unified memory
+ in its current state is more meant for quick prototyping when performance is not an issue.
+ Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
+ than when managing the memory explicitly.
+
+ In this new approach, I have simplified the multi- and single-GPU layers significantly.
+ Quick rundown:
+         New struct: Grid. There are two global variables, "grid" and "subgrid", which
+         contain the extents of the whole simulation domain and the decomposed grids,
+ respectively. To simplify thing, we require that each GPU is assigned the same amount of
+ work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
+ work with.
+
+         The whole simulation domain is decomposed with respect to the z dimension.
+         For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+         contain (nx, ny, nz / num_devices) vertices.
+
+         An local index (i, j, k) in some subgrid can be mapped to the global grid with
+                 global idx = (i, j, k + device_id * subgrid.n.z)
+
+ Terminology:
+         - Single-GPU function: a function defined on the single-GPU layer (device.cu)
+
+ Changes required to this commented code block:
+         - The thread block dimensions (tpb) are no longer passed to the kernel here but in
+ device.cu instead. Same holds for any complex index calculations. Instead, the local
+ coordinates should be passed as an int3 type without having to consider how the data is
+ actually laid out in device memory
+         - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
+ handle of type "Device" which should be passed to single-GPU functions. In this file, all
+ devices are stored in a global array "devices[num_devices]".
+         - Every single-GPU function is executed asynchronously by default such that we
+           can optimize Astaroth by executing memory transactions concurrently with
+ computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
+           Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+           as a parameter and commands executing in different streams can be processed
+           in parallel/concurrently.
+
+
+ Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+
+         With multiple GPUs, periodic boundary conditions applied on indices ranging from
+
+                 (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
+ STENCIL_ORDER/2)
+
+         on a single device are "local", in the sense that they can be computed without
+ having to exchange data with neighboring GPUs. Special care is needed only for transferring
+         the data to the fron and back plates outside this range. In the solution we use
+ here, we solve the local boundaries first, and then just exchange the front and back plates
+         in a "ring", like so
+                                 device_id
+                     (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+
+### Throughout this file we use the following notation and names for various index offsets
+
+    Global coordinates: coordinates with respect to the global grid (static Grid grid)
+    Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
+
+    s0, s1: source indices in global coordinates
+    d0, d1: destination indices in global coordinates
+    da = max(s0, d0);
+    db = min(s1, d1);
+
+    These are used in at least
+    acLoad()
+    acStore()
+    acSynchronizeHalos()
+
+     Here we decompose the host mesh and distribute it among the GPUs in
+     the node.
+
+     The host mesh is a huge contiguous block of data. Its dimensions are given by
+     the global variable named "grid". A "grid" is decomposed into "subgrids",
+     one for each GPU. Here we check which parts of the range s0...s1 maps
+     to the memory space stored by some GPU, ranging d0...d1, and transfer
+     the data if needed.
+
+     The index mapping is inherently quite involved, but here's a picture which
+     hopefully helps make sense out of all this.
+
+
+     Grid
+                                      |----num_vertices---|
+     xxx|....................................................|xxx
+              ^                   ^   ^                   ^
+             d0                  d1  s0 (src)            s1
+
+     Subgrid
+
+              xxx|.............|xxx
+              ^                   ^
+             d0                  d1
+
+                                  ^   ^
+                                 db  da
+ *
+ */
+#include "astaroth_node.h"
+
+#include "astaroth_device.h"
+#include "errchk.h"
+#include "math_utils.h" // sum for reductions
+
+static const int MAX_NUM_DEVICES = 32;
+
+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
+
+struct node_s {
+    int id;
+
+    int num_devices;
+    Device devices[MAX_NUM_DEVICES];
+
+    Grid grid;
+    Grid subgrid;
+
+    AcMeshInfo config;
+};
+
+static int
+gridIdx(const Grid grid, const int3 idx)
+{
+    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
+}
+
+static int3
+gridIdx3d(const Grid grid, const int idx)
+{
+    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+                  idx / (grid.m.x * grid.m.y)};
+}
+
+static void
+printInt3(const int3 vec)
+{
+    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
+}
+
+static inline void
+print(const AcMeshInfo config)
+{
+    for (int i = 0; i < NUM_INT_PARAMS; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+}
+
+static void
+update_builtin_params(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = NGHOST;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = NGHOST;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
+    config->int_params[AC_nz_min] = NGHOST;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
+}
+
+static Grid
+createGrid(const AcMeshInfo config)
+{
+    Grid grid;
+
+    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
+    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
+
+    return grid;
+}
+
+AcResult
+acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
+{
+    struct node_s* node = (struct node_s*)malloc(sizeof(*node));
+    node->id            = id;
+    node->config        = node_config;
+
+    // Get node->num_devices
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices));
+    if (node->num_devices < 1) {
+        ERROR("No CUDA devices found!");
+        return AC_FAILURE;
+    }
+    if (node->num_devices > MAX_NUM_DEVICES) {
+        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
+        node->num_devices = MAX_NUM_DEVICES;
+    }
+    if (!AC_MULTIGPU_ENABLED) {
+        WARNING("MULTIGPU_ENABLED was false. Using only one device");
+        node->num_devices = 1; // Use only one device if multi-GPU is not enabled
+    }
+    // Check that node->num_devices is divisible with AC_nz. This makes decomposing the
+    // problem domain to multiple GPUs much easier since we do not have to worry
+    // about remainders
+    ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0);
+
+    // Decompose the problem domain
+    // The main grid
+    node->grid = createGrid(node->config);
+
+    // Subgrids
+    AcMeshInfo subgrid_config = node->config;
+    subgrid_config.int_params[AC_nz] /= node->num_devices;
+    update_builtin_params(&subgrid_config);
+#if VERBOSE_PRINTING // Defined in astaroth.h
+    printf("###############################################################\n");
+    printf("Config dimensions recalculated:\n");
+    print(subgrid_config);
+    printf("###############################################################\n");
+#endif
+    node->subgrid = createGrid(subgrid_config);
+
+    // Periodic boundary conditions become weird if the system can "fold unto itself".
+    ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
+
+#if VERBOSE_PRINTING
+    // clang-format off
+    printf("Grid m ");   printInt3(node->grid.m);    printf("\n");
+    printf("Grid n ");   printInt3(node->grid.n);    printf("\n");
+    printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
+    // clang-format on
+#endif
+
+    // Initialize the devices
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int3 multinode_offset                    = (int3){0, 0, 0}; // Placeholder
+        const int3 multigpu_offset                     = (int3){0, 0, i * node->subgrid.n.z};
+        subgrid_config.int3_params[AC_global_grid_n]   = node->grid.n;
+        subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset;
+
+        acDeviceCreate(i, subgrid_config, &node->devices[i]);
+        acDevicePrintInfo(node->devices[i]);
+    }
+
+    // Enable peer access
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int front = (i + 1) % node->num_devices;
+        const int back  = (i - 1 + node->num_devices) % node->num_devices;
+
+        int can_access_front, can_access_back;
+        cudaDeviceCanAccessPeer(&can_access_front, i, front);
+        cudaDeviceCanAccessPeer(&can_access_back, i, back);
+#if VERBOSE_PRINTING
+        printf(
+            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
+            i, front, can_access_front, back, can_access_back);
+#endif
+
+        cudaSetDevice(i);
+        if (can_access_front) {
+            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
+        }
+        if (can_access_back) {
+            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
+        }
+    }
+    acNodeSynchronizeStream(node, STREAM_ALL);
+
+    *node_handle = node;
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeDestroy(Node node)
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceDestroy(node->devices[i]);
+    }
+    free(node);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodePrintInfo(const Node node)
+{
+    (void)node;
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
+{
+    (void)node;
+    (void)config;
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeAutoOptimize(const Node node)
+{
+    (void)node;
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeSynchronizeStream(const Node node, const Stream stream)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceSynchronizeStream(node->devices[i], stream);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
+                              const VertexBufferHandle vtxbuf_handle)
+{
+    acNodeSynchronizeStream(node, stream);
+    // Exchanges the halos of subgrids
+    // After this step, the data within the main grid ranging from
+    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
+    // has been synchronized and transferred to appropriate subgrids
+
+    // We loop only to node->num_devices - 1 since the front and back plate of the grid is not
+    // transferred because their contents depend on the boundary conditions.
+
+    // IMPORTANT NOTE: the boundary conditions must be applied before
+    // callingacNodeSynchronizeStream(node,  this function! I.e. the halos of subgrids must contain
+    // up-to-date data!
+
+    const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
+
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices - 1; ++i) {
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        const int3 src = (int3){0, 0, node->subgrid.n.z};
+        const int3 dst = (int3){0, 0, 0};
+
+        const Device src_device = node->devices[i];
+        Device dst_device       = node->devices[i + 1];
+
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    // #pragma omp parallel for
+    for (int i = 1; i < node->num_devices; ++i) {
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        const int3 src = (int3){0, 0, NGHOST};
+        const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
+
+        const Device src_device = node->devices[i];
+        Device dst_device       = node->devices[i - 1];
+
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSynchronizeMesh(const Node node, const Stream stream)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSwapBuffers(const Node node)
+{
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceSwapBuffers(node->devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
+                   const AcReal value)
+{
+    acNodeSynchronizeStream(node, stream);
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceLoadConstant(node->devices[i], stream, param, value);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                 const int3 dst, const int num_vertices)
+{
+    acNodeSynchronizeStream(node, stream);
+    // See the beginning of the file for an explanation of the index mapping
+    // // #pragma omp parallel for
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
+
+        const int3 s0 = src; // dst; // TODO fix
+        (void)dst;           // TODO fix
+        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
+
+        const int3 da = max(s0, d0);
+        const int3 db = min(s1, d1);
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
+            // DECOMPOSITION OFFSET HERE
+            const int3 da_global = da; // src + da - dst; // TODO fix
+            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
+            acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle,
+                                               da_global, da_local, copy_cells);
+        }
+        // printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                         const int3 src, const int3 dst, const int num_vertices)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
+                                         num_vertices);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
+                       const VertexBufferHandle vtxbuf_handle)
+{
+    const int3 src            = (int3){0, 0, 0};
+    const int3 dst            = src;
+    const size_t num_vertices = acVertexBufferSize(host_mesh.info);
+
+    acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst,
+                                     num_vertices);
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
+                                  const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                  const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    acNodeSynchronizeStream(node, stream);
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
+
+        const int3 s0 = src; // TODO fix
+        (void)dst;           // TODO fix
+        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
+
+        const int3 da = max(s0, d0);
+        const int3 db = min(s1, d1);
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
+            // DECOMPOSITION OFFSET HERE
+            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
+            const int3 da_global = da; // dst + da - src; // TODO fix
+            acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
+                                                da_global, copy_cells, host_mesh);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
+                          const int num_vertices, AcMesh* host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst,
+                                          num_vertices, host_mesh);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeStoreVertexBuffer(const Node node, const Stream stream,
+                        const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
+{
+    const int3 src            = (int3){0, 0, 0};
+    const int3 dst            = src;
+    const size_t num_vertices = acVertexBufferSize(host_mesh->info);
+
+    acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
+                                      host_mesh);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start,
+                       const int3 end, const AcReal dt)
+{
+    acNodeSynchronizeStream(node, stream);
+
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        // DECOMPOSITION OFFSET HERE
+        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
+        const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};
+
+        const int3 da = max(start, d0);
+        const int3 db = min(end, d1);
+
+        if (db.z >= da.z) {
+            const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};
+            const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};
+            acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
+{
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
+            acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
+        }
+    }
+    else {
+        acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
+                                      node->subgrid.m);
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
+{
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
+        const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
+        {
+            // ...|ooooxxx|... -> xxx|ooooooo|...
+            const int3 src = (int3){0, 0, node->subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+
+            const Device src_device = node->devices[node->num_devices - 1];
+            Device dst_device       = node->devices[0];
+
+            acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                                   num_vertices, dst_device);
+        }
+        {
+            // ...|ooooooo|xxx <- ...|xxxoooo|...
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
+
+            const Device src_device = node->devices[0];
+            Device dst_device       = node->devices[node->num_devices - 1];
+
+            acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                                   num_vertices, dst_device);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeIntegrate(const Node node, const AcReal dt)
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    // xxx|OOO OOOOOOOOO OOO|xxx
+    //    ^    ^         ^  ^
+    //   n0   n1        n2  n3
+    // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
+    // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+    // const int3 n2 = node->grid.n;
+    // const int3 n3 = n0 + node->grid.n;
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acNodeSynchronizeStream(node, STREAM_ALL);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        acNodeSynchronizeStream(node, STREAM_ALL);
+
+        // Inner inner
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = node->subgrid.n;
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
+        }
+
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+            global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeStream(node, (Stream)vtxbuf);
+        }
+
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Top
+            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Right
+            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
+        }
+        acNodeSwapBuffers(node);
+    }
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                            const VertexBufferHandle vtxbuf_handle)
+{
+    local_boundcondstep(node, stream, vtxbuf_handle);
+    acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
+
+    // TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE
+    global_boundcondstep(node, stream, vtxbuf_handle);
+    // WARNING("Global boundconds should not be done here with multinode");
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodePeriodicBoundconds(const Node node, const Stream stream)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
+    }
+    return AC_SUCCESS;
+}
+
+static AcReal
+simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
+                         const int& n)
+{
+    AcReal res = results[0];
+    for (int i = 1; i < n; ++i) {
+        if (rtype == RTYPE_MAX) {
+            res = max(res, results[i]);
+        }
+        else if (rtype == RTYPE_MIN) {
+            res = min(res, results[i]);
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
+            res = sum(res, results[i]);
+        }
+        else {
+            ERROR("Invalid rtype");
+        }
+    }
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z);
+        res                = sqrt(inv_n * res);
+    }
+    return res;
+}
+
+AcResult
+acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+
+    AcReal results[node->num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
+    }
+
+    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
+                const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c,
+                AcReal* result)
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+
+    AcReal results[node->num_devices];
+    // #pragma omp parallel for
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
+    }
+
+    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
+    return AC_SUCCESS;
+}
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -29,13 +29,13 @@
 #include <stdio.h>

 #include "config_loader.h"
-#include "src/core/math_utils.h"
 #include "model/host_forcing.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_boundconds.h"
 #include "model/model_reduce.h"
 #include "model/model_rk3.h"
+#include "src/core/math_utils.h"

 #include "src/core/errchk.h"

@@ -431,8 +431,9 @@ check_rk3(const AcMeshInfo& mesh_info)
            acIntegrate(dt);

            model_rk3(dt, model_mesh);
-            boundconds(model_mesh->info, model_mesh);
        }
+        boundconds(model_mesh->info, model_mesh);
+        acBoundcondStep();
        acStore(gpu_mesh);

        bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
@@ -726,6 +727,7 @@ run_autotest(void)

                // Device integration step
                acIntegrate(dt);
+                acBoundcondStep();
                acStore(candidate_mesh);

                // Check fields
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -197,7 +197,6 @@ run_benchmark(void)
               double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
               mesh_info.int_params[AC_nx]);

-        acStore(mesh);
        acQuit();
        acmesh_destroy(mesh);
    }
@@ -269,7 +268,6 @@ run_benchmark(void)
    write_result_to_file(wallclock * 1e3f / steps);
    #endif

-    acStore(mesh);
    acQuit();
    acmesh_destroy(mesh);

--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -43,6 +43,9 @@ typedef struct {
    ModelScalar value;
    ModelVector gradient;
    ModelMatrix hessian;
+#if LUPWD
+    ModelVector upwind;
+#endif
 } ModelScalarData;

 typedef struct {
@@ -273,6 +276,53 @@ derzz(const int i, const int j, const int k, const ModelScalar* arr)
    return second_derivative(pencil, get(AC_inv_dsz));
 }

+#if LUPWD
+static inline ModelScalar
+der6x_upwd(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar inv_ds = get(AC_inv_dsx);
+
+    return ModelScalar(1.0/60.0)*inv_ds* (
+          -ModelScalar(    20.0)* arr[IDX(i,   j,   k)]
+          +ModelScalar(    15.0)*(arr[IDX(i+1, j,   k)]
+          +                       arr[IDX(i-1, j,   k)])
+          -ModelScalar(     6.0)*(arr[IDX(i+2, j,   k)]
+          +                       arr[IDX(i-2, j,   k)])
+          +                       arr[IDX(i+3, j,   k)]
+          +                       arr[IDX(i-3, j,   k)]);
+}
+
+static inline ModelScalar
+der6y_upwd(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar inv_ds = get(AC_inv_dsy);
+
+    return ModelScalar(1.0/60.0)*inv_ds* (
+          -ModelScalar(    20.0)* arr[IDX(i,   j,   k)]
+          +ModelScalar(    15.0)*(arr[IDX(i,   j+1, k)]
+          +                       arr[IDX(i,   j-1, k)])
+          -ModelScalar(     6.0)*(arr[IDX(i,   j+2, k)]
+          +                       arr[IDX(i,   j-2, k)])
+          +                       arr[IDX(i,   j+3, k)]
+          +                       arr[IDX(i,   j-3, k)]);
+}
+
+static inline ModelScalar
+der6z_upwd(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar inv_ds = get(AC_inv_dsz);
+
+    return ModelScalar(1.0/60.0)*inv_ds* (
+          -ModelScalar(    20.0)* arr[IDX(i,   j,   k  )]
+          +ModelScalar(    15.0)*(arr[IDX(i,   j,   k+1)]
+          +                       arr[IDX(i,   j,   k-1)])
+          -ModelScalar(     6.0)*(arr[IDX(i,   j,   k+2)]
+          +                       arr[IDX(i,   j,   k-2)])
+          +                       arr[IDX(i,   j,   k+3)]
+          +                       arr[IDX(i,   j,   k-3)]);
+}
+#endif
+
 static inline ModelScalar
 compute_value(const int i, const int j, const int k, const ModelScalar* arr)
 {
@@ -285,6 +335,14 @@ compute_gradient(const int i, const int j, const int k, const ModelScalar* arr)
    return (ModelVector){derx(i, j, k, arr), dery(i, j, k, arr), derz(i, j, k, arr)};
 }

+#if LUPWD
+static inline ModelVector
+compute_upwind(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    return (ModelVector){der6x_upwd(i, j, k, arr), der6y_upwd(i, j, k, arr), der6z_upwd(i, j, k, arr)};
+}
+#endif
+
 static inline ModelMatrix
 compute_hessian(const int i, const int j, const int k, const ModelScalar* arr)
 {
@@ -309,6 +367,10 @@ read_data(const int i, const int j, const int k, ModelScalar* buf[], const int h
    // diagonals with all arrays
    data.hessian = compute_hessian(i, j, k, buf[handle]);

+#if LUPWD
+    data.upwind   = compute_upwind(i, j, k, buf[handle]);
+#endif
+
    return data;
 }

@@ -354,6 +416,8 @@ gradients(const ModelVectorData& data)
    return (ModelMatrix){gradient(data.x), gradient(data.y), gradient(data.z)};
 }

+
+
 /*
 * =============================================================================
 * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
@@ -502,10 +566,27 @@ contract(const ModelMatrix& mat)
 * Stencil Processing Stage (equations)
 * =============================================================================
 */
+
+#if LUPWD
+ModelScalar
+upwd_der6(const ModelVectorData& uu, const ModelScalarData& lnrho)
+{
+    ModelScalar uux = fabs(value(uu).x);
+    ModelScalar uuy = fabs(value(uu).y);
+    ModelScalar uuz = fabs(value(uu).z);
+    return uux*lnrho.upwind.x + uuy*lnrho.upwind.y + uuz*lnrho.upwind.z;
+}
+#endif
+
 static inline ModelScalar
 continuity(const ModelVectorData& uu, const ModelScalarData& lnrho)
 {
-    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+    return -dot(value(uu), gradient(lnrho))
+#if LUPWD
+           //This is a corrective hyperdiffusion term for upwinding.
+           + upwd_der6(uu, lnrho)
+#endif
+           - divergence(uu);
 }

 static inline ModelScalar
@@ -679,23 +760,24 @@ helical_forcing(ModelScalar magnitude, ModelVector k_force, ModelVector xx, Mode
                ModelVector ff_im, ModelScalar phi)
 {
    (void)magnitude; // WARNING: unused
-    xx.x = xx.x * (2.0l * M_PI / (get(AC_dsx) * (get(AC_ny_max) - get(AC_ny_min))));
-    xx.y = xx.y * (2.0l * M_PI / (get(AC_dsy) * (get(AC_ny_max) - get(AC_ny_min))));
-    xx.z = xx.z * (2.0l * M_PI / (get(AC_dsz) * (get(AC_ny_max) - get(AC_ny_min))));
+    xx.x = xx.x*(2.0*M_PI/(get(AC_dsx)*get(AC_nx)));
+    xx.y = xx.y*(2.0*M_PI/(get(AC_dsy)*get(AC_ny)));
+    xx.z = xx.z*(2.0*M_PI/(get(AC_dsz)*get(AC_nz)));

-    ModelScalar cosl_phi     = cosl(phi);
-    ModelScalar sinl_phi     = sinl(phi);
-    ModelScalar cosl_k_dox_x = cosl(dot(k_force, xx));
-    ModelScalar sinl_k_dox_x = sinl(dot(k_force, xx));
+    ModelScalar cos_phi = cosl(phi);
+    ModelScalar sin_phi = sinl(phi);
+    ModelScalar cos_k_dot_x     = cosl(dot(k_force, xx));
+    ModelScalar sin_k_dot_x     = sinl(dot(k_force, xx));
    // Phase affect only the x-component
-    // ModelScalar real_comp       = cosl_k_dox_x;
-    // ModelScalar imag_comp       = sinl_k_dox_x;
-    ModelScalar real_comp_phase = cosl_k_dox_x * cosl_phi - sinl_k_dox_x * sinl_phi;
-    ModelScalar imag_comp_phase = cosl_k_dox_x * sinl_phi + sinl_k_dox_x * cosl_phi;
+    //Scalar real_comp       = cos_k_dot_x;
+    //Scalar imag_comp       = sin_k_dot_x;
+    ModelScalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
+    ModelScalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;

-    ModelVector force = (ModelVector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
-                                      ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
-                                      ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
+
+    ModelVector force = (ModelVector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
+                             ff_re.y*real_comp_phase - ff_im.y*imag_comp_phase,
+                             ff_re.z*real_comp_phase - ff_im.z*imag_comp_phase};

    return force;
 }
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -32,13 +32,13 @@
 #include <string.h> // memcpy

 #include "config_loader.h"
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
 #include "model/host_forcing.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_reduce.h"
 #include "model/model_rk3.h"
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
 #include "timer_hires.h"

 // Window
@@ -106,6 +106,7 @@ renderer_init(const int& mx, const int& my)
    // Setup window
    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
                              window_width, window_height, SDL_WINDOW_SHOWN);
+    ERRCHK_ALWAYS(window);

    // Setup SDL renderer
    renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
@@ -409,9 +410,10 @@ run_renderer(void)
        /* Render */
        const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
        if (timer_diff_sec >= desired_frame_time) {
-            // acStore(mesh);
            const int num_vertices = mesh->info.int_params[AC_mxy];
            const int3 dst         = (int3){0, 0, k_slice};
+            acBoundcondStep();
+            // acStore(mesh);
            acStoreWithOffset(dst, num_vertices, mesh);
            acSynchronizeStream(STREAM_ALL);
            renderer_draw(*mesh); // Bottleneck is here
@@ -421,7 +423,6 @@ run_renderer(void)
    }
    printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f));

-    acStore(mesh);
    acQuit();
    acmesh_destroy(mesh);

--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -27,13 +27,13 @@
 #include "run.h"

 #include "config_loader.h"
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
 #include "model/host_forcing.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_reduce.h"
 #include "model/model_rk3.h"
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
 #include "timer_hires.h"

 #include <string.h>
@@ -216,6 +216,7 @@ run_simulation(void)
    write_mesh_info(&mesh_info);
    print_diagnostics(0, AcReal(.0), t_step, diag_file);

+    acBoundcondStep();
    acStore(mesh);
    save_mesh(*mesh, 0, t_step);

@@ -229,13 +230,13 @@ run_simulation(void)
    /* initialize random seed: */
    srand(312256655);

-    //TODO_SINK. init_sink_particle()
+    // TODO_SINK. init_sink_particle()
    //  Initialize the basic variables of the sink particle to a suitable initial value.
    //  1. Location of the particle
    //  2. Mass of the particle
    //  (3. Velocity of the particle)
-    //  This at the level of Host in this case. 
-    //  acUpdate_sink_particle() will do the similar trick to the device. 
+    //  This at the level of Host in this case.
+    //  acUpdate_sink_particle() will do the similar trick to the device.

    /* Step the simulation */
    for (int i = 1; i < max_steps; ++i) {
@@ -247,28 +248,28 @@ run_simulation(void)
        loadForcingParamsToDevice(forcing_params);
 #endif

+
       //TODO_SINK acUpdate_sink_particle()
       //  Update properties of the sing particle for acIntegrate(). Essentially:
       //  1. Location of the particle
       //  2. Mass of the particle
-       //  (3. Velocity of the particle)
+       //  3. Velocity of the particle)
       //  These can be used for calculating he gravitational field. 
       //  This is my first comment! by Jack
       //  This is my second comment! by Jack
-
        acIntegrate(dt);
-       //TODO_SINK acAdvect_sink_particle()
-       //  THIS IS OPTIONAL. We will start from unmoving particle. 
-       //  1. Calculate the equation of motion for the sink particle. 
-       //  NOTE: Might require embedding with acIntegrate(dt). 
+        // TODO_SINK acAdvect_sink_particle()
+        //  THIS IS OPTIONAL. We will start from unmoving particle.
+        //  1. Calculate the equation of motion for the sink particle.
+        //  NOTE: Might require embedding with acIntegrate(dt).

-       //TODO_SINK acAccrete_sink_particle()
-       //  Calculate accretion of the sink particle from the surrounding medium 
-       //  1. Transfer density into sink particle mass
-       //  2. Transfer momentum into sink particle 
-       //  (OPTIONAL: Affection the motion of the particle)
-       //  NOTE: Might require embedding with acIntegrate(dt).
-       //  This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference. 
+        // TODO_SINK acAccrete_sink_particle()
+        //  Calculate accretion of the sink particle from the surrounding medium
+        //  1. Transfer density into sink particle mass
+        //  2. Transfer momentum into sink particle
+        //  (OPTIONAL: Affection the motion of the particle)
+        //  NOTE: Might require embedding with acIntegrate(dt).
+        //  This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference.

        t_step += dt;

@@ -311,9 +312,10 @@ run_simulation(void)
                assumed to be asynchronous, so the meshes must be also synchronized
                before transferring the data to the CPU. Like so:

-                acSynchronize();
+                acBoundcondStep();
                acStore(mesh);
            */
+            acBoundcondStep();
            acStore(mesh);

            save_mesh(*mesh, i, t_step);