From efd9d54fef90f52dcdf9c2a449799b6d5f1165e6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 30 Jul 2019 14:34:44 +0300
Subject: [PATCH 01/37] Stashing WIP changes (interface revision) s.t. I can
 continue work on a different machine

---
 include/astaroth.h         |  92 +--------------------------
 include/astaroth_defines.h |   6 +-
 include/astaroth_device.h  | 124 +++++++++++++++++++++++++++++++++++++
 include/astaroth_grid.h    | 107 ++++++++++++++++++++++++++++++++
 include/astaroth_node.h    | 123 ++++++++++++++++++++++++++++++++++++
 src/core/astaroth.cu       |   2 +
 src/core/device.cuh        | 107 --------------------------------
 src/core/grid.cu           |   0
 src/core/node.cu           |  22 +++++++
 9 files changed, 382 insertions(+), 201 deletions(-)
 create mode 100644 include/astaroth_device.h
 create mode 100644 include/astaroth_grid.h
 create mode 100644 include/astaroth_node.h
 delete mode 100644 src/core/device.cuh
 create mode 100644 src/core/grid.cu
 create mode 100644 src/core/node.cu

diff --git a/include/astaroth.h b/include/astaroth.h
index 52ba3d0..1aef514 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -24,95 +24,9 @@ extern "C" {
 
 #include "astaroth_defines.h"
 
-/** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more,
- * AC_FAILURE otherwise. */
-AcResult acCheckDeviceAvailability(void);
-
-/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL
- * passed as a parameter */
-AcResult acSynchronizeStream(const StreamType stream);
-
-/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos
- * of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */
-AcResult acSynchronizeMesh(void);
-
-/** Starting point of all GPU computation. Handles the allocation and
-initialization of *all memory needed on all GPUs in the node*. In other words,
-setups everything GPU-side so that calling any other GPU interface function
-afterwards does not result in illegal memory accesses. */
-AcResult acInit(const AcMeshInfo mesh_info);
-
-/** Frees all GPU allocations and resets all devices in the node. Should be
- * called at exit. */
-AcResult acQuit(void);
-
-/** Does all three substeps of the RK3 integration and computes the boundary
-conditions when necessary. The result is synchronized and the boundary conditions are applied
-after the final substep, after which the result can be fetched to CPU memory with acStore. */
-AcResult acIntegrate(const AcReal dt);
-
-/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the
- * whole computational domain, which must be up to date and synchronized before calling
- * acReduceScal.
- */
-AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle a);
-
-/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the
- * whole computational domain, which must be up to date and synchronized before calling
- * acReduceVec.
- */
-AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
-                   const VertexBufferHandle b, const VertexBufferHandle c);
-
-/** Distributes the host mesh among the GPUs in the node. Synchronous. */
-AcResult acLoad(const AcMesh host_mesh);
-
-/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous.
- */
-AcResult acStore(AcMesh* host_mesh);
-
-/*
- * =============================================================================
- * Astaroth interface: Advanced functions. Asynchronous.
- * =============================================================================
- */
-/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
-AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
-AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value,
-                                   const StreamType stream);
-
-/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
-AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 start, const int num_vertices);
-AcResult acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 start, const int num_vertices,
-                               const StreamType stream);
-
-/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
- * CPU memory. Asynchronous.
- */
-AcResult acStoreWithOffset(const int3 start, const int num_vertices, AcMesh* host_mesh);
-AcResult acStoreWithOffsetAsync(const int3 start, const int num_vertices, AcMesh* host_mesh,
-                                const StreamType stream);
-
-/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
-AcResult acIntegrateStep(const int isubstep, const AcReal dt);
-AcResult acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream);
-
-/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
- * Asynchronous.*/
-AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start,
-                                   const int3 end);
-AcResult acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
-                                        const int3 end, const StreamType stream);
-
-/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
-AcResult acBoundcondStep(void);
-AcResult acBoundcondStepAsync(const StreamType stream);
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
+#include "astaroth_device.h"
+#include "astaroth_grid.h"
+#include "astaroth_node.h"
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 73a25d9..7aa9751 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -104,11 +104,7 @@ typedef enum {
     NUM_REDUCTION_TYPES
 } ReductionType;
 
-typedef enum {
-    STREAM_DEFAULT,
-    NUM_STREAM_TYPES, //
-    STREAM_ALL
-} StreamType;
+typedef enum { STREAM_DEFAULT, NUM_STREAM_TYPES } Stream;
 
 #define AC_GEN_ID(X) X
 typedef enum {
diff --git a/include/astaroth_device.h b/include/astaroth_device.h
new file mode 100644
index 0000000..d98ffcb
--- /dev/null
+++ b/include/astaroth_device.h
@@ -0,0 +1,124 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "astaroth_defines.h"
+
+typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
+                                 // in Vulkan, f.ex. VkDevice
+
+/** */
+AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device);
+
+/** */
+AcResult acDeviceDestroy(Device device);
+
+/** */
+AcResult acDevicePrintInfo(const Device device);
+
+/** */
+AcResult acDeviceAutoOptimize(const Device device);
+
+/** */
+AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
+
+/** */
+AcResult acDeviceSwapBuffers(const Device device);
+
+/** */
+AcResult acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
+                              const AcReal value);
+
+/** */
+AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
+                                            const AcMesh host_mesh,
+                                            const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                            const int3 dst, const int num_vertices);
+
+/** */
+AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
+                                    const AcMesh host_mesh, const int3 src, const int3 dst,
+                                    const int num_vertices);
+
+/** */
+AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
+                                  const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
+
+/** */
+AcResult acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
+                                             const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                             const int3 dst, const int num_vertices,
+                                             AcMesh* host_mesh);
+
+/** */
+AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                                     const int3 dst, const int num_vertices, AcMesh* host_mesh);
+
+/** */
+AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
+                                   const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+
+/** */
+AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
+
+/** */
+AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
+                                                const VertexBufferHandle vtxbuf_handle,
+                                                const int3 src, const int3 dst,
+                                                const int num_vertices, Device* dst_device);
+
+/** */
+AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
+                                        const int3 src, const int3 dst, const int num_vertices,
+                                        Device* dst_device);
+
+/** */
+AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
+                                      const VertexBufferHandle vtxbuf_handle, Device* dst_device);
+
+/** */
+AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device);
+
+/** */
+AcResult acDeviceIntegrateSubstep(const Device device, const StreamType stream_type,
+                                  const int step_number, const int3 start, const int3 end,
+                                  const AcReal dt);
+/** */
+AcResult acDevicePeriodicBoundcondStep(const Device device, const StreamType stream_type,
+                                       const int3 start, const int3 end);
+/** */
+AcResult acDeviceReduceScal(const Device device, const StreamType stream_type,
+                            const ReductionType rtype, const VertexBufferHandle vtxbuf_handle,
+                            AcReal* result);
+/** */
+AcResult acDeviceReduceVec(const Device device, const StreamType stream_type,
+                           const ReductionType rtype, const VertexBufferHandle vec0,
+                           const VertexBufferHandle vec1, const VertexBufferHandle vec2,
+                           AcReal* result);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/include/astaroth_grid.h b/include/astaroth_grid.h
new file mode 100644
index 0000000..4200343
--- /dev/null
+++ b/include/astaroth_grid.h
@@ -0,0 +1,107 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "astaroth_defines.h"
+
+/** */
+AcResult acGridInit(const AcMeshInfo node_config);
+
+/** */
+AcResult acGridQuit(void);
+
+/** */
+AcResult acGridSynchronize(void);
+
+/** */
+AcResult acGridSwapBuffers(void);
+
+/** */
+AcResult acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value);
+
+/** */
+AcResult acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
+                                          const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                          const int3 dst, const int num_vertices);
+
+/** */
+AcResult acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
+                                  const int3 dst, const int num_vertices);
+
+/** */
+AcResult acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
+                                const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acGridLoadMesh(const Stream stream, const AcMesh host_mesh);
+
+/** */
+AcResult acGridStoreVertexBufferWithOffset(const Stream stream,
+                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                           const int3 dst, const int num_vertices,
+                                           AcMesh* host_mesh);
+
+/** */
+AcResult acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
+                                   const int num_vertices, AcMesh* host_mesh);
+
+/** */
+AcResult acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
+                                 AcMesh* host_mesh);
+
+/** */
+AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
+
+/** */
+AcResult acGridTransferVertexBufferWithOffset(const Stream stream,
+                                              const VertexBufferHandle vtxbuf_handle,
+                                              const int3 src, const int3 dst,
+                                              const int num_vertices);
+
+/** */
+AcResult acGridTransferMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
+                                      const int num_vertices);
+
+/** */
+AcResult acGridTransferVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acGridTransferMesh(const Stream stream);
+
+/** */
+AcResult acGridIntegrateSubstep(const StreamType stream_type, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
+/** */
+AcResult acGridPeriodicBoundcondStep(const StreamType stream_type, const int3 start,
+                                     const int3 end);
+/** */
+AcResult acGridReduceScal(const StreamType stream_type, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+/** */
+AcResult acGridReduceVec(const StreamType stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                         const VertexBufferHandle vec2, AcReal* result);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/include/astaroth_node.h b/include/astaroth_node.h
new file mode 100644
index 0000000..83e0a45
--- /dev/null
+++ b/include/astaroth_node.h
@@ -0,0 +1,123 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "astaroth_defines.h"
+
+typedef struct node_s* Node;
+
+/** */
+AcResult acNodeCreate(const AcMeshInfo node_config, Node* node);
+
+/** */
+AcResult acNodeDestroy(Node node);
+
+/** */
+AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
+
+/** */
+AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
+
+/** */
+AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acNodeSynchronizeMesh(const Node node, const Stream stream);
+
+/** */
+AcResult acNodeSwapBuffers(const Node node);
+
+/** */
+AcResult acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
+                            const AcReal value);
+
+/** */
+AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
+                                          const AcMesh host_mesh,
+                                          const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                          const int3 dst, const int num_vertices);
+
+/** */
+AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                                  const int3 src, const int3 dst, const int num_vertices);
+
+/** */
+AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
+                                const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
+
+/** */
+AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
+                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                           const int3 dst, const int num_vertices,
+                                           AcMesh* host_mesh);
+
+/** */
+AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
+                                   const int3 dst, const int num_vertices, AcMesh* host_mesh);
+
+/** */
+AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
+                                 const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+
+/** */
+AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
+
+/** */
+AcResult acNodeTransferVertexBufferWithOffset(const Node src_node, const Stream stream,
+                                              const VertexBufferHandle vtxbuf_handle,
+                                              const int3 src, const int3 dst,
+                                              const int num_vertices, Node* dst_node);
+
+/** */
+AcResult acNodeTransferMeshWithOffset(const Node src_node, const Stream stream, const int3 src,
+                                      const int3 dst, const int num_vertices, Node* dst_node);
+
+/** */
+AcResult acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
+                                    const VertexBufferHandle vtxbuf_handle, Node* dst_node);
+
+/** */
+AcResult acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node);
+
+/** */
+AcResult acNodeIntegrateSubstep(const Node node, const StreamType stream_type,
+                                const int step_number, const int3 start, const int3 end,
+                                const AcReal dt);
+/** */
+AcResult acNodePeriodicBoundcondStep(const Node node, const StreamType stream_type,
+                                     const int3 start, const int3 end);
+/** */
+AcResult acNodeReduceScal(const Node node, const StreamType stream_type, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+/** */
+AcResult acNodeReduceVec(const Node node, const StreamType stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                         const VertexBufferHandle vec2, AcReal* result);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 736956b..1cb9f1e 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -671,3 +671,5 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
  * Revised interface
  * =============================================================================
  */
+=======
+>>>>>>> Stashed changes
diff --git a/src/core/device.cuh b/src/core/device.cuh
deleted file mode 100644
index 7f1fad4..0000000
--- a/src/core/device.cuh
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
-    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include "astaroth.h"
-
-typedef struct {
-    int3 m;
-    int3 n;
-} Grid;
-
-typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
-                                 // in Vulkan, f.ex. VkDevice
-
-/** */
-AcResult printDeviceInfo(const Device device);
-
-/** */
-AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
-
-/** */
-AcResult destroyDevice(Device device);
-
-/** */
-AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
-                       const int3& end);
-
-/** */
-AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
-                    const VertexBufferHandle vtxbuf_handle, AcReal* result);
-
-/** */
-AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
-                   const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                   const VertexBufferHandle vec2, AcReal* result);
-
-/** */
-AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
-                const int3& start, const int3& end, const AcReal dt);
-
-/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
-    a StreamType, the function synchronizes all streams on the device. */
-AcResult synchronize(const Device device, const StreamType stream_type);
-
-/** */
-AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
-                          const AcMesh& host_mesh, const int3& src, const int3& dst,
-                          const int num_vertices);
-
-/** */
-AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
-                        const int3& dst, const int num_vertices, AcMesh* host_mesh);
-
-/** */
-AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
-                                Device dst, const int3& dst_idx, const int num_vertices);
-
-/** Swaps the input/output buffers used in computations */
-AcResult swapBuffers(const Device device);
-
-/** */
-AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
-                            const AcIntParam param, const int value);
-
-/** */
-AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
-                            const AcRealParam param, const AcReal value);
-
-/** */
-AcResult loadGlobalGrid(const Device device, const Grid grid);
-
-/** */
-AcResult autoOptimize(const Device device);
-
-// #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers
-#if PACKED_DATA_TRANSFERS
-// Declarations used for packed data transfers
-#endif
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
diff --git a/src/core/grid.cu b/src/core/grid.cu
new file mode 100644
index 0000000..e69de29
diff --git a/src/core/node.cu b/src/core/node.cu
new file mode 100644
index 0000000..c3bfc05
--- /dev/null
+++ b/src/core/node.cu
@@ -0,0 +1,22 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// #include "astaroth_node.h"
+
+struct node_s {
+};

From 5be775dbff99e516c17efdb78a3f8b423f97d039 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 31 Jul 2019 17:48:48 +0300
Subject: [PATCH 02/37] Various intermediate changes

---
 include/astaroth_device.h |  21 +-
 include/astaroth_grid.h   |  11 +-
 include/astaroth_node.h   |  13 +-
 src/core/CMakeLists.txt   |   2 +-
 src/core/astaroth.cu      | 658 +-------------------------------------
 src/core/device.cu        | 502 +----------------------------
 src/core/grid.cu          |  19 ++
 src/core/node.cu          |   2 +-
 8 files changed, 43 insertions(+), 1185 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index d98ffcb..48c8db2 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -103,21 +103,18 @@ AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stre
 AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device);
 
 /** */
-AcResult acDeviceIntegrateSubstep(const Device device, const StreamType stream_type,
-                                  const int step_number, const int3 start, const int3 end,
-                                  const AcReal dt);
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                                  const int3 start, const int3 end, const AcReal dt);
 /** */
-AcResult acDevicePeriodicBoundcondStep(const Device device, const StreamType stream_type,
-                                       const int3 start, const int3 end);
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 start,
+                                       const int3 end);
 /** */
-AcResult acDeviceReduceScal(const Device device, const StreamType stream_type,
-                            const ReductionType rtype, const VertexBufferHandle vtxbuf_handle,
-                            AcReal* result);
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acDeviceReduceVec(const Device device, const StreamType stream_type,
-                           const ReductionType rtype, const VertexBufferHandle vec0,
-                           const VertexBufferHandle vec1, const VertexBufferHandle vec2,
-                           AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
+                           const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                           const VertexBufferHandle vec2, AcReal* result);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/include/astaroth_grid.h b/include/astaroth_grid.h
index 4200343..7dfe323 100644
--- a/include/astaroth_grid.h
+++ b/include/astaroth_grid.h
@@ -89,16 +89,15 @@ AcResult acGridTransferVertexBuffer(const Stream stream, const VertexBufferHandl
 AcResult acGridTransferMesh(const Stream stream);
 
 /** */
-AcResult acGridIntegrateSubstep(const StreamType stream_type, const int step_number,
-                                const int3 start, const int3 end, const AcReal dt);
+AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start,
+                                const int3 end, const AcReal dt);
 /** */
-AcResult acGridPeriodicBoundcondStep(const StreamType stream_type, const int3 start,
-                                     const int3 end);
+AcResult acGridPeriodicBoundcondStep(const Stream stream, const int3 start, const int3 end);
 /** */
-AcResult acGridReduceScal(const StreamType stream_type, const ReductionType rtype,
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
                           const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acGridReduceVec(const StreamType stream_type, const ReductionType rtype,
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vec0, const VertexBufferHandle vec1,
                          const VertexBufferHandle vec2, AcReal* result);
 
diff --git a/include/astaroth_node.h b/include/astaroth_node.h
index 83e0a45..216db25 100644
--- a/include/astaroth_node.h
+++ b/include/astaroth_node.h
@@ -104,17 +104,16 @@ AcResult acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
 AcResult acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node);
 
 /** */
-AcResult acNodeIntegrateSubstep(const Node node, const StreamType stream_type,
-                                const int step_number, const int3 start, const int3 end,
-                                const AcReal dt);
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
 /** */
-AcResult acNodePeriodicBoundcondStep(const Node node, const StreamType stream_type,
-                                     const int3 start, const int3 end);
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const int3 start,
+                                     const int3 end);
 /** */
-AcResult acNodeReduceScal(const Node node, const StreamType stream_type, const ReductionType rtype,
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                           const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acNodeReduceVec(const Node node, const StreamType stream_type, const ReductionType rtype,
+AcResult acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vec0, const VertexBufferHandle vec1,
                          const VertexBufferHandle vec2, AcReal* result);
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 5cbc271..6054444 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -3,7 +3,7 @@
 ########################################
 
 ## Find packages
-find_package(CUDA 9 REQUIRED)
+find_package(CUDA REQUIRED)
 
 ## Architecture and optimization flags
 set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 1cb9f1e..1ef56d4 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -16,660 +16,4 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-
-/**
- * @file
- * \brief Multi-GPU implementation.
- *
- %JP: The old way for computing boundary conditions conflicts with the
- way we have to do things with multiple GPUs.
-
- The older approach relied on unified memory, which represented the whole
- memory area as one huge mesh instead of several smaller ones. However, unified memory
- in its current state is more meant for quick prototyping when performance is not an issue.
- Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
- than when managing the memory explicitly.
-
- In this new approach, I have simplified the multi- and single-GPU layers significantly.
- Quick rundown:
-         New struct: Grid. There are two global variables, "grid" and "subgrid", which
-         contain the extents of the whole simulation domain and the decomposed grids,
- respectively. To simplify thing, we require that each GPU is assigned the same amount of
- work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
- work with.
-
-         The whole simulation domain is decomposed with respect to the z dimension.
-         For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
-         contain (nx, ny, nz / num_devices) vertices.
-
-         An local index (i, j, k) in some subgrid can be mapped to the global grid with
-                 global idx = (i, j, k + device_id * subgrid.n.z)
-
- Terminology:
-         - Single-GPU function: a function defined on the single-GPU layer (device.cu)
-
- Changes required to this commented code block:
-         - The thread block dimensions (tpb) are no longer passed to the kernel here but in
- device.cu instead. Same holds for any complex index calculations. Instead, the local
- coordinates should be passed as an int3 type without having to consider how the data is
- actually laid out in device memory
-         - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
- handle of type "Device" which should be passed to single-GPU functions. In this file, all
- devices are stored in a global array "devices[num_devices]".
-         - Every single-GPU function is executed asynchronously by default such that we
-           can optimize Astaroth by executing memory transactions concurrently with
- computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
-           Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
-           as a parameter and commands executing in different streams can be processed
-           in parallel/concurrently.
-
-
- Note on periodic boundaries (might be helpful when implementing other boundary conditions):
-
-         With multiple GPUs, periodic boundary conditions applied on indices ranging from
-
-                 (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
- STENCIL_ORDER/2)
-
-         on a single device are "local", in the sense that they can be computed without
- having to exchange data with neighboring GPUs. Special care is needed only for transferring
-         the data to the fron and back plates outside this range. In the solution we use
- here, we solve the local boundaries first, and then just exchange the front and back plates
-         in a "ring", like so
-                                 device_id
-                     (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
-
-### Throughout this file we use the following notation and names for various index offsets
-
-    Global coordinates: coordinates with respect to the global grid (static Grid grid)
-    Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
-
-    s0, s1: source indices in global coordinates
-    d0, d1: destination indices in global coordinates
-    da = max(s0, d0);
-    db = min(s1, d1);
-
-    These are used in at least
-    acLoad()
-    acStore()
-    acSynchronizeHalos()
-
-     Here we decompose the host mesh and distribute it among the GPUs in
-     the node.
-
-     The host mesh is a huge contiguous block of data. Its dimensions are given by
-     the global variable named "grid". A "grid" is decomposed into "subgrids",
-     one for each GPU. Here we check which parts of the range s0...s1 maps
-     to the memory space stored by some GPU, ranging d0...d1, and transfer
-     the data if needed.
-
-     The index mapping is inherently quite involved, but here's a picture which
-     hopefully helps make sense out of all this.
-
-
-     Grid
-                                      |----num_vertices---|
-     xxx|....................................................|xxx
-              ^                   ^   ^                   ^
-             d0                  d1  s0 (src)            s1
-
-     Subgrid
-
-              xxx|.............|xxx
-              ^                   ^
-             d0                  d1
-
-                                  ^   ^
-                                 db  da
- *
- */
-#include "astaroth.h"
-#include "errchk.h"
-
-#include "device.cuh"
-#include "math_utils.h" // sum for reductions
-// #include "standalone/config_loader.h" // update_config
-
-#define AC_GEN_STR(X) #X
-const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
-                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
-                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
-#undef AC_GEN_STR
-
-static const int MAX_NUM_DEVICES       = 32;
-static int num_devices                 = 0;
-static Device devices[MAX_NUM_DEVICES] = {};
-
-static Grid grid; // A grid consists of num_devices subgrids
-static Grid subgrid;
-
-static int
-gridIdx(const Grid grid, const int3 idx)
-{
-    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
-}
-
-static int3
-gridIdx3d(const Grid grid, const int idx)
-{
-    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
-                  idx / (grid.m.x * grid.m.y)};
-}
-
-static void
-printInt3(const int3 vec)
-{
-    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
-}
-
-static inline void
-print(const AcMeshInfo config)
-{
-    for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
-    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
-}
-
-static void
-update_builtin_params(AcMeshInfo* config)
-{
-    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
-    ///////////// PAD TEST
-    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
-    ///////////// PAD TEST
-    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
-    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
-
-    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
-    config->int_params[AC_nx_min] = NGHOST;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
-    config->int_params[AC_ny_min] = NGHOST;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
-    config->int_params[AC_nz_min] = NGHOST;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
-
-    /* Additional helper params */
-    // Int helpers
-    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
-    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
-    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
-
-#if VERBOSE_PRINTING // Defined in astaroth.h
-    printf("###############################################################\n");
-    printf("Config dimensions recalculated:\n");
-    print(*config);
-    printf("###############################################################\n");
-#endif
-}
-
-static Grid
-createGrid(const AcMeshInfo config)
-{
-    Grid grid;
-
-    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
-    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
-
-    return grid;
-}
-
-AcResult
-acCheckDeviceAvailability(void)
-{
-    int device_count; // Separate from num_devices to avoid side effects
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count));
-    if (device_count > 0)
-        return AC_SUCCESS;
-    else
-        return AC_FAILURE;
-}
-
-AcResult
-acSynchronizeStream(const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
-    }
-
-    return AC_SUCCESS;
-}
-
-static AcResult
-synchronize_halos(const StreamType stream)
-{
-    // Exchanges the halos of subgrids
-    // After this step, the data within the main grid ranging from
-    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
-    // has been synchronized and transferred to appropriate subgrids
-
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
-    // transferred because their contents depend on the boundary conditions.
-
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices - 1; ++i) {
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acSynchronizeMesh(void)
-{
-    acSynchronizeStream(STREAM_ALL);
-    synchronize_halos(STREAM_DEFAULT);
-    acSynchronizeStream(STREAM_ALL);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acInit(const AcMeshInfo config)
-{
-    // Get num_devices
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
-    if (num_devices < 1) {
-        ERROR("No CUDA devices found!");
-        return AC_FAILURE;
-    }
-    if (num_devices > MAX_NUM_DEVICES) {
-        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
-        num_devices = MAX_NUM_DEVICES;
-    }
-    if (!AC_MULTIGPU_ENABLED) {
-        WARNING("MULTIGPU_ENABLED was false. Using only one device");
-        num_devices = 1; // Use only one device if multi-GPU is not enabled
-    }
-    // Check that num_devices is divisible with AC_nz. This makes decomposing the
-    // problem domain to multiple GPUs much easier since we do not have to worry
-    // about remainders
-    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
-
-    // Decompose the problem domain
-    // The main grid
-    grid = createGrid(config);
-
-    // Subgrids
-    AcMeshInfo subgrid_config = config;
-    subgrid_config.int_params[AC_nz] /= num_devices;
-    update_builtin_params(&subgrid_config);
-    subgrid = createGrid(subgrid_config);
-
-    // Periodic boundary conditions become weird if the system can "fold unto itself".
-    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
-
-#if VERBOSE_PRINTING
-    // clang-format off
-    printf("Grid m ");   printInt3(grid.m);    printf("\n");
-    printf("Grid n ");   printInt3(grid.n);    printf("\n");
-    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
-    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
-    // clang-format on
-#endif
-
-    // Initialize the devices
-    for (int i = 0; i < num_devices; ++i) {
-        createDevice(i, subgrid_config, &devices[i]);
-        loadGlobalGrid(devices[i], grid);
-        printDeviceInfo(devices[i]);
-    }
-
-    // Enable peer access
-    for (int i = 0; i < num_devices; ++i) {
-        const int front = (i + 1) % num_devices;
-        const int back  = (i - 1 + num_devices) % num_devices;
-
-        int can_access_front, can_access_back;
-        cudaDeviceCanAccessPeer(&can_access_front, i, front);
-        cudaDeviceCanAccessPeer(&can_access_back, i, back);
-#if VERBOSE_PRINTING
-        printf(
-            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
-            i, front, can_access_front, back, can_access_back);
-#endif
-
-        cudaSetDevice(i);
-        if (can_access_front) {
-            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
-        }
-        if (can_access_back) {
-            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
-        }
-    }
-
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acQuit(void)
-{
-    acSynchronizeStream(STREAM_ALL);
-
-    for (int i = 0; i < num_devices; ++i) {
-        destroyDevice(devices[i]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
-                               const int3 end, const StreamType stream)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
-
-        const int3 da = max(start, d0);
-        const int3 db = min(end, d1);
-
-        if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end)
-{
-    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
-}
-
-AcResult
-acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream)
-{
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + grid.n;
-    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
-}
-
-AcResult
-acIntegrateStep(const int isubstep, const AcReal dt)
-{
-    return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
-}
-
-static AcResult
-local_boundcondstep(const StreamType stream)
-{
-    if (num_devices == 1) {
-        boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
-    }
-    else {
-        // Local boundary conditions
-        // #pragma omp parallel for
-        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            boundcondStep(devices[i], stream, d0, d1);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-global_boundcondstep(const StreamType stream)
-{
-    if (num_devices > 1) {
-        // With periodic boundary conditions we exchange the front and back plates of the
-        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStepAsync(const StreamType stream)
-{
-    ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
-
-    local_boundcondstep(stream);
-    acSynchronizeStream(stream);
-    global_boundcondstep(stream);
-    synchronize_halos(stream);
-    acSynchronizeStream(stream);
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStep(void)
-{
-    return acBoundcondStepAsync(STREAM_DEFAULT);
-}
-
-static AcResult
-swap_buffers(void)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        swapBuffers(devices[i]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrate(const AcReal dt)
-{
-    acSynchronizeStream(STREAM_ALL);
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
-        swap_buffers();
-        acBoundcondStep();
-    }
-    return AC_SUCCESS;
-}
-
-static AcReal
-simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n)
-{
-    AcReal res = results[0];
-    for (int i = 1; i < n; ++i) {
-        if (rtype == RTYPE_MAX) {
-            res = max(res, results[i]);
-        }
-        else if (rtype == RTYPE_MIN) {
-            res = min(res, results[i]);
-        }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
-            res = sum(res, results[i]);
-        }
-        else {
-            ERROR("Invalid rtype");
-        }
-    }
-
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res                = sqrt(inv_n * res);
-    }
-    return res;
-}
-
-AcReal
-acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle)
-{
-    acSynchronizeStream(STREAM_ALL);
-
-    AcReal results[num_devices];
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
-AcReal
-acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
-            const VertexBufferHandle c)
-{
-    acSynchronizeStream(STREAM_ALL);
-
-    AcReal results[num_devices];
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
-AcResult
-acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices,
-                      const StreamType stream)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
-
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
-
-        const int3 da = max(s0, d0);
-        const int3 db = min(s1, d1);
-        /*
-        printf("Device %d\n", i);
-        printf("\ts0: "); printInt3(s0); printf("\n");
-        printf("\td0: "); printInt3(d0); printf("\n");
-        printf("\tda: "); printInt3(da); printf("\n");
-        printf("\tdb: "); printInt3(db); printf("\n");
-        printf("\td1: "); printInt3(d1); printf("\n");
-        printf("\ts1: "); printInt3(s1); printf("\n");
-        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
-        */
-        if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
-            // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
-            // printf("\n");
-            copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
-        }
-        // printf("\n");
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices)
-{
-    return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
-}
-
-AcResult
-acLoad(const AcMesh host_mesh)
-{
-    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info));
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh,
-                       const StreamType stream)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
-
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
-
-        const int3 da = max(s0, d0);
-        const int3 db = min(s1, d1);
-        if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
-            // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh)
-{
-    return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
-}
-
-AcResult
-acStore(AcMesh* host_mesh)
-{
-    acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh);
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        loadDeviceConstant(devices[i], stream, param, value);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadDeviceConstant(const AcRealParam param, const AcReal value)
-{
-    return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
-}
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
-=======
->>>>>>> Stashed changes
+#include "astaroth_defines.h"
diff --git a/src/core/device.cu b/src/core/device.cu
index c046d18..5d7dd69 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -16,507 +16,7 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "device.cuh"
-
-#include "errchk.h"
-
-// Device info
-#define REGISTERS_PER_THREAD (255)
-#define MAX_REGISTERS_PER_BLOCK (65536)
-#define MAX_THREADS_PER_BLOCK (1024)
-#define WARP_SIZE (32)
-
-typedef struct {
-    AcReal* in[NUM_VTXBUF_HANDLES];
-    AcReal* out[NUM_VTXBUF_HANDLES];
-} VertexBufferArray;
-
-__constant__ AcMeshInfo d_mesh_info;
-__constant__ int3 d_multigpu_offset;
-__constant__ Grid globalGrid;
-#define DCONST_INT(X) (d_mesh_info.int_params[X])
-#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
-#define DCONST_REAL(X) (d_mesh_info.real_params[X])
-#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
-#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
-#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
-#include "kernels/kernels.cuh"
-
-static dim3 rk3_tpb = (dim3){32, 1, 4};
-
-#if PACKED_DATA_TRANSFERS // Defined in device.cuh
-// #include "kernels/pack_unpack.cuh"
-#endif
+#include "astaroth_device.h"
 
 struct device_s {
-    int id;
-    AcMeshInfo local_config;
-
-    // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
-
-    // Memory
-    VertexBufferArray vba;
-    AcReal* reduce_scratchpad;
-    AcReal* reduce_result;
-
-#if PACKED_DATA_TRANSFERS
-// Declare memory for buffers needed for packed data transfers here
-// AcReal* data_packing_buffer;
-#endif
 };
-
-AcResult
-printDeviceInfo(const Device device)
-{
-    const int device_id = device->id;
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, device_id);
-    printf("--------------------------------------------------\n");
-    printf("Device Number: %d\n", device_id);
-    const size_t bus_id_max_len = 128;
-    char bus_id[bus_id_max_len];
-    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
-    printf("  PCI bus ID: %s\n", bus_id);
-    printf("    Device name: %s\n", props.name);
-    printf("    Compute capability: %d.%d\n", props.major, props.minor);
-
-    // Compute
-    printf("  Compute\n");
-    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
-    printf("    Stream processors: %d\n", props.multiProcessorCount);
-    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
-    printf(
-        "    Compute mode: %d\n",
-        (int)props
-            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
-    // Memory
-    printf("  Global memory\n");
-    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
-    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
-    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
-           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
-    printf("    ECC enabled: %d\n", props.ECCEnabled);
-
-    // Memory usage
-    size_t free_bytes, total_bytes;
-    cudaMemGetInfo(&free_bytes, &total_bytes);
-    const size_t used_bytes = total_bytes - free_bytes;
-    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
-    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
-    printf("  Caches\n");
-    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
-    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
-    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
-    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
-    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
-    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
-    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
-    printf("  Other\n");
-    printf("    Warp size: %d\n", props.warpSize);
-    // printf("    Single to double perf. ratio: %dx\n",
-    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
-    // versions
-    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
-    printf("--------------------------------------------------\n");
-
-    return AC_SUCCESS;
-}
-
-static __global__ void
-dummy_kernel(void)
-{
-}
-
-AcResult
-createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
-{
-    cudaSetDevice(id);
-    cudaDeviceReset();
-
-    // Create Device
-    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
-    ERRCHK_ALWAYS(device);
-
-    device->id           = id;
-    device->local_config = device_config;
-
-    // Check that the code was compiled for the proper GPU architecture
-    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
-           "device supports the CUDA architecture you are compiling for.\n"
-           "Running dummy kernel... ");
-    fflush(stdout);
-    dummy_kernel<<<1, 1>>>();
-    ERRCHK_CUDA_KERNEL_ALWAYS();
-    printf("Success!\n");
-
-    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
-    }
-
-    // Memory
-    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
-    }
-    ERRCHK_CUDA_ALWAYS(
-        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
-
-#if PACKED_DATA_TRANSFERS
-// Allocate data required for packed transfers here (cudaMalloc)
-#endif
-
-    // Device constants
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
-                                          cudaMemcpyHostToDevice));
-
-    // Multi-GPU offset. This is used to compute globalVertexIdx.
-    // Might be better to calculate this in astaroth.cu instead of here, s.t.
-    // everything related to the decomposition is limited to the multi-GPU layer
-    const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
-                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
-
-    printf("Created device %d (%p)\n", device->id, device);
-    *device_handle = device;
-
-    // Autoptimize
-    if (id == 0)
-        autoOptimize(device);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-destroyDevice(Device device)
-{
-    cudaSetDevice(device->id);
-    printf("Destroying device %d (%p)\n", device->id, device);
-
-    // Memory
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        cudaFree(device->vba.in[i]);
-        cudaFree(device->vba.out[i]);
-    }
-    cudaFree(device->reduce_scratchpad);
-    cudaFree(device->reduce_result);
-
-#if PACKED_DATA_TRANSFERS
-// Free data required for packed tranfers here (cudaFree)
-#endif
-
-    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamDestroy(device->streams[i]);
-    }
-
-    // Destroy Device
-    free(device);
-    return AC_SUCCESS;
-}
-
-AcResult
-boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
-{
-    cudaSetDevice(device->id);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
-           const VertexBufferHandle vtxbuf_handle, AcReal* result)
-{
-    cudaSetDevice(device->id);
-
-    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                              device->local_config.int_params[AC_ny_min],
-                              device->local_config.int_params[AC_nz_min]};
-
-    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                            device->local_config.int_params[AC_ny_max],
-                            device->local_config.int_params[AC_nz_max]};
-
-    *result = reduce_scal(device->streams[stream_type], rtype, start, end,
-                          device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
-                          device->reduce_result);
-    return AC_SUCCESS;
-}
-
-AcResult
-reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
-          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-          const VertexBufferHandle vtxbuf2, AcReal* result)
-{
-    cudaSetDevice(device->id);
-
-    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                              device->local_config.int_params[AC_ny_min],
-                              device->local_config.int_params[AC_nz_min]};
-
-    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                            device->local_config.int_params[AC_ny_max],
-                            device->local_config.int_params[AC_nz_max]};
-
-    *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
-                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
-                         device->reduce_scratchpad, device->reduce_result);
-    return AC_SUCCESS;
-}
-
-AcResult
-rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
-       const int3& end, const AcReal dt)
-{
-    cudaSetDevice(device->id);
-
-    // const dim3 tpb(32, 1, 4);
-    const dim3 tpb = rk3_tpb;
-
-    const int3 n = end - start;
-    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-    else
-        solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
-}
-
-AcResult
-synchronize(const Device device, const StreamType stream_type)
-{
-    cudaSetDevice(device->id);
-    if (stream_type == STREAM_ALL) {
-        cudaDeviceSynchronize();
-    }
-    else {
-        cudaStreamSynchronize(device->streams[stream_type]);
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-               const size_t bytes, AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-static AcResult
-storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-                const size_t bytes, AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
-                 const int3& src, const int3& dst, const int num_vertices)
-{
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
-                       num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
-               const int num_vertices, AcMesh* host_mesh)
-{
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
-                        num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
-                       Device dst_device, const int3& dst, const int num_vertices)
-{
-    cudaSetDevice(src_device->id);
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
-
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
-                                        &src_device->vba.in[i][src_idx], src_device->id,
-                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
-                                        src_device->streams[stream_type]));
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-swapBuffers(const Device device)
-{
-    cudaSetDevice(device->id);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        AcReal* tmp        = device->vba.in[i];
-        device->vba.in[i]  = device->vba.out[i];
-        device->vba.out[i] = tmp;
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param,
-                   const int value)
-{
-    cudaSetDevice(device->id);
-    // CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here)
-    // and something like d_mesh_info.real_params[] cannot be directly accessed.
-    // Therefore we have to obfuscate the code a bit and compute the offset address before
-    // invoking cudaMemcpyToSymbol.
-    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice,
-                                               device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param,
-                   const AcReal value)
-{
-    cudaSetDevice(device->id);
-    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice,
-                                               device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-loadGlobalGrid(const Device device, const Grid grid)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA_ALWAYS(
-        cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
-    return AC_SUCCESS;
-}
-
-AcResult
-autoOptimize(const Device device)
-{
-    cudaSetDevice(device->id);
-
-    // RK3
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + (int3){device->local_config.int_params[AC_nx], //
-                                    device->local_config.int_params[AC_ny], //
-                                    device->local_config.int_params[AC_nz]};
-
-    dim3 best_dims(0, 0, 0);
-    float best_time          = INFINITY;
-    const int num_iterations = 10;
-
-    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
-        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
-            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
-
-                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
-                    break;
-                if (x * y * z > MAX_THREADS_PER_BLOCK)
-                    break;
-
-                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
-                    break;
-
-                if (((x * y * z) % WARP_SIZE) != 0)
-                    continue;
-
-                const dim3 tpb(x, y, z);
-                const int3 n = end - start;
-                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-                cudaDeviceSynchronize();
-                if (cudaGetLastError() != cudaSuccess) // resets the error if any
-                    continue;
-
-                // printf("(%d, %d, %d)\n", x, y, z);
-
-                cudaEvent_t tstart, tstop;
-                cudaEventCreate(&tstart);
-                cudaEventCreate(&tstop);
-
-                cudaEventRecord(tstart); // ---------------------------------------- Timing start
-
-                for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
-
-                cudaEventRecord(tstop); // ----------------------------------------- Timing end
-                cudaEventSynchronize(tstop);
-                float milliseconds = 0;
-                cudaEventElapsedTime(&milliseconds, tstart, tstop);
-
-                ERRCHK_CUDA_KERNEL_ALWAYS();
-                if (milliseconds < best_time) {
-                    best_time = milliseconds;
-                    best_dims = tpb;
-                }
-            }
-        }
-    }
-#if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
-#endif
-    /*
-    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
-    ERRCHK(fp);
-    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
-    fclose(fp);
-    */
-
-    rk3_tpb = best_dims;
-    return AC_SUCCESS;
-}
-
-#if PACKED_DATA_TRANSFERS
-// Functions for calling packed data transfers
-#endif
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
diff --git a/src/core/grid.cu b/src/core/grid.cu
index e69de29..83c7453 100644
--- a/src/core/grid.cu
+++ b/src/core/grid.cu
@@ -0,0 +1,19 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "astaroth_grid.h"
diff --git a/src/core/node.cu b/src/core/node.cu
index c3bfc05..1c3d5cf 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -16,7 +16,7 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-// #include "astaroth_node.h"
+#include "astaroth_node.h"
 
 struct node_s {
 };

From 49026bd26b92c891fb47d50ff15ba1b1887b9f94 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 31 Jul 2019 18:46:41 +0300
Subject: [PATCH 03/37] Revised device interface done

---
 include/astaroth_defines.h   |   1 +
 include/astaroth_device.h    |  16 +-
 src/core/device.cu           | 597 ++++++++++++++++++++++++++++++++++-
 src/core/kernels/kernels.cuh |  12 -
 4 files changed, 608 insertions(+), 18 deletions(-)

diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 7aa9751..1441967 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -105,6 +105,7 @@ typedef enum {
 } ReductionType;
 
 typedef enum { STREAM_DEFAULT, NUM_STREAM_TYPES } Stream;
+#define STREAM_ALL (-1)
 
 #define AC_GEN_ID(X) X
 typedef enum {
diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 48c8db2..929b591 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -88,7 +88,7 @@ AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* hos
 AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
                                                 const VertexBufferHandle vtxbuf_handle,
                                                 const int3 src, const int3 dst,
-                                                const int num_vertices, Device* dst_device);
+                                                const int num_vertices, Device dst_device);
 
 /** */
 AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
@@ -106,15 +106,21 @@ AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Devi
 AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
                                   const int3 start, const int3 end, const AcReal dt);
 /** */
-AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 start,
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
                                        const int3 end);
+
+/** */
+AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                                    const int3 end);
+
 /** */
 AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                             const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
-                           const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                           const VertexBufferHandle vec2, AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
+                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                           const VertexBufferHandle vtxbuf2, AcReal* result);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/core/device.cu b/src/core/device.cu
index 5d7dd69..dbee2cc 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -16,7 +16,602 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "astaroth_device.h"
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "astaroth_device.cuh"
+
+#include "errchk.h"
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define WARP_SIZE (32)
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+} VertexBufferArray;
+
+__constant__ AcMeshInfo d_mesh_info;
+__constant__ int3 d_multigpu_offset;
+__constant__ Grid globalGrid;
+#define DCONST_INT(X) (d_mesh_info.int_params[X])
+#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
+#include "kernels/kernels.cuh"
+
+static dim3 rk3_tpb = (dim3){32, 1, 4};
+
+#if PACKED_DATA_TRANSFERS // Defined in device.cuh
+// #include "kernels/pack_unpack.cuh"
+#endif
 
 struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+
+#if PACKED_DATA_TRANSFERS
+// Declare memory for buffers needed for packed data transfers here
+// AcReal* data_packing_buffer;
+#endif
 };
+
+static __global__ void
+dummy_kernel(void)
+{
+}
+
+AcResult
+acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
+{
+    cudaSetDevice(id);
+    cudaDeviceReset();
+
+    // Create Device
+    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
+    ERRCHK_ALWAYS(device);
+
+    device->id           = id;
+    device->local_config = device_config;
+
+    // Check that the code was compiled for the proper GPU architecture
+    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
+           "device supports the CUDA architecture you are compiling for.\n"
+           "Running dummy kernel... ");
+    fflush(stdout);
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    printf("Success!\n");
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
+    }
+
+    // Memory
+    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+    }
+    ERRCHK_CUDA_ALWAYS(
+        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+
+#if PACKED_DATA_TRANSFERS
+// Allocate data required for packed transfers here (cudaMalloc)
+#endif
+
+    // Device constants
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
+                                          cudaMemcpyHostToDevice));
+
+    // Multi-GPU offset. This is used to compute globalVertexIdx.
+    // Might be better to calculate this in astaroth.cu instead of here, s.t.
+    // everything related to the decomposition is limited to the multi-GPU layer
+    const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
+                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
+
+    printf("Created device %d (%p)\n", device->id, device);
+    *device_handle = device;
+
+    // Autoptimize
+    if (id == 0) {
+        autoOptimize(device);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceDestroy(Device device)
+{
+    cudaSetDevice(device->id);
+    printf("Destroying device %d (%p)\n", device->id, device);
+
+    // Memory
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        cudaFree(device->vba.in[i]);
+        cudaFree(device->vba.out[i]);
+    }
+    cudaFree(device->reduce_scratchpad);
+    cudaFree(device->reduce_result);
+
+#if PACKED_DATA_TRANSFERS
+// Free data required for packed tranfers here (cudaFree)
+#endif
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamDestroy(device->streams[i]);
+    }
+
+    // Destroy Device
+    free(device);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePrintInfo(const Device device)
+{
+    const int device_id = device->id;
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);
+
+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf(
+        "    Compute mode: %d\n",
+        (int)props
+            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    // Memory
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
+
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
+    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
+    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
+
+    return AC_SUCCESS;
+}
+
+AcResult
+autoOptimize(const Device device)
+{
+    cudaSetDevice(device->id);
+
+    // RK3
+    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 end   = start + (int3){device->local_config.int_params[AC_nx], //
+                                    device->local_config.int_params[AC_ny], //
+                                    device->local_config.int_params[AC_nz]};
+
+    dim3 best_dims(0, 0, 0);
+    float best_time          = INFINITY;
+    const int num_iterations = 10;
+
+    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
+        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
+            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
+
+                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
+                    break;
+                if (x * y * z > MAX_THREADS_PER_BLOCK)
+                    break;
+
+                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
+                    break;
+
+                if (((x * y * z) % WARP_SIZE) != 0)
+                    continue;
+
+                const dim3 tpb(x, y, z);
+                const int3 n = end - start;
+                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+                cudaDeviceSynchronize();
+                if (cudaGetLastError() != cudaSuccess) // resets the error if any
+                    continue;
+
+                // printf("(%d, %d, %d)\n", x, y, z);
+
+                cudaEvent_t tstart, tstop;
+                cudaEventCreate(&tstart);
+                cudaEventCreate(&tstop);
+
+                cudaEventRecord(tstart); // ---------------------------------------- Timing start
+
+                for (int i = 0; i < num_iterations; ++i)
+                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
+
+                cudaEventRecord(tstop); // ----------------------------------------- Timing end
+                cudaEventSynchronize(tstop);
+                float milliseconds = 0;
+                cudaEventElapsedTime(&milliseconds, tstart, tstop);
+
+                ERRCHK_CUDA_KERNEL_ALWAYS();
+                if (milliseconds < best_time) {
+                    best_time = milliseconds;
+                    best_dims = tpb;
+                }
+            }
+        }
+    }
+#if VERBOSE_PRINTING
+    printf(
+        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
+        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
+#endif
+    /*
+    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
+    ERRCHK(fp);
+    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
+    fclose(fp);
+    */
+
+    rk3_tpb = best_dims;
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceSynchronizeStream(const Device device, const Stream stream)
+{
+    cudaSetDevice(device->id);
+    if (stream == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    }
+    else {
+        cudaStreamSynchronize(device->streams[stream]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceSwapBuffers(const Device device)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        AcReal* tmp        = device->vba.in[i];
+        device->vba.in[i]  = device->vba.out[i];
+        device->vba.out[i] = tmp;
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
+                     const AcReal value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+static AcResult
+load_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes,
+                 AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(                                                                          //
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
+    );
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh,
+                                   const VertexBufferHandle vtxbuf_handle, const int3src,
+                                   const int3dst, const int num_vertices)
+{
+    cudaSetDevice(device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
+    load_with_offset(device, stream, &host_mesh.vertex_buffer[vtxbuf_handle][src_idx],
+                     num_vertices * sizeof(AcReal), &device->vba.in[vtxbuf_handle][dst_idx]);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh,
+                           const int3src, const int3dst, const int num_vertices)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
+                                           dst, num_vertices);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
+                         const VertexBufferHandle vtxbuf_handle)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+    acLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst, num_vertices);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
+    }
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+store_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes,
+                  AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(                                                                          //
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
+    );
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
+                                    const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                    const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    cudaSetDevice(device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
+    store_with_offset(device, stream, &device->vba.in[vtxbuf_handle][src_idx],
+                      num_vertices * sizeof(AcReal),
+                      &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx]);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                            const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
+                                            num_vertices, host_mesh);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreVertexBuffer(const Device device, const Stream stream,
+                          const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+
+    acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
+                                        host_mesh);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3src,
+                                       const int3dst, const int num_vertices, Device dst_device)
+{
+    cudaSetDevice(src_device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
+
+    ERRCHK_CUDA( //
+        cudaMemcpyPeerAsync(&dst_device->vba.in[vtxbuf_handle][dst_idx], dst_device->id,
+                            &src_device->vba.in[vtxbuf_handle][src_idx], src_device->id,
+                            sizeof(src_device->vba.in[vtxbuf_handle][0]) * num_vertices,
+                            src_device->streams[stream]) //
+    );
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3src,
+                               const int3dst, const int num_vertices, Device dst_device)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
+                             const VertexBufferHandle vtxbuf_handle, Device dst_device)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+
+    acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                           num_vertices, dst_device);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceTransferVertexBuffer(src_device, stream, vtxbuf_handle, (VertexBufferHandle)i, src,
+                                     dst, num_vertices, dst_device);
+    }
+}
+
+AcResult
+acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                         const int3 start, const int3 end, const AcReal dt)
+{
+    cudaSetDevice(device->id);
+    const cudaStream_t stream = device->streams[stream];
+
+    const dim3 tpb = rk3_tpb;
+
+    const int3 n = end - start;
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                              const VertexBufferHandle vtxbuf_handle, const int3 start,
+                              const int3 end)
+{
+    cudaSetDevice(device->id);
+    const cudaStream_t stream = device->streams[stream];
+
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                           const int3 end)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                   const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    cudaSetDevice(device->id);
+
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
+
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
+
+    *result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
+                          device->reduce_scratchpad, device->reduce_result);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
+                  const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                  const VertexBufferHandle vtxbuf2, AcReal* result)
+{
+    cudaSetDevice(device->id);
+
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
+
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
+
+    *result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
+                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
+                         device->reduce_scratchpad, device->reduce_result);
+    return AC_SUCCESS;
+}
+
+#if PACKED_DATA_TRANSFERS
+// Functions for calling packed data transfers
+#endif
diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh
index e8e4042..7a4e54b 100644
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -74,18 +74,6 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
     vtxbuf[dst_idx]   = vtxbuf[src_idx];
 }
 
-void
-periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
-{
-    const dim3 tpb(8, 2, 8);
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
-    ERRCHK_CUDA_KERNEL();
-}
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <assert.h>
 

From 9b7f4277fce6cbafc500280af3aeedd12a2f6407 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 31 Jul 2019 19:07:26 +0300
Subject: [PATCH 04/37] Fixed errors in device.cu

---
 src/core/device.cu | 106 ++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 59 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index dbee2cc..042a1f8 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -24,7 +24,7 @@
  * Detailed info.
  *
  */
-#include "astaroth_device.cuh"
+#include "astaroth_device.h"
 
 #include "errchk.h"
 
@@ -41,7 +41,6 @@ typedef struct {
 
 __constant__ AcMeshInfo d_mesh_info;
 __constant__ int3 d_multigpu_offset;
-__constant__ Grid globalGrid;
 #define DCONST_INT(X) (d_mesh_info.int_params[X])
 #define DCONST_INT3(X) (d_mesh_info.int3_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
@@ -50,7 +49,7 @@ __constant__ Grid globalGrid;
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #include "kernels/kernels.cuh"
 
-static dim3 rk3_tpb = (dim3){32, 1, 4};
+static dim3 rk3_tpb(32, 1, 4);
 
 #if PACKED_DATA_TRANSFERS // Defined in device.cuh
 // #include "kernels/pack_unpack.cuh"
@@ -136,7 +135,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
 
     // Autoptimize
     if (id == 0) {
-        autoOptimize(device);
+        acDeviceAutoOptimize(device);
     }
 
     return AC_SUCCESS;
@@ -346,34 +345,29 @@ acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam
     return AC_SUCCESS;
 }
 
-static AcResult
-load_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes,
-                 AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(                                                                          //
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
-    );
-    return AC_SUCCESS;
-}
-
 AcResult
-acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh,
-                                   const VertexBufferHandle vtxbuf_handle, const int3src,
-                                   const int3dst, const int num_vertices)
+acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
+                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                   const int3 dst, const int num_vertices)
 {
     cudaSetDevice(device->id);
     const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
     const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
-    load_with_offset(device, stream, &host_mesh.vertex_buffer[vtxbuf_handle][src_idx],
-                     num_vertices * sizeof(AcReal), &device->vba.in[vtxbuf_handle][dst_idx]);
+
+    const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx];
+    AcReal* dst_ptr       = &device->vba.in[vtxbuf_handle][dst_idx];
+    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);
+
+    ERRCHK_CUDA(                                                                                  //
+        cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
+    );
 
     return AC_SUCCESS;
 }
 
 AcResult
-acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh,
-                           const int3src, const int3dst, const int num_vertices)
+acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
+                           const int3 src, const int3 dst, const int num_vertices)
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
@@ -386,10 +380,11 @@ AcResult
 acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
                          const VertexBufferHandle vtxbuf_handle)
 {
-    int3 src                  = (int3){0, 0, 0};
-    int3 dst                  = src;
+    const int3 src            = (int3){0, 0, 0};
+    const int3 dst            = src;
     const size_t num_vertices = acVertexBufferSize(device->local_config);
-    acLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst, num_vertices);
+    acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst,
+                                       num_vertices);
 
     return AC_SUCCESS;
 }
@@ -404,17 +399,6 @@ acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mes
     return AC_SUCCESS;
 }
 
-static AcResult
-store_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes,
-                  AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(                                                                          //
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
-    );
-    return AC_SUCCESS;
-}
-
 AcResult
 acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
                                     const VertexBufferHandle vtxbuf_handle, const int3 src,
@@ -423,9 +407,14 @@ acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
     cudaSetDevice(device->id);
     const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
     const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
-    store_with_offset(device, stream, &device->vba.in[vtxbuf_handle][src_idx],
-                      num_vertices * sizeof(AcReal),
-                      &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx]);
+
+    const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][dst_idx];
+    AcReal* dst_ptr       = &host_mesh->vertex_buffer[vtxbuf_handle][src_idx];
+    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);
+
+    ERRCHK_CUDA(                                                                                  //
+        cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
+    );
 
     return AC_SUCCESS;
 }
@@ -468,28 +457,28 @@ acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
 
 AcResult
 acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
-                                       const VertexBufferHandle vtxbuf_handle, const int3src,
-                                       const int3dst, const int num_vertices, Device dst_device)
+                                       const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                       const int3 dst, const int num_vertices, Device dst_device)
 {
     cudaSetDevice(src_device->id);
     const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
     const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
 
-    ERRCHK_CUDA( //
-        cudaMemcpyPeerAsync(&dst_device->vba.in[vtxbuf_handle][dst_idx], dst_device->id,
-                            &src_device->vba.in[vtxbuf_handle][src_idx], src_device->id,
-                            sizeof(src_device->vba.in[vtxbuf_handle][0]) * num_vertices,
-                            src_device->streams[stream]) //
-    );
+    const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx];
+    AcReal* dst_ptr       = &dst_device->vba.in[vtxbuf_handle][dst_idx];
+    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);
+
+    ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes,
+                                    src_device->streams[stream]));
     return AC_SUCCESS;
 }
 
 AcResult
-acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3src,
-                               const int3dst, const int num_vertices, Device dst_device)
+acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
+                               const int3 dst, const int num_vertices, Device dst_device)
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
                                                num_vertices, dst_device);
     }
     return AC_SUCCESS;
@@ -501,7 +490,7 @@ acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
 {
     int3 src                  = (int3){0, 0, 0};
     int3 dst                  = src;
-    const size_t num_vertices = acVertexBufferSize(device->local_config);
+    const size_t num_vertices = acVertexBufferSize(src_device->local_config);
 
     acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                            num_vertices, dst_device);
@@ -512,9 +501,9 @@ AcResult
 acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device)
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        acDeviceTransferVertexBuffer(src_device, stream, vtxbuf_handle, (VertexBufferHandle)i, src,
-                                     dst, num_vertices, dst_device);
+        acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
     }
+    return AC_SUCCESS;
 }
 
 AcResult
@@ -522,7 +511,6 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
                          const int3 start, const int3 end, const AcReal dt)
 {
     cudaSetDevice(device->id);
-    const cudaStream_t stream = device->streams[stream];
 
     const dim3 tpb = rk3_tpb;
 
@@ -532,11 +520,11 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
                    (unsigned int)ceil(n.z / AcReal(tpb.z)));
 
     if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
     else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
     else
-        solve<2><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
 
     ERRCHK_CUDA_KERNEL();
 
@@ -544,12 +532,12 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
 }
 
 AcResult
-acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type,
                               const VertexBufferHandle vtxbuf_handle, const int3 start,
                               const int3 end)
 {
     cudaSetDevice(device->id);
-    const cudaStream_t stream = device->streams[stream];
+    const cudaStream_t stream = device->streams[stream_type];
 
     const dim3 tpb(8, 2, 8);
     const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),

From 0a5d02517299e2bd9aabbef7f292097012329247 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 31 Jul 2019 19:08:16 +0300
Subject: [PATCH 05/37] Formatting

---
 src/core/device.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index 042a1f8..b662301 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -73,10 +73,9 @@ struct device_s {
 #endif
 };
 
-static __global__ void
-dummy_kernel(void)
-{
-}
+// clang-format off
+static __global__ void dummy_kernel(void) {}
+// clang-format on
 
 AcResult
 acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)

From fb0610c1ba575aabecdf5fbb79a371955b610992 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 31 Jul 2019 20:04:39 +0300
Subject: [PATCH 06/37] Intermediate changes to the revised node interface

---
 include/astaroth_device.h |  12 +-
 include/astaroth_node.h   |  34 ++-
 src/core/device.cu        |   6 +
 src/core/node.cu          | 465 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 502 insertions(+), 15 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 929b591..75ec693 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -55,7 +55,7 @@ AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream st
                                             const VertexBufferHandle vtxbuf_handle, const int3 src,
                                             const int3 dst, const int num_vertices);
 
-/** */
+/** Deprecated */
 AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
                                     const AcMesh host_mesh, const int3 src, const int3 dst,
                                     const int num_vertices);
@@ -64,7 +64,7 @@ AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
 AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
                                   const VertexBufferHandle vtxbuf_handle);
 
-/** */
+/** Deprecated */
 AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
 
 /** */
@@ -73,7 +73,7 @@ AcResult acDeviceStoreVertexBufferWithOffset(const Device device, const Stream s
                                              const int3 dst, const int num_vertices,
                                              AcMesh* host_mesh);
 
-/** */
+/** Deprecated */
 AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
                                      const int3 dst, const int num_vertices, AcMesh* host_mesh);
 
@@ -81,7 +81,7 @@ AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, c
 AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
                                    const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
 
-/** */
+/** Deprecated */
 AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
 
 /** */
@@ -90,7 +90,7 @@ AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const S
                                                 const int3 src, const int3 dst,
                                                 const int num_vertices, Device dst_device);
 
-/** */
+/** Deprecated */
 AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
                                         const int3 src, const int3 dst, const int num_vertices,
                                         Device* dst_device);
@@ -99,7 +99,7 @@ AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream st
 AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
                                       const VertexBufferHandle vtxbuf_handle, Device* dst_device);
 
-/** */
+/** Deprecated */
 AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device);
 
 /** */
diff --git a/include/astaroth_node.h b/include/astaroth_node.h
index 216db25..0e18da1 100644
--- a/include/astaroth_node.h
+++ b/include/astaroth_node.h
@@ -24,26 +24,36 @@ extern "C" {
 
 #include "astaroth_defines.h"
 
-typedef struct node_s* Node;
+typedef struct node_s* Node; // Opaque pointer to node_s.
+
+typedef struct {
+
+} DeviceConfiguration;
 
 /** */
-AcResult acNodeCreate(const AcMeshInfo node_config, Node* node);
+AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
 
 /** */
 AcResult acNodeDestroy(Node node);
 
+/** */
+AcResult acNodePrintInfo(const Node node);
+
 /** */
 AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
 
+/** */
+AcResult acNodeAutoOptimize(const Node node);
+
 /** */
 AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
 
 /** */
 AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
-                                       const VertexBufferHandle vtxbuf_handle);
+                                       const VertexBufferHandle vtxbuf_handle); // Not in Device
 
 /** */
-AcResult acNodeSynchronizeMesh(const Node node, const Stream stream);
+AcResult acNodeSynchronizeMesh(const Node node, const Stream stream); // Not in Device
 
 /** */
 AcResult acNodeSwapBuffers(const Node node);
@@ -90,7 +100,7 @@ AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh
 AcResult acNodeTransferVertexBufferWithOffset(const Node src_node, const Stream stream,
                                               const VertexBufferHandle vtxbuf_handle,
                                               const int3 src, const int3 dst,
-                                              const int num_vertices, Node* dst_node);
+                                              const int num_vertices, Node dst_node);
 
 /** */
 AcResult acNodeTransferMeshWithOffset(const Node src_node, const Stream stream, const int3 src,
@@ -107,15 +117,21 @@ AcResult acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_
 AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
                                 const int3 start, const int3 end, const AcReal dt);
 /** */
-AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const int3 start,
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle, const int3 start,
                                      const int3 end);
+
+/** */
+AcResult acNodePeriodicBoundconds(const Node node, const Stream stream, const int3 start,
+                                  const int3 end);
+
 /** */
 AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                           const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
-                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                         const VertexBufferHandle vec2, AcReal* result);
+AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/core/device.cu b/src/core/device.cu
index b662301..d643ad1 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -368,6 +368,7 @@ AcResult
 acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                            const int3 src, const int3 dst, const int num_vertices)
 {
+    WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
                                            dst, num_vertices);
@@ -391,6 +392,7 @@ acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh
 AcResult
 acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
 {
+    WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
     }
@@ -422,6 +424,7 @@ AcResult
 acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
                             const int3 dst, const int num_vertices, AcMesh* host_mesh)
 {
+    WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
                                             num_vertices, host_mesh);
@@ -447,6 +450,7 @@ acDeviceStoreVertexBuffer(const Device device, const Stream stream,
 AcResult
 acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
 {
+    WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
     }
@@ -476,6 +480,7 @@ AcResult
 acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
                                const int3 dst, const int num_vertices, Device dst_device)
 {
+    WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
                                                num_vertices, dst_device);
@@ -499,6 +504,7 @@ acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
 AcResult
 acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device)
 {
+    WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
     }
diff --git a/src/core/node.cu b/src/core/node.cu
index 1c3d5cf..296f235 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -18,5 +18,470 @@
 */
 #include "astaroth_node.h"
 
+#include "astaroth_device.h"
+#include "errchk.h"
+#include "math_utils.h" // sum for reductions
+
+#define AC_GEN_STR(X) #X
+const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
+                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
+                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
+                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
+                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+#undef AC_GEN_STR
+
+static const int MAX_NUM_DEVICES = 32;
+static Node node                 = NULL;
+
 struct node_s {
+    int id;
+
+    int num_devices;
+    Device devices[MAX_NUM_DEVICES];
+
+    Grid grid;
+    Grid subgrid;
 };
+
+static int
+gridIdx(const Grid grid, const int3 idx)
+{
+    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
+}
+
+static int3
+gridIdx3d(const Grid grid, const int idx)
+{
+    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+                  idx / (grid.m.x * grid.m.y)};
+}
+
+static void
+printInt3(const int3 vec)
+{
+    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
+}
+
+static inline void
+print(const AcMeshInfo config)
+{
+    for (int i = 0; i < NUM_INT_PARAMS; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+}
+
+static void
+update_builtin_params(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = NGHOST;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = NGHOST;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
+    config->int_params[AC_nz_min] = NGHOST;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
+
+#if VERBOSE_PRINTING // Defined in astaroth.h
+    printf("###############################################################\n");
+    printf("Config dimensions recalculated:\n");
+    print(*config);
+    printf("###############################################################\n");
+#endif
+}
+
+static Grid
+createGrid(const AcMeshInfo config)
+{
+    Grid grid;
+
+    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
+    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
+
+    return grid;
+}
+
+AcResult
+acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
+{
+    struct node_s* node = (struct device_s*)malloc(sizeof(*node));
+
+    // Get num_devices
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
+    if (num_devices < 1) {
+        ERROR("No CUDA devices found!");
+        return AC_FAILURE;
+    }
+    if (num_devices > MAX_NUM_DEVICES) {
+        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
+        num_devices = MAX_NUM_DEVICES;
+    }
+    if (!AC_MULTIGPU_ENABLED) {
+        WARNING("MULTIGPU_ENABLED was false. Using only one device");
+        num_devices = 1; // Use only one device if multi-GPU is not enabled
+    }
+    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // problem domain to multiple GPUs much easier since we do not have to worry
+    // about remainders
+    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+
+    // Decompose the problem domain
+    // The main grid
+    grid = createGrid(config);
+
+    // Subgrids
+    AcMeshInfo subgrid_config = config;
+    subgrid_config.int_params[AC_nz] /= num_devices;
+    update_builtin_params(&subgrid_config);
+    subgrid = createGrid(subgrid_config);
+
+    // Periodic boundary conditions become weird if the system can "fold unto itself".
+    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+
+#if VERBOSE_PRINTING
+    // clang-format off
+    printf("Grid m ");   printInt3(grid.m);    printf("\n");
+    printf("Grid n ");   printInt3(grid.n);    printf("\n");
+    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+    // clang-format on
+#endif
+
+    // Initialize the devices
+    for (int i = 0; i < num_devices; ++i) {
+        createDevice(i, subgrid_config, &devices[i]);
+        printDeviceInfo(devices[i]);
+    }
+
+    // Enable peer access
+    for (int i = 0; i < num_devices; ++i) {
+        const int front = (i + 1) % num_devices;
+        const int back  = (i - 1 + num_devices) % num_devices;
+
+        int can_access_front, can_access_back;
+        cudaDeviceCanAccessPeer(&can_access_front, i, front);
+        cudaDeviceCanAccessPeer(&can_access_back, i, back);
+#if VERBOSE_PRINTING
+        printf(
+            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
+            i, front, can_access_front, back, can_access_back);
+#endif
+
+        cudaSetDevice(i);
+        if (can_access_front) {
+            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
+        }
+        if (can_access_back) {
+            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
+        }
+    }
+    acNodeSynchronizeStream(STREAM_ALL);
+
+    *node_handle = node;
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeDestroy(Node node)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    for (int i = 0; i < num_devices; ++i) {
+        destroyDevice(devices[i]);
+    }
+    free(node);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodePrintInfo(const Node node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeAutoOptimize(const Node node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeSynchronizeStream(const Node node, const Stream stream)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], stream);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
+                              const VertexBufferHandle vtxbuf_handle)
+{
+    // Exchanges the halos of subgrids
+    // After this step, the data within the main grid ranging from
+    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
+    // has been synchronized and transferred to appropriate subgrids
+
+    // We loop only to num_devices - 1 since the front and back plate of the grid is not
+    // transferred because their contents depend on the boundary conditions.
+
+    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
+    // I.e. the halos of subgrids must contain up-to-date data!
+
+    const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+
+    for (int i = 0; i < num_devices - 1; ++i) {
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        const int3 src = (int3){0, 0, subgrid.n.z};
+        const int3 dst = (int3){0, 0, 0};
+
+        const Device src_device = devices[i];
+        Device dst_device       = devices[i + 1];
+
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    for (int i = 1; i < num_devices; ++i) {
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        const int3 src = (int3){0, 0, NGHOST};
+        const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+
+        const Device src_device = devices[i];
+        Device dst_device       = devices[i - 1];
+
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSynchronizeMesh(const Node node, const Stream stream)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSwapBuffers(const Node node)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        acDeviceSwapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
+                   const AcReal value)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        acDeviceLoadConstant(devices[i], stream, param, value);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                 const int3 dst, const int num_vertices)
+{
+    // See the beginning of the file for an explanation of the index mapping
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
+
+        const int3 da = max(s0, d0);
+        const int3 db = min(s1, d1);
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
+            // DECOMPOSITION OFFSET HERE
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
+            acDeviceLoadVertexBufferWithOffset(devices[i], stream, host_mesh, vtxbuf_handle, da,
+                                               da_local, copy_cells);
+        }
+        // printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                         const int3 src, const int3 dst, const int num_vertices)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
+                                         num_vertices);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
+                       const VertexBufferHandle vtxbuf_handle)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
+                                  const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                  const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
+                          const int num_vertices, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreVertexBuffer(const Node node, const Stream stream,
+                        const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferVertexBufferWithOffset(const Node src_node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                     const int3 dst, const int num_vertices, Node dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferMeshWithOffset(const Node src_node, const Stream stream, const int3 src,
+                             const int3 dst, const int num_vertices, Node* dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
+                           const VertexBufferHandle vtxbuf_handle, Node* dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                       const int3 start, const int3 end, const AcReal dt)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                            const VertexBufferHandle vtxbuf_handle, const int3 start,
+                            const int3 end)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodePeriodicBoundconds(const Node node, const Stream stream, const int3 start, const int3 end)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                const VertexBufferHandle vtxbuf2, AcReal* result)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}

From 328b809efe8f30e87c50dfe7bd99b9b1908faa11 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 1 Aug 2019 14:04:11 +0300
Subject: [PATCH 07/37] Added the revised node interface

---
 include/astaroth_node.h |  42 +++-----
 src/core/node.cu        | 231 ++++++++++++++++++++++++++++++----------
 2 files changed, 188 insertions(+), 85 deletions(-)

diff --git a/include/astaroth_node.h b/include/astaroth_node.h
index 0e18da1..6b2d626 100644
--- a/include/astaroth_node.h
+++ b/include/astaroth_node.h
@@ -48,7 +48,7 @@ AcResult acNodeAutoOptimize(const Node node);
 /** */
 AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
 
-/** */
+/** Deprecated ? */
 AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
                                        const VertexBufferHandle vtxbuf_handle); // Not in Device
 
@@ -62,7 +62,8 @@ AcResult acNodeSwapBuffers(const Node node);
 AcResult acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
                             const AcReal value);
 
-/** */
+/** Deprecated ? Might be useful though if the user wants to load only one vtxbuf. But in this case
+ * the user should supply a AcReal* instead of vtxbuf_handle */
 AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
                                           const AcMesh host_mesh,
                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
@@ -72,14 +73,14 @@ AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
 AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
                                   const int3 src, const int3 dst, const int num_vertices);
 
-/** */
+/** Deprecated ? */
 AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
                                 const VertexBufferHandle vtxbuf_handle);
 
 /** */
 AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
 
-/** */
+/** Deprecated ? */
 AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                            const VertexBufferHandle vtxbuf_handle, const int3 src,
                                            const int3 dst, const int num_vertices,
@@ -89,41 +90,26 @@ AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
 AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
                                    const int3 dst, const int num_vertices, AcMesh* host_mesh);
 
-/** */
+/** Deprecated ? */
 AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
                                  const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
 
 /** */
 AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
 
-/** */
-AcResult acNodeTransferVertexBufferWithOffset(const Node src_node, const Stream stream,
-                                              const VertexBufferHandle vtxbuf_handle,
-                                              const int3 src, const int3 dst,
-                                              const int num_vertices, Node dst_node);
-
-/** */
-AcResult acNodeTransferMeshWithOffset(const Node src_node, const Stream stream, const int3 src,
-                                      const int3 dst, const int num_vertices, Node* dst_node);
-
-/** */
-AcResult acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
-                                    const VertexBufferHandle vtxbuf_handle, Node* dst_node);
-
-/** */
-AcResult acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node);
-
 /** */
 AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
                                 const int3 start, const int3 end, const AcReal dt);
-/** */
-AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
-                                     const VertexBufferHandle vtxbuf_handle, const int3 start,
-                                     const int3 end);
 
 /** */
-AcResult acNodePeriodicBoundconds(const Node node, const Stream stream, const int3 start,
-                                  const int3 end);
+AcResult acNodeIntegrate(const Node node, const AcReal dt);
+
+/** */
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle);
+
+/** */
+AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
 
 /** */
 AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
diff --git a/src/core/node.cu b/src/core/node.cu
index 296f235..2fbc008 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -370,15 +370,22 @@ AcResult
 acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
                        const VertexBufferHandle vtxbuf_handle)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    const int3 src            = (int3){0, 0, 0};
+    const int3 dst            = src;
+    const size_t num_vertices = acVertexBufferSize(host_mesh.info);
+
+    acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst,
+                                     num_vertices);
+    return AC_SUCCESS;
 }
 
 AcResult
 acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i);
+    }
+    return AC_SUCCESS;
 }
 
 AcResult
@@ -386,95 +393,199 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
                                   const int3 dst, const int num_vertices, AcMesh* host_mesh)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
+
+        const int3 da = max(s0, d0);
+        const int3 db = min(s1, d1);
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
+            // DECOMPOSITION OFFSET HERE
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
+            acDeviceStoreVertexBufferWithOffset(devices[i], stream, vtxbuf_handle, da_local, da,
+                                                copy_cells, host_mesh);
+        }
+    }
+    return AC_SUCCESS;
 }
 
 AcResult
 acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
                           const int num_vertices, AcMesh* host_mesh)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst,
+                                          num_vertices, host_mesh);
+    }
+    return AC_SUCCESS;
 }
 
 AcResult
 acNodeStoreVertexBuffer(const Node node, const Stream stream,
                         const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    const int3 src            = (int3){0, 0, 0};
+    const int3 dst            = src;
+    const size_t num_vertices = acVertexBufferSize(host_mesh.info);
+
+    acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
+                                      host_mesh);
+
+    return AC_SUCCESS;
 }
 
 AcResult
 acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-AcResult
-acNodeTransferVertexBufferWithOffset(const Node src_node, const Stream stream,
-                                     const VertexBufferHandle vtxbuf_handle, const int3 src,
-                                     const int3 dst, const int num_vertices, Node dst_node)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-AcResult
-acNodeTransferMeshWithOffset(const Node src_node, const Stream stream, const int3 src,
-                             const int3 dst, const int num_vertices, Node* dst_node)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-AcResult
-acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
-                           const VertexBufferHandle vtxbuf_handle, Node* dst_node)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-AcResult
-acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh);
+    }
+    return AC_SUCCESS;
 }
 
 AcResult
 acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
                        const int3 start, const int3 end, const AcReal dt)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    for (int i = 0; i < num_devices; ++i) {
+        // DECOMPOSITION OFFSET HERE
+        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
+        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
+
+        const int3 da = max(start, d0);
+        const int3 db = min(end, d1);
+
+        if (db.z >= da.z) {
+            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
+            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
+            acDeviceIntegrateSubstep(devices[i], stream, isubstep, da_local, db_local, dt);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeIntegrate(const Node node, const AcReal dt)
+{
+    acNodeSynchronizeStream(STREAM_ALL);
+
+    WARNING("Not implementad\n");
+
+    acNodeSynchronizeStream(STREAM_ALL);
+    return AC_SUCCESS;
+}
+
+static AcResult
+local_boundcondstep(const Node node, const StreamType stream, const VertexBufferHandle vtxbuf)
+{
+    if (num_devices == 1) {
+        acDeviceBoundcondStep(devices[0], stream, vtxbuf, (int3){0, 0, 0}, subgrid.m);
+    }
+    else {
+        // Local boundary conditions
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            acDeviceBoundcondStep(devices[i], stream, vtxbuf, d0, d1);
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+global_boundcondstep(const Node node, const StreamType stream, const VertexBufferHandle vtxbuf)
+{
+    if (num_devices > 1) {
+        const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+        {
+            // ...|ooooxxx|... -> xxx|ooooooo|...
+            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 dst = (int3){0, 0, 0};
+
+            const Device src_device = devices[num_devices - 1];
+            Device dst_device       = devices[0];
+
+            acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                                   num_vertices, dst_device);
+        }
+        {
+            // ...|ooooooo|xxx <- ...|xxxoooo|...
+            const int3 src = (int3){0, 0, NGHOST};
+            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+
+            const Device src_device = devices[0];
+            Device dst_device       = devices[num_devices - 1];
+
+            acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                                   num_vertices, dst_device);
+        }
+    }
+    return AC_SUCCESS;
 }
 
 AcResult
 acNodePeriodicBoundcondStep(const Node node, const Stream stream,
-                            const VertexBufferHandle vtxbuf_handle, const int3 start,
-                            const int3 end)
+                            const VertexBufferHandle vtxbuf_handle)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    local_boundcondstep(node, stream, vtxbuf_handle);
+    global_boundcondstep(node, stream, vtxbuf_handle);
+    acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
+
+    return AC_SUCCESS;
 }
 
 AcResult
-acNodePeriodicBoundconds(const Node node, const Stream stream, const int3 start, const int3 end)
+acNodePeriodicBoundconds(const Node node, const Stream stream)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
+    }
+    return AC_SUCCESS;
+}
+
+static AcReal
+simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+{
+    AcReal res = results[0];
+    for (int i = 1; i < n; ++i) {
+        if (rtype == RTYPE_MAX) {
+            res = max(res, results[i]);
+        }
+        else if (rtype == RTYPE_MIN) {
+            res = min(res, results[i]);
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+            res = sum(res, results[i]);
+        }
+        else {
+            ERROR("Invalid rtype");
+        }
+    }
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
+        res                = sqrt(inv_n * res);
+    }
+
+    return res;
 }
 
 AcResult
 acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                  const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    for (int i = 0; i < num_devices; ++i) {
+        acDeviceReduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
 }
 
 AcResult
@@ -482,6 +593,12 @@ acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType r
                 const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
                 const VertexBufferHandle vtxbuf2, AcReal* result)
 {
-    WARNING("Not implemented");
-    return AC_FAILURE;
+    acSynchronizeStream(STREAM_ALL);
+
+    AcReal results[num_devices];
+    for (int i = 0; i < num_devices; ++i) {
+        acDeviceReduceScal(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
 }

From 2b6bf10ae699569067614aaf4cb5cc7e11df77d2 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 1 Aug 2019 18:37:36 +0300
Subject: [PATCH 08/37] Dummy implementation of the Grid interface

---
 include/astaroth.h         |  11 +++
 include/astaroth_defines.h |   2 +-
 include/astaroth_grid.h    |  37 ++++----
 src/core/astaroth.cu       |  12 +++
 src/core/grid.cu           | 169 +++++++++++++++++++++++++++++++++++++
 src/core/node.cu           |  25 ++----
 6 files changed, 216 insertions(+), 40 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index 1aef514..9befcee 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -28,6 +28,17 @@ extern "C" {
 #include "astaroth_grid.h"
 #include "astaroth_node.h"
 
+#define acInit(x) acGridInit(x)
+#define acQuit() acGridQuit()
+#define acLoad(x) acGridLoadMesh(STREAM_DEFAULT, x)
+#define acReduceScal(x, y) acGridReduceScal(STREAM_DEFAULT, x, y)
+#define acReduceVec(x, y, z, w) acGridReduceVec(STREAM_DEFAULT, x, y, z, w)
+#define acBoundcondStep() acGridPeriodicBoundcondStep(STREAM_DEFAULT)
+#define acIntegrate(x) acGridIntegrateStep(STREAM_DEFAULT, x)
+#define acStore(x) acGridStoreMesh(STREAM_DEFAULT, x)
+#define acSynchronizeStream(x) acGridSynchronizeStream(x)
+#define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 1441967..0d03b70 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -105,7 +105,7 @@ typedef enum {
 } ReductionType;
 
 typedef enum { STREAM_DEFAULT, NUM_STREAM_TYPES } Stream;
-#define STREAM_ALL (-1)
+#define STREAM_ALL (NUM_STREAM_TYPES)
 
 #define AC_GEN_ID(X) X
 typedef enum {
diff --git a/include/astaroth_grid.h b/include/astaroth_grid.h
index 7dfe323..3fb5f8f 100644
--- a/include/astaroth_grid.h
+++ b/include/astaroth_grid.h
@@ -31,7 +31,7 @@ AcResult acGridInit(const AcMeshInfo node_config);
 AcResult acGridQuit(void);
 
 /** */
-AcResult acGridSynchronize(void);
+AcResult acGridSynchronizeStream(const Stream stream);
 
 /** */
 AcResult acGridSwapBuffers(void);
@@ -72,34 +72,27 @@ AcResult acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle v
 /** */
 AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
 
-/** */
-AcResult acGridTransferVertexBufferWithOffset(const Stream stream,
-                                              const VertexBufferHandle vtxbuf_handle,
-                                              const int3 src, const int3 dst,
-                                              const int num_vertices);
-
-/** */
-AcResult acGridTransferMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
-                                      const int num_vertices);
-
-/** */
-AcResult acGridTransferVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle);
-
-/** */
-AcResult acGridTransferMesh(const Stream stream);
-
 /** */
 AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start,
                                 const int3 end, const AcReal dt);
+
 /** */
-AcResult acGridPeriodicBoundcondStep(const Stream stream, const int3 start, const int3 end);
+AcResult acGridIntegrateStep(const Stream stream, const AcReal dt);
+
 /** */
-AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
-                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acGridPeriodicBoundcondStep(const Stream stream);
 /** */
-AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+/* acGridReduceScal(const Stream stream, const ReductionType rtype,
+  const VertexBufferHandle vtxbuf_handle, AcReal* result); */
+AcReal acGridReduceScal(const Stream stream, const ReductionType rtype,
+                        const VertexBufferHandle vtxbuf_handle);
+/** */
+/*AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                         const VertexBufferHandle vec2, AcReal* result);
+                         const VertexBufferHandle vec2, AcReal* result);*/
+AcReal acGridReduceVec(const Stream stream, const ReductionType rtype,
+                       const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                       const VertexBufferHandle vec2);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 1ef56d4..5de6d24 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -17,3 +17,15 @@
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "astaroth_defines.h"
+
+#define AC_GEN_STR(X) #X
+const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
+                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
+                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
+                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
+                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+#undef AC_GEN_STR
diff --git a/src/core/grid.cu b/src/core/grid.cu
index 83c7453..7a05d22 100644
--- a/src/core/grid.cu
+++ b/src/core/grid.cu
@@ -17,3 +17,172 @@
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "astaroth_grid.h"
+
+#include "astaroth_node.h"
+
+const size_t MAX_NUM_NODES = 32;
+size_t num_nodes           = 0;
+static Node nodes[MAX_NUM_NODES];
+
+/** */
+AcResult
+acGridInit(const AcMeshInfo node_config)
+{
+    acNodeCreate(0, node_config, &nodes[0]);
+    ++num_nodes;
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridQuit(void)
+{
+    acNodeDestroy(nodes[0]);
+    --num_nodes;
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridSynchronizeStream(const Stream stream)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridSwapBuffers(void)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
+                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                 const int3 dst, const int num_vertices)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
+                         const int3 dst, const int num_vertices)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
+                       const VertexBufferHandle vtxbuf_handle)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
+                                  const int3 src, const int3 dst, const int num_vertices,
+                                  AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
+                          const int num_vertices, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
+                        AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
+                       const AcReal dt)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridIntegrateStep(const Stream stream, const AcReal dt)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+/** */
+AcResult
+acGridPeriodicBoundcondStep(const Stream stream)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+#if 0
+/** */
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+/** */
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                         const VertexBufferHandle vec2, AcReal* result);
+#endif
+/** */
+AcResult
+acGridReduceScal(const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    return 0;
+}
+/** */
+AcResult
+acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
+                const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
+{
+    return 0;
+}
diff --git a/src/core/node.cu b/src/core/node.cu
index 2fbc008..8d4b48a 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -22,18 +22,6 @@
 #include "errchk.h"
 #include "math_utils.h" // sum for reductions
 
-#define AC_GEN_STR(X) #X
-const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
-                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
-                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
-#undef AC_GEN_STR
-
 static const int MAX_NUM_DEVICES = 32;
 static Node node                 = NULL;
 
@@ -482,10 +470,7 @@ acNodeIntegrate(const Node node, const AcReal dt)
 static AcResult
 local_boundcondstep(const Node node, const StreamType stream, const VertexBufferHandle vtxbuf)
 {
-    if (num_devices == 1) {
-        acDeviceBoundcondStep(devices[0], stream, vtxbuf, (int3){0, 0, 0}, subgrid.m);
-    }
-    else {
+    if (num_devices > 1) {
         // Local boundary conditions
         for (int i = 0; i < num_devices; ++i) {
             const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
@@ -493,6 +478,9 @@ local_boundcondstep(const Node node, const StreamType stream, const VertexBuffer
             acDeviceBoundcondStep(devices[i], stream, vtxbuf, d0, d1);
         }
     }
+    else {
+        acDeviceBoundcondStep(devices[0], stream, vtxbuf, (int3){0, 0, 0}, subgrid.m);
+    }
     return AC_SUCCESS;
 }
 
@@ -532,9 +520,12 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                             const VertexBufferHandle vtxbuf_handle)
 {
     local_boundcondstep(node, stream, vtxbuf_handle);
-    global_boundcondstep(node, stream, vtxbuf_handle);
     acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
 
+    // TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE
+    global_boundcondstep(node, stream, vtxbuf_handle);
+    WARNING("Global boundconds should not be done here with multinode");
+
     return AC_SUCCESS;
 }
 

From 567ad614651f4eacc617cb052b03c5a800a5f17e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Fri, 2 Aug 2019 13:54:54 +0300
Subject: [PATCH 09/37] Multinode MPI implementation should be done later in
 its own branch. The focus of this branch is to revise the node and device
 layers. Commented out references to the Grid layer.

---
 include/astaroth.h   | 25 +++++++++++++++-
 src/core/astaroth.cu | 71 +++++++++++++++++++++++++++++++++++++++++++-
 src/core/grid.cu     | 32 ++++++++++++++++----
 3 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index 9befcee..fbfdb05 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -25,9 +25,10 @@ extern "C" {
 #include "astaroth_defines.h"
 
 #include "astaroth_device.h"
-#include "astaroth_grid.h"
 #include "astaroth_node.h"
 
+/*
+#include "astaroth_grid.h"
 #define acInit(x) acGridInit(x)
 #define acQuit() acGridQuit()
 #define acLoad(x) acGridLoadMesh(STREAM_DEFAULT, x)
@@ -38,6 +39,28 @@ extern "C" {
 #define acStore(x) acGridStoreMesh(STREAM_DEFAULT, x)
 #define acSynchronizeStream(x) acGridSynchronizeStream(x)
 #define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
+*/
+
+AcResult acInit(const AcMeshInfo mesh_info);
+
+AcResult acQuit(void);
+
+AcResult acSynchronizeStream(const Stream stream);
+
+AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
+
+AcResult acLoad(const AcMesh host_mesh);
+
+AcResult acStore(AcMesh* host_mesh);
+
+AcResult acIntegrate(const AcReal dt);
+
+AcResult acBoundcondStep(void);
+
+AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
+
+AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
+                   const VertexBufferHandle b, const VertexBufferHandle c);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 5de6d24..8ea0d68 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -16,7 +16,8 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "astaroth_defines.h"
+// #include "astaroth_defines.h"
+#include "astaroth.h"
 
 #define AC_GEN_STR(X) #X
 const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
@@ -29,3 +30,71 @@ const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) /
                                   AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
 const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 #undef AC_GEN_STR
+
+static const int num_nodes = 1;
+static Node nodes[num_nodes];
+
+AcResult
+acInit(const AcMeshInfo mesh_info)
+{
+    return acNodeCreate(0, mesh_info, &nodes[0]);
+}
+
+AcResult
+acQuit(void)
+{
+    return acNodeDestroy(nodes[0]);
+}
+
+AcResult
+acSynchronizeStream(const Stream stream)
+{
+    return acNodeSynchronizeStream(nodes[0], stream);
+}
+
+AcResult
+acLoadDeviceConstant(const AcRealParam param, const AcReal value)
+{
+    return acNodeLoadConstant(nodes[0], STREAM_DEFAULT, param, value);
+}
+
+AcResult
+acLoad(const AcMesh host_mesh)
+{
+    return acNodeLoadMesh(nodes[0], STREAM_DEFAULT, host_mesh);
+}
+
+AcResult
+acStore(AcMesh* host_mesh)
+{
+    return acNodeStoreMesh(nodes[0], STREAM_DEFAULT, host_mesh);
+}
+
+AcResult
+acIntegrate(const AcReal dt)
+{
+    return acNodeIntegrate(nodes[0], dt);
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
+}
+
+AcReal
+acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
+{
+    AcReal result;
+    acNodeReduceScal(nodes[0], STREAM_DEFAULT, rtype, vtxbuf_handle, &result);
+    return result;
+}
+
+AcReal
+acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
+            const VertexBufferHandle c)
+{
+    AcReal result;
+    acNodeReduceVec(nodes[0], STREAM_DEFAULT, rtype, a, b, c, &result);
+    return result;
+}
diff --git a/src/core/grid.cu b/src/core/grid.cu
index 7a05d22..359fe00 100644
--- a/src/core/grid.cu
+++ b/src/core/grid.cu
@@ -30,6 +30,7 @@ acGridInit(const AcMeshInfo node_config)
 {
     acNodeCreate(0, node_config, &nodes[0]);
     ++num_nodes;
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 
@@ -39,6 +40,7 @@ acGridQuit(void)
 {
     acNodeDestroy(nodes[0]);
     --num_nodes;
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 
@@ -46,7 +48,10 @@ acGridQuit(void)
 AcResult
 acGridSynchronizeStream(const Stream stream)
 {
-    WARNING("Not implemented");
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeSynchronizeStream(nodes[i], stream);
+    }
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 
@@ -54,7 +59,10 @@ acGridSynchronizeStream(const Stream stream)
 AcResult
 acGridSwapBuffers(void)
 {
-    WARNING("Not implemented");
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeSwapBuffers(nodes[i]);
+    }
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 
@@ -62,7 +70,10 @@ acGridSwapBuffers(void)
 AcResult
 acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
 {
-    WARNING("Not implemented");
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadConstant(node, stream, param, value);
+    }
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 
@@ -72,7 +83,10 @@ acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
                                  const VertexBufferHandle vtxbuf_handle, const int3 src,
                                  const int3 dst, const int num_vertices)
 {
-    WARNING("Not implemented");
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle)
+    }
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 
@@ -81,7 +95,10 @@ AcResult
 acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
                          const int3 dst, const int num_vertices)
 {
-    WARNING("Not implemented");
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadMeshWithOffset(nodes[i], stream, host_mesh, src, dst, num_vertices);
+    }
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 
@@ -90,7 +107,10 @@ AcResult
 acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
                        const VertexBufferHandle vtxbuf_handle)
 {
-    WARNING("Not implemented");
+    for (int i = 0; i < num_nodes; ++i) {
+        acNodeLoadVertexBuffer(node, stream, host_mesh, vtxbuf_handle);
+    }
+    WARNING("Proper multinode not yet implemented");
     return AC_FAILURE;
 }
 

From 5f2378e91b5595c858dcbb9cea7e7df89aa5b8c6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Fri, 2 Aug 2019 15:15:18 +0300
Subject: [PATCH 10/37] Now compiles (does not work though)

---
 include/astaroth_device.h |   4 +-
 src/core/CMakeLists.txt   |   2 +-
 src/core/device.cu        |   4 +-
 src/core/node.cu          | 273 +++++++++++++++++++++-----------------
 4 files changed, 158 insertions(+), 125 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 75ec693..442b1b1 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -97,10 +97,10 @@ AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream st
 
 /** */
 AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
-                                      const VertexBufferHandle vtxbuf_handle, Device* dst_device);
+                                      const VertexBufferHandle vtxbuf_handle, Device dst_device);
 
 /** Deprecated */
-AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device);
+AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
 
 /** */
 AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 6054444..0aad51d 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -34,5 +34,5 @@ endif ()
 
 ## Create and link the library
 include_directories(.)
-cuda_add_library(astaroth_core STATIC astaroth.cu device.cu)
+cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
 target_link_libraries(astaroth_core m)
diff --git a/src/core/device.cu b/src/core/device.cu
index d643ad1..f5bd295 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -229,7 +229,7 @@ acDevicePrintInfo(const Device device)
 }
 
 AcResult
-autoOptimize(const Device device)
+acDeviceAutoOptimize(const Device device)
 {
     cudaSetDevice(device->id);
 
@@ -502,7 +502,7 @@ acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
 }
 
 AcResult
-acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device)
+acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
 {
     WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
diff --git a/src/core/node.cu b/src/core/node.cu
index 8d4b48a..23c79b3 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -23,7 +23,11 @@
 #include "math_utils.h" // sum for reductions
 
 static const int MAX_NUM_DEVICES = 32;
-static Node node                 = NULL;
+
+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
 
 struct node_s {
     int id;
@@ -33,6 +37,8 @@ struct node_s {
 
     Grid grid;
     Grid subgrid;
+
+    AcMeshInfo config;
 };
 
 static int
@@ -55,12 +61,12 @@ printInt3(const int3 vec)
 }
 
 static inline void
-print(const AcMeshInfo config)
+print(const Node node)
 {
     for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+        printf("[%s]: %d\n", intparam_names[i], node->config.int_params[i]);
     for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+        printf("[%s]: %g\n", realparam_names[i], double(node->config.real_params[i]));
 }
 
 static void
@@ -86,13 +92,6 @@ update_builtin_params(AcMeshInfo* config)
     config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
     config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
     config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
-
-#if VERBOSE_PRINTING // Defined in astaroth.h
-    printf("###############################################################\n");
-    printf("Config dimensions recalculated:\n");
-    print(*config);
-    printf("###############################################################\n");
-#endif
 }
 
 static Grid
@@ -109,61 +108,69 @@ createGrid(const AcMeshInfo config)
 AcResult
 acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
 {
-    struct node_s* node = (struct device_s*)malloc(sizeof(*node));
+    struct node_s* node = (struct node_s*)malloc(sizeof(*node));
+    node->id            = id;
+    node->config        = node_config;
 
-    // Get num_devices
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
-    if (num_devices < 1) {
+    // Get node->num_devices
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices));
+    if (node->num_devices < 1) {
         ERROR("No CUDA devices found!");
         return AC_FAILURE;
     }
-    if (num_devices > MAX_NUM_DEVICES) {
+    if (node->num_devices > MAX_NUM_DEVICES) {
         WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
-        num_devices = MAX_NUM_DEVICES;
+        node->num_devices = MAX_NUM_DEVICES;
     }
     if (!AC_MULTIGPU_ENABLED) {
         WARNING("MULTIGPU_ENABLED was false. Using only one device");
-        num_devices = 1; // Use only one device if multi-GPU is not enabled
+        node->num_devices = 1; // Use only one device if multi-GPU is not enabled
     }
-    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // Check that node->num_devices is divisible with AC_nz. This makes decomposing the
     // problem domain to multiple GPUs much easier since we do not have to worry
     // about remainders
-    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+    ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0);
 
     // Decompose the problem domain
     // The main grid
-    grid = createGrid(config);
+    node->grid = createGrid(node->config);
 
     // Subgrids
-    AcMeshInfo subgrid_config = config;
-    subgrid_config.int_params[AC_nz] /= num_devices;
+    AcMeshInfo subgrid_config = node->config;
+    subgrid_config.int_params[AC_nz] /= node->num_devices;
     update_builtin_params(&subgrid_config);
-    subgrid = createGrid(subgrid_config);
+#if VERBOSE_PRINTING // Defined in astaroth.h
+    printf("###############################################################\n");
+    printf("Config dimensions recalculated:\n");
+    print(node);
+    printf("###############################################################\n");
+#endif
+    node->subgrid = createGrid(subgrid_config);
 
     // Periodic boundary conditions become weird if the system can "fold unto itself".
-    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
 
 #if VERBOSE_PRINTING
     // clang-format off
-    printf("Grid m ");   printInt3(grid.m);    printf("\n");
-    printf("Grid n ");   printInt3(grid.n);    printf("\n");
-    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
-    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+    printf("Grid m ");   printInt3(node->grid.m);    printf("\n");
+    printf("Grid n ");   printInt3(node->grid.n);    printf("\n");
+    printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
     // clang-format on
 #endif
 
     // Initialize the devices
-    for (int i = 0; i < num_devices; ++i) {
-        createDevice(i, subgrid_config, &devices[i]);
-        printDeviceInfo(devices[i]);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceCreate(i, subgrid_config, &node->devices[i]);
+        acDevicePrintInfo(node->devices[i]);
     }
 
     // Enable peer access
-    for (int i = 0; i < num_devices; ++i) {
-        const int front = (i + 1) % num_devices;
-        const int back  = (i - 1 + num_devices) % num_devices;
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int front = (i + 1) % node->num_devices;
+        const int back  = (i - 1 + node->num_devices) % node->num_devices;
 
         int can_access_front, can_access_back;
         cudaDeviceCanAccessPeer(&can_access_front, i, front);
@@ -182,7 +189,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
             ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
         }
     }
-    acNodeSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
     *node_handle = node;
     return AC_SUCCESS;
@@ -191,10 +198,10 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
 AcResult
 acNodeDestroy(Node node)
 {
-    acSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    for (int i = 0; i < num_devices; ++i) {
-        destroyDevice(devices[i]);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceDestroy(node->devices[i]);
     }
     free(node);
 
@@ -204,6 +211,7 @@ acNodeDestroy(Node node)
 AcResult
 acNodePrintInfo(const Node node)
 {
+    (void)node;
     WARNING("Not implemented");
     return AC_FAILURE;
 }
@@ -211,6 +219,8 @@ acNodePrintInfo(const Node node)
 AcResult
 acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
 {
+    (void)node;
+    (void)config;
     WARNING("Not implemented");
     return AC_FAILURE;
 }
@@ -218,6 +228,7 @@ acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
 AcResult
 acNodeAutoOptimize(const Node node)
 {
+    (void)node;
     WARNING("Not implemented");
     return AC_FAILURE;
 }
@@ -225,8 +236,8 @@ acNodeAutoOptimize(const Node node)
 AcResult
 acNodeSynchronizeStream(const Node node, const Stream stream)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceSynchronizeStream(node->devices[i], stream);
     }
 
     return AC_SUCCESS;
@@ -236,37 +247,39 @@ AcResult
 acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
                               const VertexBufferHandle vtxbuf_handle)
 {
+    acNodeSynchronizeStream(node, stream);
     // Exchanges the halos of subgrids
     // After this step, the data within the main grid ranging from
     // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
     // has been synchronized and transferred to appropriate subgrids
 
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
+    // We loop only to node->num_devices - 1 since the front and back plate of the grid is not
     // transferred because their contents depend on the boundary conditions.
 
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
+    // IMPORTANT NOTE: the boundary conditions must be applied before
+    // callingacNodeSynchronizeStream(node,  this function! I.e. the halos of subgrids must contain
+    // up-to-date data!
 
-    const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+    const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
 
-    for (int i = 0; i < num_devices - 1; ++i) {
+    for (int i = 0; i < node->num_devices - 1; ++i) {
         // ...|ooooxxx|... -> xxx|ooooooo|...
-        const int3 src = (int3){0, 0, subgrid.n.z};
+        const int3 src = (int3){0, 0, node->subgrid.n.z};
         const int3 dst = (int3){0, 0, 0};
 
-        const Device src_device = devices[i];
-        Device dst_device       = devices[i + 1];
+        const Device src_device = node->devices[i];
+        Device dst_device       = node->devices[i + 1];
 
         acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                num_vertices, dst_device);
     }
-    for (int i = 1; i < num_devices; ++i) {
+    for (int i = 1; i < node->num_devices; ++i) {
         // ...|ooooooo|xxx <- ...|xxxoooo|...
         const int3 src = (int3){0, 0, NGHOST};
-        const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+        const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
 
-        const Device src_device = devices[i];
-        Device dst_device       = devices[i - 1];
+        const Device src_device = node->devices[i];
+        Device dst_device       = node->devices[i - 1];
 
         acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                num_vertices, dst_device);
@@ -287,8 +300,8 @@ acNodeSynchronizeMesh(const Node node, const Stream stream)
 AcResult
 acNodeSwapBuffers(const Node node)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceSwapBuffers(devices[i]);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceSwapBuffers(node->devices[i]);
     }
     return AC_SUCCESS;
 }
@@ -297,8 +310,9 @@ AcResult
 acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
                    const AcReal value)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceLoadConstant(devices[i], stream, param, value);
+    acNodeSynchronizeStream(node, stream);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceLoadConstant(node->devices[i], stream, param, value);
     }
     return AC_SUCCESS;
 }
@@ -308,14 +322,15 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
                                  const VertexBufferHandle vtxbuf_handle, const int3 src,
                                  const int3 dst, const int num_vertices)
 {
+    acNodeSynchronizeStream(node, stream);
     // See the beginning of the file for an explanation of the index mapping
     // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
 
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
+        const int3 s0 = dst;
+        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
 
         const int3 da = max(s0, d0);
         const int3 db = min(s1, d1);
@@ -330,13 +345,14 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
         printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
         */
         if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
+            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
             // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
+            const int3 da_global = src + da - dst;
+            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
             // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
             // printf("\n");
-            acDeviceLoadVertexBufferWithOffset(devices[i], stream, host_mesh, vtxbuf_handle, da,
-                                               da_local, copy_cells);
+            acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle,
+                                               da_global, da_local, copy_cells);
         }
         // printf("\n");
     }
@@ -381,21 +397,23 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
                                   const int3 dst, const int num_vertices, AcMesh* host_mesh)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+    acNodeSynchronizeStream(node, stream);
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
 
         const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
+        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
 
         const int3 da = max(s0, d0);
         const int3 db = min(s1, d1);
         if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
+            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
             // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            acDeviceStoreVertexBufferWithOffset(devices[i], stream, vtxbuf_handle, da_local, da,
-                                                copy_cells, host_mesh);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
+            const int3 da_global = dst + da - src;
+            acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
+                                                da_global, copy_cells, host_mesh);
         }
     }
     return AC_SUCCESS;
@@ -418,7 +436,7 @@ acNodeStoreVertexBuffer(const Node node, const Stream stream,
 {
     const int3 src            = (int3){0, 0, 0};
     const int3 dst            = src;
-    const size_t num_vertices = acVertexBufferSize(host_mesh.info);
+    const size_t num_vertices = acVertexBufferSize(host_mesh->info);
 
     acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
                                       host_mesh);
@@ -436,21 +454,23 @@ acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
 }
 
 AcResult
-acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
-                       const int3 start, const int3 end, const AcReal dt)
+acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start,
+                       const int3 end, const AcReal dt)
 {
-    for (int i = 0; i < num_devices; ++i) {
+    acNodeSynchronizeStream(node, stream);
+
+    for (int i = 0; i < node->num_devices; ++i) {
         // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
+        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
+        const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};
 
         const int3 da = max(start, d0);
         const int3 db = min(end, d1);
 
         if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            acDeviceIntegrateSubstep(devices[i], stream, isubstep, da_local, db_local, dt);
+            const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};
+            const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};
+            acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt);
         }
     }
     return AC_SUCCESS;
@@ -459,43 +479,54 @@ acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_numb
 AcResult
 acNodeIntegrate(const Node node, const AcReal dt)
 {
-    acNodeSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    WARNING("Not implementad\n");
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acNodePeriodicBoundconds(node, STREAM_DEFAULT);
+        const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+        const int3 end   = start + node->grid.n;
+        acNodeIntegrateSubstep(node, STREAM_DEFAULT, isubstep, start, end, dt);
+        acNodeSwapBuffers(node);
+    }
 
-    acNodeSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
     return AC_SUCCESS;
 }
 
 static AcResult
-local_boundcondstep(const Node node, const StreamType stream, const VertexBufferHandle vtxbuf)
+local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
 {
-    if (num_devices > 1) {
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
         // Local boundary conditions
-        for (int i = 0; i < num_devices; ++i) {
+        for (int i = 0; i < node->num_devices; ++i) {
             const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            acDeviceBoundcondStep(devices[i], stream, vtxbuf, d0, d1);
+            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
+            acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
         }
     }
     else {
-        acDeviceBoundcondStep(devices[0], stream, vtxbuf, (int3){0, 0, 0}, subgrid.m);
+        acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
+                                      node->subgrid.m);
     }
     return AC_SUCCESS;
 }
 
 static AcResult
-global_boundcondstep(const Node node, const StreamType stream, const VertexBufferHandle vtxbuf)
+global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
 {
-    if (num_devices > 1) {
-        const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
+        const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
         {
             // ...|ooooxxx|... -> xxx|ooooooo|...
-            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 src = (int3){0, 0, node->subgrid.n.z};
             const int3 dst = (int3){0, 0, 0};
 
-            const Device src_device = devices[num_devices - 1];
-            Device dst_device       = devices[0];
+            const Device src_device = node->devices[node->num_devices - 1];
+            Device dst_device       = node->devices[0];
 
             acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                    num_vertices, dst_device);
@@ -503,10 +534,10 @@ global_boundcondstep(const Node node, const StreamType stream, const VertexBuffe
         {
             // ...|ooooooo|xxx <- ...|xxxoooo|...
             const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
 
-            const Device src_device = devices[0];
-            Device dst_device       = devices[num_devices - 1];
+            const Device src_device = node->devices[0];
+            Device dst_device       = node->devices[node->num_devices - 1];
 
             acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                    num_vertices, dst_device);
@@ -539,7 +570,8 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
 }
 
 static AcReal
-simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
+                         const int& n)
 {
     AcReal res = results[0];
     for (int i = 1; i < n; ++i) {
@@ -549,7 +581,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
         else if (rtype == RTYPE_MIN) {
             res = min(res, results[i]);
         }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
             res = sum(res, results[i]);
         }
         else {
@@ -558,10 +590,9 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
     }
 
     if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
+        const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z);
         res                = sqrt(inv_n * res);
     }
-
     return res;
 }
 
@@ -569,27 +600,29 @@ AcResult
 acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                  const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
-    acSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    AcReal results[num_devices];
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceReduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
+    AcReal results[node->num_devices];
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
     }
 
-    return simple_final_reduce_scal(rtype, results, num_devices);
+    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
+    return AC_SUCCESS;
 }
 
 AcResult
-acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
-                const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-                const VertexBufferHandle vtxbuf2, AcReal* result)
+acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
+                const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c,
+                AcReal* result)
 {
-    acSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    AcReal results[num_devices];
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceReduceScal(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
+    AcReal results[node->num_devices];
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
     }
 
-    return simple_final_reduce_scal(rtype, results, num_devices);
+    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
+    return AC_SUCCESS;
 }

From 6dfd03664d3e4c5fd1a871b541ab7d0c494724ae Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Fri, 2 Aug 2019 15:31:24 +0300
Subject: [PATCH 11/37] Still does not work. I'm starting to think that instead
 of this one huge revision, we should modify the existing interface
 step-by-step.

---
 src/core/node.cu | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/core/node.cu b/src/core/node.cu
index 23c79b3..e647547 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -61,12 +61,12 @@ printInt3(const int3 vec)
 }
 
 static inline void
-print(const Node node)
+print(const AcMeshInfo config)
 {
     for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], node->config.int_params[i]);
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
     for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(node->config.real_params[i]));
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
 }
 
 static void
@@ -142,7 +142,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
 #if VERBOSE_PRINTING // Defined in astaroth.h
     printf("###############################################################\n");
     printf("Config dimensions recalculated:\n");
-    print(node);
+    print(subgrid_config);
     printf("###############################################################\n");
 #endif
     node->subgrid = createGrid(subgrid_config);
@@ -329,7 +329,8 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
         const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
         const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
 
-        const int3 s0 = dst;
+        const int3 s0 = src; // dst; // TODO fix
+        (void)dst;           // TODO fix
         const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
 
         const int3 da = max(s0, d0);
@@ -347,7 +348,7 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
         if (db.z >= da.z) {
             const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
             // DECOMPOSITION OFFSET HERE
-            const int3 da_global = src + da - dst;
+            const int3 da_global = da; // src + da - dst; // TODO fix
             const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
             // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
             // printf("\n");
@@ -402,7 +403,8 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
         const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
         const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
 
-        const int3 s0 = src;
+        const int3 s0 = src; // TODO fix
+        (void)dst;           // TODO fix
         const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
 
         const int3 da = max(s0, d0);
@@ -411,7 +413,7 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
             const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
             // DECOMPOSITION OFFSET HERE
             const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
-            const int3 da_global = dst + da - src;
+            const int3 da_global = da; // dst + da - src; // TODO fix
             acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
                                                 da_global, copy_cells, host_mesh);
         }
@@ -483,12 +485,14 @@ acNodeIntegrate(const Node node, const AcReal dt)
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
         acNodePeriodicBoundconds(node, STREAM_DEFAULT);
+        acNodeSynchronizeStream(node, STREAM_DEFAULT); // DEBUG
         const int3 start = (int3){NGHOST, NGHOST, NGHOST};
         const int3 end   = start + node->grid.n;
         acNodeIntegrateSubstep(node, STREAM_DEFAULT, isubstep, start, end, dt);
         acNodeSwapBuffers(node);
+        acNodeSynchronizeStream(node, STREAM_DEFAULT); // DEBUG
     }
-
+    acNodePeriodicBoundconds(node, STREAM_DEFAULT); // DEBUG
     acNodeSynchronizeStream(node, STREAM_ALL);
     return AC_SUCCESS;
 }
@@ -555,7 +559,7 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 
     // TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE
     global_boundcondstep(node, stream, vtxbuf_handle);
-    WARNING("Global boundconds should not be done here with multinode");
+    // WARNING("Global boundconds should not be done here with multinode");
 
     return AC_SUCCESS;
 }

From 5232d987c12c5897ff3ade36acd2a6cb62d50cee Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 5 Aug 2019 16:18:22 +0300
Subject: [PATCH 12/37] Added acStoreWithOffset to the revised interface

---
 include/astaroth.h   | 2 ++
 src/core/astaroth.cu | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/include/astaroth.h b/include/astaroth.h
index fbfdb05..94d7e17 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -62,6 +62,8 @@ AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_h
 AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
                    const VertexBufferHandle b, const VertexBufferHandle c);
 
+AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 8ea0d68..d47748c 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -98,3 +98,9 @@ acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexB
     acNodeReduceVec(nodes[0], STREAM_DEFAULT, rtype, a, b, c, &result);
     return result;
 }
+
+AcResult
+acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh)
+{
+    return acNodeStoreMeshWithOffset(nodes[0], STREAM_DEFAULT, dst, dst, num_vertices, host_mesh);
+}

From fa6e1116cb921cd4a249782e6b79b94fe3226f1e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 5 Aug 2019 17:26:05 +0300
Subject: [PATCH 13/37] The interface revision now actually works. The issue
 was incorrect order of src and dst indices when storing the mesh.

---
 src/core/device.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index f5bd295..9846e85 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -409,8 +409,8 @@ acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
     const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
     const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
 
-    const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][dst_idx];
-    AcReal* dst_ptr       = &host_mesh->vertex_buffer[vtxbuf_handle][src_idx];
+    const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx];
+    AcReal* dst_ptr       = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx];
     const size_t bytes    = num_vertices * sizeof(src_ptr[0]);
 
     ERRCHK_CUDA(                                                                                  //

From b1176a0c067ea0253c3b92a4f2f896000b8b0b2b Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 5 Aug 2019 18:24:55 +0300
Subject: [PATCH 14/37] Added a copyright text to ctest

---
 src/ctest/main.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/ctest/main.c b/src/ctest/main.c
index a718b7d..5f31f17 100644
--- a/src/ctest/main.c
+++ b/src/ctest/main.c
@@ -1,3 +1,21 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
 #include <stdio.h>
 #include <stdlib.h>
 

From b21d2040da6ea86b10d9428425ad2c04e445bd33 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 5 Aug 2019 18:26:12 +0300
Subject: [PATCH 15/37] Added a test for building an MPI project. Building for
 the MPI and C API tests is now also disabled by default.

---
 CMakeLists.txt             |  7 +++++-
 src/mpitest/CMakeLists.txt | 12 +++++++++
 src/mpitest/README.txt     |  1 +
 src/mpitest/main.c         | 51 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 src/mpitest/CMakeLists.txt
 create mode 100644 src/mpitest/README.txt
 create mode 100644 src/mpitest/main.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04a73d8..4cf61b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,8 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 option(BUILD_DEBUG              "Builds the program with extensive error checking"          OFF)
 option(BUILD_STANDALONE         "Builds the standalone Astaroth"                            ON)
 option(BUILD_RT_VISUALIZATION   "Builds the module for real-time visualization using SDL2"  OFF)
-option(BUILD_C_API_TEST         "Builds a C program to test whether the API is conformant"  ON)
+option(BUILD_C_API_TEST         "Builds a C program to test whether the API is conformant"  OFF)
+option(BUILD_MPI_TEST           "Builds a C program to test whether MPI works"              OFF)
 option(DOUBLE_PRECISION         "Generates double precision code"                           OFF)
 option(MULTIGPU_ENABLED         "If enabled, uses all the available GPUs"                   ON)
 option(ALTER_CONF               "If enabled, loads astaroth.conf from the build directory"  OFF)
@@ -60,3 +61,7 @@ endif ()
 if (BUILD_C_API_TEST)
     add_subdirectory(src/ctest)
 endif ()
+
+if (BUILD_MPI_TEST)
+    add_subdirectory(src/mpitest)
+endif ()
diff --git a/src/mpitest/CMakeLists.txt b/src/mpitest/CMakeLists.txt
new file mode 100644
index 0000000..c64105d
--- /dev/null
+++ b/src/mpitest/CMakeLists.txt
@@ -0,0 +1,12 @@
+##############################################
+##    CMakeLists.txt for the MPI test       ##
+##############################################
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+
+find_package(MPI REQUIRED)
+
+add_executable(mpitest main.c)
+target_include_directories(mpitest PRIVATE ${MPI_C_INCLUDE_PATH})
+target_link_libraries(mpitest PRIVATE ${MPI_C_LIBRARIES} astaroth_core)
diff --git a/src/mpitest/README.txt b/src/mpitest/README.txt
new file mode 100644
index 0000000..1547b08
--- /dev/null
+++ b/src/mpitest/README.txt
@@ -0,0 +1 @@
+This directory is used to test MPI with Astaroth.
diff --git a/src/mpitest/main.c b/src/mpitest/main.c
new file mode 100644
index 0000000..a07522e
--- /dev/null
+++ b/src/mpitest/main.c
@@ -0,0 +1,51 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "astaroth.h"
+
+#include <mpi.h>
+
+int
+main(void)
+{
+    MPI_Init(NULL, NULL);
+
+    int num_processes, process_id;
+    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
+
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+    printf("Processor %s. Process %d of %d.\n", processor_name, process_id, num_processes);
+
+    AcMeshInfo info = {
+        .int_params[AC_nx] = 128,
+        .int_params[AC_ny] = 64,
+        .int_params[AC_nz] = 32,
+    };
+    acInit(info);
+    acIntegrate(0.1f);
+    acQuit();
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}

From 8df49370c8c1175e3a8f067defcda31659ed08dd Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 5 Aug 2019 19:08:05 +0300
Subject: [PATCH 16/37] Cleanup

---
 src/core/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 0aad51d..1a58820 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -33,6 +33,6 @@ else ()
 endif ()
 
 ## Create and link the library
-include_directories(.)
 cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
+target_include_directories(astaroth_core PRIVATE .)
 target_link_libraries(astaroth_core m)

From b73c2675e8fa023d4e8d278c8111dc80ec98998e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 5 Aug 2019 20:10:13 +0300
Subject: [PATCH 17/37] Added the optimized implementation of acNodeIntegrate
 where boundconds are done before integration instead of after

---
 include/astaroth_defines.h |  22 ++++++-
 src/core/astaroth.cu       |   3 +-
 src/core/node.cu           | 124 +++++++++++++++++++++++++++++++------
 3 files changed, 128 insertions(+), 21 deletions(-)

diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 0d03b70..808dd1f 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -104,7 +104,27 @@ typedef enum {
     NUM_REDUCTION_TYPES
 } ReductionType;
 
-typedef enum { STREAM_DEFAULT, NUM_STREAM_TYPES } Stream;
+typedef enum {
+    STREAM_DEFAULT,
+    STREAM_0,
+    STREAM_1,
+    STREAM_2,
+    STREAM_3,
+    STREAM_4,
+    STREAM_5,
+    STREAM_6,
+    STREAM_7,
+    STREAM_8,
+    STREAM_9,
+    STREAM_10,
+    STREAM_11,
+    STREAM_12,
+    STREAM_13,
+    STREAM_14,
+    STREAM_15,
+    STREAM_16,
+    NUM_STREAM_TYPES
+} Stream;
 #define STREAM_ALL (NUM_STREAM_TYPES)
 
 #define AC_GEN_ID(X) X
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index d47748c..f4a8beb 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -73,7 +73,8 @@ acStore(AcMesh* host_mesh)
 AcResult
 acIntegrate(const AcReal dt)
 {
-    return acNodeIntegrate(nodes[0], dt);
+    acNodeIntegrate(nodes[0], dt);
+    return acBoundcondStep();
 }
 
 AcResult
diff --git a/src/core/node.cu b/src/core/node.cu
index e647547..95133f1 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -478,25 +478,6 @@ acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep,
     return AC_SUCCESS;
 }
 
-AcResult
-acNodeIntegrate(const Node node, const AcReal dt)
-{
-    acNodeSynchronizeStream(node, STREAM_ALL);
-
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acNodePeriodicBoundconds(node, STREAM_DEFAULT);
-        acNodeSynchronizeStream(node, STREAM_DEFAULT); // DEBUG
-        const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-        const int3 end   = start + node->grid.n;
-        acNodeIntegrateSubstep(node, STREAM_DEFAULT, isubstep, start, end, dt);
-        acNodeSwapBuffers(node);
-        acNodeSynchronizeStream(node, STREAM_DEFAULT); // DEBUG
-    }
-    acNodePeriodicBoundconds(node, STREAM_DEFAULT); // DEBUG
-    acNodeSynchronizeStream(node, STREAM_ALL);
-    return AC_SUCCESS;
-}
-
 static AcResult
 local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
 {
@@ -550,6 +531,111 @@ global_boundcondstep(const Node node, const Stream stream, const VertexBufferHan
     return AC_SUCCESS;
 }
 
+AcResult
+acNodeIntegrate(const Node node, const AcReal dt)
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    // xxx|OOO OOOOOOOOO OOO|xxx
+    //    ^    ^         ^  ^
+    //   n0   n1        n2  n3
+    const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+    const int3 n2 = node->grid.n;
+    const int3 n3 = n0 + node->grid.n;
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acNodeSynchronizeStream(node, STREAM_ALL);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        acNodeSynchronizeStream(node, STREAM_ALL);
+        // Inner inner
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 m1 = n1 + (int3){0, 0, i * node->subgrid.n.z};
+            const int3 m2 = m1 + node->subgrid.n - (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            acNodeIntegrateSubstep(node, STREAM_16, isubstep, m1, m2, dt);
+        }
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            const int num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
+            for (int device_id = 0; device_id < node->num_devices; ++device_id) {
+                // ...|ooooxxx|... -> xxx|ooooooo|...
+                {
+                    const int3 src = (int3){0, 0, node->subgrid.n.z};
+                    const int3 dst = (int3){0, 0, 0};
+                    acDeviceTransferVertexBufferWithOffset(
+                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
+                        dst, num_vertices, node->devices[(device_id + 1) % node->num_devices]);
+                }
+                // ...|ooooooo|xxx <- ...|xxxoooo|...
+                {
+                    const int3 src = (int3){0, 0, NGHOST};
+                    const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
+                    acDeviceTransferVertexBufferWithOffset(
+                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
+                        dst, num_vertices,
+                        node->devices[(device_id - 1 + node->num_devices) % node->num_devices]);
+                }
+            }
+        }
+        for (int vtxbuf = 0; vtxbuf < 2 * NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeStream(node, (Stream)vtxbuf);
+        }
+        // Inner outer
+        for (int i = 0; i < node->num_devices - 1; ++i) {
+            const int3 m1 = n1 + (int3){0, 0, (i + 1) * node->subgrid.n.z - 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x - 2 * NGHOST,
+                                        node->subgrid.n.y - 2 * NGHOST, 2 * NGHOST};
+            acNodeIntegrateSubstep(node, STREAM_0, isubstep, m1, m2, dt);
+        }
+        // Outer
+        // Front
+        {
+            const int3 m1 = (int3){n0.x, n0.y, n0.z};
+            const int3 m2 = (int3){n3.x, n3.y, n1.z};
+            acNodeIntegrateSubstep(node, STREAM_1, isubstep, m1, m2, dt);
+        }
+
+        // Back
+        {
+            const int3 m1 = (int3){n0.x, n0.y, n2.z};
+            const int3 m2 = (int3){n3.x, n3.y, n3.z};
+            acNodeIntegrateSubstep(node, STREAM_2, isubstep, m1, m2, dt);
+        }
+
+        // Top
+        {
+            const int3 m1 = (int3){n0.x, n0.y, n1.z};
+            const int3 m2 = (int3){n3.x, n1.y, n2.z};
+            acNodeIntegrateSubstep(node, STREAM_3, isubstep, m1, m2, dt);
+        }
+
+        // Bottom
+        {
+            const int3 m1 = (int3){n0.x, n2.y, n1.z};
+            const int3 m2 = (int3){n3.x, n3.y, n2.z};
+            acNodeIntegrateSubstep(node, STREAM_4, isubstep, m1, m2, dt);
+        }
+
+        // Left
+        {
+            const int3 m1 = (int3){n0.x, n1.y, n1.z};
+            const int3 m2 = (int3){n1.x, n2.y, n2.z};
+            acNodeIntegrateSubstep(node, STREAM_5, isubstep, m1, m2, dt);
+        }
+
+        // Right
+        {
+            const int3 m1 = (int3){n2.x, n1.y, n1.z};
+            const int3 m2 = (int3){n3.x, n2.y, n2.z};
+            acNodeIntegrateSubstep(node, STREAM_6, isubstep, m1, m2, dt);
+        }
+
+        acNodeSwapBuffers(node);
+    }
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    return AC_SUCCESS;
+}
+
 AcResult
 acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                             const VertexBufferHandle vtxbuf_handle)

From 1dd997552876d8f01fea6441b2787de41cbfa854 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 6 Aug 2019 15:44:51 +0300
Subject: [PATCH 18/37] Formatting

---
 src/core/node.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/node.cu b/src/core/node.cu
index 95133f1..6a37f95 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -152,14 +152,14 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
     ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
     ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
 
-#if VERBOSE_PRINTING
     // clang-format off
+    #if VERBOSE_PRINTING
     printf("Grid m ");   printInt3(node->grid.m);    printf("\n");
     printf("Grid n ");   printInt3(node->grid.n);    printf("\n");
     printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
     printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
+    #endif
     // clang-format on
-#endif
 
     // Initialize the devices
     for (int i = 0; i < node->num_devices; ++i) {

From 812b5e170edb06e0e5e45a77c4ed307a191952b6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 6 Aug 2019 16:32:36 +0300
Subject: [PATCH 19/37] Added some error checking to rendering

---
 src/standalone/renderer.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc
index 6fd1913..dd95a9b 100644
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -32,12 +32,12 @@
 #include <string.h> // memcpy
 
 #include "config_loader.h"
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_reduce.h"
 #include "model/model_rk3.h"
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
 #include "timer_hires.h"
 
 // Window
@@ -105,6 +105,7 @@ renderer_init(const int& mx, const int& my)
     // Setup window
     window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
                               window_width, window_height, SDL_WINDOW_SHOWN);
+    ERRCHK_ALWAYS(window);
 
     // Setup SDL renderer
     renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);

From 37268476838081e74f1deaea85d669eecad4296f Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 6 Aug 2019 16:39:15 +0300
Subject: [PATCH 20/37] Made globalGridN and d_multigpu_offsets built-in
 parameters. Note the renaming from globalGrid.n to globalGridN.

---
 acc/mhd_solver/stencil_defines.h   |  2 +-
 acc/mhd_solver/stencil_process.sps |  6 +++---
 include/astaroth_defines.h         |  4 +++-
 src/core/device.cu                 | 10 ++--------
 src/core/node.cu                   |  9 +++++++--
 5 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/acc/mhd_solver/stencil_defines.h b/acc/mhd_solver/stencil_defines.h
index 0ab07c2..5124820 100644
--- a/acc/mhd_solver/stencil_defines.h
+++ b/acc/mhd_solver/stencil_defines.h
@@ -30,7 +30,7 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (0)
+#define LFORCING (1)
 #define LUPWD (0)
 
 #define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index eb0fc0c..af30d79 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -248,9 +248,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
 Vector
 forcing(int3 globalVertexIdx, Scalar dt)
 {
-    Vector a = Scalar(.5) * (Vector){globalGrid.n.x * dsx,
-                                     globalGrid.n.y * dsy,
-                                     globalGrid.n.z * dsz}; // source (origin)
+    Vector a = Scalar(.5) * (Vector){globalGridN.x * dsx,
+                                     globalGridN.y * dsy,
+                                     globalGridN.z * dsz}; // source (origin)
     Vector xx = (Vector){(globalVertexIdx.x - nx_min) * dsx,
                         (globalVertexIdx.y - ny_min) * dsy,
                         (globalVertexIdx.z - nz_min) * dsz}; // sink (current index)
diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 808dd1f..345dd5d 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -86,7 +86,9 @@ typedef struct {
         FUNC(AC_nxy),\
         FUNC(AC_nxyz),\
 
-#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)
+#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)\
+        FUNC(AC_global_grid_n),\
+        FUNC(AC_multigpu_offset),
 
 #define AC_FOR_BUILTIN_REAL_PARAM_TYPES(FUNC)
 
diff --git a/src/core/device.cu b/src/core/device.cu
index 9846e85..a21ab4d 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -40,13 +40,14 @@ typedef struct {
 } VertexBufferArray;
 
 __constant__ AcMeshInfo d_mesh_info;
-__constant__ int3 d_multigpu_offset;
 #define DCONST_INT(X) (d_mesh_info.int_params[X])
 #define DCONST_INT3(X) (d_mesh_info.int3_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
+#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
+#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 #include "kernels/kernels.cuh"
 
 static dim3 rk3_tpb(32, 1, 4);
@@ -122,13 +123,6 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
     ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
                                           cudaMemcpyHostToDevice));
 
-    // Multi-GPU offset. This is used to compute globalVertexIdx.
-    // Might be better to calculate this in astaroth.cu instead of here, s.t.
-    // everything related to the decomposition is limited to the multi-GPU layer
-    const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
-                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
-
     printf("Created device %d (%p)\n", device->id, device);
     *device_handle = device;
 
diff --git a/src/core/node.cu b/src/core/node.cu
index 6a37f95..5166fa1 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -152,17 +152,22 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
     ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
     ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
 
+#if VERBOSE_PRINTING
     // clang-format off
-    #if VERBOSE_PRINTING
     printf("Grid m ");   printInt3(node->grid.m);    printf("\n");
     printf("Grid n ");   printInt3(node->grid.n);    printf("\n");
     printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
     printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
-    #endif
     // clang-format on
+#endif
 
     // Initialize the devices
     for (int i = 0; i < node->num_devices; ++i) {
+        const int3 multinode_offset                    = (int3){0, 0, 0}; // Placeholder
+        const int3 multigpu_offset                     = (int3){0, 0, i * node->subgrid.n.z};
+        subgrid_config.int3_params[AC_global_grid_n]   = node->grid.n;
+        subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset;
+
         acDeviceCreate(i, subgrid_config, &node->devices[i]);
         acDevicePrintInfo(node->devices[i]);
     }

From 9af5ba21563ad765238314f2353caea115b5b1aa Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 7 Aug 2019 11:11:27 +0800
Subject: [PATCH 21/37] Copied elements in the DSL form.

Needs to be adapted at the next stage.
---
 acc/mhd_solver/stencil_assembly.sas | 27 ++++++++----
 src/standalone/model/model_rk3.cc   | 68 ++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/acc/mhd_solver/stencil_assembly.sas b/acc/mhd_solver/stencil_assembly.sas
index 3451722..53b5a45 100644
--- a/acc/mhd_solver/stencil_assembly.sas
+++ b/acc/mhd_solver/stencil_assembly.sas
@@ -21,9 +21,12 @@ der6x_upwd(in Scalar vertex)
 
     return (Scalar){  Scalar(1.0/60.0)*inv_ds* (
                     - Scalar(20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    + Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
-                    - Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
-                    +               vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
+                    + Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
+                    - Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
+                    +               vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
 }
 
 Preprocessed Scalar
@@ -33,9 +36,12 @@ der6y_upwd(in Scalar vertex)
 
     return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
                     -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
-                    +               vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
+                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
+                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
+                    +               vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
 }
 
 Preprocessed Scalar
@@ -45,9 +51,12 @@ der6z_upwd(in Scalar vertex)
 
     return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
                     -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
+                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
+                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
 }
 
 #endif
diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
index bb6dbc2..356f6ad 100644
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -354,6 +354,56 @@ gradients(const ModelVectorData& data)
     return (ModelMatrix){gradient(data.x), gradient(data.y), gradient(data.z)};
 }
 
+#if LUPWD
+
+Preprocessed Scalar
+der6x_upwd(in Scalar vertex)
+{
+    Scalar inv_ds = DCONST_REAL(AC_inv_dsx);
+
+    return (Scalar){  Scalar(1.0/60.0)*inv_ds* (
+                    - Scalar(20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
+                    + Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
+                    - Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
+                    +               vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] 
+                    +               vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
+}
+
+Preprocessed Scalar
+der6y_upwd(in Scalar vertex)
+{
+    Scalar inv_ds = DCONST_REAL(AC_inv_dsy);
+
+    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
+                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
+                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
+                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
+                    +               vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] 
+                    +               vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
+}
+
+Preprocessed Scalar
+der6z_upwd(in Scalar vertex)
+{
+    Scalar inv_ds = DCONST_REAL(AC_inv_dsz);
+
+    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
+                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
+                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
+                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] 
+                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
+}
+
+#endif
+
+
 /*
  * =============================================================================
  * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
@@ -450,6 +500,17 @@ curl(const ModelVectorData& vec)
                          gradient(vec.y).x - gradient(vec.x).y};
 }
 
+#if LUPWD
+Scalar
+upwd_der6(in Vector uu, in Scalar lnrho)
+{
+    Scalar uux = fabs(value(uu).x);
+    Scalar uuy = fabs(value(uu).y);
+    Scalar uuz = fabs(value(uu).z);
+    return (Scalar){uux*der6x_upwd(lnrho) + uuy*der6y_upwd(lnrho) + uuz*der6z_upwd(lnrho)};
+}
+#endif
+
 static inline ModelVector
 gradient_of_divergence(const ModelVectorData& vec)
 {
@@ -505,7 +566,12 @@ contract(const ModelMatrix& mat)
 static inline ModelScalar
 continuity(const ModelVectorData& uu, const ModelScalarData& lnrho)
 {
-    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+    return -dot(value(uu), gradient(lnrho)) 
+#if LUPWD
+           //This is a corrective hyperdiffusion term for upwinding.
+           + upwd_der6(uu, lnrho)
+#endif
+           - divergence(uu);
 }
 
 static inline ModelScalar

From 15f0fa6aa552b0f7c18248ba2a9eabcec23707a8 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 7 Aug 2019 14:07:45 +0800
Subject: [PATCH 22/37] Useful .gitignore added

---
 doc/manual/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 doc/manual/.gitignore

diff --git a/doc/manual/.gitignore b/doc/manual/.gitignore
new file mode 100644
index 0000000..23f832b
--- /dev/null
+++ b/doc/manual/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.pdf

From 7cc524b78b2807f520dc4ad2c4f92329a1a83209 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 7 Aug 2019 14:57:51 +0800
Subject: [PATCH 23/37] Adapting for autotest but i, j, k indexing is
 confusing.

---
 acc/mhd_solver/stencil_defines.h  |   4 +-
 src/standalone/model/model_rk3.cc | 118 +++++++++++++++---------------
 2 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/acc/mhd_solver/stencil_defines.h b/acc/mhd_solver/stencil_defines.h
index 5124820..ea4c102 100644
--- a/acc/mhd_solver/stencil_defines.h
+++ b/acc/mhd_solver/stencil_defines.h
@@ -30,8 +30,8 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (1)
-#define LUPWD (0)
+#define LFORCING (0)
+#define LUPWD (1)
 
 #define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
 
diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
index 356f6ad..3471cd1 100644
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -273,6 +273,53 @@ derzz(const int i, const int j, const int k, const ModelScalar* arr)
     return second_derivative(pencil, get(AC_inv_dsz));
 }
 
+#if LUPWD
+static inline ModelScalar
+der6x_upwd(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar inv_ds = get(AC_inv_dsx);
+
+    return ModelScalar(1.0/60.0)*inv_ds* (
+          -ModelScalar(    20.0)* arr[IDX(i,   j,   k)]
+          +ModelScalar(    15.0)*(arr[IDX(i+1, j,   k)] 
+          +                       arr[IDX(i-1, j,   k)])
+          -ModelScalar(     6.0)*(arr[IDX(i+2, j,   k)] 
+          +                       arr[IDX(i-2, j,   k)])
+          +                       arr[IDX(i+3, j,   k)] 
+          +                       arr[IDX(i-3, j,   k)]);
+}
+
+static inline ModelScalar
+der6y_upwd(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar inv_ds = get(AC_inv_dsy);
+
+    return ModelScalar(1.0/60.0)*inv_ds* (
+          -ModelScalar(    20.0)* arr[IDX(i,   j,   k)]
+          +ModelScalar(    15.0)*(arr[IDX(i,   j+1, k)] 
+          +                       arr[IDX(i,   j-1, k)])
+          -ModelScalar(     6.0)*(arr[IDX(i,   j+2, k)] 
+          +                       arr[IDX(i,   j-2, k)])
+          +                       arr[IDX(i,   j+3, k)] 
+          +                       arr[IDX(i,   j-3, k)]);
+}
+
+static inline ModelScalar
+der6z_upwd(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar inv_ds = get(AC_inv_dsz);
+
+    return ModelScalar(1.0/60.0)*inv_ds* (
+          -ModelScalar(    20.0)* arr[IDX(i,   j,   k  )]
+          +ModelScalar(    15.0)*(arr[IDX(i,   j,   k+1)] 
+          +                       arr[IDX(i,   j,   k-1)])
+          -ModelScalar(     6.0)*(arr[IDX(i,   j,   k+2)] 
+          +                       arr[IDX(i,   j,   k-2)])
+          +                       arr[IDX(i,   j,   k+3)] 
+          +                       arr[IDX(i,   j,   k-3)]);
+}
+#endif
+
 static inline ModelScalar
 compute_value(const int i, const int j, const int k, const ModelScalar* arr)
 {
@@ -354,54 +401,6 @@ gradients(const ModelVectorData& data)
     return (ModelMatrix){gradient(data.x), gradient(data.y), gradient(data.z)};
 }
 
-#if LUPWD
-
-Preprocessed Scalar
-der6x_upwd(in Scalar vertex)
-{
-    Scalar inv_ds = DCONST_REAL(AC_inv_dsx);
-
-    return (Scalar){  Scalar(1.0/60.0)*inv_ds* (
-                    - Scalar(20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    + Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] 
-                    +               vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
-                    - Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] 
-                    +               vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
-                    +               vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] 
-                    +               vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
-}
-
-Preprocessed Scalar
-der6y_upwd(in Scalar vertex)
-{
-    Scalar inv_ds = DCONST_REAL(AC_inv_dsy);
-
-    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
-                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] 
-                    +               vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] 
-                    +               vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
-                    +               vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] 
-                    +               vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
-}
-
-Preprocessed Scalar
-der6z_upwd(in Scalar vertex)
-{
-    Scalar inv_ds = DCONST_REAL(AC_inv_dsz);
-
-    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
-                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] 
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] 
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] 
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
-}
-
-#endif
 
 
 /*
@@ -500,17 +499,6 @@ curl(const ModelVectorData& vec)
                          gradient(vec.y).x - gradient(vec.x).y};
 }
 
-#if LUPWD
-Scalar
-upwd_der6(in Vector uu, in Scalar lnrho)
-{
-    Scalar uux = fabs(value(uu).x);
-    Scalar uuy = fabs(value(uu).y);
-    Scalar uuz = fabs(value(uu).z);
-    return (Scalar){uux*der6x_upwd(lnrho) + uuy*der6y_upwd(lnrho) + uuz*der6z_upwd(lnrho)};
-}
-#endif
-
 static inline ModelVector
 gradient_of_divergence(const ModelVectorData& vec)
 {
@@ -563,6 +551,18 @@ contract(const ModelMatrix& mat)
  * Stencil Processing Stage (equations)
  * =============================================================================
  */
+
+#if LUPWD
+ModelScalar
+upwd_der6(const ModelVectorData& uu, const ModelScalarData& lnrho)
+{
+    ModelScalar uux = fabs(value(uu).x);
+    ModelScalar uuy = fabs(value(uu).y);
+    ModelScalar uuz = fabs(value(uu).z);
+    return uux*der6x_upwd(lnrho) + uuy*der6y_upwd(lnrho) + uuz*der6z_upwd(lnrho);
+}
+#endif
+
 static inline ModelScalar
 continuity(const ModelVectorData& uu, const ModelScalarData& lnrho)
 {

From 7e6361a92abde5a9fcdf8413309baa33f04fcb7d Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 7 Aug 2019 16:04:48 +0800
Subject: [PATCH 24/37] Forcing hotfix.

Will need more investigation before scientific runs. Now just something to correct the obvious bug.
---
 acc/mhd_solver/stencil_process.sps | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index c3ade38..2adb5ba 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -222,13 +222,16 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
     // JP: This looks wrong:
     //    1) Should it be dsx * nx instead of dsx * ny?
     //    2) Should you also use globalGrid.n instead of the local n?
+    //    MV: You are rigth. Made a quickfix. I did not see the error  because multigpu is split 
+    //        in z direction not y direction. 
     //    3) Also final point: can we do this with vectors/quaternions instead?
     //       Tringonometric functions are much more expensive and inaccurate/
+    //    MV: Good idea. No an immediate priority.  
     // Fun related article:
     // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
-    xx.x = xx.x*(2.0*M_PI/(dsx*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
-    xx.y = xx.y*(2.0*M_PI/(dsy*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
-    xx.z = xx.z*(2.0*M_PI/(dsz*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
+    xx.x = xx.x*(2.0*M_PI/(dsx*globalGrid.n.x));
+    xx.y = xx.y*(2.0*M_PI/(dsy*globalGrid.n.y));
+    xx.z = xx.z*(2.0*M_PI/(dsz*globalGrid.n.z));
 
     Scalar cos_phi = cos(phi);
     Scalar sin_phi = sin(phi);

From 738b2abaf3d9ea0b394cbf89ab3101e71e6cb3e7 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 7 Aug 2019 18:57:53 +0800
Subject: [PATCH 25/37] Supposedly working autotest for upwinding.

---
 src/standalone/model/model_rk3.cc | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
index 3471cd1..b51f285 100644
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -43,6 +43,9 @@ typedef struct {
     ModelScalar value;
     ModelVector gradient;
     ModelMatrix hessian;
+#if LUPWD
+    ModelVector upwind;  
+#endif
 } ModelScalarData;
 
 typedef struct {
@@ -332,6 +335,12 @@ compute_gradient(const int i, const int j, const int k, const ModelScalar* arr)
     return (ModelVector){derx(i, j, k, arr), dery(i, j, k, arr), derz(i, j, k, arr)};
 }
 
+static inline ModelVector
+compute_upwind(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    return (ModelVector){der6x_upwd(i, j, k, arr), der6y_upwd(i, j, k, arr), der6z_upwd(i, j, k, arr)};
+}
+
 static inline ModelMatrix
 compute_hessian(const int i, const int j, const int k, const ModelScalar* arr)
 {
@@ -356,6 +365,10 @@ read_data(const int i, const int j, const int k, ModelScalar* buf[], const int h
     // diagonals with all arrays
     data.hessian = compute_hessian(i, j, k, buf[handle]);
 
+#if LUPWD
+    data.upwind   = compute_upwind(i, j, k, buf[handle]);
+#endif
+
     return data;
 }
 
@@ -559,7 +572,7 @@ upwd_der6(const ModelVectorData& uu, const ModelScalarData& lnrho)
     ModelScalar uux = fabs(value(uu).x);
     ModelScalar uuy = fabs(value(uu).y);
     ModelScalar uuz = fabs(value(uu).z);
-    return uux*der6x_upwd(lnrho) + uuy*der6y_upwd(lnrho) + uuz*der6z_upwd(lnrho);
+    return uux*lnrho.upwind.x + uuy*lnrho.upwind.y + uuz*lnrho.upwind.z;
 }
 #endif
 

From 7fdbd76aa21118949ddea7d34a0f0e0f76d59da8 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 7 Aug 2019 19:05:14 +0800
Subject: [PATCH 26/37] The default stencil_defines.h setting for merge.

---
 acc/mhd_solver/stencil_defines.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/acc/mhd_solver/stencil_defines.h b/acc/mhd_solver/stencil_defines.h
index ea4c102..6db725d 100644
--- a/acc/mhd_solver/stencil_defines.h
+++ b/acc/mhd_solver/stencil_defines.h
@@ -30,8 +30,9 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (0)
-#define LUPWD (1)
+#define LFORCING (1)
+#define LUPWD (0)
+
 
 #define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
 

From 0bb568642fd8997cf5a2b130fb4986581f562ab0 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 7 Aug 2019 19:10:39 +0800
Subject: [PATCH 27/37] Still one bug

---
 src/standalone/model/model_rk3.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
index b51f285..7eb962f 100644
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -335,11 +335,13 @@ compute_gradient(const int i, const int j, const int k, const ModelScalar* arr)
     return (ModelVector){derx(i, j, k, arr), dery(i, j, k, arr), derz(i, j, k, arr)};
 }
 
+#if LUPWD
 static inline ModelVector
 compute_upwind(const int i, const int j, const int k, const ModelScalar* arr)
 {
     return (ModelVector){der6x_upwd(i, j, k, arr), der6y_upwd(i, j, k, arr), der6z_upwd(i, j, k, arr)};
 }
+#endif
 
 static inline ModelMatrix
 compute_hessian(const int i, const int j, const int k, const ModelScalar* arr)

From b61617ee0ff3e2cb80d45f11ef24dd727404207e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 15:53:32 +0300
Subject: [PATCH 28/37] Enabled upwinding by default and updated the model
 helical forcing with the hotfixed changes from earlier commits. Autotests
 kinda pass (we get 1 failure but this is likely due to inaccuracies of the
 trigonometric functions used in helical forcing. The error is very close to
 the acceptable error bound).

---
 acc/mhd_solver/stencil_defines.h  |  3 +-
 src/standalone/model/model_rk3.cc | 51 ++++++++++++++++---------------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/acc/mhd_solver/stencil_defines.h b/acc/mhd_solver/stencil_defines.h
index 6db725d..b4bc622 100644
--- a/acc/mhd_solver/stencil_defines.h
+++ b/acc/mhd_solver/stencil_defines.h
@@ -31,8 +31,7 @@
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
 #define LFORCING (1)
-#define LUPWD (0)
-
+#define LUPWD (1)
 
 #define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
 
diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
index 7eb962f..1084794 100644
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -44,7 +44,7 @@ typedef struct {
     ModelVector gradient;
     ModelMatrix hessian;
 #if LUPWD
-    ModelVector upwind;  
+    ModelVector upwind;
 #endif
 } ModelScalarData;
 
@@ -284,11 +284,11 @@ der6x_upwd(const int i, const int j, const int k, const ModelScalar* arr)
 
     return ModelScalar(1.0/60.0)*inv_ds* (
           -ModelScalar(    20.0)* arr[IDX(i,   j,   k)]
-          +ModelScalar(    15.0)*(arr[IDX(i+1, j,   k)] 
+          +ModelScalar(    15.0)*(arr[IDX(i+1, j,   k)]
           +                       arr[IDX(i-1, j,   k)])
-          -ModelScalar(     6.0)*(arr[IDX(i+2, j,   k)] 
+          -ModelScalar(     6.0)*(arr[IDX(i+2, j,   k)]
           +                       arr[IDX(i-2, j,   k)])
-          +                       arr[IDX(i+3, j,   k)] 
+          +                       arr[IDX(i+3, j,   k)]
           +                       arr[IDX(i-3, j,   k)]);
 }
 
@@ -299,11 +299,11 @@ der6y_upwd(const int i, const int j, const int k, const ModelScalar* arr)
 
     return ModelScalar(1.0/60.0)*inv_ds* (
           -ModelScalar(    20.0)* arr[IDX(i,   j,   k)]
-          +ModelScalar(    15.0)*(arr[IDX(i,   j+1, k)] 
+          +ModelScalar(    15.0)*(arr[IDX(i,   j+1, k)]
           +                       arr[IDX(i,   j-1, k)])
-          -ModelScalar(     6.0)*(arr[IDX(i,   j+2, k)] 
+          -ModelScalar(     6.0)*(arr[IDX(i,   j+2, k)]
           +                       arr[IDX(i,   j-2, k)])
-          +                       arr[IDX(i,   j+3, k)] 
+          +                       arr[IDX(i,   j+3, k)]
           +                       arr[IDX(i,   j-3, k)]);
 }
 
@@ -314,11 +314,11 @@ der6z_upwd(const int i, const int j, const int k, const ModelScalar* arr)
 
     return ModelScalar(1.0/60.0)*inv_ds* (
           -ModelScalar(    20.0)* arr[IDX(i,   j,   k  )]
-          +ModelScalar(    15.0)*(arr[IDX(i,   j,   k+1)] 
+          +ModelScalar(    15.0)*(arr[IDX(i,   j,   k+1)]
           +                       arr[IDX(i,   j,   k-1)])
-          -ModelScalar(     6.0)*(arr[IDX(i,   j,   k+2)] 
+          -ModelScalar(     6.0)*(arr[IDX(i,   j,   k+2)]
           +                       arr[IDX(i,   j,   k-2)])
-          +                       arr[IDX(i,   j,   k+3)] 
+          +                       arr[IDX(i,   j,   k+3)]
           +                       arr[IDX(i,   j,   k-3)]);
 }
 #endif
@@ -581,7 +581,7 @@ upwd_der6(const ModelVectorData& uu, const ModelScalarData& lnrho)
 static inline ModelScalar
 continuity(const ModelVectorData& uu, const ModelScalarData& lnrho)
 {
-    return -dot(value(uu), gradient(lnrho)) 
+    return -dot(value(uu), gradient(lnrho))
 #if LUPWD
            //This is a corrective hyperdiffusion term for upwinding.
            + upwd_der6(uu, lnrho)
@@ -760,23 +760,24 @@ helical_forcing(ModelScalar magnitude, ModelVector k_force, ModelVector xx, Mode
                 ModelVector ff_im, ModelScalar phi)
 {
     (void)magnitude; // WARNING: unused
-    xx.x = xx.x * (2.0l * M_PI / (get(AC_dsx) * (get(AC_ny_max) - get(AC_ny_min))));
-    xx.y = xx.y * (2.0l * M_PI / (get(AC_dsy) * (get(AC_ny_max) - get(AC_ny_min))));
-    xx.z = xx.z * (2.0l * M_PI / (get(AC_dsz) * (get(AC_ny_max) - get(AC_ny_min))));
+    xx.x = xx.x*(2.0*M_PI/(get(AC_dsx)*get(AC_nx)));
+    xx.y = xx.y*(2.0*M_PI/(get(AC_dsy)*get(AC_ny)));
+    xx.z = xx.z*(2.0*M_PI/(get(AC_dsz)*get(AC_nz)));
 
-    ModelScalar cosl_phi     = cosl(phi);
-    ModelScalar sinl_phi     = sinl(phi);
-    ModelScalar cosl_k_dox_x = cosl(dot(k_force, xx));
-    ModelScalar sinl_k_dox_x = sinl(dot(k_force, xx));
+    ModelScalar cos_phi = cosl(phi);
+    ModelScalar sin_phi = sinl(phi);
+    ModelScalar cos_k_dox_x     = cosl(dot(k_force, xx));
+    ModelScalar sin_k_dox_x     = sinl(dot(k_force, xx));
     // Phase affect only the x-component
-    // ModelScalar real_comp       = cosl_k_dox_x;
-    // ModelScalar imag_comp       = sinl_k_dox_x;
-    ModelScalar real_comp_phase = cosl_k_dox_x * cosl_phi - sinl_k_dox_x * sinl_phi;
-    ModelScalar imag_comp_phase = cosl_k_dox_x * sinl_phi + sinl_k_dox_x * cosl_phi;
+    //Scalar real_comp       = cos_k_dox_x;
+    //Scalar imag_comp       = sin_k_dox_x;
+    ModelScalar real_comp_phase = cos_k_dox_x*cos_phi - sin_k_dox_x*sin_phi;
+    ModelScalar imag_comp_phase = cos_k_dox_x*sin_phi + sin_k_dox_x*cos_phi;
 
-    ModelVector force = (ModelVector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
-                                      ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
-                                      ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
+
+    ModelVector force = (ModelVector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
+                             ff_re.y*real_comp_phase - ff_im.y*imag_comp_phase,
+                             ff_re.z*real_comp_phase - ff_im.z*imag_comp_phase};
 
     return force;
 }

From e2f5cced1e52b33c598b4ebb3646490136247227 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 16:08:03 +0300
Subject: [PATCH 29/37] Renamed dox -> dot

---
 acc/mhd_solver/stencil_process.sps | 18 +++++++++---------
 src/standalone/model/model_rk3.cc  | 12 ++++++------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index 2adb5ba..cc692fe 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -222,11 +222,11 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
     // JP: This looks wrong:
     //    1) Should it be dsx * nx instead of dsx * ny?
     //    2) Should you also use globalGrid.n instead of the local n?
-    //    MV: You are rigth. Made a quickfix. I did not see the error  because multigpu is split 
-    //        in z direction not y direction. 
+    //    MV: You are rigth. Made a quickfix. I did not see the error  because multigpu is split
+    //        in z direction not y direction.
     //    3) Also final point: can we do this with vectors/quaternions instead?
     //       Tringonometric functions are much more expensive and inaccurate/
-    //    MV: Good idea. No an immediate priority.  
+    //    MV: Good idea. No an immediate priority.
     // Fun related article:
     // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
     xx.x = xx.x*(2.0*M_PI/(dsx*globalGrid.n.x));
@@ -235,13 +235,13 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
 
     Scalar cos_phi = cos(phi);
     Scalar sin_phi = sin(phi);
-    Scalar cos_k_dox_x     = cos(dot(k_force, xx));
-    Scalar sin_k_dox_x     = sin(dot(k_force, xx));
+    Scalar cos_k_dot_x     = cos(dot(k_force, xx));
+    Scalar sin_k_dot_x     = sin(dot(k_force, xx));
     // Phase affect only the x-component
-    //Scalar real_comp       = cos_k_dox_x;
-    //Scalar imag_comp       = sin_k_dox_x;
-    Scalar real_comp_phase = cos_k_dox_x*cos_phi - sin_k_dox_x*sin_phi;
-    Scalar imag_comp_phase = cos_k_dox_x*sin_phi + sin_k_dox_x*cos_phi;
+    //Scalar real_comp       = cos_k_dot_x;
+    //Scalar imag_comp       = sin_k_dot_x;
+    Scalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
+    Scalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;
 
 
     Vector force = (Vector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
index 1084794..3900f0c 100644
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -766,13 +766,13 @@ helical_forcing(ModelScalar magnitude, ModelVector k_force, ModelVector xx, Mode
 
     ModelScalar cos_phi = cosl(phi);
     ModelScalar sin_phi = sinl(phi);
-    ModelScalar cos_k_dox_x     = cosl(dot(k_force, xx));
-    ModelScalar sin_k_dox_x     = sinl(dot(k_force, xx));
+    ModelScalar cos_k_dot_x     = cosl(dot(k_force, xx));
+    ModelScalar sin_k_dot_x     = sinl(dot(k_force, xx));
     // Phase affect only the x-component
-    //Scalar real_comp       = cos_k_dox_x;
-    //Scalar imag_comp       = sin_k_dox_x;
-    ModelScalar real_comp_phase = cos_k_dox_x*cos_phi - sin_k_dox_x*sin_phi;
-    ModelScalar imag_comp_phase = cos_k_dox_x*sin_phi + sin_k_dox_x*cos_phi;
+    //Scalar real_comp       = cos_k_dot_x;
+    //Scalar imag_comp       = sin_k_dot_x;
+    ModelScalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
+    ModelScalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;
 
 
     ModelVector force = (ModelVector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,

From c38218da3baf2a93127106cf78c0df3bc1862db8 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 16:22:55 +0300
Subject: [PATCH 30/37] SRUN_COMMAND has to be exported for it to be available
 in autotest.sh

---
 scripts/autotest.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/autotest.sh b/scripts/autotest.sh
index 49f0b57..f8e20a5 100755
--- a/scripts/autotest.sh
+++ b/scripts/autotest.sh
@@ -5,10 +5,13 @@
 # source ./sourceme.sh
 # autotest.sh
 #
-# If you need slurm or to pass something before ./ac_run, set the variable
+# If you need slurm or to pass something before ./ac_run, export the variable
 # SRUN_COMMAND before calling this script.
 #
 # F.ex. on Taito SRUN_COMMAND="srun --gres=gpu:k80:4 --mem=24000 -t 00:14:59 -p gputest --cpus-per-task 1 -n 1"
+# export SRUN_COMMAND
+# autotest.sh
+echo "SRUN_COMMAND: " ${SRUN_COMMAND}
 
 results=()
 

From 5432d20b1b15060c6a82a81b3cb2036c0b467bfe Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 16:25:03 +0300
Subject: [PATCH 31/37] Removed a depreceated auto-optimization script. The
 optimization is done during acInit() at runtime instead

---
 scripts/auto_optimize.sh | 51 ----------------------------------------
 1 file changed, 51 deletions(-)
 delete mode 100755 scripts/auto_optimize.sh

diff --git a/scripts/auto_optimize.sh b/scripts/auto_optimize.sh
deleted file mode 100755
index c86fa63..0000000
--- a/scripts/auto_optimize.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-# Run this in your build directory (cd build && ../scripts/auto_optimize.sh)
-# Generates a ${BENCHMARK_FILE} which contains the threadblock dims and other
-# constants used in the integration in addition to the time used.
-
-MAX_THREADS=1024 # Max size of the thread block, depends on hardware
-
-BENCHMARK_FILE="benchmark.out"
-TBCONFCREATOR_SRC_PATH="../scripts/gen_rk3_threadblockconf.c"
-TBCONFFILE_DST_PATH="../src/core/kernels"
-
-C_COMPILER_NAME="gcc"
-
-rm ${BENCHMARK_FILE}
-
-for (( tz=2; tz<=8; tz*=2))
-do
-for (( ty=1; ty<=1; ty+=1))
-do
-for (( tx=16; tx<=64; tx*=2))
-do
-
-if ( (${tx}*${ty}*${tz}) > ${MAX_THREADS})
-then break
-fi
-
-for (( launch_bound=1; launch_bound<=8; launch_bound*=2))
-do
-for (( elems_per_thread=1; elems_per_thread<=128; elems_per_thread*=2))
-do
-    # Generate the threadblock configuration
-    ${C_COMPILER_NAME} ${TBCONFCREATOR_SRC_PATH} -o gen_rk3_threadblockconf
-    ./gen_rk3_threadblockconf ${tx} ${ty} ${tz} ${elems_per_thread} ${launch_bound}
-    rm gen_rk3_threadblockconf
-    mv rk3_threadblock.conf ${TBCONFFILE_DST_PATH}
-
-    # Compile and run the test build
-    cmake -DBUILD_DEBUG=OFF -DDOUBLE_PRECISION=OFF -DAUTO_OPTIMIZE=ON .. && make -j
-    #if ./ac_run -t; then
-    #    echo Success
-        ./ac_run -b
-    #else
-    #    echo fail!
-    #fi
-done 
-done 
-done 
-done
-done 
-

From fd94b6321dd2a6f6fd2fce06fb74d8d8c077fc86 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 18:16:34 +0300
Subject: [PATCH 32/37] Renamed globalGrid.n to globalGridN

---
 acc/mhd_solver/stencil_process.sps | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index 1f6f190..813f51a 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -229,9 +229,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
     //    MV: Good idea. No an immediate priority.
     // Fun related article:
     // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
-    xx.x = xx.x*(2.0*M_PI/(dsx*globalGrid.n.x));
-    xx.y = xx.y*(2.0*M_PI/(dsy*globalGrid.n.y));
-    xx.z = xx.z*(2.0*M_PI/(dsz*globalGrid.n.z));
+    xx.x = xx.x*(2.0*M_PI/(dsx*globalGridN.x));
+    xx.y = xx.y*(2.0*M_PI/(dsy*globalGridN.y));
+    xx.z = xx.z*(2.0*M_PI/(dsz*globalGridN.z));
 
     Scalar cos_phi = cos(phi);
     Scalar sin_phi = sin(phi);

From c2bd5ae3e68deeb8146b936acbec230e95e5c68e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 18:17:03 +0300
Subject: [PATCH 33/37] Simplified the optimized multi-GPU integration function

---
 src/core/node.cu | 105 ++++++++++++++++-------------------------------
 1 file changed, 35 insertions(+), 70 deletions(-)

diff --git a/src/core/node.cu b/src/core/node.cu
index 5166fa1..5739cae 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -545,8 +545,8 @@ acNodeIntegrate(const Node node, const AcReal dt)
     //   n0   n1        n2  n3
     const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
     const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-    const int3 n2 = node->grid.n;
-    const int3 n3 = n0 + node->grid.n;
+    // const int3 n2 = node->grid.n;
+    // const int3 n3 = n0 + node->grid.n;
 
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
         acNodeSynchronizeStream(node, STREAM_ALL);
@@ -556,85 +556,50 @@ acNodeIntegrate(const Node node, const AcReal dt)
         acNodeSynchronizeStream(node, STREAM_ALL);
         // Inner inner
         for (int i = 0; i < node->num_devices; ++i) {
-            const int3 m1 = n1 + (int3){0, 0, i * node->subgrid.n.z};
-            const int3 m2 = m1 + node->subgrid.n - (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
-            acNodeIntegrateSubstep(node, STREAM_16, isubstep, m1, m2, dt);
+            const int3 m1 = n1;
+            const int3 m2 = node->subgrid.n;
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
         }
         for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
-            const int num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
-            for (int device_id = 0; device_id < node->num_devices; ++device_id) {
-                // ...|ooooxxx|... -> xxx|ooooooo|...
-                {
-                    const int3 src = (int3){0, 0, node->subgrid.n.z};
-                    const int3 dst = (int3){0, 0, 0};
-                    acDeviceTransferVertexBufferWithOffset(
-                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
-                        dst, num_vertices, node->devices[(device_id + 1) % node->num_devices]);
-                }
-                // ...|ooooooo|xxx <- ...|xxxoooo|...
-                {
-                    const int3 src = (int3){0, 0, NGHOST};
-                    const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
-                    acDeviceTransferVertexBufferWithOffset(
-                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
-                        dst, num_vertices,
-                        node->devices[(device_id - 1 + node->num_devices) % node->num_devices]);
-                }
-            }
+            acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+            global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
         }
-        for (int vtxbuf = 0; vtxbuf < 2 * NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
             acNodeSynchronizeStream(node, (Stream)vtxbuf);
         }
-        // Inner outer
-        for (int i = 0; i < node->num_devices - 1; ++i) {
-            const int3 m1 = n1 + (int3){0, 0, (i + 1) * node->subgrid.n.z - 2 * NGHOST};
-            const int3 m2 = m1 + (int3){node->subgrid.n.x - 2 * NGHOST,
-                                        node->subgrid.n.y - 2 * NGHOST, 2 * NGHOST};
-            acNodeIntegrateSubstep(node, STREAM_0, isubstep, m1, m2, dt);
-        }
-        // Outer
-        // Front
-        {
-            const int3 m1 = (int3){n0.x, n0.y, n0.z};
-            const int3 m2 = (int3){n3.x, n3.y, n1.z};
-            acNodeIntegrateSubstep(node, STREAM_1, isubstep, m1, m2, dt);
-        }
 
-        // Back
-        {
-            const int3 m1 = (int3){n0.x, n0.y, n2.z};
-            const int3 m2 = (int3){n3.x, n3.y, n3.z};
-            acNodeIntegrateSubstep(node, STREAM_2, isubstep, m1, m2, dt);
+        for (int i = 0; i < node->num_devices; ++i) { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
         }
-
-        // Top
-        {
-            const int3 m1 = (int3){n0.x, n0.y, n1.z};
-            const int3 m2 = (int3){n3.x, n1.y, n2.z};
-            acNodeIntegrateSubstep(node, STREAM_3, isubstep, m1, m2, dt);
+        for (int i = 0; i < node->num_devices; ++i) { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
         }
-
-        // Bottom
-        {
-            const int3 m1 = (int3){n0.x, n2.y, n1.z};
-            const int3 m2 = (int3){n3.x, n3.y, n2.z};
-            acNodeIntegrateSubstep(node, STREAM_4, isubstep, m1, m2, dt);
+        for (int i = 0; i < node->num_devices; ++i) { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
         }
-
-        // Left
-        {
-            const int3 m1 = (int3){n0.x, n1.y, n1.z};
-            const int3 m2 = (int3){n1.x, n2.y, n2.z};
-            acNodeIntegrateSubstep(node, STREAM_5, isubstep, m1, m2, dt);
+        for (int i = 0; i < node->num_devices; ++i) { // Top
+            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
         }
-
-        // Right
-        {
-            const int3 m1 = (int3){n2.x, n1.y, n1.z};
-            const int3 m2 = (int3){n3.x, n2.y, n2.z};
-            acNodeIntegrateSubstep(node, STREAM_6, isubstep, m1, m2, dt);
+        for (int i = 0; i < node->num_devices; ++i) { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
+        }
+        for (int i = 0; i < node->num_devices; ++i) { // Right
+            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
         }
-
         acNodeSwapBuffers(node);
     }
     acNodeSynchronizeStream(node, STREAM_ALL);

From 1525e0603f25c2c067bfabd2dbeb94504c656285 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 19:08:52 +0300
Subject: [PATCH 34/37] Added some preliminary pragma omps and verified that
 acIntegrate works as it should.

---
 src/core/CMakeLists.txt    |  4 +++-
 src/core/astaroth.cu       |  3 +++
 src/core/node.cu           | 29 ++++++++++++++++++++++++++---
 src/standalone/autotest.cc |  5 +++--
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 96d0202..642c7f1 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -4,6 +4,7 @@
 
 ## Find packages
 find_package(CUDA REQUIRED)
+# find_package(OpenMP REQUIRED)
 
 ## Architecture and optimization flags
 set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
@@ -12,7 +13,8 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
                     -gencode arch=compute_61,code=sm_61
                     -lineinfo
                     -ftz=true # Flush denormalized floats to zero
-                    -std=c++11)
+                    -std=c++11
+                    )# --compiler-options ${OpenMP_CXX_FLAGS})
                     #--maxrregcount=255
                     # -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
                     # =cg to opt out
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index f4a8beb..7393fea 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -73,8 +73,11 @@ acStore(AcMesh* host_mesh)
 AcResult
 acIntegrate(const AcReal dt)
 {
+    /*
     acNodeIntegrate(nodes[0], dt);
     return acBoundcondStep();
+    */
+    return acNodeIntegrate(nodes[0], dt);
 }
 
 AcResult
diff --git a/src/core/node.cu b/src/core/node.cu
index 5739cae..fa8a35d 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -162,6 +162,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
 #endif
 
     // Initialize the devices
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         const int3 multinode_offset                    = (int3){0, 0, 0}; // Placeholder
         const int3 multigpu_offset                     = (int3){0, 0, i * node->subgrid.n.z};
@@ -173,6 +174,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
     }
 
     // Enable peer access
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         const int front = (i + 1) % node->num_devices;
         const int back  = (i - 1 + node->num_devices) % node->num_devices;
@@ -205,6 +207,7 @@ acNodeDestroy(Node node)
 {
     acNodeSynchronizeStream(node, STREAM_ALL);
 
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         acDeviceDestroy(node->devices[i]);
     }
@@ -241,6 +244,7 @@ acNodeAutoOptimize(const Node node)
 AcResult
 acNodeSynchronizeStream(const Node node, const Stream stream)
 {
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         acDeviceSynchronizeStream(node->devices[i], stream);
     }
@@ -267,6 +271,7 @@ acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
 
     const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
 
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices - 1; ++i) {
         // ...|ooooxxx|... -> xxx|ooooooo|...
         const int3 src = (int3){0, 0, node->subgrid.n.z};
@@ -278,6 +283,7 @@ acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
         acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                num_vertices, dst_device);
     }
+    // #pragma omp parallel for
     for (int i = 1; i < node->num_devices; ++i) {
         // ...|ooooooo|xxx <- ...|xxxoooo|...
         const int3 src = (int3){0, 0, NGHOST};
@@ -305,6 +311,7 @@ acNodeSynchronizeMesh(const Node node, const Stream stream)
 AcResult
 acNodeSwapBuffers(const Node node)
 {
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         acDeviceSwapBuffers(node->devices[i]);
     }
@@ -316,6 +323,7 @@ acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param
                    const AcReal value)
 {
     acNodeSynchronizeStream(node, stream);
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         acDeviceLoadConstant(node->devices[i], stream, param, value);
     }
@@ -329,6 +337,7 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
 {
     acNodeSynchronizeStream(node, stream);
     // See the beginning of the file for an explanation of the index mapping
+    // // #pragma omp parallel for
     // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
@@ -404,6 +413,7 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                   const int3 dst, const int num_vertices, AcMesh* host_mesh)
 {
     acNodeSynchronizeStream(node, stream);
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
         const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
@@ -466,6 +476,7 @@ acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep,
 {
     acNodeSynchronizeStream(node, stream);
 
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         // DECOMPOSITION OFFSET HERE
         const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
@@ -490,6 +501,7 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
 
     if (node->num_devices > 1) {
         // Local boundary conditions
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) {
             const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
             const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
@@ -543,8 +555,8 @@ acNodeIntegrate(const Node node, const AcReal dt)
     // xxx|OOO OOOOOOOOO OOO|xxx
     //    ^    ^         ^  ^
     //   n0   n1        n2  n3
-    const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+    // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
+    // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
     // const int3 n2 = node->grid.n;
     // const int3 n3 = n0 + node->grid.n;
 
@@ -554,12 +566,15 @@ acNodeIntegrate(const Node node, const AcReal dt)
             local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
         }
         acNodeSynchronizeStream(node, STREAM_ALL);
+
         // Inner inner
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) {
-            const int3 m1 = n1;
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
             const int3 m2 = node->subgrid.n;
             acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
         }
+
         for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
             acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
             global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
@@ -568,32 +583,38 @@ acNodeIntegrate(const Node node, const AcReal dt)
             acNodeSynchronizeStream(node, (Stream)vtxbuf);
         }
 
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) { // Front
             const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
             const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
             acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
         }
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) { // Back
             const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
             const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
             acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
         }
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) { // Bottom
             const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
             const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
             acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
         }
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) { // Top
             const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
             const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
             acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
         }
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) { // Left
             const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
             const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
                                         node->subgrid.n.z - 2 * NGHOST};
             acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
         }
+        // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) { // Right
             const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
             const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
@@ -663,6 +684,7 @@ acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype
     acNodeSynchronizeStream(node, STREAM_ALL);
 
     AcReal results[node->num_devices];
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
     }
@@ -679,6 +701,7 @@ acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
     acNodeSynchronizeStream(node, STREAM_ALL);
 
     AcReal results[node->num_devices];
+    // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
         acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
     }
diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc
index e65a783..bf8ca65 100644
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -29,13 +29,13 @@
 #include <stdio.h>
 
 #include "config_loader.h"
-#include "src/core/math_utils.h"
 #include "model/host_forcing.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_boundconds.h"
 #include "model/model_reduce.h"
 #include "model/model_rk3.h"
+#include "src/core/math_utils.h"
 
 #include "src/core/errchk.h"
 
@@ -431,8 +431,9 @@ check_rk3(const AcMeshInfo& mesh_info)
             acIntegrate(dt);
 
             model_rk3(dt, model_mesh);
-            boundconds(model_mesh->info, model_mesh);
         }
+        boundconds(model_mesh->info, model_mesh);
+        acBoundcondStep();
         acStore(gpu_mesh);
 
         bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);

From d8eb2723b4326f74d239a4c9c2bf47abc5263b62 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 19:10:04 +0300
Subject: [PATCH 35/37] Added an acBoundconds() call before acStore in
 autotest.cc

---
 src/standalone/autotest.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc
index bf8ca65..f8a5312 100644
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -727,6 +727,7 @@ run_autotest(void)
 
                 // Device integration step
                 acIntegrate(dt);
+                acBoundcondStep();
                 acStore(candidate_mesh);
 
                 // Check fields

From 2b3f9d75aff47eb3108eb1187cb90e97ef16ceff Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 19:20:40 +0300
Subject: [PATCH 36/37] Ensured that acBoundcondStep is called everywhere in
 the program before acStore

---
 src/standalone/benchmark.cc  |  2 --
 src/standalone/renderer.cc   |  8 +++---
 src/standalone/simulation.cc | 48 +++++++++++++++++++-----------------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc
index 24f7f15..1a6a919 100644
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -197,7 +197,6 @@ run_benchmark(void)
                double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
                mesh_info.int_params[AC_nx]);
 
-        acStore(mesh);
         acQuit();
         acmesh_destroy(mesh);
     }
@@ -269,7 +268,6 @@ run_benchmark(void)
     write_result_to_file(wallclock * 1e3f / steps);
     #endif
 
-    acStore(mesh);
     acQuit();
     acmesh_destroy(mesh);
 
diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc
index 57292c1..2e8945f 100644
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -32,13 +32,13 @@
 #include <string.h> // memcpy
 
 #include "config_loader.h"
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
 #include "model/host_forcing.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_reduce.h"
 #include "model/model_rk3.h"
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
 #include "timer_hires.h"
 
 // Window
@@ -410,9 +410,10 @@ run_renderer(void)
         /* Render */
         const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
         if (timer_diff_sec >= desired_frame_time) {
-            // acStore(mesh);
             const int num_vertices = mesh->info.int_params[AC_mxy];
             const int3 dst         = (int3){0, 0, k_slice};
+            acBoundcondStep();
+            // acStore(mesh);
             acStoreWithOffset(dst, num_vertices, mesh);
             acSynchronizeStream(STREAM_ALL);
             renderer_draw(*mesh); // Bottleneck is here
@@ -422,7 +423,6 @@ run_renderer(void)
     }
     printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f));
 
-    acStore(mesh);
     acQuit();
     acmesh_destroy(mesh);
 
diff --git a/src/standalone/simulation.cc b/src/standalone/simulation.cc
index ebd527e..51a9a76 100644
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -27,13 +27,13 @@
 #include "run.h"
 
 #include "config_loader.h"
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
 #include "model/host_forcing.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_reduce.h"
 #include "model/model_rk3.h"
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
 #include "timer_hires.h"
 
 #include <string.h>
@@ -205,6 +205,7 @@ run_simulation(void)
     write_mesh_info(&mesh_info);
     print_diagnostics(0, AcReal(.0), t_step, diag_file);
 
+    acBoundcondStep();
     acStore(mesh);
     save_mesh(*mesh, 0, t_step);
 
@@ -218,13 +219,13 @@ run_simulation(void)
     /* initialize random seed: */
     srand(312256655);
 
-    //TODO_SINK. init_sink_particle()
+    // TODO_SINK. init_sink_particle()
     //  Initialize the basic variables of the sink particle to a suitable initial value.
     //  1. Location of the particle
     //  2. Mass of the particle
     //  (3. Velocity of the particle)
-    //  This at the level of Host in this case. 
-    //  acUpdate_sink_particle() will do the similar trick to the device. 
+    //  This at the level of Host in this case.
+    //  acUpdate_sink_particle() will do the similar trick to the device.
 
     /* Step the simulation */
     for (int i = 1; i < max_steps; ++i) {
@@ -236,26 +237,26 @@ run_simulation(void)
         loadForcingParamsToDevice(forcing_params);
 #endif
 
-       //TODO_SINK acUpdate_sink_particle()
-       //  Update properties of the sing particle for acIntegrate(). Essentially:
-       //  1. Location of the particle
-       //  2. Mass of the particle
-       //  (3. Velocity of the particle)
-       //  These can be used for calculating he gravitational field. 
+        // TODO_SINK acUpdate_sink_particle()
+        //  Update properties of the sing particle for acIntegrate(). Essentially:
+        //  1. Location of the particle
+        //  2. Mass of the particle
+        //  (3. Velocity of the particle)
+        //  These can be used for calculating he gravitational field.
 
         acIntegrate(dt);
-       //TODO_SINK acAdvect_sink_particle()
-       //  THIS IS OPTIONAL. We will start from unmoving particle. 
-       //  1. Calculate the equation of motion for the sink particle. 
-       //  NOTE: Might require embedding with acIntegrate(dt). 
+        // TODO_SINK acAdvect_sink_particle()
+        //  THIS IS OPTIONAL. We will start from unmoving particle.
+        //  1. Calculate the equation of motion for the sink particle.
+        //  NOTE: Might require embedding with acIntegrate(dt).
 
-       //TODO_SINK acAccrete_sink_particle()
-       //  Calculate accretion of the sink particle from the surrounding medium 
-       //  1. Transfer density into sink particle mass
-       //  2. Transfer momentum into sink particle 
-       //  (OPTIONAL: Affection the motion of the particle)
-       //  NOTE: Might require embedding with acIntegrate(dt).
-       //  This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference. 
+        // TODO_SINK acAccrete_sink_particle()
+        //  Calculate accretion of the sink particle from the surrounding medium
+        //  1. Transfer density into sink particle mass
+        //  2. Transfer momentum into sink particle
+        //  (OPTIONAL: Affection the motion of the particle)
+        //  NOTE: Might require embedding with acIntegrate(dt).
+        //  This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference.
 
         t_step += dt;
 
@@ -298,9 +299,10 @@ run_simulation(void)
                 assumed to be asynchronous, so the meshes must be also synchronized
                 before transferring the data to the CPU. Like so:
 
-                acSynchronize();
+                acBoundcondStep();
                 acStore(mesh);
             */
+            acBoundcondStep();
             acStore(mesh);
 
             save_mesh(*mesh, i, t_step);

From 322cdce52ceb474541bc90e0c74dc144e72dfed9 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 20:05:54 +0300
Subject: [PATCH 37/37] Added some new comments + some helpful old comments
 from a time before the interface revision

---
 include/astaroth.h |  17 +++++++
 src/core/node.cu   | 107 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/include/astaroth.h b/include/astaroth.h
index 94d7e17..43438f7 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -41,27 +41,44 @@ extern "C" {
 #define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
 */
 
+/** Allocates all memory and initializes the devices visible to the caller. Should be
+ * called before any other function in this interface. */
 AcResult acInit(const AcMeshInfo mesh_info);
 
+/** Frees all GPU allocations and resets all devices in the node. Should be
+ * called at exit. */
 AcResult acQuit(void);
 
+/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a
+ * parameter*/
 AcResult acSynchronizeStream(const Stream stream);
 
+/** Loads a constant to the memories of the devices visible to the caller */
 AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
 
+/** Loads an AcMesh to the devices visible to the caller */
 AcResult acLoad(const AcMesh host_mesh);
 
+/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/
 AcResult acStore(AcMesh* host_mesh);
 
+/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
+ * substep and the user is responsible for calling acBoundcondStep before reading the data. */
 AcResult acIntegrate(const AcReal dt);
 
+/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the
+ * caller*/
 AcResult acBoundcondStep(void);
 
+/** Does a scalar reduction with the data stored in some vertex buffer */
 AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
 
+/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */
 AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
                    const VertexBufferHandle b, const VertexBufferHandle c);
 
+/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory.
+ */
 AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);
 
 #ifdef __cplusplus
diff --git a/src/core/node.cu b/src/core/node.cu
index fa8a35d..f9a4aa9 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -16,6 +16,113 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
+
+/**
+ * @file
+ * \brief Multi-GPU implementation.
+ *
+ %JP: The old way for computing boundary conditions conflicts with the
+ way we have to do things with multiple GPUs.
+
+ The older approach relied on unified memory, which represented the whole
+ memory area as one huge mesh instead of several smaller ones. However, unified memory
+ in its current state is more meant for quick prototyping when performance is not an issue.
+ Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
+ than when managing the memory explicitly.
+
+ In this new approach, I have simplified the multi- and single-GPU layers significantly.
+ Quick rundown:
+         New struct: Grid. There are two global variables, "grid" and "subgrid", which
+         contain the extents of the whole simulation domain and the decomposed grids,
+ respectively. To simplify thing, we require that each GPU is assigned the same amount of
+ work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
+ work with.
+
+         The whole simulation domain is decomposed with respect to the z dimension.
+         For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+         contain (nx, ny, nz / num_devices) vertices.
+
+         An local index (i, j, k) in some subgrid can be mapped to the global grid with
+                 global idx = (i, j, k + device_id * subgrid.n.z)
+
+ Terminology:
+         - Single-GPU function: a function defined on the single-GPU layer (device.cu)
+
+ Changes required to this commented code block:
+         - The thread block dimensions (tpb) are no longer passed to the kernel here but in
+ device.cu instead. Same holds for any complex index calculations. Instead, the local
+ coordinates should be passed as an int3 type without having to consider how the data is
+ actually laid out in device memory
+         - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
+ handle of type "Device" which should be passed to single-GPU functions. In this file, all
+ devices are stored in a global array "devices[num_devices]".
+         - Every single-GPU function is executed asynchronously by default such that we
+           can optimize Astaroth by executing memory transactions concurrently with
+ computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
+           Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+           as a parameter and commands executing in different streams can be processed
+           in parallel/concurrently.
+
+
+ Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+
+         With multiple GPUs, periodic boundary conditions applied on indices ranging from
+
+                 (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
+ STENCIL_ORDER/2)
+
+         on a single device are "local", in the sense that they can be computed without
+ having to exchange data with neighboring GPUs. Special care is needed only for transferring
+         the data to the fron and back plates outside this range. In the solution we use
+ here, we solve the local boundaries first, and then just exchange the front and back plates
+         in a "ring", like so
+                                 device_id
+                     (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+
+### Throughout this file we use the following notation and names for various index offsets
+
+    Global coordinates: coordinates with respect to the global grid (static Grid grid)
+    Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
+
+    s0, s1: source indices in global coordinates
+    d0, d1: destination indices in global coordinates
+    da = max(s0, d0);
+    db = min(s1, d1);
+
+    These are used in at least
+    acLoad()
+    acStore()
+    acSynchronizeHalos()
+
+     Here we decompose the host mesh and distribute it among the GPUs in
+     the node.
+
+     The host mesh is a huge contiguous block of data. Its dimensions are given by
+     the global variable named "grid". A "grid" is decomposed into "subgrids",
+     one for each GPU. Here we check which parts of the range s0...s1 maps
+     to the memory space stored by some GPU, ranging d0...d1, and transfer
+     the data if needed.
+
+     The index mapping is inherently quite involved, but here's a picture which
+     hopefully helps make sense out of all this.
+
+
+     Grid
+                                      |----num_vertices---|
+     xxx|....................................................|xxx
+              ^                   ^   ^                   ^
+             d0                  d1  s0 (src)            s1
+
+     Subgrid
+
+              xxx|.............|xxx
+              ^                   ^
+             d0                  d1
+
+                                  ^   ^
+                                 db  da
+ *
+ */
 #include "astaroth_node.h"
 
 #include "astaroth_device.h"