From 36fea7056096bd3daf6940848527d87ae0aeb4b6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 15 Aug 2019 11:04:22 +0300
Subject: [PATCH 01/27] Moved basic built-in functions for vector operations to
 math_utils.h from integration.cuh so that they are shared with the CPU and
 GPU

---
 src/core/kernels/integration.cuh     | 61 +----------------
 src/core/math_utils.h                | 99 +++++++++++++++++++++-------
 src/standalone/autotest.cc           |  6 ++
 src/standalone/model/host_forcing.cc |  3 +-
 src/standalone/model/host_forcing.h  |  8 +--
 src/standalone/model/modelmesh.h     |  1 +
 6 files changed, 92 insertions(+), 86 deletions(-)
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 50db9ae..58063a6 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -26,6 +26,8 @@
  */
 #pragma once
 
+#include "math_utils.h"
+
 #include <assert.h>
 
 static __device__ __forceinline__ int
@@ -321,65 +323,6 @@ read_data(const int i, const int j, const int k,
  * =============================================================================
  */
 
-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator+(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a)
-{
-    return (AcReal3){-a.x, -a.y, -a.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal
-dot(const AcReal3& a, const AcReal3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-mul(const AcMatrix& aa, const AcReal3& x)
-{
-    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-cross(const AcReal3& a, const AcReal3& b)
-{
-    AcReal3 c;
-
-    c.x = a.y * b.z - a.z * b.y;
-    c.y = a.z * b.x - a.x * b.z;
-    c.z = a.x * b.y - a.y * b.x;
-
-    return c;
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal& a)
-{
-    return !isnan(a) && !isinf(a);
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal3& a)
-{
-    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
-}
-
 /*
  * =============================================================================
  * Level 1 (Stencil Processing Stage)
diff --git a/src/core/math_utils.h b/src/core/math_utils.h
index 1f1bc56..5ebd42c 100644
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -64,16 +64,6 @@ sum(const T& a, const T& b)
     return a + b;
 }
 
-template <class T>
-static inline const T
-is_valid(const T& val)
-{
-    if (isnan(val) || isinf(val))
-        return false;
-    else
-        return true;
-}
-
 template <class T>
 static inline const T
 clamp(const T& val, const T& min, const T& max)
@@ -87,20 +77,85 @@ randr()
     return AcReal(rand()) / AcReal(RAND_MAX);
 }
 
-static inline int3
-operator+(const int3& a, const int3& b)
-{
-    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static inline int3
-operator-(const int3& a, const int3& b)
-{
-    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
 static inline bool
 is_power_of_two(const unsigned val)
 {
     return val && !(val & (val - 1));
 }
+
+#ifdef __CUDACC__
+#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+#define HOST_DEVICE_INLINE inline
+#endif // __CUDACC__
+
+static HOST_DEVICE_INLINE AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static HOST_DEVICE_INLINE int3
+operator+(const int3& a, const int3& b)
+{
+    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static HOST_DEVICE_INLINE int3
+operator-(const int3& a, const int3& b)
+{
+    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static HOST_DEVICE_INLINE AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static HOST_DEVICE_INLINE bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static HOST_DEVICE_INLINE bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc
index f8a5312..f484b30 100644
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -75,6 +75,12 @@ static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE,
                                       INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
 // #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 
+static inline bool
+is_valid(const ModelScalar a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
 #if TEST_TYPE ==                                                                                   \
     QUICK_TEST // REGULAR TEST START HERE
                // --------------------------------------------------------------------------------------------------------------
diff --git a/src/standalone/model/host_forcing.cc b/src/standalone/model/host_forcing.cc
index a5cedef..bd68d89 100644
--- a/src/standalone/model/host_forcing.cc
+++ b/src/standalone/model/host_forcing.cc
@@ -26,7 +26,8 @@
  */
 #include "host_forcing.h"
 
-#include "src/core/math_utils.h"
+// #include "src/core/math_utils.h"
+#include "math.h"
 
 // The is a wrapper for genering random numbers with a chosen system.
 AcReal
diff --git a/src/standalone/model/host_forcing.h b/src/standalone/model/host_forcing.h
index 383c663..4f00464 100644
--- a/src/standalone/model/host_forcing.h
+++ b/src/standalone/model/host_forcing.h
@@ -32,13 +32,13 @@
 
 AcReal get_random_number_01();
 
-AcReal3 cross(const AcReal3& a, const AcReal3& b);
+// AcReal3 cross(const AcReal3& a, const AcReal3& b);
 
-AcReal dot(const AcReal3& a, const AcReal3& b);
+// AcReal dot(const AcReal3& a, const AcReal3& b);
 
-AcReal3 vec_norm(const AcReal3& a);
+// AcReal3 vec_norm(const AcReal3& a);
 
-AcReal3 vec_multi_scal(const AcReal scal, const AcReal3& a);
+// AcReal3 vec_multi_scal(const AcReal scal, const AcReal3& a);
 
 AcReal3 helical_forcing_k_generator(const AcReal kmax, const AcReal kmin);
 
diff --git a/src/standalone/model/modelmesh.h b/src/standalone/model/modelmesh.h
index 85680a6..370e7a0 100644
--- a/src/standalone/model/modelmesh.h
+++ b/src/standalone/model/modelmesh.h
@@ -27,6 +27,7 @@
 #pragma once
 #include "astaroth.h"
 
+#include "math.h"
 
 typedef long double ModelScalar;
 

From aa45ce04de9099319b5c813440ac307930302feb Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 15 Aug 2019 11:09:40 +0300
Subject: [PATCH 02/27] Made the linear algebra functions used in forcing.cc
 static to avoid collisions with the functions defined in math_utils.h

---
 src/standalone/model/host_forcing.cc | 8 ++++----
 src/standalone/model/host_forcing.h  | 8 --------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/standalone/model/host_forcing.cc b/src/standalone/model/host_forcing.cc
index bd68d89..bdc9d9f 100644
--- a/src/standalone/model/host_forcing.cc
+++ b/src/standalone/model/host_forcing.cc
@@ -37,7 +37,7 @@ get_random_number_01()
     return AcReal(rand()) / AcReal(RAND_MAX);
 }
 
-AcReal3
+static AcReal3
 cross(const AcReal3& a, const AcReal3& b)
 {
     AcReal3 c;
@@ -49,13 +49,13 @@ cross(const AcReal3& a, const AcReal3& b)
     return c;
 }
 
-AcReal
+static AcReal
 dot(const AcReal3& a, const AcReal3& b)
 {
     return a.x * b.x + a.y * b.y + a.z * b.z;
 }
 
-AcReal3
+static AcReal3
 vec_norm(const AcReal3& a)
 {
     AcReal3 c;
@@ -68,7 +68,7 @@ vec_norm(const AcReal3& a)
     return c;
 }
 
-AcReal3
+static AcReal3
 vec_multi_scal(const AcReal scal, const AcReal3& a)
 {
     AcReal3 c;
diff --git a/src/standalone/model/host_forcing.h b/src/standalone/model/host_forcing.h
index 4f00464..519c2f0 100644
--- a/src/standalone/model/host_forcing.h
+++ b/src/standalone/model/host_forcing.h
@@ -32,14 +32,6 @@
 
 AcReal get_random_number_01();
 
-// AcReal3 cross(const AcReal3& a, const AcReal3& b);
-
-// AcReal dot(const AcReal3& a, const AcReal3& b);
-
-// AcReal3 vec_norm(const AcReal3& a);
-
-// AcReal3 vec_multi_scal(const AcReal scal, const AcReal3& a);
-
 AcReal3 helical_forcing_k_generator(const AcReal kmax, const AcReal kmin);
 
 void helical_forcing_e_generator(AcReal3* e_force, const AcReal3 k_force);

From 6d4d53342e0eda78b54ea7cfe8431547bf92d530 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 15 Aug 2019 11:14:52 +0300
Subject: [PATCH 03/27] Removed old comments

---
 src/core/device.cu | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index 9690278..a19dee0 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -64,10 +64,6 @@ DCONST(const AcReal3Param param)
 #define DCONST_INT3(x) DCONST(x)
 #define DCONST_REAL(x) DCONST(x)
 #define DCONST_REAL3(x) DCONST(x)
-//#define DCONST_INT(X) (d_mesh_info.int_params[X])
-//#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
-//#define DCONST_REAL(X) (d_mesh_info.real_params[X])
-//#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])

From e89897985eb9d4d58bb392f0c92169f12763f690 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 14:02:30 +0300
Subject: [PATCH 04/27] Battled with math.h and cmath. We probably should move
 from C standard libraries to C++ ones internally (in places which are not
 visible via the interface)

---
 src/core/kernels/integration.cuh     | 2 +-
 src/core/math_utils.h                | 8 ++++----
 src/standalone/model/host_forcing.cc | 3 ++-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 58063a6..1b62d1e 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -26,7 +26,7 @@
  */
 #pragma once
 
-#include "math_utils.h"
+#include "src/core/math_utils.h"
 
 #include <assert.h>
 
diff --git a/src/core/math_utils.h b/src/core/math_utils.h
index 5ebd42c..4d41e4e 100644
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -25,10 +25,10 @@
  *
  */
 #pragma once
-#include <cmath>
-using namespace std; // Potentially bad practice to declare namespace std here
-// #include <math.h>   // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
-// #include <tgmath.h> // Even this does not work
+//#include <cmath>
+// using namespace std; // Potentially bad practice to declare namespace std here
+#include <math.h> // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
+//#include <tgmath.h> // Even this does not work
 #include <stdlib.h> // rand
 
 template <class T>
diff --git a/src/standalone/model/host_forcing.cc b/src/standalone/model/host_forcing.cc
index bdc9d9f..1f39296 100644
--- a/src/standalone/model/host_forcing.cc
+++ b/src/standalone/model/host_forcing.cc
@@ -27,7 +27,8 @@
 #include "host_forcing.h"
 
 // #include "src/core/math_utils.h"
-#include "math.h"
+#include <cmath>
+using namespace std;
 
 // The is a wrapper for genering random numbers with a chosen system.
 AcReal

From 598799d7c347adb626bfcaf5e4fb5a8efdce8235 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 15:14:00 +0300
Subject: [PATCH 05/27] Added a new function to the device interface:
 acDeviceLoadMeshInfo

---
 include/astaroth_device.h |  4 ++++
 src/core/device.cu        | 12 ++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 442b1b1..95cbb83 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -49,6 +49,10 @@ AcResult acDeviceSwapBuffers(const Device device);
 AcResult acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
                               const AcReal value);
 
+/** */
+AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
+                              const AcMeshInfo device_config);
+
 /** */
 AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
                                             const AcMesh host_mesh,
diff --git a/src/core/device.cu b/src/core/device.cu
index a19dee0..0482f93 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -149,8 +149,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
 #endif
 
     // Device constants
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
-                                          cudaMemcpyHostToDevice));
+    acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config);
 
     printf("Created device %d (%p)\n", device->id, device);
     *device_handle = device;
@@ -367,6 +366,15 @@ acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam
     return AC_SUCCESS;
 }
 
+AcResult
+acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
+                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
 AcResult
 acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                                    const VertexBufferHandle vtxbuf_handle, const int3 src,

From 41805dcb6885c813e69455934af4612654292765 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 15:17:51 +0300
Subject: [PATCH 06/27] Added some error checking for the case where user
 supplies an incomplete meshinfo to acDeviceLoadMeshInfo

---
 src/core/device.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/core/device.cu b/src/core/device.cu
index 0482f93..b50478a 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -370,6 +370,13 @@ AcResult
 acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
 {
     cudaSetDevice(device->id);
+
+    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
+                  device->local_config.int_params[AC_multigpu_offset]);
+
     ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
                                                0, cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;

From 787363226b5510c06990ee5ac66939f7f09c9304 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 15:28:16 +0300
Subject: [PATCH 07/27] Added functions for loading int, int3, scalar and
 vector constants to the device layer (acDeviceLoad...Constant)

---
 include/astaroth_device.h | 16 ++++++++++++++--
 src/core/device.cu        | 37 +++++++++++++++++++++++++++++++++++--
 src/core/node.cu          |  2 +-
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 95cbb83..1d7926f 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -46,8 +46,20 @@ AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
 AcResult acDeviceSwapBuffers(const Device device);
 
 /** */
-AcResult acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
-                              const AcReal value);
+AcResult acDeviceLoadScalarConstant(const Device device, const Stream stream,
+                                    const AcRealParam param, const AcReal value);
+
+/** */
+AcResult acDeviceLoadVectorConstant(const Device device, const Stream stream,
+                                    const AcReal3Param param, const AcReal3 value);
+
+/** */
+AcResult acDeviceLoadIntConstant(const Device device, const Stream stream, const AcIntParam param,
+                                 const int value);
+
+/** */
+AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
+                                  const int3 value);
 
 /** */
 AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
diff --git a/src/core/device.cu b/src/core/device.cu
index b50478a..8b89a78 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -356,8 +356,8 @@ acDeviceSwapBuffers(const Device device)
 }
 
 AcResult
-acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
-                     const AcReal value)
+acDeviceLoadScalarConstant(const Device device, const Stream stream, const AcRealParam param,
+                           const AcReal value)
 {
     cudaSetDevice(device->id);
     const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
@@ -366,6 +366,39 @@ acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam
     return AC_SUCCESS;
 }
 
+AcResult
+acDeviceLoadVectorConstant(const Device device, const Stream stream, const AcReal3Param param,
+                           const AcReal3 value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadIntConstant(const Device device, const Stream stream, const AcIntParam param,
+                        const int value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
+                         const int3 value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
 AcResult
 acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
 {
diff --git a/src/core/node.cu b/src/core/node.cu
index 6fa80f2..9950dd4 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -429,7 +429,7 @@ acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param
     acNodeSynchronizeStream(node, stream);
     // #pragma omp parallel for
     for (int i = 0; i < node->num_devices; ++i) {
-        acDeviceLoadConstant(node->devices[i], stream, param, value);
+        acDeviceLoadScalarConstant(node->devices[i], stream, param, value);
     }
     return AC_SUCCESS;
 }

From b316e51267537f4d4a83bd549ebe2ab10634e2a9 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 16:16:26 +0300
Subject: [PATCH 08/27] Added preliminary code for generating C headers with
 the DSL

---
 acc/src/code_generator.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index bd05ea0..f629b58 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -36,7 +36,8 @@
 ASTNode* root = NULL;
 
 static const char inout_name_prefix[] = "handle_";
-static bool doing_stencil_assembly    = true;
+typedef enum { STENCIL_ASSEMBLY, STENCIL_PROCESS, STENCIL_HEADER } CompilationType;
+static CompilationType compilation_type;
 
 /*
  * =============================================================================
@@ -212,11 +213,14 @@ translate_latest_symbol(void)
     // FUNCTION PARAMETER
     else if (symbol->type == SYMBOLTYPE_FUNCTION_PARAMETER) {
         if (symbol->type_qualifier == IN || symbol->type_qualifier == OUT) {
-            if (doing_stencil_assembly)
+            if (compilation_type == STENCIL_ASSEMBLY)
                 printf("const __restrict__ %s* %s", translate(symbol->type_specifier),
                        symbol->identifier);
-            else
+            else if (compilation_type == STENCIL_PROCESS)
                 printf("const %sData& %s", translate(symbol->type_specifier), symbol->identifier);
+            else
+                printf("Invalid compilation type %d, IN and OUT qualifiers not supported\n",
+                       compilation_type);
         }
         else {
             print_symbol(handle);
@@ -550,9 +554,11 @@ main(int argc, char** argv)
 {
     if (argc == 2) {
         if (!strcmp(argv[1], "-sas"))
-            doing_stencil_assembly = true;
+            compilation_type = STENCIL_ASSEMBLY;
         else if (!strcmp(argv[1], "-sps"))
-            doing_stencil_assembly = false;
+            compilation_type = STENCIL_PROCESS;
+        else if (!strcmp(argv[1], "-hh"))
+            compilation_type = STENCIL_HEADER;
         else
             printf("Unknown flag %s. Generating stencil assembly.\n", argv[1]);
     }
@@ -576,7 +582,7 @@ main(int argc, char** argv)
 
     // Traverse
     traverse(root);
-    if (doing_stencil_assembly)
+    if (compilation_type == STENCIL_ASSEMBLY)
         generate_preprocessed_structures();
 
     // print_symbol_table();

From c98b74563c7f5c1cb94c3aae453a72bce3faed3d Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 16:18:24 +0300
Subject: [PATCH 09/27] Added a comment

---
 acc/src/code_generator.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index f629b58..2de30d7 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -566,8 +566,8 @@ main(int argc, char** argv)
         printf("Usage: ./acc [flags]\n"
                "Flags:\n"
                "\t-sas - Generates code for the stencil assembly stage\n"
-               "\t-sps - Generates code for the stencil processing "
-               "stage\n");
+               "\t-sps - Generates code for the stencil processing stage\n"
+               "\t-hh  - Generates stencil definitions from a header file\n");
         printf("\n");
         return EXIT_FAILURE;
     }

From 0208d55e4ebc50973ea7fc5b991edb1f46f935bf Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 16:40:47 +0300
Subject: [PATCH 10/27] Moved STENCIL_ORDER and NGHOST out of user-defined
 parameter as these are actually internal defines used to configure the
 built-in functions. Additionally, renamed all explicitly declared uniforms
 from dsx -> AC_dsx in the DSL in preparation for having clear connection
 between DSL uniforms and the library parameter handles created by the user
 (AcRealParam etc)

---
 acc/mhd_solver/.gitignore          |   5 ++
 acc/mhd_solver/stencil_process.sps | 105 +++++++++++------------------
 acc/src/code_generator.c           |   6 ++
 include/astaroth_defines.h         |   2 +
 src/core/kernels/stencil_header.hh |  33 +++++++++
 5 files changed, 87 insertions(+), 64 deletions(-)
 create mode 100644 acc/mhd_solver/.gitignore
 create mode 100644 src/core/kernels/stencil_header.hh

diff --git a/acc/mhd_solver/.gitignore b/acc/mhd_solver/.gitignore
new file mode 100644
index 0000000..bc4b7d8
--- /dev/null
+++ b/acc/mhd_solver/.gitignore
@@ -0,0 +1,5 @@
+build
+testbin
+
+# Except this file
+!.gitignore
diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index ee8656e..18141f7 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -1,27 +1,4 @@
-// Declare uniforms (i.e. device constants)
-uniform Scalar cs2_sound;
-uniform Scalar nu_visc;
-uniform Scalar cp_sound;
-uniform Scalar cv_sound;
-uniform Scalar mu0;
-uniform Scalar eta;
-uniform Scalar gamma;
-uniform Scalar zeta;
-
-uniform Scalar dsx;
-uniform Scalar dsy;
-uniform Scalar dsz;
-
-uniform Scalar lnT0;
-uniform Scalar lnrho0;
-
-uniform int nx_min;
-uniform int ny_min;
-uniform int nz_min;
-uniform int nx;
-uniform int ny;
-uniform int nz;
-
+#include "stencil_header.hh"
 
 
 Vector
@@ -61,8 +38,8 @@ continuity(in VectorField uu, in ScalarField lnrho) {
 Vector
 momentum(in VectorField uu, in ScalarField lnrho, in ScalarField ss, in VectorField aa) {
     const Matrix S = stress_tensor(uu);
-    const Scalar cs2 = cs2_sound * exp(gamma * value(ss) / cp_sound + (gamma - 1) * (value(lnrho) - lnrho0));
-    const Vector  j = (Scalar(1.) / mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Scalar cs2 = AC_cs2_sound * exp(AC_gamma * value(ss) / AC_cp_sound + (AC_gamma - 1) * (value(lnrho) - AC_lnrho0));
+    const Vector  j = (Scalar(1.) / AC_mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
     const Vector B = curl(aa);
     //TODO: DOES INTHERMAL VERSTION INCLUDE THE MAGNETIC FIELD?
     const Scalar inv_rho = Scalar(1.) / exp(value(lnrho));
@@ -70,14 +47,14 @@ momentum(in VectorField uu, in ScalarField lnrho, in ScalarField ss, in VectorFi
     // Regex replace CPU constants with get\(AC_([a-zA-Z_0-9]*)\)
     // \1
     const Vector mom = - mul(gradients(uu), value(uu))
-                                                       - cs2 * ((Scalar(1.) / cp_sound) * gradient(ss) + gradient(lnrho))
+                                                       - cs2 * ((Scalar(1.) / AC_cp_sound) * gradient(ss) + gradient(lnrho))
                                                        + inv_rho * cross(j, B)
-                                                       + nu_visc * (
+                                                       + AC_nu_visc * (
                                                             laplace_vec(uu)
                                                         + Scalar(1. / 3.) * gradient_of_divergence(uu)
                                                         + Scalar(2.) * mul(S, gradient(lnrho))
                                                         )
-                                                        + zeta * gradient_of_divergence(uu);
+                                                        + AC_zeta * gradient_of_divergence(uu);
     return mom;
 }
 #elif LTEMPERATURE
@@ -87,13 +64,13 @@ momentum(in VectorField uu, in ScalarField lnrho, in ScalarField tt) {
 
   const Matrix S = stress_tensor(uu);
 
-    const Vector pressure_term = (cp_sound - cv_sound) * (gradient(tt) + value(tt) * gradient(lnrho));
+    const Vector pressure_term = (AC_cp_sound - AC_cv_sound) * (gradient(tt) + value(tt) * gradient(lnrho));
 
   mom = -mul(gradients(uu), value(uu)) -
     pressure_term +
-    nu_visc *
+    AC_nu_visc *
     (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
-      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+      Scalar(2.) * mul(S, gradient(lnrho))) + AC_zeta * gradient_of_divergence(uu);
 
   #if LGRAVITY
   mom = mom - (Vector){0, 0, -10.0};
@@ -111,10 +88,10 @@ momentum(in VectorField uu, in ScalarField lnrho) {
     // Isothermal: we have constant speed of sound
 
   mom = -mul(gradients(uu), value(uu)) -
-    cs2_sound * gradient(lnrho) +
-    nu_visc *
+    AC_cs2_sound * gradient(lnrho) +
+    AC_nu_visc *
     (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
-      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+      Scalar(2.) * mul(S, gradient(lnrho))) + AC_zeta * gradient_of_divergence(uu);
 
   #if LGRAVITY
   mom = mom - (Vector){0, 0, -10.0};
@@ -130,13 +107,13 @@ induction(in VectorField uu, in VectorField aa) {
   // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
   // x A)) in order to avoid taking the first derivative twice (did the math,
   // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
-  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  // u cross B - AC_eta * AC_mu0 * (AC_mu0^-1 * [- laplace A + grad div A ])
   const Vector B = curl(aa);
   const Vector grad_div = gradient_of_divergence(aa);
   const Vector lap = laplace_vec(aa);
 
-  // Note, mu0 is cancelled out
-  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+  // Note, AC_mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - AC_eta * (grad_div - lap);
 
   return ind;
 }
@@ -145,27 +122,27 @@ induction(in VectorField uu, in VectorField aa) {
 #if LENTROPY
 Scalar
 lnT( in ScalarField ss, in ScalarField lnrho) {
-  const Scalar lnT = lnT0 + gamma * value(ss) / cp_sound +
-    (gamma - Scalar(1.)) * (value(lnrho) - lnrho0);
+  const Scalar lnT = AC_lnT0 + AC_gamma * value(ss) / AC_cp_sound +
+    (AC_gamma - Scalar(1.)) * (value(lnrho) - AC_lnrho0);
   return lnT;
 }
 
 // Nabla dot (K nabla T) / (rho T)
 Scalar
 heat_conduction( in ScalarField ss, in ScalarField lnrho) {
-  const Scalar inv_cp_sound = AcReal(1.) / cp_sound;
+  const Scalar inv_AC_cp_sound = AcReal(1.) / AC_cp_sound;
 
   const Vector grad_ln_chi = - gradient(lnrho);
 
-  const Scalar first_term = gamma * inv_cp_sound * laplace(ss) +
-    (gamma - AcReal(1.)) * laplace(lnrho);
-  const Vector second_term = gamma * inv_cp_sound * gradient(ss) +
-    (gamma - AcReal(1.)) * gradient(lnrho);
-  const Vector third_term = gamma * (inv_cp_sound * gradient(ss) +
+  const Scalar first_term = AC_gamma * inv_AC_cp_sound * laplace(ss) +
+    (AC_gamma - AcReal(1.)) * laplace(lnrho);
+  const Vector second_term = AC_gamma * inv_AC_cp_sound * gradient(ss) +
+    (AC_gamma - AcReal(1.)) * gradient(lnrho);
+  const Vector third_term = AC_gamma * (inv_AC_cp_sound * gradient(ss) +
     gradient(lnrho)) + grad_ln_chi;
 
-  const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * cp_sound);
-  return cp_sound * chi * (first_term + dot(second_term, third_term));
+  const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * AC_cp_sound);
+  return AC_cp_sound * chi * (first_term + dot(second_term, third_term));
 }
 
 Scalar
@@ -177,11 +154,11 @@ Scalar
 entropy(in ScalarField ss, in VectorField uu, in ScalarField lnrho, in VectorField aa) {
     const Matrix S = stress_tensor(uu);
     const Scalar inv_pT = Scalar(1.) / (exp(value(lnrho)) * exp(lnT(ss, lnrho)));
-    const Vector  j = (Scalar(1.) / mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Vector  j = (Scalar(1.) / AC_mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
     const Scalar RHS = H_CONST - C_CONST
-                                                + eta * (mu0) * dot(j, j)
-                                                + Scalar(2.) * exp(value(lnrho)) * nu_visc * contract(S)
-                                                + zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
+                                                + AC_eta * (AC_mu0) * dot(j, j)
+                                                + Scalar(2.) * exp(value(lnrho)) * AC_nu_visc * contract(S)
+                                                + AC_zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
 
     return - dot(value(uu), gradient(ss))
                   + inv_pT * RHS
@@ -195,7 +172,7 @@ heat_transfer(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
 {
     const Matrix S = stress_tensor(uu);
     const Scalar heat_diffusivity_k = 0.0008; //8e-4;
-    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) + heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) + nu_visc * contract(S) * (Scalar(1.) / cv_sound) - (gamma - 1) * value(tt) * divergence(uu);
+    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) + heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) + AC_nu_visc * contract(S) * (Scalar(1.) / AC_cv_sound) - (AC_gamma - 1) * value(tt) * divergence(uu);
 }
 #endif
 
@@ -220,7 +197,7 @@ Vector
 helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
 {
     // JP: This looks wrong:
-    //    1) Should it be dsx * nx instead of dsx * ny?
+    //    1) Should it be AC_dsx * AC_nx instead of AC_dsx * AC_ny?
     //    2) Should you also use globalGrid.n instead of the local n?
     //    MV: You are rigth. Made a quickfix. I did not see the error  because multigpu is split
     //        in z direction not y direction.
@@ -229,9 +206,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
     //    MV: Good idea. No an immediate priority.
     // Fun related article:
     // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
-    xx.x = xx.x*(2.0*M_PI/(dsx*globalGridN.x));
-    xx.y = xx.y*(2.0*M_PI/(dsy*globalGridN.y));
-    xx.z = xx.z*(2.0*M_PI/(dsz*globalGridN.z));
+    xx.x = xx.x*(2.0*M_PI/(AC_dsx*globalGridN.x));
+    xx.y = xx.y*(2.0*M_PI/(AC_dsy*globalGridN.y));
+    xx.z = xx.z*(2.0*M_PI/(AC_dsz*globalGridN.z));
 
     Scalar cos_phi = cos(phi);
     Scalar sin_phi = sin(phi);
@@ -254,13 +231,13 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
 Vector
 forcing(int3 globalVertexIdx, Scalar dt)
 {
-    Vector a = Scalar(.5) * (Vector){globalGridN.x * dsx,
-                                     globalGridN.y * dsy,
-                                     globalGridN.z * dsz}; // source (origin)
-    Vector xx = (Vector){(globalVertexIdx.x - nx_min) * dsx,
-                        (globalVertexIdx.y - ny_min) * dsy,
-                        (globalVertexIdx.z - nz_min) * dsz}; // sink (current index)
-    const Scalar cs2 = cs2_sound;
+    Vector a = Scalar(.5) * (Vector){globalGridN.x * AC_dsx,
+                                     globalGridN.y * AC_dsy,
+                                     globalGridN.z * AC_dsz}; // source (origin)
+    Vector xx = (Vector){(globalVertexIdx.x - AC_nx_min) * AC_dsx,
+                        (globalVertexIdx.y - AC_ny_min) * AC_dsy,
+                        (globalVertexIdx.z - AC_nz_min) * AC_dsz}; // sink (current index)
+    const Scalar cs2 = AC_cs2_sound;
     const Scalar cs = sqrt(cs2);
 
     //Placeholders until determined properly
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 2de30d7..01cf1d1 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -228,6 +228,9 @@ translate_latest_symbol(void)
     }
     // UNIFORM
     else if (symbol->type_qualifier == UNIFORM) {
+        // if (compilation_type != STENCIL_HEADER) {
+        //    printf("ERROR: %s can only be used in stencil headers\n", translation_table[UNIFORM]);
+        //}
         /* Do nothing */
     }
     // IN / OUT
@@ -373,6 +376,8 @@ traverse(const ASTNode* node)
             //    printf("%s%s", inout_name_prefix, symbol->identifier);
             //}
             if (symbol->type_qualifier == UNIFORM) {
+                printf("DCONST(%s) ", symbol->identifier);
+                /*
                 if (symbol->type_specifier == SCALAR)
                     printf("DCONST_REAL(AC_%s) ", symbol->identifier);
                 else if (symbol->type_specifier == INT)
@@ -380,6 +385,7 @@ traverse(const ASTNode* node)
                 else
                     printf("INVALID UNIFORM type specifier %s with %s\n",
                            translate(symbol->type_specifier), symbol->identifier);
+                           */
             }
             else {
                 // Do a regular translation
diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index ee5d70c..96ab3fd 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -45,6 +45,8 @@ typedef struct {
 #endif // __CUDACC__
 
 // Library flags
+#define STENCIL_ORDER (6)
+#define NGHOST (STENCIL_ORDER / 2)
 #define VERBOSE_PRINTING (1)
 
 // Built-in types and parameters
diff --git a/src/core/kernels/stencil_header.hh b/src/core/kernels/stencil_header.hh
new file mode 100644
index 0000000..798e3cc
--- /dev/null
+++ b/src/core/kernels/stencil_header.hh
@@ -0,0 +1,33 @@
+#define LDENSITY (1)
+#define LHYDRO (1)
+#define LMAGNETIC (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+#define LFORCING (1)
+#define LUPWD (1)
+
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar AC_cs2_sound;
+uniform Scalar AC_nu_visc;
+uniform Scalar AC_cp_sound;
+uniform Scalar AC_cv_sound;
+uniform Scalar AC_mu0;
+uniform Scalar AC_eta;
+uniform Scalar AC_gamma;
+uniform Scalar AC_zeta;
+
+uniform Scalar AC_dsx;
+uniform Scalar AC_dsy;
+uniform Scalar AC_dsz;
+
+uniform Scalar AC_lnT0;
+uniform Scalar AC_lnrho0;
+
+uniform int AC_nx_min;
+uniform int AC_ny_min;
+uniform int AC_nz_min;
+uniform int AC_nx;
+uniform int AC_ny;
+uniform int AC_nz;

From bcdd827a4f7de3d77ecc8798d81f28b208529658 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 17:05:56 +0300
Subject: [PATCH 11/27] Added a proper declarations for all user-specified
 uniform. Note: built-in uniforms are not correctly translated into CUDA

---
 acc/mhd_solver/stencil_process.sps |  18 ++---
 src/core/kernels/stencil_header.hh | 122 +++++++++++++++++++++++++----
 2 files changed, 115 insertions(+), 25 deletions(-)

diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index 18141f7..a2219b6 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -234,18 +234,18 @@ forcing(int3 globalVertexIdx, Scalar dt)
     Vector a = Scalar(.5) * (Vector){globalGridN.x * AC_dsx,
                                      globalGridN.y * AC_dsy,
                                      globalGridN.z * AC_dsz}; // source (origin)
-    Vector xx = (Vector){(globalVertexIdx.x - AC_nx_min) * AC_dsx,
-                        (globalVertexIdx.y - AC_ny_min) * AC_dsy,
-                        (globalVertexIdx.z - AC_nz_min) * AC_dsz}; // sink (current index)
+    Vector xx = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
+                        (globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
+                        (globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz}; // sink (current index)
     const Scalar cs2 = AC_cs2_sound;
     const Scalar cs = sqrt(cs2);
 
     //Placeholders until determined properly
-    Scalar magnitude = DCONST_REAL(AC_forcing_magnitude);
-    Scalar phase     = DCONST_REAL(AC_forcing_phase);
-    Vector k_force   = (Vector){  DCONST_REAL(AC_k_forcex),   DCONST_REAL(AC_k_forcey),   DCONST_REAL(AC_k_forcez)};
-    Vector ff_re     = (Vector){DCONST_REAL(AC_ff_hel_rex), DCONST_REAL(AC_ff_hel_rey), DCONST_REAL(AC_ff_hel_rez)};
-    Vector ff_im     = (Vector){DCONST_REAL(AC_ff_hel_imx), DCONST_REAL(AC_ff_hel_imy), DCONST_REAL(AC_ff_hel_imz)};
+    Scalar magnitude = AC_forcing_magnitude;
+    Scalar phase     = AC_forcing_phase;
+    Vector k_force   = (Vector){  AC_k_forcex,   AC_k_forcey,   AC_k_forcez};
+    Vector ff_re     = (Vector){AC_ff_hel_rex, AC_ff_hel_rey, AC_ff_hel_rez};
+    Vector ff_im     = (Vector){AC_ff_hel_imx, AC_ff_hel_imy, AC_ff_hel_imz};
 
 
     //Determine that forcing funtion type at this point.
@@ -254,7 +254,7 @@ forcing(int3 globalVertexIdx, Scalar dt)
     Vector force   = helical_forcing(magnitude, k_force, xx, ff_re,ff_im, phase);
 
     //Scaling N = magnitude*cs*sqrt(k*cs/dt)  * dt
-    const Scalar NN = cs*sqrt(DCONST_REAL(AC_kaver)*cs);
+    const Scalar NN = cs*sqrt(AC_kaver*cs);
     //MV: Like in the Pencil Code. I don't understandf the logic here.
     force.x = sqrt(dt)*NN*force.x;
     force.y = sqrt(dt)*NN*force.y;
diff --git a/src/core/kernels/stencil_header.hh b/src/core/kernels/stencil_header.hh
index 798e3cc..f7de1ba 100644
--- a/src/core/kernels/stencil_header.hh
+++ b/src/core/kernels/stencil_header.hh
@@ -8,26 +8,116 @@
 
 #define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
 
-// Declare uniforms (i.e. device constants)
-uniform Scalar AC_cs2_sound;
-uniform Scalar AC_nu_visc;
-uniform Scalar AC_cp_sound;
-uniform Scalar AC_cv_sound;
-uniform Scalar AC_mu0;
-uniform Scalar AC_eta;
-uniform Scalar AC_gamma;
-uniform Scalar AC_zeta;
+// Int params
+uniform int AC_max_steps;
+uniform int AC_save_steps;
+uniform int AC_bin_steps;
+uniform int AC_bc_type;
 
+// Real params
+// Spacing
 uniform Scalar AC_dsx;
 uniform Scalar AC_dsy;
 uniform Scalar AC_dsz;
-
+uniform Scalar AC_dsmin;
+// physical grid
+uniform Scalar AC_xlen;
+uniform Scalar AC_ylen;
+uniform Scalar AC_zlen;
+uniform Scalar AC_xorig;
+uniform Scalar AC_yorig;
+uniform Scalar AC_zorig;
+// Physical units
+uniform Scalar AC_unit_density;
+uniform Scalar AC_unit_velocity;
+uniform Scalar AC_unit_length;
+// properties of gravitating star
+uniform Scalar AC_star_pos_x;
+uniform Scalar AC_star_pos_y;
+uniform Scalar AC_star_pos_z;
+uniform Scalar AC_M_star;
+//  Run params
+uniform Scalar AC_cdt;
+uniform Scalar AC_cdtv;
+uniform Scalar AC_cdts;
+uniform Scalar AC_nu_visc;
+uniform Scalar AC_cs_sound;
+uniform Scalar AC_eta;
+uniform Scalar AC_mu0;
+uniform Scalar AC_cp_sound;
+uniform Scalar AC_gamma;
+uniform Scalar AC_cv_sound;
 uniform Scalar AC_lnT0;
 uniform Scalar AC_lnrho0;
+uniform Scalar AC_zeta;
+uniform Scalar AC_trans;
+//  Other
+uniform Scalar AC_bin_save_t;
+//  Initial condition params
+uniform Scalar AC_ampl_lnrho;
+uniform Scalar AC_ampl_uu;
+uniform Scalar AC_angl_uu;
+uniform Scalar AC_lnrho_edge;
+uniform Scalar AC_lnrho_out;
+//  Forcing parameters. User configured.
+uniform Scalar AC_forcing_magnitude;
+uniform Scalar AC_relhel;
+uniform Scalar AC_kmin;
+uniform Scalar AC_kmax;
+//  Forcing parameters. Set by the generator.
+uniform Scalar AC_forcing_phase;
+uniform Scalar AC_k_forcex;
+uniform Scalar AC_k_forcey;
+uniform Scalar AC_k_forcez;
+uniform Scalar AC_kaver;
+uniform Scalar AC_ff_hel_rex;
+uniform Scalar AC_ff_hel_rey;
+uniform Scalar AC_ff_hel_rez;
+uniform Scalar AC_ff_hel_imx;
+uniform Scalar AC_ff_hel_imy;
+uniform Scalar AC_ff_hel_imz;
+//  Additional helper params  //  (deduced from other params do not set these directly!)
+uniform Scalar AC_G_CONST;
+uniform Scalar AC_GM_star;
+uniform Scalar AC_sq2GM_star;
+uniform Scalar AC_cs2_sound;
+uniform Scalar AC_inv_dsx;
+uniform Scalar AC_inv_dsy;
+uniform Scalar AC_inv_dsz;
 
-uniform int AC_nx_min;
-uniform int AC_ny_min;
-uniform int AC_nz_min;
-uniform int AC_nx;
-uniform int AC_ny;
-uniform int AC_nz;
+/*
+ * =============================================================================
+ * User-defined vertex buffers
+ * =============================================================================
+ */
+// clang-format off
+#if LENTROPY
+#define AC_FOR_VTXBUF_HANDLES(FUNC) \
+    FUNC(VTXBUF_LNRHO), \
+    FUNC(VTXBUF_UUX), \
+    FUNC(VTXBUF_UUY), \
+    FUNC(VTXBUF_UUZ), \
+    FUNC(VTXBUF_AX), \
+    FUNC(VTXBUF_AY), \
+    FUNC(VTXBUF_AZ), \
+    FUNC(VTXBUF_ENTROPY),
+#elif LMAGNETIC
+#define AC_FOR_VTXBUF_HANDLES(FUNC) \
+    FUNC(VTXBUF_LNRHO), \
+    FUNC(VTXBUF_UUX), \
+    FUNC(VTXBUF_UUY), \
+    FUNC(VTXBUF_UUZ), \
+    FUNC(VTXBUF_AX), \
+    FUNC(VTXBUF_AY), \
+    FUNC(VTXBUF_AZ),
+#elif LHYDRO
+#define AC_FOR_VTXBUF_HANDLES(FUNC) \
+    FUNC(VTXBUF_LNRHO), \
+    FUNC(VTXBUF_UUX), \
+    FUNC(VTXBUF_UUY), \
+    FUNC(VTXBUF_UUZ),
+#else
+#define AC_FOR_VTXBUF_HANDLES(FUNC) \
+    FUNC(VTXBUF_LNRHO),
+#endif
+// clang-format on

From d801ebdd41e5c893f96f5513f65f25df4194cbb0 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 17:35:03 +0300
Subject: [PATCH 12/27] Now parameters and vertexbuffers (fields) can be
 declared with the DSL only. TODO: translation from the DSL header to C

---
 src/core/device.cu                 |  5 ++++
 src/core/kernels/stencil_header.hh | 46 +++++++++++++-----------------
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index 8b89a78..a3db98a 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -60,6 +60,11 @@ DCONST(const AcReal3Param param)
 {
     return d_mesh_info.real3_params[param];
 }
+constexpr VertexBufferHandle
+DCONST(const VertexBufferHandle handle)
+{
+    return handle;
+}
 #define DCONST_INT(x) DCONST(x)
 #define DCONST_INT3(x) DCONST(x)
 #define DCONST_REAL(x) DCONST(x)
diff --git a/src/core/kernels/stencil_header.hh b/src/core/kernels/stencil_header.hh
index f7de1ba..31667fc 100644
--- a/src/core/kernels/stencil_header.hh
+++ b/src/core/kernels/stencil_header.hh
@@ -90,34 +90,28 @@ uniform Scalar AC_inv_dsz;
  * User-defined vertex buffers
  * =============================================================================
  */
-// clang-format off
 #if LENTROPY
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO), \
-    FUNC(VTXBUF_UUX), \
-    FUNC(VTXBUF_UUY), \
-    FUNC(VTXBUF_UUZ), \
-    FUNC(VTXBUF_AX), \
-    FUNC(VTXBUF_AY), \
-    FUNC(VTXBUF_AZ), \
-    FUNC(VTXBUF_ENTROPY),
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+uniform ScalarField VTXBUF_AX;
+uniform ScalarField VTXBUF_AY;
+uniform ScalarField VTXBUF_AZ;
+uniform ScalarField VTXBUF_ENTROPY;
 #elif LMAGNETIC
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO), \
-    FUNC(VTXBUF_UUX), \
-    FUNC(VTXBUF_UUY), \
-    FUNC(VTXBUF_UUZ), \
-    FUNC(VTXBUF_AX), \
-    FUNC(VTXBUF_AY), \
-    FUNC(VTXBUF_AZ),
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+uniform ScalarField VTXBUF_AX;
+uniform ScalarField VTXBUF_AY;
+uniform ScalarField VTXBUF_AZ;
 #elif LHYDRO
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO), \
-    FUNC(VTXBUF_UUX), \
-    FUNC(VTXBUF_UUY), \
-    FUNC(VTXBUF_UUZ),
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
 #else
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO),
+uniform ScalarField VTXBUF_LNRHO;
 #endif
-// clang-format on

From 51cf1f1068466d865e69228d2602a5099c6d6734 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 18:19:28 +0300
Subject: [PATCH 13/27] The C header is now generated from the DSL, stashing
 the changes just to be sure since I might overwrite something when updating
 the compilation scripts to work with this new scheme

---
 acc/compile.sh                                |  12 +-
 acc/mhd_solver/stencil_defines.h              | 163 ------------------
 .../mhd_solver/stencil_definition.sdh         |   0
 acc/mhd_solver/stencil_process.sps            |   2 +-
 acc/src/acc.l                                 |   2 +-
 acc/src/acc.y                                 |   3 +-
 acc/src/code_generator.c                      |  79 ++++++++-
 config/astaroth.conf                          |   2 +-
 scripts/compile_acc.sh                        |  22 ++-
 9 files changed, 99 insertions(+), 186 deletions(-)
 delete mode 100644 acc/mhd_solver/stencil_defines.h
 rename src/core/kernels/stencil_header.hh => acc/mhd_solver/stencil_definition.sdh (100%)

diff --git a/acc/compile.sh b/acc/compile.sh
index 7f7c143..0fe6b4c 100755
--- a/acc/compile.sh
+++ b/acc/compile.sh
@@ -8,17 +8,21 @@ FILENAME="${FULL_NAME%.*}"
 EXTENSION="${FULL_NAME##*.}"
 
 if [ "${EXTENSION}" = "sas" ]; then
-    echo "Generating stencil assembly stage ${FILENAME}.sas -> stencil_assembly.cuh"
     COMPILE_FLAGS="-sas" # Generate stencil assembly stage
     CUH_FILENAME="stencil_assembly.cuh"
+    echo "Generating stencil assembly stage ${FILENAME}.sas -> ${CUH_FILENAME}"
 elif [ "${EXTENSION}" = "sps" ]; then
-    echo "Generating stencil processing stage:  ${FILENAME}.sps -> stencil_process.cuh"
     COMPILE_FLAGS="-sps" # Generate stencil processing stage
     CUH_FILENAME="stencil_process.cuh"
+    echo "Generating stencil processing stage:  ${FILENAME}.sps -> ${CUH_FILENAME}"
+elif [ "${EXTENSION}" = "sdh" ]; then
+    COMPILE_FLAGS="-sdh" # Generate stencil definition header
+    CUH_FILENAME="stencil_defines.h"
+    echo "Generating stencil definition header:  ${FILENAME}.sdh -> ${CUH_FILENAME}"
 else
     echo "Error: unknown extension" ${EXTENSION} "of file" ${FULL_NAME}
-    echo "Extension should be either .sas or .sps"
+    echo "Extension should be either .sas, .sps or .sdh"
     exit
 fi
 
-${ACC_DIR}/preprocess.sh $2 $1 | ${ACC_DIR}/build/acc ${COMPILE_FLAGS} > ${CUH_FILENAME}
+${ACC_DIR}/preprocess.sh $1 | ${ACC_DIR}/build/acc ${COMPILE_FLAGS} > ${CUH_FILENAME}
diff --git a/acc/mhd_solver/stencil_defines.h b/acc/mhd_solver/stencil_defines.h
deleted file mode 100644
index b4bc622..0000000
--- a/acc/mhd_solver/stencil_defines.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
-    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-#pragma once
-
-/*
- * =============================================================================
- * Logical switches
- * =============================================================================
- */
-#define STENCIL_ORDER (6)
-#define NGHOST (STENCIL_ORDER / 2)
-#define LDENSITY (1)
-#define LHYDRO (1)
-#define LMAGNETIC (1)
-#define LENTROPY (1)
-#define LTEMPERATURE (0)
-#define LFORCING (1)
-#define LUPWD (1)
-
-#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
-
-/*
- * =============================================================================
- * User-defined parameters
- * =============================================================================
- */
-// clang-format off
-#define AC_FOR_USER_INT_PARAM_TYPES(FUNC)\
-        /* Other */\
-        FUNC(AC_max_steps), \
-        FUNC(AC_save_steps), \
-        FUNC(AC_bin_steps), \
-        FUNC(AC_bc_type),
-
-#define AC_FOR_USER_INT3_PARAM_TYPES(FUNC)
-
-#define AC_FOR_USER_REAL_PARAM_TYPES(FUNC)\
-        /* cparams */\
-        FUNC(AC_dsx), \
-        FUNC(AC_dsy), \
-        FUNC(AC_dsz), \
-        FUNC(AC_dsmin), \
-        /* physical grid*/\
-        FUNC(AC_xlen), \
-        FUNC(AC_ylen), \
-        FUNC(AC_zlen), \
-        FUNC(AC_xorig), \
-        FUNC(AC_yorig), \
-        FUNC(AC_zorig), \
-        /*Physical units*/\
-        FUNC(AC_unit_density),\
-        FUNC(AC_unit_velocity),\
-        FUNC(AC_unit_length),\
-        /* properties of gravitating star*/\
-        FUNC(AC_star_pos_x),\
-        FUNC(AC_star_pos_y),\
-        FUNC(AC_star_pos_z),\
-        FUNC(AC_M_star),\
-        /* Run params */\
-        FUNC(AC_cdt), \
-        FUNC(AC_cdtv), \
-        FUNC(AC_cdts), \
-        FUNC(AC_nu_visc), \
-        FUNC(AC_cs_sound), \
-        FUNC(AC_eta), \
-        FUNC(AC_mu0), \
-        FUNC(AC_cp_sound), \
-        FUNC(AC_gamma), \
-        FUNC(AC_cv_sound), \
-        FUNC(AC_lnT0), \
-        FUNC(AC_lnrho0), \
-        FUNC(AC_zeta), \
-        FUNC(AC_trans),\
-        /* Other */\
-        FUNC(AC_bin_save_t), \
-        /* Initial condition params */\
-        FUNC(AC_ampl_lnrho), \
-        FUNC(AC_ampl_uu), \
-        FUNC(AC_angl_uu), \
-        FUNC(AC_lnrho_edge),\
-        FUNC(AC_lnrho_out),\
-	/* Forcing parameters. User configured. */\
-        FUNC(AC_forcing_magnitude),\
-        FUNC(AC_relhel), \
-        FUNC(AC_kmin), \
-        FUNC(AC_kmax), \
-	/* Forcing parameters. Set by the generator. */\
-        FUNC(AC_forcing_phase),\
-        FUNC(AC_k_forcex),\
-        FUNC(AC_k_forcey),\
-        FUNC(AC_k_forcez),\
-        FUNC(AC_kaver),\
-        FUNC(AC_ff_hel_rex),\
-        FUNC(AC_ff_hel_rey),\
-        FUNC(AC_ff_hel_rez),\
-        FUNC(AC_ff_hel_imx),\
-        FUNC(AC_ff_hel_imy),\
-        FUNC(AC_ff_hel_imz),\
-        /* Additional helper params */\
-        /* (deduced from other params do not set these directly!) */\
-        FUNC(AC_G_CONST),\
-        FUNC(AC_GM_star),\
-        FUNC(AC_sq2GM_star),\
-        FUNC(AC_cs2_sound), \
-        FUNC(AC_inv_dsx), \
-        FUNC(AC_inv_dsy), \
-        FUNC(AC_inv_dsz),
-
-#define AC_FOR_USER_REAL3_PARAM_TYPES(FUNC)
-// clang-format on
-
-/*
- * =============================================================================
- * User-defined vertex buffers
- * =============================================================================
- */
-// clang-format off
-#if LENTROPY
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO), \
-    FUNC(VTXBUF_UUX), \
-    FUNC(VTXBUF_UUY), \
-    FUNC(VTXBUF_UUZ), \
-    FUNC(VTXBUF_AX), \
-    FUNC(VTXBUF_AY), \
-    FUNC(VTXBUF_AZ), \
-    FUNC(VTXBUF_ENTROPY),
-#elif LMAGNETIC
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO), \
-    FUNC(VTXBUF_UUX), \
-    FUNC(VTXBUF_UUY), \
-    FUNC(VTXBUF_UUZ), \
-    FUNC(VTXBUF_AX), \
-    FUNC(VTXBUF_AY), \
-    FUNC(VTXBUF_AZ),
-#elif LHYDRO
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO), \
-    FUNC(VTXBUF_UUX), \
-    FUNC(VTXBUF_UUY), \
-    FUNC(VTXBUF_UUZ),
-#else
-#define AC_FOR_VTXBUF_HANDLES(FUNC) \
-    FUNC(VTXBUF_LNRHO),
-#endif
-// clang-format on
diff --git a/src/core/kernels/stencil_header.hh b/acc/mhd_solver/stencil_definition.sdh
similarity index 100%
rename from src/core/kernels/stencil_header.hh
rename to acc/mhd_solver/stencil_definition.sdh
diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index a2219b6..420193e 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -1,4 +1,4 @@
-#include "stencil_header.hh"
+#include "stencil_definition.sdh"
 
 
 Vector
diff --git a/acc/src/acc.l b/acc/src/acc.l
index b180ae8..76104d8 100644
--- a/acc/src/acc.l
+++ b/acc/src/acc.l
@@ -15,7 +15,7 @@ L [a-zA-Z_]
 "void"      { return VOID; } /* Rest of the types inherited from C */
 "int"       { return INT; }
 "int3"      { return INT3; }
-"ScalarField" { return SCALAR; }
+"ScalarField" { return SCALARFIELD; }
 "VectorField" { return VECTOR; }
 
 "Kernel"       { return KERNEL; } /* Function specifiers */
diff --git a/acc/src/acc.y b/acc/src/acc.y
index 7138b1b..0bd1d19 100644
--- a/acc/src/acc.y
+++ b/acc/src/acc.y
@@ -16,7 +16,7 @@ int yyget_lineno();
 %token CONSTANT IN OUT UNIFORM
 %token IDENTIFIER NUMBER
 %token RETURN
-%token SCALAR VECTOR MATRIX
+%token SCALAR VECTOR MATRIX SCALARFIELD
 %token VOID INT INT3
 %token IF ELSE FOR WHILE ELIF
 %token LEQU LAND LOR LLEQU
@@ -209,6 +209,7 @@ type_specifier: VOID
               | SCALAR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALAR;  }
               | VECTOR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VECTOR;  }
               | MATRIX                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = MATRIX;  }
+              | SCALARFIELD                                                             { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALARFIELD;  }
               ;
 
 identifier: IDENTIFIER                                                                  { $$ = astnode_create(NODE_IDENTIFIER, NULL, NULL); astnode_set_buffer(yytext, $$); }
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 01cf1d1..c31ad86 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -54,12 +54,13 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
     [WHILE] = "while",
     [FOR]   = "for",
     // Type specifiers
-    [VOID]   = "void",
-    [INT]    = "int",
-    [INT3]   = "int3",
-    [SCALAR] = "AcReal",
-    [VECTOR] = "AcReal3",
-    [MATRIX] = "AcMatrix",
+    [VOID]        = "void",
+    [INT]         = "int",
+    [INT3]        = "int3",
+    [SCALAR]      = "AcReal",
+    [VECTOR]      = "AcReal3",
+    [MATRIX]      = "AcMatrix",
+    [SCALARFIELD] = "AcReal",
     // Type qualifiers
     [KERNEL] = "template <int step_number>  static "
                "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
@@ -555,6 +556,68 @@ generate_preprocessed_structures(void)
     ");
 }
 
+static void
+generate_header(void)
+{
+    printf("\n#pragma once\n");
+
+    // Int params
+    printf("#define AC_FOR_USER_INT_PARAM_TYPES(FUNC)");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_specifier == INT) {
+            printf("\\\nFUNC(%s),", symbol_table[i].identifier);
+        }
+    }
+    printf("\n\n");
+
+    // Int3 params
+    printf("#define AC_FOR_USER_INT3_PARAM_TYPES(FUNC)");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_specifier == INT3) {
+            printf("\\\nFUNC(%s),", symbol_table[i].identifier);
+        }
+    }
+    printf("\n\n");
+
+    // Scalar params
+    printf("#define AC_FOR_USER_REAL_PARAM_TYPES(FUNC)");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_specifier == SCALAR) {
+            printf("\\\nFUNC(%s),", symbol_table[i].identifier);
+        }
+    }
+    printf("\n\n");
+
+    // Vector params
+    printf("#define AC_FOR_USER_REAL3_PARAM_TYPES(FUNC)");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_specifier == VECTOR) {
+            printf("\\\nFUNC(%s),", symbol_table[i].identifier);
+        }
+    }
+    printf("\n\n");
+
+    // Scalar fields
+    printf("#define AC_FOR_VTXBUF_HANDLES(FUNC)");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_specifier == SCALARFIELD) {
+            printf("\\\nFUNC(%s),", symbol_table[i].identifier);
+        }
+    }
+    printf("\n\n");
+
+    /*
+    printf("\n");
+    printf("typedef struct {\n");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == PREPROCESSED)
+            printf("%s %s;\n", translate(symbol_table[i].type_specifier),
+                   symbol_table[i].identifier);
+    }
+    printf("} %sData;\n", translate(SCALAR));
+    */
+}
+
 int
 main(int argc, char** argv)
 {
@@ -563,7 +626,7 @@ main(int argc, char** argv)
             compilation_type = STENCIL_ASSEMBLY;
         else if (!strcmp(argv[1], "-sps"))
             compilation_type = STENCIL_PROCESS;
-        else if (!strcmp(argv[1], "-hh"))
+        else if (!strcmp(argv[1], "-sdh"))
             compilation_type = STENCIL_HEADER;
         else
             printf("Unknown flag %s. Generating stencil assembly.\n", argv[1]);
@@ -590,6 +653,8 @@ main(int argc, char** argv)
     traverse(root);
     if (compilation_type == STENCIL_ASSEMBLY)
         generate_preprocessed_structures();
+    else if (compilation_type == STENCIL_HEADER)
+        generate_header();
 
     // print_symbol_table();
 
diff --git a/config/astaroth.conf b/config/astaroth.conf
index 32f50a3..944364b 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -7,7 +7,7 @@
  */
 AC_nx = 128
 AC_ny = 128
-AC_nz = 128
+AC_nz = 6
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
diff --git a/scripts/compile_acc.sh b/scripts/compile_acc.sh
index 8648079..4649bf5 100755
--- a/scripts/compile_acc.sh
+++ b/scripts/compile_acc.sh
@@ -8,17 +8,18 @@ fi
 
 KERNEL_DIR=${AC_HOME}"/src/core/kernels"
 ACC_DIR=${AC_HOME}"/acc"
-ACC_DEFAULT_HEADER="mhd_solver/stencil_defines.h"
 ACC_DEFAULT_SAS="mhd_solver/stencil_assembly.sas"
 ACC_DEFAULT_SPS="mhd_solver/stencil_process.sps"
+ACC_DEFAULT_HEADER="mhd_solver/stencil_definition.sdh"
+ACC_DEFAULT_INCLUDE_DIR="mhd_solver"
 
 ${ACC_DIR}/clean.sh
 ${ACC_DIR}/build_acc.sh
 
-
-ACC_HEADER=${ACC_DEFAULT_HEADER}
 ACC_SAS=${ACC_DEFAULT_SAS}
 ACC_SPS=${ACC_DEFAULT_SPS}
+ACC_HEADER=${ACC_DEFAULT_HEADER}
+ACC_INCLUDE=${ACC_DEFAULT_INCLUDE_DIR}
 
 while [ "$#" -gt 0 ]
 do
@@ -56,9 +57,14 @@ echo "Header file:" ${ACC_DIR}/${ACC_HEADER}
 echo "Assembly file: ${ACC_DIR}/${ACC_SAS}"
 echo "Process file: ${ACC_DIR}/${ACC_SPS}"
 
-cd ${KERNEL_DIR}
-${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS} ${ACC_DIR}/${ACC_HEADER}
-${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS} ${ACC_DIR}/${ACC_HEADER}
+cd ${ACC_DIR}/${ACC_INCLUDE_DIR}
+${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS}
+${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS}
+${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_HEADER}
 
-echo "Linking: " ${ACC_DIR}/${ACC_HEADER} " -> " ${AC_HOME}/include/stencil_defines.h
-ln -sf ${ACC_DIR}/${ACC_HEADER} ${AC_HOME}/include/stencil_defines.h
+#mv ${ACC_SAS} ${AC_HOME}/src/core/kernels
+#mv ${ACC_SPS} ${AC_HOME}/src/core/kernels
+#mv ${ACC_HEADER} ${AC_HOME}/include
+
+#echo "Linking: " ${ACC_DIR}/${ACC_HEADER} " -> " ${AC_HOME}/include/stencil_defines.h
+#ln -sf ${ACC_DIR}/${ACC_HEADER} ${AC_HOME}/include/stencil_defines.h

From 5b7408eb55d90f11998ff9d8b9cbe04dec228241 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 19 Aug 2019 18:43:16 +0300
Subject: [PATCH 14/27] User config param overhaul complete, works. If I
 haven't missed anything, all fields and user parameters, and everything
 related to simulation can now be declared with the DSL. The only thing that
 you need to do is to fill the declared symbols with data, like with OpenGL
 and GLSL.

---
 acc/mhd_solver/stencil_assembly.sas |  8 +++++---
 acc/src/code_generator.c            |  5 +++--
 scripts/compile_acc.sh              | 15 +++++++++------
 src/standalone/model/model_rk3.cc   | 10 ++++++++++
 4 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/acc/mhd_solver/stencil_assembly.sas b/acc/mhd_solver/stencil_assembly.sas
index 5c4f14a..e30d500 100644
--- a/acc/mhd_solver/stencil_assembly.sas
+++ b/acc/mhd_solver/stencil_assembly.sas
@@ -1,3 +1,5 @@
+#include "stencil_definition.sdh"
+
 Preprocessed Scalar
 value(in ScalarField vertex)
 {
@@ -17,7 +19,7 @@ gradient(in ScalarField vertex)
 Preprocessed Scalar
 der6x_upwd(in ScalarField vertex)
 {
-    Scalar inv_ds = DCONST_REAL(AC_inv_dsx);
+    Scalar inv_ds = AC_inv_dsx;
 
     return (Scalar){  Scalar(1.0/60.0)*inv_ds* (
                     - Scalar(20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
@@ -32,7 +34,7 @@ der6x_upwd(in ScalarField vertex)
 Preprocessed Scalar
 der6y_upwd(in ScalarField vertex)
 {
-    Scalar inv_ds = DCONST_REAL(AC_inv_dsy);
+    Scalar inv_ds = AC_inv_dsy;
 
     return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
                     -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
@@ -47,7 +49,7 @@ der6y_upwd(in ScalarField vertex)
 Preprocessed Scalar
 der6z_upwd(in ScalarField vertex)
 {
-    Scalar inv_ds = DCONST_REAL(AC_inv_dsz);
+    Scalar inv_ds = AC_inv_dsz;
 
     return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
                     -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index c31ad86..e560a70 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -238,8 +238,9 @@ translate_latest_symbol(void)
     else if (symbol->type != SYMBOLTYPE_FUNCTION_PARAMETER &&
              (symbol->type_qualifier == IN || symbol->type_qualifier == OUT)) {
 
-        printf("static __device__ const %s %s%s", symbol->type_specifier == SCALAR ? "int" : "int3",
-               inout_name_prefix, symbol_table[handle].identifier);
+        printf("static __device__ const %s %s%s",
+               symbol->type_specifier == SCALARFIELD ? "int" : "int3", inout_name_prefix,
+               symbol_table[handle].identifier);
         if (symbol->type_specifier == VECTOR)
             printf(" = make_int3");
     }
diff --git a/scripts/compile_acc.sh b/scripts/compile_acc.sh
index 4649bf5..7ebbd4c 100755
--- a/scripts/compile_acc.sh
+++ b/scripts/compile_acc.sh
@@ -19,7 +19,7 @@ ${ACC_DIR}/build_acc.sh
 ACC_SAS=${ACC_DEFAULT_SAS}
 ACC_SPS=${ACC_DEFAULT_SPS}
 ACC_HEADER=${ACC_DEFAULT_HEADER}
-ACC_INCLUDE=${ACC_DEFAULT_INCLUDE_DIR}
+ACC_INCLUDE_DIR=${ACC_DEFAULT_INCLUDE_DIR}
 
 while [ "$#" -gt 0 ]
 do
@@ -58,13 +58,16 @@ echo "Assembly file: ${ACC_DIR}/${ACC_SAS}"
 echo "Process file: ${ACC_DIR}/${ACC_SPS}"
 
 cd ${ACC_DIR}/${ACC_INCLUDE_DIR}
+echo ${PWD}
 ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS}
 ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS}
 ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_HEADER}
 
-#mv ${ACC_SAS} ${AC_HOME}/src/core/kernels
-#mv ${ACC_SPS} ${AC_HOME}/src/core/kernels
-#mv ${ACC_HEADER} ${AC_HOME}/include
+echo "Moving stencil_assembly.cuh -> ${AC_HOME}/src/core/kernels"
+mv stencil_assembly.cuh ${AC_HOME}/src/core/kernels
 
-#echo "Linking: " ${ACC_DIR}/${ACC_HEADER} " -> " ${AC_HOME}/include/stencil_defines.h
-#ln -sf ${ACC_DIR}/${ACC_HEADER} ${AC_HOME}/include/stencil_defines.h
+echo "Moving stencil_process.cuh -> ${AC_HOME}/src/core/kernels"
+mv stencil_process.cuh ${AC_HOME}/src/core/kernels
+
+echo "Moving stencil_defines.cuh -> ${AC_HOME}/include"
+mv stencil_defines.h ${AC_HOME}/include
diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
index 756e3a7..5fab4b4 100644
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -31,6 +31,16 @@
 #include "host_memory.h"
 #include "model_boundconds.h"
 
+// Standalone flags
+#define LDENSITY (1)
+#define LHYDRO (1)
+#define LMAGNETIC (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+#define LFORCING (1)
+#define LUPWD (1)
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
 typedef struct {
     ModelScalar x, y, z;
 } ModelVector;

From 73d393e4190d8802fe03295cfd0adad655bf3f92 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 20 Aug 2019 18:40:38 +0300
Subject: [PATCH 15/27] Changed order for linking the MPI library to work
 around cmake error on CMP0004

---
 src/mpitest/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mpitest/CMakeLists.txt b/src/mpitest/CMakeLists.txt
index c64105d..ed19b4d 100644
--- a/src/mpitest/CMakeLists.txt
+++ b/src/mpitest/CMakeLists.txt
@@ -9,4 +9,4 @@ find_package(MPI REQUIRED)
 
 add_executable(mpitest main.c)
 target_include_directories(mpitest PRIVATE ${MPI_C_INCLUDE_PATH})
-target_link_libraries(mpitest PRIVATE ${MPI_C_LIBRARIES} astaroth_core)
+target_link_libraries(mpitest PRIVATE astaroth_core ${MPI_C_LIBRARIES})

From 5d2b658fb0680531e885fbb42ee8786af3f06c99 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 20 Aug 2019 18:41:26 +0300
Subject: [PATCH 16/27] Autoformatted the DSL files

---
 acc/mhd_solver/stencil_assembly.sas |  59 +++---
 acc/mhd_solver/stencil_process.sps  | 293 ++++++++++++++--------------
 2 files changed, 178 insertions(+), 174 deletions(-)

diff --git a/acc/mhd_solver/stencil_assembly.sas b/acc/mhd_solver/stencil_assembly.sas
index e30d500..85886e6 100644
--- a/acc/mhd_solver/stencil_assembly.sas
+++ b/acc/mhd_solver/stencil_assembly.sas
@@ -9,9 +9,7 @@ value(in ScalarField vertex)
 Preprocessed Vector
 gradient(in ScalarField vertex)
 {
-    return (Vector){derx(vertexIdx, vertex),
-                    dery(vertexIdx, vertex),
-                    derz(vertexIdx, vertex)};
+    return (Vector){derx(vertexIdx, vertex), dery(vertexIdx, vertex), derz(vertexIdx, vertex)};
 }
 
 #if LUPWD
@@ -21,14 +19,14 @@ der6x_upwd(in ScalarField vertex)
 {
     Scalar inv_ds = AC_inv_dsx;
 
-    return (Scalar){  Scalar(1.0/60.0)*inv_ds* (
-                    - Scalar(20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    + Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z]
-                    +               vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
-                    - Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z]
-                    +               vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
-                    +               vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z]
-                    +               vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x + 1, vertexIdx.y, vertexIdx.z] +
+                                     vertex[vertexIdx.x - 1, vertexIdx.y, vertexIdx.z]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x + 2, vertexIdx.y, vertexIdx.z] +
+                                    vertex[vertexIdx.x - 2, vertexIdx.y, vertexIdx.z]) +
+                     vertex[vertexIdx.x + 3, vertexIdx.y, vertexIdx.z] +
+                     vertex[vertexIdx.x - 3, vertexIdx.y, vertexIdx.z])};
 }
 
 Preprocessed Scalar
@@ -36,14 +34,14 @@ der6y_upwd(in ScalarField vertex)
 {
     Scalar inv_ds = AC_inv_dsy;
 
-    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
-                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z]
-                    +               vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z]
-                    +               vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
-                    +               vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z]
-                    +               vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y + 1, vertexIdx.z] +
+                                     vertex[vertexIdx.x, vertexIdx.y - 1, vertexIdx.z]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y + 2, vertexIdx.z] +
+                                    vertex[vertexIdx.x, vertexIdx.y - 2, vertexIdx.z]) +
+                     vertex[vertexIdx.x, vertexIdx.y + 3, vertexIdx.z] +
+                     vertex[vertexIdx.x, vertexIdx.y - 3, vertexIdx.z])};
 }
 
 Preprocessed Scalar
@@ -51,14 +49,14 @@ der6z_upwd(in ScalarField vertex)
 {
     Scalar inv_ds = AC_inv_dsz;
 
-    return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
-                    -Scalar( 20.0)* vertex[vertexIdx.x,   vertexIdx.y, vertexIdx.z]
-                    +Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1]
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
-                    -Scalar(  6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2]
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3]
-                    +               vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 1] +
+                                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 1]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 2] +
+                                    vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 2]) +
+                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 3] +
+                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 3])};
 }
 
 #endif
@@ -68,9 +66,10 @@ hessian(in ScalarField vertex)
 {
     Matrix hessian;
 
-    hessian.row[0] = (Vector){derxx(vertexIdx, vertex), derxy(vertexIdx, vertex), derxz(vertexIdx, vertex)};
-    hessian.row[1] = (Vector){hessian.row[0].y,       deryy(vertexIdx, vertex), deryz(vertexIdx, vertex)};
-    hessian.row[2] = (Vector){hessian.row[0].z,       hessian.row[1].z,       derzz(vertexIdx, vertex)};
+    hessian.row[0] = (Vector){derxx(vertexIdx, vertex), derxy(vertexIdx, vertex),
+                              derxz(vertexIdx, vertex)};
+    hessian.row[1] = (Vector){hessian.row[0].y, deryy(vertexIdx, vertex), deryz(vertexIdx, vertex)};
+    hessian.row[2] = (Vector){hessian.row[0].z, hessian.row[1].z, derzz(vertexIdx, vertex)};
 
     return hessian;
 }
diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index 420193e..e605b9f 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -1,6 +1,5 @@
 #include "stencil_definition.sdh"
 
-
 Vector
 value(in VectorField uu)
 {
@@ -14,7 +13,7 @@ upwd_der6(in VectorField uu, in ScalarField lnrho)
     Scalar uux = fabs(value(uu).x);
     Scalar uuy = fabs(value(uu).y);
     Scalar uuz = fabs(value(uu).z);
-    return (Scalar){uux*der6x_upwd(lnrho) + uuy*der6y_upwd(lnrho) + uuz*der6z_upwd(lnrho)};
+    return (Scalar){uux * der6x_upwd(lnrho) + uuy * der6y_upwd(lnrho) + uuz * der6z_upwd(lnrho)};
 }
 #endif
 
@@ -25,10 +24,11 @@ gradients(in VectorField uu)
 }
 
 Scalar
-continuity(in VectorField uu, in ScalarField lnrho) {
+continuity(in VectorField uu, in ScalarField lnrho)
+{
     return -dot(value(uu), gradient(lnrho))
 #if LUPWD
-           //This is a corrective hyperdiffusion term for upwinding.
+           // This is a corrective hyperdiffusion term for upwinding.
            + upwd_der6(uu, lnrho)
 #endif
            - divergence(uu);
@@ -36,133 +36,136 @@ continuity(in VectorField uu, in ScalarField lnrho) {
 
 #if LENTROPY
 Vector
-momentum(in VectorField uu, in ScalarField lnrho, in ScalarField ss, in VectorField aa) {
-    const Matrix S = stress_tensor(uu);
-    const Scalar cs2 = AC_cs2_sound * exp(AC_gamma * value(ss) / AC_cp_sound + (AC_gamma - 1) * (value(lnrho) - AC_lnrho0));
-    const Vector  j = (Scalar(1.) / AC_mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+momentum(in VectorField uu, in ScalarField lnrho, in ScalarField ss, in VectorField aa)
+{
+    const Matrix S   = stress_tensor(uu);
+    const Scalar cs2 = AC_cs2_sound * exp(AC_gamma * value(ss) / AC_cp_sound +
+                                          (AC_gamma - 1) * (value(lnrho) - AC_lnrho0));
+    const Vector j   = (Scalar(1.) / AC_mu0) *
+                     (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
     const Vector B = curl(aa);
-    //TODO: DOES INTHERMAL VERSTION INCLUDE THE MAGNETIC FIELD?
+    // TODO: DOES INTHERMAL VERSTION INCLUDE THE MAGNETIC FIELD?
     const Scalar inv_rho = Scalar(1.) / exp(value(lnrho));
 
     // Regex replace CPU constants with get\(AC_([a-zA-Z_0-9]*)\)
     // \1
-    const Vector mom = - mul(gradients(uu), value(uu))
-                                                       - cs2 * ((Scalar(1.) / AC_cp_sound) * gradient(ss) + gradient(lnrho))
-                                                       + inv_rho * cross(j, B)
-                                                       + AC_nu_visc * (
-                                                            laplace_vec(uu)
-                                                        + Scalar(1. / 3.) * gradient_of_divergence(uu)
-                                                        + Scalar(2.) * mul(S, gradient(lnrho))
-                                                        )
-                                                        + AC_zeta * gradient_of_divergence(uu);
+    const Vector mom = -mul(gradients(uu), value(uu)) -
+                       cs2 * ((Scalar(1.) / AC_cp_sound) * gradient(ss) + gradient(lnrho)) +
+                       inv_rho * cross(j, B) +
+                       AC_nu_visc *
+                           (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                            Scalar(2.) * mul(S, gradient(lnrho))) +
+                       AC_zeta * gradient_of_divergence(uu);
     return mom;
 }
 #elif LTEMPERATURE
 Vector
-momentum(in VectorField uu, in ScalarField lnrho, in ScalarField tt) {
-  Vector mom;
+momentum(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
+{
+    Vector mom;
 
-  const Matrix S = stress_tensor(uu);
+    const Matrix S = stress_tensor(uu);
 
-    const Vector pressure_term = (AC_cp_sound - AC_cv_sound) * (gradient(tt) + value(tt) * gradient(lnrho));
+    const Vector pressure_term = (AC_cp_sound - AC_cv_sound) *
+                                 (gradient(tt) + value(tt) * gradient(lnrho));
 
-  mom = -mul(gradients(uu), value(uu)) -
-    pressure_term +
-    AC_nu_visc *
-    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
-      Scalar(2.) * mul(S, gradient(lnrho))) + AC_zeta * gradient_of_divergence(uu);
+    mom = -mul(gradients(uu), value(uu)) - pressure_term +
+          AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                        Scalar(2.) * mul(S, gradient(lnrho))) +
+          AC_zeta * gradient_of_divergence(uu);
 
-  #if LGRAVITY
-  mom = mom - (Vector){0, 0, -10.0};
-  #endif
+#if LGRAVITY
+    mom = mom - (Vector){0, 0, -10.0};
+#endif
 
-  return mom;
+    return mom;
 }
 #else
 Vector
-momentum(in VectorField uu, in ScalarField lnrho) {
-  Vector mom;
+momentum(in VectorField uu, in ScalarField lnrho)
+{
+    Vector mom;
 
-  const Matrix S = stress_tensor(uu);
+    const Matrix S = stress_tensor(uu);
 
     // Isothermal: we have constant speed of sound
 
-  mom = -mul(gradients(uu), value(uu)) -
-    AC_cs2_sound * gradient(lnrho) +
-    AC_nu_visc *
-    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
-      Scalar(2.) * mul(S, gradient(lnrho))) + AC_zeta * gradient_of_divergence(uu);
+    mom = -mul(gradients(uu), value(uu)) - AC_cs2_sound * gradient(lnrho) +
+          AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                        Scalar(2.) * mul(S, gradient(lnrho))) +
+          AC_zeta * gradient_of_divergence(uu);
 
-  #if LGRAVITY
-  mom = mom - (Vector){0, 0, -10.0};
-  #endif
+#if LGRAVITY
+    mom = mom - (Vector){0, 0, -10.0};
+#endif
 
-  return mom;
+    return mom;
 }
 #endif
 
-
 Vector
-induction(in VectorField uu, in VectorField aa) {
-  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
-  // x A)) in order to avoid taking the first derivative twice (did the math,
-  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
-  // u cross B - AC_eta * AC_mu0 * (AC_mu0^-1 * [- laplace A + grad div A ])
-  const Vector B = curl(aa);
-  const Vector grad_div = gradient_of_divergence(aa);
-  const Vector lap = laplace_vec(aa);
+induction(in VectorField uu, in VectorField aa)
+{
+    // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+    // x A)) in order to avoid taking the first derivative twice (did the math,
+    // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+    // u cross B - AC_eta * AC_mu0 * (AC_mu0^-1 * [- laplace A + grad div A ])
+    const Vector B        = curl(aa);
+    const Vector grad_div = gradient_of_divergence(aa);
+    const Vector lap      = laplace_vec(aa);
 
-  // Note, AC_mu0 is cancelled out
-  const Vector ind = cross(value(uu), B) - AC_eta * (grad_div - lap);
+    // Note, AC_mu0 is cancelled out
+    const Vector ind = cross(value(uu), B) - AC_eta * (grad_div - lap);
 
-  return ind;
+    return ind;
 }
 
-
 #if LENTROPY
 Scalar
-lnT( in ScalarField ss, in ScalarField lnrho) {
-  const Scalar lnT = AC_lnT0 + AC_gamma * value(ss) / AC_cp_sound +
-    (AC_gamma - Scalar(1.)) * (value(lnrho) - AC_lnrho0);
-  return lnT;
+lnT(in ScalarField ss, in ScalarField lnrho)
+{
+    const Scalar lnT = AC_lnT0 + AC_gamma * value(ss) / AC_cp_sound +
+                       (AC_gamma - Scalar(1.)) * (value(lnrho) - AC_lnrho0);
+    return lnT;
 }
 
 // Nabla dot (K nabla T) / (rho T)
 Scalar
-heat_conduction( in ScalarField ss, in ScalarField lnrho) {
-  const Scalar inv_AC_cp_sound = AcReal(1.) / AC_cp_sound;
+heat_conduction(in ScalarField ss, in ScalarField lnrho)
+{
+    const Scalar inv_AC_cp_sound = AcReal(1.) / AC_cp_sound;
 
-  const Vector grad_ln_chi = - gradient(lnrho);
+    const Vector grad_ln_chi = -gradient(lnrho);
 
-  const Scalar first_term = AC_gamma * inv_AC_cp_sound * laplace(ss) +
-    (AC_gamma - AcReal(1.)) * laplace(lnrho);
-  const Vector second_term = AC_gamma * inv_AC_cp_sound * gradient(ss) +
-    (AC_gamma - AcReal(1.)) * gradient(lnrho);
-  const Vector third_term = AC_gamma * (inv_AC_cp_sound * gradient(ss) +
-    gradient(lnrho)) + grad_ln_chi;
+    const Scalar first_term = AC_gamma * inv_AC_cp_sound * laplace(ss) +
+                              (AC_gamma - AcReal(1.)) * laplace(lnrho);
+    const Vector second_term = AC_gamma * inv_AC_cp_sound * gradient(ss) +
+                               (AC_gamma - AcReal(1.)) * gradient(lnrho);
+    const Vector third_term = AC_gamma * (inv_AC_cp_sound * gradient(ss) + gradient(lnrho)) +
+                              grad_ln_chi;
 
-  const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * AC_cp_sound);
-  return AC_cp_sound * chi * (first_term + dot(second_term, third_term));
+    const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * AC_cp_sound);
+    return AC_cp_sound * chi * (first_term + dot(second_term, third_term));
 }
 
 Scalar
-heating(const int i, const int j, const int k) {
-  return 1;
+heating(const int i, const int j, const int k)
+{
+    return 1;
 }
 
 Scalar
-entropy(in ScalarField ss, in VectorField uu, in ScalarField lnrho, in VectorField aa) {
-    const Matrix S = stress_tensor(uu);
+entropy(in ScalarField ss, in VectorField uu, in ScalarField lnrho, in VectorField aa)
+{
+    const Matrix S      = stress_tensor(uu);
     const Scalar inv_pT = Scalar(1.) / (exp(value(lnrho)) * exp(lnT(ss, lnrho)));
-    const Vector  j = (Scalar(1.) / AC_mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
-    const Scalar RHS = H_CONST - C_CONST
-                                                + AC_eta * (AC_mu0) * dot(j, j)
-                                                + Scalar(2.) * exp(value(lnrho)) * AC_nu_visc * contract(S)
-                                                + AC_zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
+    const Vector j      = (Scalar(1.) / AC_mu0) *
+                     (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Scalar RHS = H_CONST - C_CONST + AC_eta * (AC_mu0)*dot(j, j) +
+                       Scalar(2.) * exp(value(lnrho)) * AC_nu_visc * contract(S) +
+                       AC_zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
 
-    return - dot(value(uu), gradient(ss))
-                  + inv_pT * RHS
-                  + heat_conduction(ss, lnrho);
+    return -dot(value(uu), gradient(ss)) + inv_pT * RHS + heat_conduction(ss, lnrho);
 }
 #endif
 
@@ -170,14 +173,15 @@ entropy(in ScalarField ss, in VectorField uu, in ScalarField lnrho, in VectorFie
 Scalar
 heat_transfer(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
 {
-    const Matrix S = stress_tensor(uu);
-    const Scalar heat_diffusivity_k = 0.0008; //8e-4;
-    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) + heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) + AC_nu_visc * contract(S) * (Scalar(1.) / AC_cv_sound) - (AC_gamma - 1) * value(tt) * divergence(uu);
+    const Matrix S                  = stress_tensor(uu);
+    const Scalar heat_diffusivity_k = 0.0008; // 8e-4;
+    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) +
+           heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) +
+           AC_nu_visc * contract(S) * (Scalar(1.) / AC_cv_sound) -
+           (AC_gamma - 1) * value(tt) * divergence(uu);
 }
 #endif
 
-
-
 #if LFORCING
 Vector
 simple_vortex_forcing(Vector a, Vector b, Scalar magnitude)
@@ -191,8 +195,8 @@ simple_outward_flow_forcing(Vector a, Vector b, Scalar magnitude)
     return magnitude * (1 / length(b - a)) * normalized(b - a); // Outward flow
 }
 
-
-// The Pencil Code forcing_hel_noshear(), manual Eq. 222, inspired forcing function with adjustable helicity
+// The Pencil Code forcing_hel_noshear(), manual Eq. 222, inspired forcing function with adjustable
+// helicity
 Vector
 helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
 {
@@ -206,24 +210,23 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
     //    MV: Good idea. No an immediate priority.
     // Fun related article:
     // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
-    xx.x = xx.x*(2.0*M_PI/(AC_dsx*globalGridN.x));
-    xx.y = xx.y*(2.0*M_PI/(AC_dsy*globalGridN.y));
-    xx.z = xx.z*(2.0*M_PI/(AC_dsz*globalGridN.z));
+    xx.x = xx.x * (2.0 * M_PI / (AC_dsx * globalGridN.x));
+    xx.y = xx.y * (2.0 * M_PI / (AC_dsy * globalGridN.y));
+    xx.z = xx.z * (2.0 * M_PI / (AC_dsz * globalGridN.z));
 
-    Scalar cos_phi = cos(phi);
-    Scalar sin_phi = sin(phi);
-    Scalar cos_k_dot_x     = cos(dot(k_force, xx));
-    Scalar sin_k_dot_x     = sin(dot(k_force, xx));
+    Scalar cos_phi     = cos(phi);
+    Scalar sin_phi     = sin(phi);
+    Scalar cos_k_dot_x = cos(dot(k_force, xx));
+    Scalar sin_k_dot_x = sin(dot(k_force, xx));
     // Phase affect only the x-component
-    //Scalar real_comp       = cos_k_dot_x;
-    //Scalar imag_comp       = sin_k_dot_x;
-    Scalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
-    Scalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;
+    // Scalar real_comp       = cos_k_dot_x;
+    // Scalar imag_comp       = sin_k_dot_x;
+    Scalar real_comp_phase = cos_k_dot_x * cos_phi - sin_k_dot_x * sin_phi;
+    Scalar imag_comp_phase = cos_k_dot_x * sin_phi + sin_k_dot_x * cos_phi;
 
-
-    Vector force = (Vector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
-                             ff_re.y*real_comp_phase - ff_im.y*imag_comp_phase,
-                             ff_re.z*real_comp_phase - ff_im.z*imag_comp_phase};
+    Vector force = (Vector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
+                            ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
+                            ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
 
     return force;
 }
@@ -231,37 +234,39 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
 Vector
 forcing(int3 globalVertexIdx, Scalar dt)
 {
-    Vector a = Scalar(.5) * (Vector){globalGridN.x * AC_dsx,
-                                     globalGridN.y * AC_dsy,
+    Vector a         = Scalar(.5) * (Vector){globalGridN.x * AC_dsx, globalGridN.y * AC_dsy,
                                      globalGridN.z * AC_dsz}; // source (origin)
-    Vector xx = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
-                        (globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
-                        (globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz}; // sink (current index)
+    Vector xx        = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
+                         (globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
+                         (globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz}; // sink (current index)
     const Scalar cs2 = AC_cs2_sound;
-    const Scalar cs = sqrt(cs2);
+    const Scalar cs  = sqrt(cs2);
 
-    //Placeholders until determined properly
+    // Placeholders until determined properly
     Scalar magnitude = AC_forcing_magnitude;
     Scalar phase     = AC_forcing_phase;
-    Vector k_force   = (Vector){  AC_k_forcex,   AC_k_forcey,   AC_k_forcez};
+    Vector k_force   = (Vector){AC_k_forcex, AC_k_forcey, AC_k_forcez};
     Vector ff_re     = (Vector){AC_ff_hel_rex, AC_ff_hel_rey, AC_ff_hel_rez};
     Vector ff_im     = (Vector){AC_ff_hel_imx, AC_ff_hel_imy, AC_ff_hel_imz};
 
+    // Determine that forcing funtion type at this point.
+    // Vector force = simple_vortex_forcing(a, xx, magnitude);
+    // Vector force = simple_outward_flow_forcing(a, xx, magnitude);
+    Vector force = helical_forcing(magnitude, k_force, xx, ff_re, ff_im, phase);
 
-    //Determine that forcing funtion type at this point.
-    //Vector force = simple_vortex_forcing(a, xx, magnitude);
-    //Vector force = simple_outward_flow_forcing(a, xx, magnitude);
-    Vector force   = helical_forcing(magnitude, k_force, xx, ff_re,ff_im, phase);
+    // Scaling N = magnitude*cs*sqrt(k*cs/dt)  * dt
+    const Scalar NN = cs * sqrt(AC_kaver * cs);
+    // MV: Like in the Pencil Code. I don't understandf the logic here.
+    force.x = sqrt(dt) * NN * force.x;
+    force.y = sqrt(dt) * NN * force.y;
+    force.z = sqrt(dt) * NN * force.z;
 
-    //Scaling N = magnitude*cs*sqrt(k*cs/dt)  * dt
-    const Scalar NN = cs*sqrt(AC_kaver*cs);
-    //MV: Like in the Pencil Code. I don't understandf the logic here.
-    force.x = sqrt(dt)*NN*force.x;
-    force.y = sqrt(dt)*NN*force.y;
-    force.z = sqrt(dt)*NN*force.z;
-
-    if (is_valid(force)) { return force; }
-    else                 { return (Vector){0, 0, 0}; }
+    if (is_valid(force)) {
+        return force;
+    }
+    else {
+        return (Vector){0, 0, 0};
+    }
 }
 #endif // LFORCING
 
@@ -271,12 +276,11 @@ in ScalarField lnrho(VTXBUF_LNRHO);
 out ScalarField out_lnrho(VTXBUF_LNRHO);
 
 in VectorField uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-out VectorField out_uu(VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ);
-
+out VectorField out_uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
 
 #if LMAGNETIC
-in VectorField aa(VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ);
-out VectorField out_aa(VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ);
+in VectorField aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
+out VectorField out_aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
 #endif
 
 #if LENTROPY
@@ -290,26 +294,27 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE);
 #endif
 
 Kernel void
-solve(Scalar dt) {
+solve(Scalar dt)
+{
     out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
 
-    #if LMAGNETIC
+#if LMAGNETIC
     out_aa = rk3(out_aa, aa, induction(uu, aa), dt);
-    #endif
+#endif
 
-    #if LENTROPY
-        out_uu  = rk3(out_uu, uu, momentum(uu, lnrho, ss, aa), dt);
-        out_ss  = rk3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt);
-    #elif LTEMPERATURE
-        out_uu = rk3(out_uu, uu, momentum(uu, lnrho, tt), dt);
-        out_tt = rk3(out_tt, tt, heat_transfer(uu, lnrho, tt), dt);
-    #else
-        out_uu = rk3(out_uu, uu, momentum(uu, lnrho), dt);
-    #endif
+#if LENTROPY
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho, ss, aa), dt);
+    out_ss = rk3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt);
+#elif LTEMPERATURE
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho, tt), dt);
+    out_tt = rk3(out_tt, tt, heat_transfer(uu, lnrho, tt), dt);
+#else
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho), dt);
+#endif
 
-    #if LFORCING
+#if LFORCING
     if (step_number == 2) {
         out_uu = out_uu + forcing(globalVertexIdx, dt);
     }
-    #endif
+#endif
 }

From 39dcda4a040f33797aeff2f28a62a3eff9ef4096 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 21 Aug 2019 14:28:46 +0300
Subject: [PATCH 17/27] Made warnings about unused functions go away (this is
 intended functionality and not all programs will use all types of device
 constants, thus unnecessary warning)

---
 src/core/device.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index a3db98a..15f0c1e 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -40,22 +40,22 @@ typedef struct {
 } VertexBufferArray;
 
 __constant__ AcMeshInfo d_mesh_info;
-static inline int __device__
+static int __device__ __forceinline__
 DCONST(const AcIntParam param)
 {
     return d_mesh_info.int_params[param];
 }
-static inline int3 __device__
+static int3 __device__ __forceinline__
 DCONST(const AcInt3Param param)
 {
     return d_mesh_info.int3_params[param];
 }
-static inline AcReal __device__
+static AcReal __device__ __forceinline__
 DCONST(const AcRealParam param)
 {
     return d_mesh_info.real_params[param];
 }
-static inline AcReal3 __device__
+static AcReal3 __device__ __forceinline__
 DCONST(const AcReal3Param param)
 {
     return d_mesh_info.real3_params[param];
@@ -108,7 +108,7 @@ struct device_s {
 };
 
 // clang-format off
-static __global__ void dummy_kernel(void) {}
+static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
 // clang-format on
 
 AcResult

From 5867ff4b3e47a99863969a0a52264bd3c6f8c06d Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 21 Aug 2019 16:16:12 +0300
Subject: [PATCH 18/27] Stashing MPItest changes

---
 src/mpitest/CMakeLists.txt |   4 +-
 src/mpitest/main.c         | 148 +++++++++++++++++++++++++++++++++++--
 2 files changed, 142 insertions(+), 10 deletions(-)

diff --git a/src/mpitest/CMakeLists.txt b/src/mpitest/CMakeLists.txt
index ed19b4d..d6c5309 100644
--- a/src/mpitest/CMakeLists.txt
+++ b/src/mpitest/CMakeLists.txt
@@ -8,5 +8,5 @@ set(CMAKE_C_STANDARD_REQUIRED ON)
 find_package(MPI REQUIRED)
 
 add_executable(mpitest main.c)
-target_include_directories(mpitest PRIVATE ${MPI_C_INCLUDE_PATH})
-target_link_libraries(mpitest PRIVATE astaroth_core ${MPI_C_LIBRARIES})
+target_include_directories(mpitest PRIVATE ${CMAKE_SOURCE_DIR}/src/standalone ${MPI_C_INCLUDE_PATH})
+target_link_libraries(mpitest astaroth_core astaroth_standalone ${MPI_C_LIBRARIES})
diff --git a/src/mpitest/main.c b/src/mpitest/main.c
index a07522e..2ca34d3 100644
--- a/src/mpitest/main.c
+++ b/src/mpitest/main.c
@@ -16,13 +16,120 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
+/**
+    Running: mpirun -np <num processes> <executable>
+*/
+#undef NDEBUG // Assert always
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "astaroth.h"
+#include "autotest.h"
 
 #include <mpi.h>
 
+// From Astaroth Standalone
+#include "config_loader.h"
+#include "model/host_memory.h"
+
+static void
+distribute_mesh(const AcMesh* src, AcMesh* dst)
+{
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int process_id, num_processes;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
+
+    const size_t count = acVertexBufferSize(dst->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+
+        // Communicate to self
+        if (process_id == 0) {
+            assert(src);
+            assert(dst);
+            memcpy(&dst->vertex_buffer[i][0], //
+                   &src->vertex_buffer[i][0], //
+                   count * sizeof(src->vertex_buffer[i][0]));
+        }
+        // Communicate to others
+        for (int j = 1; j < num_processes; ++j) {
+            if (process_id == 0) {
+                assert(src);
+
+                // Send
+                // TODO RECHECK THESE j INDICES
+                const size_t src_idx = j * dst->info.int_params[AC_mx] *
+                                       dst->info.int_params[AC_my] * src->info.int_params[AC_nz] /
+                                       num_processes;
+
+                MPI_Send(&src->vertex_buffer[i][src_idx], count, datatype, j, 0, MPI_COMM_WORLD);
+            }
+            else {
+                assert(dst);
+
+                // Recv
+                const size_t dst_idx = 0;
+                MPI_Status status;
+                MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, 0, 0, MPI_COMM_WORLD,
+                         &status);
+            }
+        }
+    }
+}
+
+static void
+gather_mesh(const AcMesh* src, AcMesh* dst)
+{
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int process_id, num_processes;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
+
+    size_t count = acVertexBufferSize(src->info);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        // Communicate to self
+        if (process_id == 0) {
+            assert(src);
+            assert(dst);
+            memcpy(&dst->vertex_buffer[i][0], //
+                   &src->vertex_buffer[i][0], //
+                   count * sizeof(AcReal));
+        }
+
+        // Communicate to others
+        for (int j = 1; j < num_processes; ++j) {
+            if (process_id == 0) {
+                // Recv
+                // const size_t dst_idx = j * acVertexBufferCompdomainSize(dst->info);
+                const size_t dst_idx = j * dst->info.int_params[AC_mx] *
+                                       dst->info.int_params[AC_my] * dst->info.int_params[AC_nz] /
+                                       num_processes;
+
+                assert(dst_idx + count <= acVertexBufferSize(dst->info));
+                MPI_Status status;
+                MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, j, 0, MPI_COMM_WORLD,
+                         &status);
+            }
+            else {
+                // Send
+                const size_t src_idx = 0;
+
+                assert(src_idx + count <= acVertexBufferSize(src->info));
+                MPI_Send(&src->vertex_buffer[i][src_idx], count, datatype, 0, 0, MPI_COMM_WORLD);
+            }
+        }
+    }
+}
+
 int
 main(void)
 {
@@ -37,14 +144,39 @@ main(void)
     MPI_Get_processor_name(processor_name, &name_len);
     printf("Processor %s. Process %d of %d.\n", processor_name, process_id, num_processes);
 
-    AcMeshInfo info = {
-        .int_params[AC_nx] = 128,
-        .int_params[AC_ny] = 64,
-        .int_params[AC_nz] = 32,
-    };
-    acInit(info);
-    acIntegrate(0.1f);
-    acQuit();
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    update_config(&mesh_info);
+
+    AcMesh* main_mesh     = NULL;
+    ModelMesh* model_mesh = NULL;
+    if (process_id == 0) {
+        main_mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_RANDOM, main_mesh);
+        model_mesh = modelmesh_create(mesh_info);
+        acmesh_to_modelmesh(*main_mesh, model_mesh);
+    }
+
+    AcMeshInfo submesh_info = mesh_info;
+    submesh_info.int_params[AC_nz] /= num_processes;
+    update_config(&submesh_info);
+
+    AcMesh* submesh = acmesh_create(submesh_info);
+
+    /////////////////////
+    distribute_mesh(main_mesh, submesh);
+    gather_mesh(submesh, main_mesh);
+    /////////////////////////
+    // Autotest
+    bool is_acceptable = verify_meshes(*model_mesh, *main_mesh);
+    /////
+
+    acmesh_destroy(submesh);
+
+    if (process_id == 0) {
+        modelmesh_destroy(model_mesh);
+        acmesh_destroy(main_mesh);
+    }
 
     MPI_Finalize();
     return EXIT_SUCCESS;

From d52e002e5d1756317927ff10d9774ce25bce561e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 21 Aug 2019 16:18:48 +0300
Subject: [PATCH 19/27] Made Astaroth Standalone a library component (still
 works as before but can be included in other projects which need f.ex.
 autotesting)

---
 src/standalone/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/standalone/CMakeLists.txt b/src/standalone/CMakeLists.txt
index ea1d04c..0a61ede 100644
--- a/src/standalone/CMakeLists.txt
+++ b/src/standalone/CMakeLists.txt
@@ -25,10 +25,11 @@ add_compile_options(-pipe ${OpenMP_CXX_FLAGS})
 add_compile_options(-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion)# -Wshadow)
 
 ## Compile and link
-add_library(astaroth_standalone ${SOURCES})
+add_library(astaroth_standalone STATIC ${SOURCES})
+target_link_libraries(astaroth_standalone PRIVATE astaroth_core "${OpenMP_CXX_FLAGS}" ${SDL2_LIBRARY})
 
 add_executable(ac_run main.cc)
-target_link_libraries(ac_run PRIVATE astaroth_standalone astaroth_core "${OpenMP_CXX_FLAGS}" ${SDL2_LIBRARY})
+target_link_libraries(ac_run PRIVATE astaroth_standalone)
 
 # Define the config directory
 if (ALTER_CONF)

From f6040f89dc807b5bfb32518ad9b309cb25802cc7 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 21 Aug 2019 16:24:48 +0300
Subject: [PATCH 20/27] Added acPrintMeshInfo for printing all mesh parameters

---
 include/astaroth_defines.h      |  3 +++
 src/core/astaroth.cu            | 15 +++++++++++++++
 src/standalone/config_loader.cc | 11 +----------
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index ee5d70c..0d340a0 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -215,6 +215,9 @@ acVertexBufferIdx(const int i, const int j, const int k, const AcMeshInfo info)
            k * info.int_params[AC_mx] * info.int_params[AC_my];
 }
 
+/** Prints all parameters inside AcMeshInfo */
+void acPrintMeshInfo(const AcMeshInfo config);
+
 /*
 static inline int
 acGetParam(const AcMeshInfo info, const AcIntParam param)
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index ea4ecaa..11956f7 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -36,6 +36,21 @@ const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 static const int num_nodes = 1;
 static Node nodes[num_nodes];
 
+void
+acPrintMeshInfo(const AcMeshInfo config)
+{
+    for (int i = 0; i < NUM_INT_PARAMS; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_INT3_PARAMS; ++i)
+        printf("[%s]: (%d, %d, %d)\n", int3param_names[i], config.int3_params[i].x,
+               config.int3_params[i].y, config.int3_params[i].z);
+    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+    for (int i = 0; i < NUM_REAL3_PARAMS; ++i)
+        printf("[%s]: (%g, %g, %g)\n", real3param_names[i], double(config.real3_params[i].x),
+               double(config.real3_params[i].y), double(config.real3_params[i].z));
+}
+
 AcResult
 acInit(const AcMeshInfo mesh_info)
 {
diff --git a/src/standalone/config_loader.cc b/src/standalone/config_loader.cc
index eeb223e..36fb83a 100644
--- a/src/standalone/config_loader.cc
+++ b/src/standalone/config_loader.cc
@@ -34,15 +34,6 @@
 #include "src/core/errchk.h"
 #include "src/core/math_utils.h"
 
-static inline void
-print(const AcMeshInfo& config)
-{
-    for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
-    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
-}
-
 /**
  \brief Find the index of the keyword in names
  \return Index in range 0...n if the keyword is in names. -1 if the keyword was
@@ -155,7 +146,7 @@ update_config(AcMeshInfo* config)
 #if VERBOSE_PRINTING // Defined in astaroth.h
     printf("###############################################################\n");
     printf("Config dimensions recalculated:\n");
-    print(*config);
+    acPrintMeshInfo(*config);
     printf("###############################################################\n");
 #endif
 }

From 9fa39b66fc142da695caa33562e1f6972c9456a1 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 26 Aug 2019 18:33:50 +0300
Subject: [PATCH 21/27] Reverted an accidentally modified astaroth.conf to the
 same version as in master

---
 config/astaroth.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/astaroth.conf b/config/astaroth.conf
index 944364b..32f50a3 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -7,7 +7,7 @@
  */
 AC_nx = 128
 AC_ny = 128
-AC_nz = 6
+AC_nz = 128
 
 AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521

From 0616f8938519baca840dc213b6e2c1a5175e3245 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Tue, 27 Aug 2019 11:06:37 +0800
Subject: [PATCH 22/27] Could be used in some documentation demonstrating
 domain decomposition.

---
 doc/domain_decomposition.svg | 391 +++++++++++++++++++++++++++++++++++
 1 file changed, 391 insertions(+)
 create mode 100644 doc/domain_decomposition.svg

diff --git a/doc/domain_decomposition.svg b/doc/domain_decomposition.svg
new file mode 100644
index 0000000..70527eb
--- /dev/null
+++ b/doc/domain_decomposition.svg
@@ -0,0 +1,391 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="76.050636mm"
+   height="85.851082mm"
+   viewBox="0 0 76.050636 85.851082"
+   version="1.1"
+   id="svg8"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"
+   sodipodi:docname="domain_decomposition.svg"
+   inkscape:export-filename="/home/mvaisala/astaroth/doc/domain_decomposition.svg.png"
+   inkscape:export-xdpi="233.7916"
+   inkscape:export-ydpi="233.7916">
+  <defs
+     id="defs2">
+    <inkscape:perspective
+       sodipodi:type="inkscape:persp3d"
+       inkscape:vp_x="-34.999999 : 849.45444 : 1"
+       inkscape:vp_y="0 : 3779.5274 : 0"
+       inkscape:vp_z="758.70081 : 849.45444 : 1"
+       inkscape:persp3d-origin="361.85039 : 662.36784 : 1"
+       id="perspective3883" />
+    <inkscape:perspective
+       sodipodi:type="inkscape:persp3d"
+       inkscape:vp_x="-28.760416 : 819.37885 : 1"
+       inkscape:vp_y="0 : 3779.5274 : 0"
+       inkscape:vp_z="764.94038 : 819.37885 : 1"
+       inkscape:persp3d-origin="368.08997 : 632.29225 : 1"
+       id="perspective3817" />
+    <inkscape:perspective
+       sodipodi:type="inkscape:persp3d"
+       inkscape:vp_x="139.37224 : 246.52176 : 1"
+       inkscape:vp_y="90.281989 : 563.44101 : 1"
+       inkscape:vp_z="193.82452 : 233.87492 : 1"
+       inkscape:persp3d-origin="90.281989 : 216.71184 : 1"
+       id="perspective3715" />
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.4"
+     inkscape:cx="65.802138"
+     inkscape:cy="225.49817"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="true"
+     inkscape:snap-grids="true"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:window-width="1920"
+     inkscape:window-height="1025"
+     inkscape:window-x="0"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1">
+    <inkscape:grid
+       type="xygrid"
+       id="grid3923"
+       originx="-51.915482"
+       originy="-200.5164" />
+  </sodipodi:namedview>
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-51.915483,-10.632538)">
+    <g
+       inkscape:corner7="-0.091655035 : 0.066875827 : 0.25 : 1"
+       inkscape:corner0="0.2311558 : 0.078514233 : 0 : 1"
+       inkscape:perspectiveID="#perspective3817"
+       id="g3815"
+       sodipodi:type="inkscape:box3d">
+      <path
+         points="345.49477,-34.871325 425.05445,-25.867869 425.05445,-55.566078 345.49477,-70.600089 "
+         d="m 345.49477,-70.600089 v 35.728764 l 79.55968,9.003456 v -29.698209 z"
+         inkscape:box3dsidetype="6"
+         style="fill:#353564;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3803"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="537.05691,-76.198873 537.05691,-38.224283 425.05445,-25.867869 425.05445,-55.566078 "
+         d="m 425.05445,-55.566078 112.00246,-20.632795 v 37.97459 l -112.00246,12.356414 z"
+         inkscape:box3dsidetype="11"
+         style="fill:#e9e9ff;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3805"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="460.049,-102.2544 537.05691,-76.198873 425.05445,-55.566078 345.49477,-70.600089 "
+         d="M 345.49477,-70.600089 460.049,-102.2544 l 77.00791,26.055527 -112.00246,20.632795 z"
+         inkscape:box3dsidetype="5"
+         style="fill:#4d4d9f;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3807"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="460.049,-53.828225 537.05691,-38.224283 425.05445,-25.867869 345.49477,-34.871325 "
+         d="m 345.49477,-34.871325 114.55423,-18.9569 77.00791,15.603942 -112.00246,12.356414 z"
+         inkscape:box3dsidetype="13"
+         style="fill:#afafde;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3809"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="460.049,-53.828225 537.05691,-38.224283 537.05691,-76.198873 460.049,-102.2544 "
+         d="m 460.049,-102.2544 v 48.426175 l 77.00791,15.603942 v -37.97459 z"
+         inkscape:box3dsidetype="14"
+         style="fill:#d7d7ff;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3811"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="460.049,-102.2544 460.049,-53.828225 345.49477,-34.871325 345.49477,-70.600089 "
+         d="M 345.49477,-70.600089 460.049,-102.2544 v 48.426175 l -114.55423,18.9569 z"
+         inkscape:box3dsidetype="3"
+         style="fill:#8686bf;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3813"
+         sodipodi:type="inkscape:box3dside" />
+    </g>
+    <g
+       inkscape:corner7="-0.067801071 : 0.080724505 : 0.25 : 1"
+       inkscape:corner0="0.29810879 : 0.093471507 : 0 : 1"
+       inkscape:perspectiveID="#perspective3883"
+       id="g3881"
+       sodipodi:type="inkscape:box3d">
+      <path
+         points="322.62977,-102.51699 401.43343,-87.835826 401.43343,-118.95615 322.62977,-139.63071 "
+         d="m 322.62977,-139.63071 v 37.11372 l 78.80366,14.681164 v -31.120324 z"
+         inkscape:box3dsidetype="6"
+         style="fill:#353564;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3869"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="520.44801,-152.18313 520.44801,-111.43056 401.43343,-87.835826 401.43343,-118.95615 "
+         d="m 401.43343,-118.95615 119.01458,-33.22698 v 40.75257 l -119.01458,23.594734 z"
+         inkscape:box3dsidetype="11"
+         style="fill:#e9e9ff;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3871"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="442.62977,-189.88386 520.44801,-152.18313 401.43343,-118.95615 322.62977,-139.63071 "
+         d="m 322.62977,-139.63071 120,-50.25315 77.81824,37.70073 -119.01458,33.22698 z"
+         inkscape:box3dsidetype="5"
+         style="fill:#4d4d9f;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3873"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="442.62977,-138.20214 520.44801,-111.43056 401.43343,-87.835826 322.62977,-102.51699 "
+         d="m 322.62977,-102.51699 120,-35.68515 77.81824,26.77158 -119.01458,23.594734 z"
+         inkscape:box3dsidetype="13"
+         style="fill:#afafde;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3875"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="442.62977,-138.20214 520.44801,-111.43056 520.44801,-152.18313 442.62977,-189.88386 "
+         d="m 442.62977,-189.88386 v 51.68172 l 77.81824,26.77158 v -40.75257 z"
+         inkscape:box3dsidetype="14"
+         style="fill:#d7d7ff;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3877"
+         sodipodi:type="inkscape:box3dside" />
+      <path
+         points="442.62977,-189.88386 442.62977,-138.20214 322.62977,-102.51699 322.62977,-139.63071 "
+         d="m 322.62977,-139.63071 120,-50.25315 v 51.68172 l -120,35.68515 z"
+         inkscape:box3dsidetype="3"
+         style="fill:#8686bf;fill-rule:evenodd;stroke:none;stroke-linejoin:round"
+         id="path3879"
+         sodipodi:type="inkscape:box3dside" />
+    </g>
+    <g
+       id="g4791"
+       transform="translate(13.229163,-23.812503)"
+       style="fill:#ffffff;fill-opacity:1">
+      <rect
+         y="87.979164"
+         x="63.5"
+         height="10.583336"
+         width="42.333332"
+         id="rect3925"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+      <rect
+         y="77.395836"
+         x="63.500004"
+         height="10.583337"
+         width="42.333332"
+         id="rect3925-2"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+      <rect
+         y="66.8125"
+         x="63.499996"
+         height="10.583337"
+         width="42.333332"
+         id="rect3925-0"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+      <rect
+         y="56.229168"
+         x="63.499996"
+         height="10.583337"
+         width="42.333332"
+         id="rect3925-23"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+    </g>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 60.854159,48.291665 15.875,-15.875"
+       id="path4862"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 103.18749,48.291666 15.875,-15.875001"
+       id="path4862-1"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 103.18749,90.624998 15.875,-15.875"
+       id="path4862-2"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 103.18749,80.041668 15.875,-15.875"
+       id="path4862-9"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 103.18749,69.458328 15.875,-15.875"
+       id="path4862-3"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 103.18749,58.874998 15.875,-15.874999"
+       id="path4862-19"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 60.854158,58.874998 15.875001,-15.875"
+       id="path4862-4"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="90.045952"
+       y="15.407738"
+       id="text4926"><tspan
+         sodipodi:role="line"
+         x="90.926521"
+         y="15.407738"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.3499999px;line-height:10.58333302px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';text-align:center;text-anchor:middle;stroke-width:0.26458332"
+         id="tspan4928">Astaroth multi-GPU node </tspan><tspan
+         sodipodi:role="line"
+         x="90.045952"
+         y="28.636904"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.3499999px;line-height:10.58333302px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';text-align:center;text-anchor:middle;stroke-width:0.26458332"
+         id="tspan4930">domain decomposition</tspan></text>
+  </g>
+  <g
+     inkscape:groupmode="layer"
+     id="layer2"
+     inkscape:label="Layer 2"
+     transform="translate(-51.915483,-10.632538)">
+    <g
+       id="g4791-8"
+       transform="translate(-2.6458431,-7.9375045)"
+       style="fill:#ffffff;fill-opacity:1">
+      <rect
+         y="87.979164"
+         x="63.5"
+         height="10.583336"
+         width="42.333332"
+         id="rect3925-9"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+      <rect
+         y="77.395836"
+         x="63.500004"
+         height="10.583337"
+         width="42.333332"
+         id="rect3925-2-7"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+      <rect
+         y="66.8125"
+         x="63.499996"
+         height="10.583337"
+         width="42.333332"
+         id="rect3925-0-3"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+      <rect
+         y="56.229168"
+         x="63.499996"
+         height="10.583337"
+         width="42.333332"
+         id="rect3925-23-6"
+         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="76.351189"
+       y="55.28421"
+       id="text4926-2"><tspan
+         sodipodi:role="line"
+         x="76.351189"
+         y="55.28421"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';stroke-width:0.26458332"
+         id="tspan4930-0">GPU 0</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="75.973213"
+       y="66.245522"
+       id="text4926-2-9"><tspan
+         sodipodi:role="line"
+         x="75.973213"
+         y="66.245522"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';stroke-width:0.26458332"
+         id="tspan4930-0-9">GPU 1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="75.784225"
+       y="76.072899"
+       id="text4926-2-4"><tspan
+         sodipodi:role="line"
+         x="75.784225"
+         y="76.072899"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';stroke-width:0.26458332"
+         id="tspan4930-0-5">GPU 2</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="75.973213"
+       y="86.656235"
+       id="text4926-2-1"><tspan
+         sodipodi:role="line"
+         x="75.973213"
+         y="86.656235"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';stroke-width:0.26458332"
+         id="tspan4930-0-0">GPU 3</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="51.59375"
+       y="70.970222"
+       id="text4926-2-3"><tspan
+         sodipodi:role="line"
+         x="51.59375"
+         y="70.970222"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';stroke-width:0.26458332"
+         id="tspan4930-0-7">Nz</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="79.563988"
+       y="96.48362"
+       id="text4926-2-3-8"><tspan
+         sodipodi:role="line"
+         x="79.563988"
+         y="96.48362"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';stroke-width:0.26458332"
+         id="tspan4930-0-7-8">Nx</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:10.58333302px;line-height:1.25;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="114.71577"
+       y="87.223206"
+       id="text4926-2-3-6"><tspan
+         sodipodi:role="line"
+         x="114.71577"
+         y="87.223206"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;font-family:'URW Gothic L';-inkscape-font-specification:'URW Gothic L';stroke-width:0.26458332"
+         id="tspan4930-0-7-0">Ny</tspan></text>
+  </g>
+</svg>

From 20138263f41bbd1a5ce377cb91a8f7bd061d0f22 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 27 Aug 2019 17:36:33 +0300
Subject: [PATCH 23/27] The previous attempt
 (dsl_feature_completeness_2019-08-23) to enable arbitrary kernel functions
 was a failure: we get significant performance loss (25-100%) if step_number
 is not passed as a template parameter to the integration kernel. Apparently
 the CUDA compiler cannot perform some optimizations if there is a if/else
 construct in a performance-critical part which cannot be evaluated at compile
 time. This branch keeps step_number as a template parameter but takes rest of
 the user parameters as uniforms (dt is no longer passed as a function
 parameter but as an uniform with the DSL instead).

---
 acc/mhd_solver/stencil_definition.sdh |  1 +
 acc/mhd_solver/stencil_process.sps    |  3 ++-
 acc/src/code_generator.c              | 18 ++++++++++++------
 src/core/device.cu                    | 10 ++++++----
 4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/acc/mhd_solver/stencil_definition.sdh b/acc/mhd_solver/stencil_definition.sdh
index 31667fc..c4324e5 100644
--- a/acc/mhd_solver/stencil_definition.sdh
+++ b/acc/mhd_solver/stencil_definition.sdh
@@ -15,6 +15,7 @@ uniform int AC_bin_steps;
 uniform int AC_bc_type;
 
 // Real params
+uniform Scalar AC_dt;
 // Spacing
 uniform Scalar AC_dsx;
 uniform Scalar AC_dsy;
diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index e605b9f..0db62e0 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE);
 #endif
 
 Kernel void
-solve(Scalar dt)
+solve()
 {
+    Scalar dt = AC_dt;
     out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
 
 #if LMAGNETIC
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index e560a70..b486a6d 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -62,9 +62,9 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
     [MATRIX]      = "AcMatrix",
     [SCALARFIELD] = "AcReal",
     // Type qualifiers
-    [KERNEL] = "template <int step_number>  static "
-               "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
-                             // RK_LAUNCH_BOUND_MIN_BLOCKS),
+    [KERNEL] = "template <int step_number>  static __global__",
+    //__launch_bounds__(RK_THREADBLOCK_SIZE,
+    // RK_LAUNCH_BOUND_MIN_BLOCKS),
     [PREPROCESSED] = "static __device__ "
                      "__forceinline__",
     [CONSTANT] = "const",
@@ -318,9 +318,15 @@ traverse(const ASTNode* node)
         inside_kernel = true;
 
     // Kernel parameter boilerplate
-    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, ";
-    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
-        printf("%s ", kernel_parameter_boilerplate);
+    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE";
+    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) {
+        printf("%s", kernel_parameter_boilerplate);
+
+        if (node->lhs != NULL) {
+            printf("Compilation error: function parameters for Kernel functions not allowed!\n");
+            exit(EXIT_FAILURE);
+        }
+    }
 
     // Kernel builtin variables boilerplate (read input/output arrays and setup
     // indices)
diff --git a/src/core/device.cu b/src/core/device.cu
index 15f0c1e..12d4087 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)
 
                 cudaEventRecord(tstart); // ---------------------------------------- Timing start
 
+                acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
                 for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
+                    solve<2><<<bpg, tpb>>>(start, end, device->vba);
 
                 cudaEventRecord(tstop); // ----------------------------------------- Timing end
                 cudaEventSynchronize(tstop);
@@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
                    (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                    (unsigned int)ceil(n.z / AcReal(tpb.z)));
 
+    acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
     if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
     else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
     else
-        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
 
     ERRCHK_CUDA_KERNEL();
 

From 230230ead94d741f9eb86e93dbf32c83b91d738b Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 27 Aug 2019 18:15:30 +0300
Subject: [PATCH 24/27] Added a script for preprocessing the device files.
 Useful for inspecting whether the DSL code is generated correctly.

---
 scripts/preprocess_device_files.sh | 1 +
 1 file changed, 1 insertion(+)
 create mode 100755 scripts/preprocess_device_files.sh

diff --git a/scripts/preprocess_device_files.sh b/scripts/preprocess_device_files.sh
new file mode 100755
index 0000000..0ef61f9
--- /dev/null
+++ b/scripts/preprocess_device_files.sh
@@ -0,0 +1 @@
+nvcc -E ../src/core/device.cu -I ../include -I ../ > preprocessed_device_files.pp

From 6ea02fa28e2086a9e59ff00c54d6fe9397dac4d8 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 27 Aug 2019 18:19:20 +0300
Subject: [PATCH 25/27] DSL now 'feature complete' with respect to what I had
 in mind before the summer. Users can now create multiple kernels and the
 library functions are generated automatically for them. The generated library
 functions are of the form acDeviceKernel_<name> and acNodeKernel_<name>. More
 features are needed though. The next features to be added at some point are
 1D and 2D device constant arrays in order to support profiles for f.ex.
 forcing.

---
 acc/src/code_generator.c         | 13 +++++++++
 src/core/device.cu               | 36 +++++++++++------------
 src/core/kernels/integration.cuh | 50 ++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index b486a6d..864d2ec 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -625,6 +625,17 @@ generate_header(void)
     */
 }
 
+static void
+generate_library_hooks(void)
+{
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == KERNEL) {
+            printf("GEN_DEVICE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
+            // printf("GEN_NODE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
+        }
+    }
+}
+
 int
 main(int argc, char** argv)
 {
@@ -662,6 +673,8 @@ main(int argc, char** argv)
         generate_preprocessed_structures();
     else if (compilation_type == STENCIL_HEADER)
         generate_header();
+    else if (compilation_type == STENCIL_PROCESS)
+        generate_library_hooks();
 
     // print_symbol_table();
 
diff --git a/src/core/device.cu b/src/core/device.cu
index 12d4087..78e82d7 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -39,6 +39,24 @@ typedef struct {
     AcReal* out[NUM_VTXBUF_HANDLES];
 } VertexBufferArray;
 
+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+
+#if PACKED_DATA_TRANSFERS
+// Declare memory for buffers needed for packed data transfers here
+// AcReal* data_packing_buffer;
+#endif
+};
+
 __constant__ AcMeshInfo d_mesh_info;
 static int __device__ __forceinline__
 DCONST(const AcIntParam param)
@@ -89,24 +107,6 @@ static dim3 rk3_tpb(32, 1, 4);
 // #include "kernels/pack_unpack.cuh"
 #endif
 
-struct device_s {
-    int id;
-    AcMeshInfo local_config;
-
-    // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
-
-    // Memory
-    VertexBufferArray vba;
-    AcReal* reduce_scratchpad;
-    AcReal* reduce_result;
-
-#if PACKED_DATA_TRANSFERS
-// Declare memory for buffers needed for packed data transfers here
-// AcReal* data_packing_buffer;
-#endif
-};
-
 // clang-format off
 static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
 // clang-format on
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 1b62d1e..0a06c2f 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -585,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
                                                                                                    \
     const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 
+// clang-format off
+#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
+    template <int step_number>                                                                     \
+    AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,                 \
+                                         const int3 start, const int3 end)                         \
+    {                                                                                              \
+        cudaSetDevice(device->id);                                                                 \
+                                                                                                   \
+        const dim3 tpb(32, 1, 4);                                                                  \
+                                                                                                   \
+        const int3 n = end - start;                                                                \
+        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
+                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
+                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
+                                                                                                   \
+        identifier<step_number>                                                                    \
+            <<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);                   \
+        ERRCHK_CUDA_KERNEL();                                                                      \
+                                                                                                   \
+        return AC_SUCCESS;                                                                         \
+    }
+
+/*
+#define GEN_NODE_FUNC_HOOK(identifier)                                                             \
+    template <int step_number>                                                                     \
+    AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start,     \
+                                       const int3 end)                                             \
+    {                                                                                              \
+        acNodeSynchronizeStream(node, stream);                                                     \
+                                                                                                   \
+        for (int i = 0; i < node->num_devices; ++i) {                                              \
+                                                                                                   \
+            const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};                \
+            const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};  \
+                                                                                                   \
+            const int3 da = max(start, d0);                                                        \
+            const int3 db = min(end, d1);                                                          \
+                                                                                                   \
+            if (db.z >= da.z) {                                                                    \
+                const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};                    \
+                const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};                    \
+                acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local,          \
+                                            db_local, dt);                                         \
+            }                                                                                      \
+        }                                                                                          \
+        return AC_SUCCESS;                                                                         \
+    }
+    */
+// clang-format on
+
 #include "stencil_process.cuh"

From f77ec836c75554281e2d94ec9dc57cdccc30f67d Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 28 Aug 2019 21:06:05 +0300
Subject: [PATCH 26/27] Added a WIP API, DSL specification and user manual

---
 .../API_specification_and_user_manual.md      | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md

diff --git a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
new file mode 100644
index 0000000..76701fe
--- /dev/null
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -0,0 +1,66 @@
+# Astaroth specification and user manual
+
+Copyright (C) 2014-2019, Johannes Pekkila, Miikka Vaisala.
+
+	   Astaroth is free software: you can redistribute it and/or modify
+	   it under the terms of the GNU General Public License as published by
+	   the Free Software Foundation, either version 3 of the License, or
+	   (at your option) any later version.
+	   Astaroth is free software: you can redistribute it and/or modify
+	   it under the terms of the GNU General Public License as published by
+	   the Free Software Foundation, either version 3 of the License, or
+	   (at your option) any later version.
+
+	   Astaroth is distributed in the hope that it will be useful,
+	   but WITHOUT ANY WARRANTY; without even the implied warranty of
+	   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	   GNU General Public License for more details.
+
+	   You should have received a copy of the GNU General Public License
+	   along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+
+
+# Introduction and background
+
+Astaroth is a collection of tools for utilizing multiple graphics processing units (GPUs) efficiently in three-dimensional stencil computations. This document specifies the Astaroth application-programming interface (API) and domain-specific language (DSL).
+
+Astaroth has been designed for the demands in computational sciences, where large stencils are often used to attain sufficient accuracy. The majority of previous work focuses on stencil computations with low-order stencils for which several efficient algorithms have been proposed, whereas work on high-order stencils is more limited. In addition, in computational physics multiple fields interact with each other, such as the velocity and magnetic fields of electrically conducting fluids. Such computations are especially challenging to solve efficiently because of the problem's relatively low operational intensity and the small caches provided by GPUs. Efficient methods for computations with several coupled fields and large stencils have not been addressed sufficiently in prior work.
+
+With Astaroth, we have taken inspiration of image processing and graphics pipelines which rely on holding intermediate data in caches for the duration of computations, and extended the idea to work efficiently also with large three-dimensional stencils and an arbitrary number of coupled fields. As programming GPUs efficiently is relatively verbose and requires deep knowledge of the underlying hardware and execution model, we have created a high-level domain-specific language for expressing a wide range of tasks in computational sciences and provide a source-to-source compiler for translating stencil problems expressed in our language into efficient CUDA kernels.
+
+The kernels generated from the Astaroth DSL are embedded in the Astaroth Core library, which is usable via the Astaroth API. While the Astaroth library is written in C++/CUDA, the API conforms to the C99 standard.
+
+
+# Publications
+
+> J. Pekkilä, M. S. Väisälä, M. Käpylä, P. J. Käpylä, and O. Anjum, “Methods for compressible fluid simulation on GPUs using high-order finite differences, ”Computer Physics Communications, vol. 217, pp. 11–22, Aug. 2017.
+
+> M. S. Väisälä, Magnetic Phenomena of the Interstellar Medium in Theory and Observation. PhD thesis, University of Helsinki, Finland, 2017.
+
+> J. Pekkilä, Astaroth: A Library for Stencil Computations on Graphics Processing Units. Master's thesis, Aalto University School of Science, Espoo, Finland, 2019.
+
+The library, API and DSL described in this document were introduced in (Pekkilä, 2019). Consider referring to it if you use the latest version of Astaroth in your work.
+
+
+# Astaroth API
+
+## Devices and nodes
+
+## Meshes
+
+## Streams and synchronization
+
+## Interface
+
+
+# Astaroth DSL
+
+## Uniforms
+
+## Vertex buffers
+
+### Input and output buffers
+
+## Built-in variables and functions
+
+

From bd549f5d2888659e6f5c3029583f606d473b2b06 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 29 Aug 2019 15:14:40 +0300
Subject: [PATCH 27/27] Revised the publications section in the API & DSL
 specification

---
 .../API_specification_and_user_manual.md                 | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
index 76701fe..b77b00b 100644
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -33,13 +33,14 @@ The kernels generated from the Astaroth DSL are embedded in the Astaroth Core li
 
 # Publications
 
-> J. Pekkilä, M. S. Väisälä, M. Käpylä, P. J. Käpylä, and O. Anjum, “Methods for compressible fluid simulation on GPUs using high-order finite differences, ”Computer Physics Communications, vol. 217, pp. 11–22, Aug. 2017.
-
-> M. S. Väisälä, Magnetic Phenomena of the Interstellar Medium in Theory and Observation. PhD thesis, University of Helsinki, Finland, 2017.
+The foundational work was done in (Väisälä, Pekkilä, 2017) and the library, API and DSL described in this document were introduced in (Pekkilä, 2019). We kindly wish the users of Astaroth to cite to these publications in their work.
 
 > J. Pekkilä, Astaroth: A Library for Stencil Computations on Graphics Processing Units. Master's thesis, Aalto University School of Science, Espoo, Finland, 2019.
 
-The library, API and DSL described in this document were introduced in (Pekkilä, 2019). Consider referring to it if you use the latest version of Astaroth in your work.
+> M. S. Väisälä, Magnetic Phenomena of the Interstellar Medium in Theory and Observation. PhD thesis, University of Helsinki, Finland, 2017.
+
+> J. Pekkilä, M. S. Väisälä, M. Käpylä, P. J. Käpylä, and O. Anjum, “Methods for compressible fluid simulation on GPUs using high-order finite differences, ”Computer Physics Communications, vol. 217, pp. 11–22, Aug. 2017.
+
 
 
 # Astaroth API