From 18df9e5579bce4bcb9222aac379ff92d392e9f45 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 2 Sep 2019 20:15:27 +0300
Subject: [PATCH 01/12] Added a parameter for passing a custom include dir to
 compile_acc

---
 scripts/compile_acc.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/compile_acc.sh b/scripts/compile_acc.sh
index 7ebbd4c..a91d244 100755
--- a/scripts/compile_acc.sh
+++ b/scripts/compile_acc.sh
@@ -30,6 +30,12 @@ do
 			echo "compile_acc.sh -a custom_setup/custom_assembly.sas -p custom_setup/custom_process.sps --header custom_setup/custom_header.h"
 			exit 0
 			;;
+		-I|--include)
+			shift
+                        ACC_INCLUDE_DIR=${1}
+			shift
+                        echo "CUSTOM include dir!"
+			;;
 		--header)
 			shift
                         ACC_HEADER=${1}

From 9e57aba9b711bda2c4f4324214c1cdd9043eda97 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 2 Sep 2019 21:26:57 +0300
Subject: [PATCH 02/12] New feature: ScalarArray. ScalarArrays are read-only 1D
 arrays containing max(mx, max(my, mz)) elements. ScalarArray is a new type of
 uniform and can be used for storing f.ex. forcing profiles. The DSL now also
 supports complex numbers and some basic arithmetic (exp, multiplication)

---
 acc/src/acc.l                    |  2 ++
 acc/src/acc.y                    |  6 +++--
 acc/src/code_generator.c         | 30 +++++++++++++----------
 include/astaroth_defines.h       |  6 +++++
 src/core/astaroth.cu             | 11 +++++----
 src/core/device.cu               | 42 ++++++++++++++++++++++++++++++++
 src/core/kernels/integration.cuh | 10 ++++----
 src/core/math_utils.h            |  5 ++++
 8 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/acc/src/acc.l b/acc/src/acc.l
index 76104d8..c0eaab9 100644
--- a/acc/src/acc.l
+++ b/acc/src/acc.l
@@ -15,8 +15,10 @@ L [a-zA-Z_]
 "void"      { return VOID; } /* Rest of the types inherited from C */
 "int"       { return INT; }
 "int3"      { return INT3; }
+"Complex"   { return COMPLEX; }
 "ScalarField" { return SCALARFIELD; }
 "VectorField" { return VECTOR; }
+"ScalarArray" { return SCALARARRAY; }
 
 "Kernel"       { return KERNEL; } /* Function specifiers */
 "Preprocessed" { return PREPROCESSED; }
diff --git a/acc/src/acc.y b/acc/src/acc.y
index 0bd1d19..103b802 100644
--- a/acc/src/acc.y
+++ b/acc/src/acc.y
@@ -16,8 +16,8 @@ int yyget_lineno();
 %token CONSTANT IN OUT UNIFORM
 %token IDENTIFIER NUMBER
 %token RETURN
-%token SCALAR VECTOR MATRIX SCALARFIELD
-%token VOID INT INT3
+%token SCALAR VECTOR MATRIX SCALARFIELD SCALARARRAY
+%token VOID INT INT3 COMPLEX
 %token IF ELSE FOR WHILE ELIF
 %token LEQU LAND LOR LLEQU
 %token KERNEL PREPROCESSED
@@ -210,6 +210,8 @@ type_specifier: VOID
               | VECTOR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VECTOR;  }
               | MATRIX                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = MATRIX;  }
               | SCALARFIELD                                                             { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALARFIELD;  }
+              | SCALARARRAY                                                             { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALARARRAY;  }
+              | COMPLEX                                                                 { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = COMPLEX;  }
               ;
 
 identifier: IDENTIFIER                                                                  { $$ = astnode_create(NODE_IDENTIFIER, NULL, NULL); astnode_set_buffer(yytext, $$); }
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 864d2ec..b961f77 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -61,6 +61,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
     [VECTOR]      = "AcReal3",
     [MATRIX]      = "AcMatrix",
     [SCALARFIELD] = "AcReal",
+    [SCALARARRAY] = "const AcReal* __restrict__",
+    [COMPLEX]     = "acComplex",
     // Type qualifiers
     [KERNEL] = "template <int step_number>  static __global__",
     //__launch_bounds__(RK_THREADBLOCK_SIZE,
@@ -380,20 +382,13 @@ traverse(const ASTNode* node)
         if (handle >= 0) { // The variable exists in the symbol table
             const Symbol* symbol = &symbol_table[handle];
 
-            // if (symbol->type_qualifier == OUT) {
-            //    printf("%s%s", inout_name_prefix, symbol->identifier);
-            //}
             if (symbol->type_qualifier == UNIFORM) {
-                printf("DCONST(%s) ", symbol->identifier);
-                /*
-                if (symbol->type_specifier == SCALAR)
-                    printf("DCONST_REAL(AC_%s) ", symbol->identifier);
-                else if (symbol->type_specifier == INT)
-                    printf("DCONST_INT(AC_%s) ", symbol->identifier);
-                else
-                    printf("INVALID UNIFORM type specifier %s with %s\n",
-                           translate(symbol->type_specifier), symbol->identifier);
-                           */
+                if (inside_kernel && symbol->type_specifier == SCALARARRAY) {
+                    printf("buffer.profiles[%s] ", symbol->identifier);
+                }
+                else {
+                    printf("DCONST(%s) ", symbol->identifier);
+                }
             }
             else {
                 // Do a regular translation
@@ -613,6 +608,15 @@ generate_header(void)
     }
     printf("\n\n");
 
+    // Scalar arrays
+    printf("#define AC_FOR_SCALARARRAY_HANDLES(FUNC)");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_specifier == SCALARARRAY) {
+            printf("\\\nFUNC(%s),", symbol_table[i].identifier);
+        }
+    }
+    printf("\n\n");
+
     /*
     printf("\n");
     printf("typedef struct {\n");
diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 58525ba..1c7ba1e 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -156,6 +156,11 @@ typedef enum {
     NUM_REAL3_PARAMS
 } AcReal3Param;
 
+typedef enum {
+    AC_FOR_SCALARARRAY_HANDLES(AC_GEN_ID) //
+    NUM_SCALARARRAY_HANDLES
+} ScalarArrayHandle;
+
 typedef enum {
     AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) //
     NUM_VTXBUF_HANDLES
@@ -166,6 +171,7 @@ extern const char* intparam_names[];
 extern const char* int3param_names[];
 extern const char* realparam_names[];
 extern const char* real3param_names[];
+extern const char* scalararray_names[];
 extern const char* vtxbuf_names[];
 
 typedef struct {
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 11956f7..4ab34a9 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -22,15 +22,16 @@
 #include "math_utils.h" // int3 + int3
 
 #define AC_GEN_STR(X) #X
-const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
+const char* intparam_names[]    = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
+const char* int3param_names[]   = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
                                  AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
+const char* realparam_names[]   = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
                                  AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
+const char* real3param_names[]  = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
                                   AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
+const char* vtxbuf_names[]      = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 #undef AC_GEN_STR
 
 static const int num_nodes = 1;
diff --git a/src/core/device.cu b/src/core/device.cu
index 78e82d7..1faaa5e 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -37,6 +37,8 @@
 typedef struct {
     AcReal* in[NUM_VTXBUF_HANDLES];
     AcReal* out[NUM_VTXBUF_HANDLES];
+
+    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
 } VertexBufferArray;
 
 struct device_s {
@@ -97,6 +99,32 @@ DCONST(const VertexBufferHandle handle)
 //#define globalMeshN_min // Placeholder
 #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
+//#include <thrust/complex.h>
+// using namespace thrust;
+#include <cuComplex.h>
+#if AC_DOUBLE_PRECISION == 1
+typedef cuDoubleComplex acComplex;
+#define acComplex(x, y) make_cuDoubleComplex(x, y)
+#else
+typedef cuFloatComplex acComplex;
+#define acComplex(x, y) make_cuFloatComplex(x, y)
+#endif
+static __device__ inline acComplex
+exp(const acComplex& val)
+{
+    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
+}
+static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+{
+    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
+}
+//#include <complex>
+
 #include "kernels/boundconds.cuh"
 #include "kernels/integration.cuh"
 #include "kernels/reductions.cuh"
@@ -140,11 +168,21 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
     }
 
     // Memory
+    // VBA in/out
     const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
         ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
     }
+    // VBA Profiles
+    const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
+                                                           max(device_config.int_params[AC_my],
+                                                               device_config.int_params[AC_mz]));
+    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
+    }
+
+    // Reductions
     ERRCHK_CUDA_ALWAYS(
         cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
     ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
@@ -178,6 +216,10 @@ acDeviceDestroy(Device device)
         cudaFree(device->vba.in[i]);
         cudaFree(device->vba.out[i]);
     }
+    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
+        cudaFree(device->vba.profiles[i]);
+    }
+
     cudaFree(device->reduce_scratchpad);
     cudaFree(device->reduce_result);
 
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 0a06c2f..e1c723a 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -70,11 +70,11 @@ create_rotz(const AcReal radians)
 #define cos __cosf
 #define exp __expf
 */
-#define sin sinf
-#define cos cosf
-#define exp expf
-#define rsqrt rsqrtf // hardware reciprocal sqrt
-#endif               // AC_DOUBLE_PRECISION == 0
+//#define sin sinf
+//#define cos cosf
+//#define exp expf
+//#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
 
 /*
  * =============================================================================
diff --git a/src/core/math_utils.h b/src/core/math_utils.h
index 4d41e4e..a7ea2e2 100644
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -124,6 +124,11 @@ static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
     return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
 
+static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
 static HOST_DEVICE_INLINE AcReal
 dot(const AcReal3& a, const AcReal3& b)
 {

From cdb504e77231b177cd75fa1dc545f74e59c62a1b Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 2 Sep 2019 21:29:07 +0300
Subject: [PATCH 03/12] Added a proof-of-concept helical forcing which uses the
 newly introduced ScalarArrays for reading profiles. Not extensively tested.

---
 acc/pc_mhd_solver/.gitignore             |   5 +
 acc/pc_mhd_solver/stencil_assembly.sas   |  75 +++++
 acc/pc_mhd_solver/stencil_definition.sdh | 159 ++++++++++
 acc/pc_mhd_solver/stencil_process.sps    | 358 +++++++++++++++++++++++
 4 files changed, 597 insertions(+)
 create mode 100644 acc/pc_mhd_solver/.gitignore
 create mode 100644 acc/pc_mhd_solver/stencil_assembly.sas
 create mode 100644 acc/pc_mhd_solver/stencil_definition.sdh
 create mode 100644 acc/pc_mhd_solver/stencil_process.sps

diff --git a/acc/pc_mhd_solver/.gitignore b/acc/pc_mhd_solver/.gitignore
new file mode 100644
index 0000000..bc4b7d8
--- /dev/null
+++ b/acc/pc_mhd_solver/.gitignore
@@ -0,0 +1,5 @@
+build
+testbin
+
+# Except this file
+!.gitignore
diff --git a/acc/pc_mhd_solver/stencil_assembly.sas b/acc/pc_mhd_solver/stencil_assembly.sas
new file mode 100644
index 0000000..85886e6
--- /dev/null
+++ b/acc/pc_mhd_solver/stencil_assembly.sas
@@ -0,0 +1,75 @@
+#include "stencil_definition.sdh"
+
+Preprocessed Scalar
+value(in ScalarField vertex)
+{
+    return vertex[vertexIdx];
+}
+
+Preprocessed Vector
+gradient(in ScalarField vertex)
+{
+    return (Vector){derx(vertexIdx, vertex), dery(vertexIdx, vertex), derz(vertexIdx, vertex)};
+}
+
+#if LUPWD
+
+Preprocessed Scalar
+der6x_upwd(in ScalarField vertex)
+{
+    Scalar inv_ds = AC_inv_dsx;
+
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x + 1, vertexIdx.y, vertexIdx.z] +
+                                     vertex[vertexIdx.x - 1, vertexIdx.y, vertexIdx.z]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x + 2, vertexIdx.y, vertexIdx.z] +
+                                    vertex[vertexIdx.x - 2, vertexIdx.y, vertexIdx.z]) +
+                     vertex[vertexIdx.x + 3, vertexIdx.y, vertexIdx.z] +
+                     vertex[vertexIdx.x - 3, vertexIdx.y, vertexIdx.z])};
+}
+
+Preprocessed Scalar
+der6y_upwd(in ScalarField vertex)
+{
+    Scalar inv_ds = AC_inv_dsy;
+
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y + 1, vertexIdx.z] +
+                                     vertex[vertexIdx.x, vertexIdx.y - 1, vertexIdx.z]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y + 2, vertexIdx.z] +
+                                    vertex[vertexIdx.x, vertexIdx.y - 2, vertexIdx.z]) +
+                     vertex[vertexIdx.x, vertexIdx.y + 3, vertexIdx.z] +
+                     vertex[vertexIdx.x, vertexIdx.y - 3, vertexIdx.z])};
+}
+
+Preprocessed Scalar
+der6z_upwd(in ScalarField vertex)
+{
+    Scalar inv_ds = AC_inv_dsz;
+
+    return (Scalar){Scalar(1.0 / 60.0) * inv_ds *
+                    (-Scalar(20.0) * vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z] +
+                     Scalar(15.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 1] +
+                                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 1]) -
+                     Scalar(6.0) * (vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 2] +
+                                    vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 2]) +
+                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z + 3] +
+                     vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z - 3])};
+}
+
+#endif
+
+Preprocessed Matrix
+hessian(in ScalarField vertex)
+{
+    Matrix hessian;
+
+    hessian.row[0] = (Vector){derxx(vertexIdx, vertex), derxy(vertexIdx, vertex),
+                              derxz(vertexIdx, vertex)};
+    hessian.row[1] = (Vector){hessian.row[0].y, deryy(vertexIdx, vertex), deryz(vertexIdx, vertex)};
+    hessian.row[2] = (Vector){hessian.row[0].z, hessian.row[1].z, derzz(vertexIdx, vertex)};
+
+    return hessian;
+}
diff --git a/acc/pc_mhd_solver/stencil_definition.sdh b/acc/pc_mhd_solver/stencil_definition.sdh
new file mode 100644
index 0000000..f29b220
--- /dev/null
+++ b/acc/pc_mhd_solver/stencil_definition.sdh
@@ -0,0 +1,159 @@
+#define LDENSITY (1)
+#define LHYDRO (1)
+#define LMAGNETIC (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+#define LFORCING (1)
+#define LUPWD (1)
+
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
+// Int params
+uniform int AC_max_steps;
+uniform int AC_save_steps;
+uniform int AC_bin_steps;
+uniform int AC_bc_type;
+
+// Real params
+uniform Scalar AC_dt;
+// Spacing
+uniform Scalar AC_dsx;
+uniform Scalar AC_dsy;
+uniform Scalar AC_dsz;
+uniform Scalar AC_dsmin;
+// physical grid
+uniform Scalar AC_xlen;
+uniform Scalar AC_ylen;
+uniform Scalar AC_zlen;
+uniform Scalar AC_xorig;
+uniform Scalar AC_yorig;
+uniform Scalar AC_zorig;
+// Physical units
+uniform Scalar AC_unit_density;
+uniform Scalar AC_unit_velocity;
+uniform Scalar AC_unit_length;
+// properties of gravitating star
+uniform Scalar AC_star_pos_x;
+uniform Scalar AC_star_pos_y;
+uniform Scalar AC_star_pos_z;
+uniform Scalar AC_M_star;
+//  Run params
+uniform Scalar AC_cdt;
+uniform Scalar AC_cdtv;
+uniform Scalar AC_cdts;
+uniform Scalar AC_nu_visc;
+uniform Scalar AC_cs_sound;
+uniform Scalar AC_eta;
+uniform Scalar AC_mu0;
+uniform Scalar AC_cp_sound;
+uniform Scalar AC_gamma;
+uniform Scalar AC_cv_sound;
+uniform Scalar AC_lnT0;
+uniform Scalar AC_lnrho0;
+uniform Scalar AC_zeta;
+uniform Scalar AC_trans;
+//  Other
+uniform Scalar AC_bin_save_t;
+//  Initial condition params
+uniform Scalar AC_ampl_lnrho;
+uniform Scalar AC_ampl_uu;
+uniform Scalar AC_angl_uu;
+uniform Scalar AC_lnrho_edge;
+uniform Scalar AC_lnrho_out;
+//  Forcing parameters. User configured.
+uniform Scalar AC_forcing_magnitude;
+uniform Scalar AC_relhel;
+uniform Scalar AC_kmin;
+uniform Scalar AC_kmax;
+//  Forcing parameters. Set by the generator.
+uniform Scalar AC_forcing_phase;
+uniform Scalar AC_k_forcex;
+uniform Scalar AC_k_forcey;
+uniform Scalar AC_k_forcez;
+uniform Scalar AC_kaver;
+uniform Scalar AC_ff_hel_rex;
+uniform Scalar AC_ff_hel_rey;
+uniform Scalar AC_ff_hel_rez;
+uniform Scalar AC_ff_hel_imx;
+uniform Scalar AC_ff_hel_imy;
+uniform Scalar AC_ff_hel_imz;
+//  Additional helper params  //  (deduced from other params do not set these directly!)
+uniform Scalar AC_G_CONST;
+uniform Scalar AC_GM_star;
+uniform Scalar AC_sq2GM_star;
+uniform Scalar AC_cs2_sound;
+uniform Scalar AC_inv_dsx;
+uniform Scalar AC_inv_dsy;
+uniform Scalar AC_inv_dsz;
+
+// PC-style helical forcing with profiles
+uniform Vector AC_kk;
+uniform Vector AC_coef1;
+uniform Vector AC_coef2;
+uniform Vector AC_coef3;
+uniform Vector AC_fda;
+
+uniform Scalar AC_phase;
+uniform Scalar AC_fact;
+uniform Scalar AC_k1_ff;
+
+uniform ScalarArray AC_profx;
+uniform ScalarArray AC_profy;
+uniform ScalarArray AC_profz;
+uniform ScalarArray AC_profx_hel;
+uniform ScalarArray AC_profy_hel;
+uniform ScalarArray AC_profz_hel;
+
+uniform int AC_iforcing_zsym;
+
+uniform ScalarArray AC_0;
+uniform ScalarArray AC_1;
+uniform ScalarArray AC_2;
+uniform ScalarArray AC_3;
+uniform ScalarArray AC_4;
+uniform ScalarArray AC_5;
+uniform ScalarArray AC_6;
+uniform ScalarArray AC_7;
+uniform ScalarArray AC_8;
+uniform ScalarArray AC_9;
+uniform ScalarArray AC_10;
+uniform ScalarArray AC_11;
+uniform ScalarArray AC_12;
+uniform ScalarArray AC_13;
+uniform ScalarArray AC_14;
+uniform ScalarArray AC_15;
+uniform ScalarArray AC_16;
+uniform ScalarArray AC_17;
+uniform ScalarArray AC_18;
+uniform ScalarArray AC_19;
+
+/*
+ * =============================================================================
+ * User-defined vertex buffers
+ * =============================================================================
+ */
+#if LENTROPY
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+uniform ScalarField VTXBUF_AX;
+uniform ScalarField VTXBUF_AY;
+uniform ScalarField VTXBUF_AZ;
+uniform ScalarField VTXBUF_ENTROPY;
+#elif LMAGNETIC
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+uniform ScalarField VTXBUF_AX;
+uniform ScalarField VTXBUF_AY;
+uniform ScalarField VTXBUF_AZ;
+#elif LHYDRO
+uniform ScalarField VTXBUF_LNRHO;
+uniform ScalarField VTXBUF_UUX;
+uniform ScalarField VTXBUF_UUY;
+uniform ScalarField VTXBUF_UUZ;
+#else
+uniform ScalarField VTXBUF_LNRHO;
+#endif
diff --git a/acc/pc_mhd_solver/stencil_process.sps b/acc/pc_mhd_solver/stencil_process.sps
new file mode 100644
index 0000000..db1c41e
--- /dev/null
+++ b/acc/pc_mhd_solver/stencil_process.sps
@@ -0,0 +1,358 @@
+#include "stencil_definition.sdh"
+
+Vector
+value(in VectorField uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+#if LUPWD
+Scalar
+upwd_der6(in VectorField uu, in ScalarField lnrho)
+{
+    Scalar uux = fabs(value(uu).x);
+    Scalar uuy = fabs(value(uu).y);
+    Scalar uuz = fabs(value(uu).z);
+    return (Scalar){uux * der6x_upwd(lnrho) + uuy * der6y_upwd(lnrho) + uuz * der6z_upwd(lnrho)};
+}
+#endif
+
+Matrix
+gradients(in VectorField uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in VectorField uu, in ScalarField lnrho)
+{
+    return -dot(value(uu), gradient(lnrho))
+#if LUPWD
+           // This is a corrective hyperdiffusion term for upwinding.
+           + upwd_der6(uu, lnrho)
+#endif
+           - divergence(uu);
+}
+
+#if LENTROPY
+Vector
+momentum(in VectorField uu, in ScalarField lnrho, in ScalarField ss, in VectorField aa)
+{
+    const Matrix S   = stress_tensor(uu);
+    const Scalar cs2 = AC_cs2_sound * exp(AC_gamma * value(ss) / AC_cp_sound +
+                                          (AC_gamma - 1) * (value(lnrho) - AC_lnrho0));
+    const Vector j   = (Scalar(1.) / AC_mu0) *
+                     (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Vector B = curl(aa);
+    // TODO: DOES INTHERMAL VERSTION INCLUDE THE MAGNETIC FIELD?
+    const Scalar inv_rho = Scalar(1.) / exp(value(lnrho));
+
+    // Regex replace CPU constants with get\(AC_([a-zA-Z_0-9]*)\)
+    // \1
+    const Vector mom = -mul(gradients(uu), value(uu)) -
+                       cs2 * ((Scalar(1.) / AC_cp_sound) * gradient(ss) + gradient(lnrho)) +
+                       inv_rho * cross(j, B) +
+                       AC_nu_visc *
+                           (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                            Scalar(2.) * mul(S, gradient(lnrho))) +
+                       AC_zeta * gradient_of_divergence(uu);
+    return mom;
+}
+#elif LTEMPERATURE
+Vector
+momentum(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
+{
+    Vector mom;
+
+    const Matrix S = stress_tensor(uu);
+
+    const Vector pressure_term = (AC_cp_sound - AC_cv_sound) *
+                                 (gradient(tt) + value(tt) * gradient(lnrho));
+
+    mom = -mul(gradients(uu), value(uu)) - pressure_term +
+          AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                        Scalar(2.) * mul(S, gradient(lnrho))) +
+          AC_zeta * gradient_of_divergence(uu);
+
+#if LGRAVITY
+    mom = mom - (Vector){0, 0, -10.0};
+#endif
+
+    return mom;
+}
+#else
+Vector
+momentum(in VectorField uu, in ScalarField lnrho)
+{
+    Vector mom;
+
+    const Matrix S = stress_tensor(uu);
+
+    // Isothermal: we have constant speed of sound
+
+    mom = -mul(gradients(uu), value(uu)) - AC_cs2_sound * gradient(lnrho) +
+          AC_nu_visc * (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+                        Scalar(2.) * mul(S, gradient(lnrho))) +
+          AC_zeta * gradient_of_divergence(uu);
+
+#if LGRAVITY
+    mom = mom - (Vector){0, 0, -10.0};
+#endif
+
+    return mom;
+}
+#endif
+
+Vector
+induction(in VectorField uu, in VectorField aa)
+{
+    // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+    // x A)) in order to avoid taking the first derivative twice (did the math,
+    // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+    // u cross B - AC_eta * AC_mu0 * (AC_mu0^-1 * [- laplace A + grad div A ])
+    const Vector B        = curl(aa);
+    const Vector grad_div = gradient_of_divergence(aa);
+    const Vector lap      = laplace_vec(aa);
+
+    // Note, AC_mu0 is cancelled out
+    const Vector ind = cross(value(uu), B) - AC_eta * (grad_div - lap);
+
+    return ind;
+}
+
+#if LENTROPY
+Scalar
+lnT(in ScalarField ss, in ScalarField lnrho)
+{
+    const Scalar lnT = AC_lnT0 + AC_gamma * value(ss) / AC_cp_sound +
+                       (AC_gamma - Scalar(1.)) * (value(lnrho) - AC_lnrho0);
+    return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction(in ScalarField ss, in ScalarField lnrho)
+{
+    const Scalar inv_AC_cp_sound = AcReal(1.) / AC_cp_sound;
+
+    const Vector grad_ln_chi = -gradient(lnrho);
+
+    const Scalar first_term = AC_gamma * inv_AC_cp_sound * laplace(ss) +
+                              (AC_gamma - AcReal(1.)) * laplace(lnrho);
+    const Vector second_term = AC_gamma * inv_AC_cp_sound * gradient(ss) +
+                               (AC_gamma - AcReal(1.)) * gradient(lnrho);
+    const Vector third_term = AC_gamma * (inv_AC_cp_sound * gradient(ss) + gradient(lnrho)) +
+                              grad_ln_chi;
+
+    const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * AC_cp_sound);
+    return AC_cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k)
+{
+    return 1;
+}
+
+Scalar
+entropy(in ScalarField ss, in VectorField uu, in ScalarField lnrho, in VectorField aa)
+{
+    const Matrix S      = stress_tensor(uu);
+    const Scalar inv_pT = Scalar(1.) / (exp(value(lnrho)) * exp(lnT(ss, lnrho)));
+    const Vector j      = (Scalar(1.) / AC_mu0) *
+                     (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Scalar RHS = H_CONST - C_CONST + AC_eta * (AC_mu0)*dot(j, j) +
+                       Scalar(2.) * exp(value(lnrho)) * AC_nu_visc * contract(S) +
+                       AC_zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
+
+    return -dot(value(uu), gradient(ss)) + inv_pT * RHS + heat_conduction(ss, lnrho);
+}
+#endif
+
+#if LTEMPERATURE
+Scalar
+heat_transfer(in VectorField uu, in ScalarField lnrho, in ScalarField tt)
+{
+    const Matrix S                  = stress_tensor(uu);
+    const Scalar heat_diffusivity_k = 0.0008; // 8e-4;
+    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) +
+           heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) +
+           AC_nu_visc * contract(S) * (Scalar(1.) / AC_cv_sound) -
+           (AC_gamma - 1) * value(tt) * divergence(uu);
+}
+#endif
+
+#if LFORCING
+Vector
+simple_vortex_forcing(Vector a, Vector b, Scalar magnitude)
+{
+    return magnitude * cross(normalized(b - a), (Vector){0, 0, 1}); // Vortex
+}
+
+Vector
+simple_outward_flow_forcing(Vector a, Vector b, Scalar magnitude)
+{
+    return magnitude * (1 / length(b - a)) * normalized(b - a); // Outward flow
+}
+
+// The Pencil Code forcing_hel_noshear(), manual Eq. 222, inspired forcing function with adjustable
+// helicity
+Vector
+helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
+{
+    // JP: This looks wrong:
+    //    1) Should it be AC_dsx * AC_nx instead of AC_dsx * AC_ny?
+    //    2) Should you also use globalGrid.n instead of the local n?
+    //    MV: You are rigth. Made a quickfix. I did not see the error  because multigpu is split
+    //        in z direction not y direction.
+    //    3) Also final point: can we do this with vectors/quaternions instead?
+    //       Tringonometric functions are much more expensive and inaccurate/
+    //    MV: Good idea. No an immediate priority.
+    // Fun related article:
+    // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
+    xx.x = xx.x * (2.0 * M_PI / (AC_dsx * globalGridN.x));
+    xx.y = xx.y * (2.0 * M_PI / (AC_dsy * globalGridN.y));
+    xx.z = xx.z * (2.0 * M_PI / (AC_dsz * globalGridN.z));
+
+    Scalar cos_phi     = cos(phi);
+    Scalar sin_phi     = sin(phi);
+    Scalar cos_k_dot_x = cos(dot(k_force, xx));
+    Scalar sin_k_dot_x = sin(dot(k_force, xx));
+    // Phase affect only the x-component
+    // Scalar real_comp       = cos_k_dot_x;
+    // Scalar imag_comp       = sin_k_dot_x;
+    Scalar real_comp_phase = cos_k_dot_x * cos_phi - sin_k_dot_x * sin_phi;
+    Scalar imag_comp_phase = cos_k_dot_x * sin_phi + sin_k_dot_x * cos_phi;
+
+    Vector force = (Vector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
+                            ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
+                            ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
+
+    return force;
+}
+
+Vector
+forcing_OLD(int3 globalVertexIdx, Scalar dt)
+{
+    Vector a         = Scalar(.5) * (Vector){globalGridN.x * AC_dsx, globalGridN.y * AC_dsy,
+                                     globalGridN.z * AC_dsz}; // source (origin)
+    Vector xx        = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
+                         (globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
+                         (globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz}; // sink (current index)
+    const Scalar cs2 = AC_cs2_sound;
+    const Scalar cs  = sqrt(cs2);
+
+    // Placeholders until determined properly
+    Scalar magnitude = AC_forcing_magnitude;
+    Scalar phase     = AC_forcing_phase;
+    Vector k_force   = (Vector){AC_k_forcex, AC_k_forcey, AC_k_forcez};
+    Vector ff_re     = (Vector){AC_ff_hel_rex, AC_ff_hel_rey, AC_ff_hel_rez};
+    Vector ff_im     = (Vector){AC_ff_hel_imx, AC_ff_hel_imy, AC_ff_hel_imz};
+
+    // Determine that forcing funtion type at this point.
+    // Vector force = simple_vortex_forcing(a, xx, magnitude);
+    // Vector force = simple_outward_flow_forcing(a, xx, magnitude);
+    Vector force = helical_forcing(magnitude, k_force, xx, ff_re, ff_im, phase);
+
+    // Scaling N = magnitude*cs*sqrt(k*cs/dt)  * dt
+    const Scalar NN = cs * sqrt(AC_kaver * cs);
+    // MV: Like in the Pencil Code. I don't understandf the logic here.
+    force.x = sqrt(dt) * NN * force.x;
+    force.y = sqrt(dt) * NN * force.y;
+    force.z = sqrt(dt) * NN * force.z;
+
+    if (is_valid(force)) {
+        return force;
+    }
+    else {
+        return (Vector){0, 0, 0};
+    }
+}
+
+// PC-style helical forcing with support for profiles
+Vector
+forcing(int3 vertexIdx, int3 globalVertexIdx, Scalar dt, ScalarArray profx, ScalarArray profy,
+        ScalarArray profz, ScalarArray profx_hel, ScalarArray profy_hel, ScalarArray profz_hel)
+{
+    Vector pos = (Vector){(globalVertexIdx.x - DCONST(AC_nx_min)) * AC_dsx,
+                          (globalVertexIdx.y - DCONST(AC_ny_min)) * AC_dsy,
+                          (globalVertexIdx.z - DCONST(AC_nz_min)) * AC_dsz};
+
+    Complex fx = AC_fact * exp(Complex(0, AC_kk.x * AC_k1_ff * pos.z + AC_phase));
+    Complex fy = exp(Complex(0, AC_kk.y * AC_k1_ff * pos.y));
+
+    Complex fz;
+    if (AC_iforcing_zsym == 0) {
+        fz = exp(Complex(0., AC_kk.z * AC_k1_ff * pos.z));
+    }
+    else if (AC_iforcing_zsym == 1) {
+        fz = Complex(cos(AC_kk.z * AC_k1_ff * pos.z), 0);
+    }
+    else if (AC_iforcing_zsym == -1) {
+        fz = Complex(sin(AC_kk.z * AC_k1_ff * pos.z), 0);
+    }
+    else {
+        // Failure
+    }
+
+    Complex fxyz = fx * fy * fz;
+
+    // TODO recheck indices
+    Scalar force_ampl    = profx[vertexIdx.x - NGHOST] * profy[vertexIdx.y] * profz[vertexIdx.z];
+    Scalar prof_hel_ampl = profx_hel[vertexIdx.x - NGHOST] * profy_hel[vertexIdx.y] *
+                           profz_hel[vertexIdx.z];
+
+    return force_ampl * AC_fda * (Complex(AC_coef1.x, prof_hel_ampl * AC_coef2.x) * fxyz).x;
+}
+#endif // LFORCING
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in ScalarField lnrho(VTXBUF_LNRHO);
+out ScalarField out_lnrho(VTXBUF_LNRHO);
+
+in VectorField uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+out VectorField out_uu(VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+
+#if LMAGNETIC
+in VectorField aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
+out VectorField out_aa(VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ);
+#endif
+
+#if LENTROPY
+in ScalarField ss(VTXBUF_ENTROPY);
+out ScalarField out_ss(VTXBUF_ENTROPY);
+#endif
+
+#if LTEMPERATURE
+in ScalarField tt(VTXBUF_TEMPERATURE);
+out ScalarField out_tt(VTXBUF_TEMPERATURE);
+#endif
+
+Kernel void
+solve()
+{
+    Scalar dt = AC_dt;
+    out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
+
+#if LMAGNETIC
+    out_aa = rk3(out_aa, aa, induction(uu, aa), dt);
+#endif
+
+#if LENTROPY
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho, ss, aa), dt);
+    out_ss = rk3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt);
+#elif LTEMPERATURE
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho, tt), dt);
+    out_tt = rk3(out_tt, tt, heat_transfer(uu, lnrho, tt), dt);
+#else
+    out_uu = rk3(out_uu, uu, momentum(uu, lnrho), dt);
+#endif
+
+#if LFORCING
+    if (step_number == 2) {
+        out_uu = out_uu + forcing(vertexIdx, globalVertexIdx, dt, AC_profx, AC_profy, AC_profz,
+                                  AC_profx_hel, AC_profy_hel, AC_profz_hel);
+    }
+#endif
+}

From 263a1d23a304c1753515df1d0b0e8c3ee17d3926 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 5 Sep 2019 16:35:08 +0300
Subject: [PATCH 04/12] Added a function for loading ScalarArrays to the GPU

---
 include/astaroth_device.h |  5 +++++
 src/core/device.cu        | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 1d7926f..7a800c2 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -61,6 +61,11 @@ AcResult acDeviceLoadIntConstant(const Device device, const Stream stream, const
 AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
                                   const int3 value);
 
+/** */
+AcResult acDeviceLoadScalarArray(const Device device, const Stream stream,
+                                 const ScalarArrayHandle handle, const AcReal* data,
+                                 const size_t num);
+
 /** */
 AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
                               const AcMeshInfo device_config);
diff --git a/src/core/device.cu b/src/core/device.cu
index 1faaa5e..33229b8 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -447,6 +447,16 @@ acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3P
     return AC_SUCCESS;
 }
 
+AcResult
+acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
+                        const AcReal* data, const size_t num)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(device->vba.profiles[handle], data, sizeof(data[0]) * num,
+                                cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
 AcResult
 acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
 {

From 71695b7c12ebbdffb9275a9c9870909e8155c95a Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 5 Sep 2019 19:55:16 +0300
Subject: [PATCH 05/12] Added WIP stuff to the API specification

---
 .../API_specification_and_user_manual.md      | 304 +++++++++++++++++-
 1 file changed, 302 insertions(+), 2 deletions(-)

diff --git a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
index b77b00b..b0e364e 100644
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -45,23 +45,323 @@ The foundational work was done in (Väisälä, Pekkilä, 2017) and the library,
 
 # Astaroth API
 
-## Devices and nodes
+The Astroth application-programming interface (API) provides the means for controlling execution of user-defined and built-in functions on multiple graphics processing units. Functions in the API are prefixed with lower case ```ac```, while structures and data types are prefixed with capitalized ```Ac```. Compile-time constants, such as definitions and enumerations, have the prefix ```AC_```. All of the API functions return an AcResult value indicating either success or failure. The return codes are
+```C
+typedef enum { 
+    AC_SUCCESS = 0, 
+    AC_FAILURE = 1
+} AcResult;
+```
+
+The API is divided into layers which differ in the level of control provided over the execution. There are two primary layers:
+
+* Device layer
+    > Functions start with acDevice*.
+    > Provides control over a single GPU.
+    > All functions are asynchronous.
+
+* Node layer
+    > Functions start with acNode*.
+    > Provides control over multiple devices in a single node.
+    > All functions are asynchronous and executed concurrently on all devices in the node.
+    > Subsequent functions called in the same stream (see Section #Streams and synchronization) are guaranteed to be synchronous.
+
+Finally, a third layer is provided for convenience and backwards compatibility. 
+
+* Astaroth layer (deprecated)
+    > Functions start with ac* without Node or Device, f.ex. acInit().
+    > Provided for backwards compatibility.
+    > Essentially a wrapper for the Node layer. 
+    > All functions are guaranteed to be synchronous.
+
+There are also several helper functions defined in `include/astaroth_defines.h`, which can be used for, say, determining the size or performing index calculations within the simulation domain.
+
+
+## List of Astaroth API functions
+
+Here's a non-exhaustive list of astaroth API functions. For more info and an up-to-date list, see the corresponding header files in `include/astaroth_defines.h`, `include/astaroth.h`, `include/astaroth_node.h`, `include/astaroth_device.h`.
+
+### Initialization, quitting and helper functions
+
+Device layer.
+```C
+AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device);
+AcResult acDeviceDestroy(Device device);
+AcResult acDevicePrintInfo(const Device device);
+AcResult acDeviceAutoOptimize(const Device device);
+```
+
+Node layer.
+```C
+AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
+AcResult acNodeDestroy(Node node);
+AcResult acNodePrintInfo(const Node node);
+AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
+AcResult acNodeAutoOptimize(const Node node);
+```
+
+General helper functions.
+```C
+size_t acVertexBufferSize(const AcMeshInfo info);
+size_t acVertexBufferSizeBytes(const AcMeshInfo info);
+size_t acVertexBufferCompdomainSize(const AcMeshInfo info);
+size_t acVertexBufferCompdomainSizeBytes(const AcMeshInfo info);
+size_t acVertexBufferIdx(const int i, const int j, const int k, const AcMeshInfo info);
+```
+
+### Loading and storing
+
+Loading meshes and vertex buffers to device memory.
+```C
+AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
+AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
+                                    const AcMesh host_mesh, const int3 src, const int3 dst,
+                                    const int num_vertices);
+AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
+                                  const VertexBufferHandle vtxbuf_handle);
+AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
+                                            const AcMesh host_mesh,
+                                            const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                            const int3 dst, const int num_vertices);
+
+AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
+AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                                  const int3 src, const int3 dst, const int num_vertices);
+AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
+                                const VertexBufferHandle vtxbuf_handle);
+AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
+                                          const AcMesh host_mesh,
+                                          const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                          const int3 dst, const int num_vertices);
+```
+
+Storing meshes and vertex buffer to host memory.
+```C
+AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
+AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                                     const int3 dst, const int num_vertices, AcMesh* host_mesh);
+AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
+                                   const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                                     const int3 dst, const int num_vertices, AcMesh* host_mesh);
+
+AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
+AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
+                                   const int3 dst, const int num_vertices, AcMesh* host_mesh);
+AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
+                                 const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
+AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
+                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                           const int3 dst, const int num_vertices,
+                                           AcMesh* host_mesh);
+```
+
+Transferring data between devices
+```C
+AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
+AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
+                                        const int3 src, const int3 dst, const int num_vertices,
+                                        Device* dst_device);
+AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
+                                      const VertexBufferHandle vtxbuf_handle, Device dst_device);
+AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
+                                                const VertexBufferHandle vtxbuf_handle,
+                                                const int3 src, const int3 dst,
+                                                const int num_vertices, Device dst_device);
+```
+
+Loading uniforms (device constants)
+```C
+AcResult acDeviceLoadScalarConstant(const Device device, const Stream stream,
+                                    const AcRealParam param, const AcReal value);
+AcResult acDeviceLoadVectorConstant(const Device device, const Stream stream,
+                                    const AcReal3Param param, const AcReal3 value);
+AcResult acDeviceLoadIntConstant(const Device device, const Stream stream, const AcIntParam param,
+                                 const int value);
+AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
+                                  const int3 value);
+AcResult acDeviceLoadScalarArray(const Device device, const Stream stream,
+                                 const ScalarArrayHandle handle, const AcReal* data,
+                                 const size_t num);
+AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
+                              const AcMeshInfo device_config);
+```
+
+
+### Computation
+
+```C
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                                  const int3 start, const int3 end, const AcReal dt);
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
+                                       const int3 end);
+AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                                    const int3 end);
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
+                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                           const VertexBufferHandle vtxbuf2, AcReal* result);
+
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
+AcResult acNodeIntegrate(const Node node, const AcReal dt);
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle);
+AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
+```
+
+### Synchronization
+
+The functions for synchronizing with certain stream and swapping input and output buffers are
+defined as follows. Reductions and boundary conditions declared in the previous subsection operate
+in-place on the input array, while integration places the result in the output buffer.
+Therefore ac*SwapBuffers() should always be called after an integration substep has been completed.
+
+The node layer introduces two new functions in addition to synchronization and swapping functions,
+acNodeSynchronizeVertexBuffer and acNodeSynchronizeMesh. These functions communicate the overlapping
+vertices between neighboring devices.
+Note that part of the overlapping ghost zone is also communicated when synchronizing vertex buffers
+and meshes. Therefore the data in this part of the ghost zone must be up to date before calling
+the communication functions.
+
+```C
+AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
+AcResult acDeviceSwapBuffers(const Device device);
+
+AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
+AcResult acNodeSwapBuffers(const Node node);
+AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle);
+AcResult acNodeSynchronizeMesh(const Node node, const Stream stream);
+```
+
+## Devices
+
+The data type Device is a handle to some single device and is used in the Device layer functions to specify which device should execute the function. A device is created and destroyed with the following interface functions.
+```C
+AcResult acDeviceCreate(const int device_id, const AcMeshInfo device_config, Device* device);
+
+AcResult acDeviceDestroy(Device device);
+```
+
+## Nodes
+
+The data type Node is a handle to some node, which consists of multiple devices. The Node handle is used to specify which node the Node layer functions should operate in. A node is created and destroyed with the following interface functions.
+```C
+AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
+
+AcResult acNodeDestroy(Node node);
+```
+
+The function acNodeCreate calls acDeviceCreate for all devices that are visible from the current process. After a node has been created, the devices in it can be retrived with the function
+```C
+AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
+```
+where DeviceConfiguration is defined as
+```C
+typedef struct {
+    int num_devices;
+    Device* devices; // Todo make this predefined s.t. the user/us do not have to alloc/free
+
+    Grid grid;
+    Grid subgrid;
+} DeviceConfiguration;
+```
 
 ## Meshes
 
+Meshes are the primary structures for passing information to the library and kernels. The definition
+of a mesh is declared as
+```C
+typedef struct {
+    int int_params[NUM_INT_PARAMS];
+    int3 int3_params[NUM_INT3_PARAMS];
+    AcReal real_params[NUM_REAL_PARAMS];
+    AcReal3 real3_params[NUM_REAL3_PARAMS];
+} AcMeshInfo;
+
+typedef struct {
+    AcReal* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} AcMesh;
+```
+
+Several built-in parameters, such as the dimensions of the mesh, and all user-defined parameters
+are defined in the `AcMeshInfo` structure. Before passing AcMeshInfo to an initialization function,
+such as `acDeviceCreate()`, the built-in parameters `AC_nx, AC_ny, AC_nz` must be set. These
+parameters define the dimensions of the computational domain of the mesh. For example,
+```C
+AcMeshInfo info;
+info.int_params[AC_nx] = 128;
+info.int_params[AC_ny] = 64;
+info.int_params[AC_nz] = 32;
+
+Device device;
+acDeviceCreate(0, info, &device);
+```
+
+AcMesh is used in loading and storing data from host to device and vice versa. Before calling for
+example `acDeviceLoadMesh()`, one must ensure that all `NUM_VTXBUF_HANDLES` are pointers to valid
+arrays in host memory consisting of `mx * my * mz` elements and stored in order
+`i + j * mx + k * mx * my`, where `i`, `j` and `k` correspond to `x`, `y` and `z` coordinate
+indices, respectively. The mesh dimensions can be queried with 
+```C
+int mx = info.int_params[AC_mx];
+int my = info.int_params[AC_my];
+int mz = info.int_params[AC_mz];
+```
+after initialization.
+
+
+
+
+### Decomposition
+Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
+```C
+typedef struct {
+    int3 m; // Size of the simulation domain (includes the ghost zones)
+    int3 n; // Size of the computational domain (without ghost zones)
+} Grid;
+```
+Each device is assigned a block of data 
+
+
 ## Streams and synchronization
 
-## Interface
+## Integration, reductions and boundary conditions
 
 
 # Astaroth DSL
 
 ## Uniforms
 
+### Control flow and implementing switches
+// Runtime constants are as fast as compile-time constants as long as
+// 1) They are not placed in tight loops, especially those that inlcude global memory accesses, that could be unrolled
+// 2) They are not multiplied with each other
+// 3) At least 32 neighboring threads in the x-axis access the same constant
+
+// Safe and efficient to use as switches
+
 ## Vertex buffers
 
 ### Input and output buffers
 
 ## Built-in variables and functions
 
+## Functions
+
+### Kernel
+
+### Preprocessed
+
+
+
 

From 53230c9b61ca004c5eec794e3f2a43a91d5b40ee Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 5 Sep 2019 19:56:04 +0300
Subject: [PATCH 06/12] Added errorchecking and more flexibility the the new
 acDeviceLoadScalarArray function

---
 include/astaroth_device.h | 4 ++--
 src/core/device.cu        | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 7a800c2..6ce7cda 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -63,8 +63,8 @@ AcResult acDeviceLoadInt3Constant(const Device device, const Stream stream, cons
 
 /** */
 AcResult acDeviceLoadScalarArray(const Device device, const Stream stream,
-                                 const ScalarArrayHandle handle, const AcReal* data,
-                                 const size_t num);
+                                 const ScalarArrayHandle handle, const size_t start,
+                                 const AcReal* data, const size_t num);
 
 /** */
 AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
diff --git a/src/core/device.cu b/src/core/device.cu
index 33229b8..b5d3ebb 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -449,10 +449,15 @@ acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3P
 
 AcResult
 acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
-                        const AcReal* data, const size_t num)
+                        const size_t start, const AcReal* data, const size_t num)
 {
     cudaSetDevice(device->id);
-    ERRCHK_CUDA(cudaMemcpyAsync(device->vba.profiles[handle], data, sizeof(data[0]) * num,
+
+    ERRCHK(start + num <= max(device->local_config.int_params[AC_mx],
+                              max(device->local_config.int_params[AC_my],
+                                  device->local_config.int_params[AC_mz])));
+
+    ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num,
                                 cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }

From 021e5f3774da7bd50441ab81968cb815c5eb2ce4 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 12 Sep 2019 15:48:38 +0300
Subject: [PATCH 07/12] Renamed NUM_STREAM_TYPES -> NUM_STREAMS

---
 include/astaroth_defines.h | 4 ++--
 src/core/device.cu         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 1c7ba1e..ad5d39b 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -127,9 +127,9 @@ typedef enum {
     STREAM_14,
     STREAM_15,
     STREAM_16,
-    NUM_STREAM_TYPES
+    NUM_STREAMS
 } Stream;
-#define STREAM_ALL (NUM_STREAM_TYPES)
+#define STREAM_ALL (NUM_STREAMS)
 
 #define AC_GEN_ID(X) X
 typedef enum {
diff --git a/src/core/device.cu b/src/core/device.cu
index b5d3ebb..9ab7df4 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -46,7 +46,7 @@ struct device_s {
     AcMeshInfo local_config;
 
     // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
+    cudaStream_t streams[NUM_STREAMS];
 
     // Memory
     VertexBufferArray vba;
@@ -163,7 +163,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
     printf("Success!\n");
 
     // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+    for (int i = 0; i < NUM_STREAMS; ++i) {
         cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
     }
 
@@ -228,7 +228,7 @@ acDeviceDestroy(Device device)
 #endif
 
     // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+    for (int i = 0; i < NUM_STREAMS; ++i) {
         cudaStreamDestroy(device->streams[i]);
     }
 

From e8745e282a5f22424ea33f83ff08cc77e3ce7c9c Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 12 Sep 2019 19:34:43 +0300
Subject: [PATCH 08/12] Added the library API specification

---
 .../API_specification_and_user_manual.md      | 124 ++++++++++++++----
 1 file changed, 98 insertions(+), 26 deletions(-)

diff --git a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
index b0e364e..a928d44 100644
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -217,46 +217,81 @@ AcResult acNodeReduceVec(const Node node, const Stream stream_type, const Reduct
                          const VertexBufferHandle vtxbuf2, AcReal* result);
 ```
 
-### Synchronization
+### Stream synchronization
 
-The functions for synchronizing with certain stream and swapping input and output buffers are
-defined as follows. Reductions and boundary conditions declared in the previous subsection operate
-in-place on the input array, while integration places the result in the output buffer.
-Therefore ac*SwapBuffers() should always be called after an integration substep has been completed.
-
-The node layer introduces two new functions in addition to synchronization and swapping functions,
-acNodeSynchronizeVertexBuffer and acNodeSynchronizeMesh. These functions communicate the overlapping
-vertices between neighboring devices.
-Note that part of the overlapping ghost zone is also communicated when synchronizing vertex buffers
-and meshes. Therefore the data in this part of the ghost zone must be up to date before calling
-the communication functions.
+All library functions that take a `Stream` as a parameter are asynchronous. When calling these functions,
+the control returns immediately back to the host even if the called device function has not yet completed.
+Therefore special care must be taken in order to ensure proper synchronization.
 
+Synchronization is done using `Stream` primitives, defined as
 ```C
-AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
-AcResult acDeviceSwapBuffers(const Device device);
+typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
+#define STREAM_ALL (NUM_STREAMS)
+```. 
 
-AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
-AcResult acNodeSwapBuffers(const Node node);
+Functions queued in the same stream will be executed sequentially. If two or more consequent
+functions are queued in different streams, then these functions may execute in parallel. Finally,
+there is a barrier synchronization function, which blocks until all functions in some stream have
+completed. The Astaroth API provides barrier synchronization with functions `acDeviceSynchronize` and
+`acNodeSynchronize`. All streams can be synchronized at once by passing the alias `STREAM_ALL` to
+the synchronization function.
+
+Usage of streams is demonstrated with the following example.
+```C
+funcA(STREAM_0);
+funcB(STREAM_0); // Blocks until funcA has completed
+funcC(STREAM_1); // May execute in parallel with funcB
+synchronizeStream(STREAM_ALL); // Blocks until functions in all streams have completed
+funcD(STREAM_2); // Is started when command returns from synchronizeStream()
+```
+
+### Data synchronization
+
+Stream synchronization works in the same fashion on node and device layers. However, on the node
+layer one has to take in account that a portion of the mesh is shared between devices and that the
+data is always up to date.
+
+In stencil computations, the mesh is surrounded by a halo, where data is only used for updating grid
+points near the boundaries of the simulation domain. A portion of this halo is shared by neighboring
+devices. As there is no way of knowing when the user has completed operations on the mesh, the data
+communication between neighboring devices must be explicitly triggered. For this purpose, we provide
+the functions
+```C
+AcResult acNodeSynchronizeMesh(const Node node, const Stream stream);
 AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
                                        const VertexBufferHandle vtxbuf_handle);
-AcResult acNodeSynchronizeMesh(const Node node, const Stream stream);
+
 ```
 
+> **NOTE**: Local halos must be up to date before synchronizing the data. Local halos are the grid points outside the computational domain which are used only by a single device. The mesh is distributed to multiple devices by blocking along the z axis. If there are *n* devices and the z-dimension of the computational domain is *nz*, then each device is assigned *nz / n* two-dimensional planes. For example with two devices, the data block that has to be up to date ranges from *(0, 0, nz)* to *(mx, my, nz + 2 * NGHOST)* 
+
+### Input and output buffers
+
+The mesh is duplicated to input and output buffers for performance reasons. The input buffers are
+read-only in user-specified compute kernels, which allows us to read them via the texture cache optimized
+for spatially local memory accesses. The results of compute kernels are written into the output buffers.
+
+Since we allow the user to operate on subsets of the computational domain in user-specified kernels, we have no way to know when the output buffers are complete and can be swapped. Therefore the user should explicitly state when the input and output buffer should be swapped. This is done via the API calls
+```C
+AcResult acDeviceSwapBuffers(const Device device);
+AcResult acNodeSwapBuffers(const Node node);
+```
+> All functions provided with the API operate on input buffers and ensure that the complete result
+is available in the input buffer when the function has completed. User-specified kernels are exceptions and write the result to output buffers. Therefore buffers have to be swapped only after calling user-specified kernels.
+
 ## Devices
 
-The data type Device is a handle to some single device and is used in the Device layer functions to specify which device should execute the function. A device is created and destroyed with the following interface functions.
+`Device` is a handle to some single device and is used in device layer functions to specify which device should execute the function. A `Device` is created and destroyed with the following interface functions.
 ```C
 AcResult acDeviceCreate(const int device_id, const AcMeshInfo device_config, Device* device);
-
 AcResult acDeviceDestroy(Device device);
 ```
 
 ## Nodes
 
-The data type Node is a handle to some node, which consists of multiple devices. The Node handle is used to specify which node the Node layer functions should operate in. A node is created and destroyed with the following interface functions.
+`Node` is a handle to some compute node which consists of multiple devices. The `Node` handle is used to specify which node the node layer functions should operate in. A node is created and destroyed with the following interface functions.
 ```C
 AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
-
 AcResult acNodeDestroy(Node node);
 ```
 
@@ -278,7 +313,7 @@ typedef struct {
 ## Meshes
 
 Meshes are the primary structures for passing information to the library and kernels. The definition
-of a mesh is declared as
+of a `Mesh` is declared as
 ```C
 typedef struct {
     int int_params[NUM_INT_PARAMS];
@@ -320,8 +355,6 @@ int mz = info.int_params[AC_mz];
 after initialization.
 
 
-
-
 ### Decomposition
 Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
 ```C
@@ -330,13 +363,52 @@ typedef struct {
     int3 n; // Size of the computational domain (without ghost zones)
 } Grid;
 ```
-Each device is assigned a block of data 
 
+As briefly discussed in the section Data synchronization, a `Mesh` is distributed to multiple devices
+by blocking the data along the *z*-axis. Given the mesh dimensions *(mx, my, mz)*, its computational
+domain *(nx, ny, nz)* and *n* number of devices, then each device is assigned a mesh of size
+*(mx, my, 2 * NGHOST + nz/n)* and a computational domain of size *(nx, ny, nz/n)*.
 
-## Streams and synchronization
+Let *i* be the device id. The portion of the halos shared by neighboring devices is then *(0, 0, i * nz/n)* - *(mx, my, 2 * NGHOST + i * nz/n)*. The functions `acNodeSynchronizeVertexBuffer` and `acNodeSynchronizeMesh` communicate these shared areas among the devices in the node.
 
 ## Integration, reductions and boundary conditions
 
+The library provides the following functions for integration, reductions and computing periodic boundary conditions.
+```C
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                                  const int3 start, const int3 end, const AcReal dt);
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
+                                       const int3 end);
+AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                                    const int3 end);
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
+                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                           const VertexBufferHandle vtxbuf2, AcReal* result);
+
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
+AcResult acNodeIntegrate(const Node node, const AcReal dt);
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle);
+AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
+```
+
+Finally, there's a library function that is automatically generated for all user-specified `Kernel`
+functions written with the Astaroth DSL,
+```C
+AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,
+                                     const int3 start, const int3 end);
+```
+Where `##identifier` is replaced with the name of the user-specified kernel. For example, a device
+function `Kernel solve()` can be called with `acDeviceKernel_solve()` via the API.
 
 # Astaroth DSL
 

From bfd00f12d15fb8b22388d9b43247d98b600727bb Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 12 Sep 2019 16:39:26 +0000
Subject: [PATCH 09/12] API_specification_and_user_manual.md edited online with
 Bitbucket. Syntax fixes.

---
 .../API_specification_and_user_manual.md                   | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
index a928d44..45e5930 100644
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -1,4 +1,4 @@
-# Astaroth specification and user manual
+a# Astaroth specification and user manual
 
 Copyright (C) 2014-2019, Johannes Pekkila, Miikka Vaisala.
 
@@ -227,7 +227,7 @@ Synchronization is done using `Stream` primitives, defined as
 ```C
 typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
 #define STREAM_ALL (NUM_STREAMS)
-```. 
+```
 
 Functions queued in the same stream will be executed sequentially. If two or more consequent
 functions are queued in different streams, then these functions may execute in parallel. Finally,
@@ -276,8 +276,7 @@ Since we allow the user to operate on subsets of the computational domain in use
 AcResult acDeviceSwapBuffers(const Device device);
 AcResult acNodeSwapBuffers(const Node node);
 ```
-> All functions provided with the API operate on input buffers and ensure that the complete result
-is available in the input buffer when the function has completed. User-specified kernels are exceptions and write the result to output buffers. Therefore buffers have to be swapped only after calling user-specified kernels.
+> **NOTE**: All functions provided with the API operate on input buffers and ensure that the complete result is available in the input buffer when the function has completed. User-specified kernels are exceptions and write the result to output buffers. Therefore buffers have to be swapped only after calling user-specified kernels.
 
 ## Devices
 

From e351902dc0c2f527b286e94a8bf6496f78f36c8e Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 12 Sep 2019 16:41:11 +0000
Subject: [PATCH 10/12] Fixed a failed fix.

---
 .../API_specification_and_user_manual.md                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
index 45e5930..4ee06dc 100644
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -1,4 +1,4 @@
-a# Astaroth specification and user manual
+# Astaroth specification and user manual
 
 Copyright (C) 2014-2019, Johannes Pekkila, Miikka Vaisala.
 

From 4ce51ea60e668063000255a8dede8303ba190e69 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Thu, 12 Sep 2019 20:11:21 +0300
Subject: [PATCH 11/12] Now the generated CUDA header files are completely
 local (placed in the build directory) instead of depending on some predefined
 directory structure. This allows users to swap between build directories
 without having to recompile.

---
 CMakeLists.txt         |  1 +
 scripts/compile_acc.sh | 13 +++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4cf61b3..774bfa1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ endif ()
 ## Include directories
 include_directories(.)
 include_directories(include)
+include_directories(${CMAKE_BINARY_DIR})
 
 ## Subdirectories
 add_subdirectory(src/core) # The core library
diff --git a/scripts/compile_acc.sh b/scripts/compile_acc.sh
index a91d244..640e129 100755
--- a/scripts/compile_acc.sh
+++ b/scripts/compile_acc.sh
@@ -5,6 +5,7 @@ then
        echo "ASTAROTH_HOME environment variable not set, run \"source ./sourceme.sh\" in Astaroth home directory"
        exit 1
 fi
+OUTPUT_DIR=${PWD}
 
 KERNEL_DIR=${AC_HOME}"/src/core/kernels"
 ACC_DIR=${AC_HOME}"/acc"
@@ -69,11 +70,11 @@ ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS}
 ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS}
 ${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_HEADER}
 
-echo "Moving stencil_assembly.cuh -> ${AC_HOME}/src/core/kernels"
-mv stencil_assembly.cuh ${AC_HOME}/src/core/kernels
+echo "Moving stencil_assembly.cuh -> ${OUTPUT_DIR}"
+mv stencil_assembly.cuh ${OUTPUT_DIR}
 
-echo "Moving stencil_process.cuh -> ${AC_HOME}/src/core/kernels"
-mv stencil_process.cuh ${AC_HOME}/src/core/kernels
+echo "Moving stencil_process.cuh -> ${OUTPUT_DIR}"
+mv stencil_process.cuh ${OUTPUT_DIR}
 
-echo "Moving stencil_defines.cuh -> ${AC_HOME}/include"
-mv stencil_defines.h ${AC_HOME}/include
+echo "Moving stencil_defines.cuh -> ${OUTPUT_DIR}"
+mv stencil_defines.h ${OUTPUT_DIR}

From 55e4357d77195383b3f1eed31f04e360dfa2fe4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miikka=20V=C3=A4is=C3=A4l=C3=A4?=
 <mvaisala@asiaa.sinica.edu.tw>
Date: Mon, 16 Sep 2019 02:16:15 +0000
Subject: [PATCH 12/12]  compile_acc.sh edited online with Bitbucket. 
 Corrections to --help.

---
 scripts/compile_acc.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/compile_acc.sh b/scripts/compile_acc.sh
index 640e129..efbcc71 100755
--- a/scripts/compile_acc.sh
+++ b/scripts/compile_acc.sh
@@ -26,7 +26,9 @@ while [ "$#" -gt 0 ]
 do
 	case $1 in
 		-h|--help)
-			echo "You can set a custom files for DSL under the path $AC_HOME/"
+			echo "This script will help to compile DSL to CUDA code."
+			echo "The resulting kernels will be stored to $OUTPUT_DIR."
+			echo "You can set a custom files for DSL under the path $AC_DIR/"
 			echo "Example:"
 			echo "compile_acc.sh -a custom_setup/custom_assembly.sas -p custom_setup/custom_process.sps --header custom_setup/custom_header.h"
 			exit 0