Added Astaroth 2.0

2019-06-14 14:18:35 +03:00
parent 4e4f84c8ff
commit 0e48766a68
87 changed files with 18058 additions and 1 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -0,0 +1,70 @@
+########################################
+##  CMakeLists.txt for Astaroth Core  ##
+########################################
+
+#----------------------Find CUDA-----------------------------------------------#
+
+find_package(CUDA)
+if (NOT CUDA_FOUND)
+    # find_package(CUDA REQUIRED) gives a confusing error message if it fails,
+    # therefore we print the reason here explicitly
+    message(FATAL_ERROR "CUDA not found")
+endif()
+
+
+#----------------------CUDA settings-------------------------------------------#
+
+set(CUDA_SEPARABLE_COMPILATION ON)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
+
+# CUDA_BUILD_CUBIN requires that we're compiling for only one architecture
+# set(CUDA_BUILD_CUBIN ON)
+
+
+#----------------------Setup CUDA compilation flags----------------------------#
+
+# Generate code for the default architecture (Pascal)
+set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
+                    -gencode arch=compute_50,code=sm_50 
+                    -gencode arch=compute_60,code=sm_60 
+                    -gencode arch=compute_61,code=sm_61 
+                    -lineinfo 
+                    --maxrregcount=255
+                    -ftz=true 
+                    -std=c++11) #--maxrregcount=255 -ftz=true #ftz = flush denormalized floats to zero
+# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
+# =cg to opt out
+
+# Additional CUDA optimization flags
+if (CMAKE_BUILD_TYPE MATCHES RELEASE)
+    # Doesn't set any additional flags, see CUDA_NVCC_FLAGS_DEBUG below on how
+    # to add more
+    set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE})
+endif()
+
+# Additional CUDA debug flags
+if (CMAKE_BUILD_TYPE MATCHES DEBUG)
+    # The debug flags must be set inside this if clause, since either CMake 3.5
+    # or nvcc 7.5 is bugged:
+    # CMake converts these into empty strings when doing RELEASE build, but nvcc
+    # 7.5 fails to parse empty flags.
+    set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};
+                               --device-debug;
+                               --generate-line-info;
+                               --ptxas-options=-v)
+endif()
+
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCH_FLAGS}")
+
+
+message("CUDA_NVCC_FLAGS: " ${CUDA_NVCC_FLAGS})
+
+
+#------------------Compile and create a static library-------------------------#
+file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu")
+
+# Use -fPIC if -fpic not supported. Some quick non-scientific tests:
+# Without fpic: 4.94 user, 4.04 system, 0:09.88 elapsed
+# With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed
+# With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed
+CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic")
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -0,0 +1,451 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Multi-GPU implementation.
+ *
+ * Detailed info.
+ *
+ */
+#include "astaroth.h"
+#include "errchk.h"
+
+#include "device.cuh"
+#include "math_utils.h" // sum for reductions
+#include "standalone/config_loader.h" // update_config
+
+const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[]     = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]        = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+
+
+static const int MAX_NUM_DEVICES = 32;
+static int num_devices = 1;
+static Device devices[MAX_NUM_DEVICES] = {};
+
+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
+
+static Grid
+createGrid(const AcMeshInfo& config)
+{
+    Grid grid;
+    grid.m = (int3) {
+        config.int_params[AC_mx],
+        config.int_params[AC_my],
+        config.int_params[AC_mz]
+    };
+
+    grid.n = (int3) {
+        config.int_params[AC_nx],
+        config.int_params[AC_ny],
+        config.int_params[AC_nz]
+    };
+
+    return grid;
+}
+
+static Grid grid; // A grid consists of num_devices subgrids
+static Grid subgrid;
+
+static int
+gridIdx(const Grid& grid, const int i, const int j, const int k)
+{
+    return i + j * grid.m.x + k * grid.m.x * grid.m.y;
+}
+
+static int3
+gridIdx3d(const Grid& grid, const int idx)
+{
+    return (int3){idx % grid.m.x,
+                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+                  idx / (grid.m.x * grid.m.y)};
+}
+
+void
+printInt3(const int3 vec)
+{
+    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
+}
+
+AcResult
+acInit(const AcMeshInfo& config)
+{
+    // Check devices
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices < 1) {
+        ERROR("No CUDA devices found!");
+        return AC_FAILURE;
+    }
+    if (num_devices > MAX_NUM_DEVICES) {
+        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
+        num_devices = MAX_NUM_DEVICES;
+    }
+    if (!AC_MULTIGPU_ENABLED) {
+        WARNING("MULTIGPU_ENABLED was false. Using only one device");
+        num_devices = 1; // Use only one device if multi-GPU is not enabled
+    }
+    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // problem domain to multiple GPUs much easier since we do not have to worry
+    // about remainders
+    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+
+    // Decompose the problem domain
+    // The main grid
+    grid = createGrid(config);
+
+    // Subgrids
+    AcMeshInfo subgrid_config = config;
+    subgrid_config.int_params[AC_nz] /= num_devices;
+    update_config(&subgrid_config);
+    subgrid = createGrid(subgrid_config);
+
+    // Periodic boundary conditions become weird if the system can "fold unto itself".
+    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+
+    printf("Grid m "); printInt3(grid.m); printf("\n");
+    printf("Grid n "); printInt3(grid.n); printf("\n");
+    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+
+    // Initialize the devices
+    for (int i = 0; i < num_devices; ++i) {
+        createDevice(i, subgrid_config, &devices[i]);
+        printDeviceInfo(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acQuit(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        destroyDevice(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+int
+gridIdxx(const Grid grid, const int3 idx)
+{
+    return gridIdx(grid, idx.x, idx.y, idx.z);
+}
+
+AcResult
+acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
+{
+    /*
+    Here we decompose the host mesh and distribute it among the GPUs in
+    the node.
+    
+    The host mesh is a huge contiguous block of data. Its dimensions are given by
+    the global variable named "grid". A "grid" is decomposed into "subgrids",
+    one for each GPU. Here we check which parts of the range s0...s1 maps
+    to the memory space stored by some GPU, ranging d0...d1, and transfer
+    the data if needed.
+    
+    The index mapping is inherently quite involved, but here's a picture which
+    hopefully helps make sense out of all this.
+    
+
+    Grid
+                                     |----num_vertices---|
+    xxx|....................................................|xxx
+             ^                   ^   ^                   ^
+            d0                  d1  s0 (src)            s1
+
+    Subgrid
+
+             xxx|.............|xxx
+             ^                   ^
+            d0                  d1
+
+                                 ^   ^                   
+                                db  da                   
+
+    */
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
+{
+    // See acLoadWithOffset() for an explanation of the index mapping
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+// acCopyMeshToDevice
+AcResult
+acLoad(const AcMesh& host_mesh)
+{
+    return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
+}
+
+// acCopyMeshToHost
+AcResult
+acStore(AcMesh* host_mesh)
+{
+    return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
+}
+
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
+    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
+                              STENCIL_ORDER/2 + subgrid.n.y,
+                              STENCIL_ORDER/2 + subgrid.n.z};
+    for (int i = 0; i < num_devices; ++i) {
+        rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    acSynchronize();
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY,
+                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+    } else {
+        // Local boundary conditions
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+
+/*
+// ===MIIKKANOTE START==========================================
+%JP: The old way for computing boundary conditions conflicts with the 
+way we have to do things with multiple GPUs.
+
+The older approach relied on unified memory, which represented the whole
+memory area as one huge mesh instead of several smaller ones. However, unified memory
+in its current state is more meant for quick prototyping when performance is not an issue.
+Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
+when managing the memory explicitly.
+
+In this new approach, I have simplified the multi- and single-GPU layers significantly. 
+Quick rundown:
+	New struct: Grid. There are two global variables, "grid" and "subgrid", which
+	contain the extents of the whole simulation domain and the decomposed grids, respectively.
+	To simplify thing, we require that each GPU is assigned the same amount of work,
+	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
+	to work with.
+
+	The whole simulation domain is decomposed with respect to the z dimension.
+	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+	contain (nx, ny, nz / num_devices) vertices.
+ 
+	An local index (i, j, k) in some subgrid can be mapped to the global grid with
+		global idx = (i, j, k + device_id * subgrid.n.z)
+
+Terminology:
+	- Single-GPU function: a function defined on the single-GPU layer (device.cu)
+
+Changes required to this commented code block:
+	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
+	  instead. Same holds for any complex index calculations. Instead, the local coordinates
+  	  should be passed as an int3 type without having to consider how the data is actually
+	  laid out in device memory
+	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
+	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
+	  are stored in a global array "devices[num_devices]". 
+	- Every single-GPU function is executed asynchronously by default such that we
+	  can optimize Astaroth by executing memory transactions concurrently with computation.
+	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
+	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+	  as a parameter and commands executing in different streams can be processed 
+	  in parallel/concurrently.
+
+
+Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+
+	With multiple GPUs, periodic boundary conditions applied on indices ranging from 
+		
+		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+
+	on a single device are "local", in the sense that they can be computed without having
+	to exchange data with neighboring GPUs. Special care is needed only for transferring
+	the data to the fron and back plates outside this range. In the solution we use here,
+	we solve the local boundaries first, and then just exchange the front and back plates
+	in a "ring", like so 
+				device_id
+		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+			
+
+// ======MIIKKANOTE END==========================================
+
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
+                                                          moved into device.cu, function boundCondStep()
+                                                          In astaroth.cu, we use acBoundcondStep() 
+                                                          just to distribute the work and manage
+                                                          communication between GPUs.
+
+    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+
+    exit(0);
+    #else
+    
+        
+        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
+
+        const int3 start = (int3){0, 0, device_id * depth};
+        const int3 end = (int3){mesh_info.int_params[AC_mx],
+                                mesh_info.int_params[AC_my],
+                                min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
+
+        const dim3 tpb(8,2,8);
+
+        // TODO uses the default stream currently
+        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
+            wedge_boundconds(0, tpb, start, end, d_buffer);
+        } else { 
+            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) 
+                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+*/
+        // Exchange halos
+        for (int i = 0; i < num_devices; ++i) {
+            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
+            // ...|ooooxxx|... -> xxx|ooooooo|...
+            {
+                const int3 src = (int3) {0, 0, subgrid.n.z};
+                const int3 dst = (int3) {0, 0, 0};
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+            }
+            // ...|ooooooo|xxx <- ...|xxxoooo|...
+            {
+                const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
+                const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
+                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+            }
+        }
+    }
+    acSynchronize();
+    return AC_SUCCESS;
+}
+
+static AcResult
+acSwapBuffers(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        swapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrate(const AcReal& dt)
+{
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acBoundcondStep();
+        acIntegrateStep(isubstep, dt);
+        acSwapBuffers();
+    }
+    return AC_SUCCESS;
+}
+
+AcReal
+acReduceScal(const ReductionType& rtype,
+             const VertexBufferHandle& vtxbuffer_handle)
+{
+    // TODO
+    return 0;
+}
+
+AcReal
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+            const VertexBufferHandle& b, const VertexBufferHandle& c)
+{
+    // TODO
+    return 0;
+}
+
+AcResult
+acSynchronize(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], STREAM_ALL);
+    }
+
+    return AC_SUCCESS;
+}
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -0,0 +1,309 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "device.cuh"
+
+#include "errchk.h"
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+} VertexBufferArray;
+
+__constant__ AcMeshInfo d_mesh_info;
+#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+#include "kernels/kernels.cuh"
+
+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+};
+
+AcResult
+printDeviceInfo(const Device device)
+{
+    const int device_id = device->id;
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);
+
+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf("    Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    // Memory
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
+               (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n",
+           props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n",
+           free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n",
+           total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    printf("    Shared mem per block: %ld KiB\n",
+           props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n",
+           props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
+
+    return AC_SUCCESS;
+}
+
+static __global__ void dummy_kernel(void) {}
+
+AcResult
+createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
+{
+    cudaSetDevice(id);
+    cudaDeviceReset();
+
+    // Create Device
+    struct device_s* device = (struct device_s*) malloc(sizeof(*device));
+    ERRCHK_ALWAYS(device);
+
+    device->id = id;
+    device->local_config = device_config;
+
+    // Check that the code was compiled for the proper GPU architecture
+    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
+           "device supports the CUDA architecture you are compiling for.\n"
+           "Running dummy kernel... ");
+    fflush(stdout);
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    printf("Success!\n");
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamCreate(&device->streams[i]);
+    }
+
+    // Memory
+    const size_t vba_size_bytes = AC_VTXBUF_SIZE_BYTES(device_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+    }
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
+                                  AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+
+    // Device constants
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
+                                          cudaMemcpyHostToDevice));
+
+    printf("Created device %d (%p)\n", device->id, device);
+    *device_handle = device;
+    return AC_SUCCESS;
+}
+
+AcResult
+destroyDevice(Device device)
+{
+    cudaSetDevice(device->id);
+    printf("Destroying device %d (%p)\n", device->id, device);
+
+    // Memory
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        cudaFree(device->vba.in[i]);
+        cudaFree(device->vba.out[i]);
+    }
+    cudaFree(device->reduce_scratchpad);
+    cudaFree(device->reduce_result);
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i)
+        cudaStreamDestroy(device->streams[i]);
+
+    // Destroy Device
+    free(device);
+    return AC_SUCCESS;
+}
+
+AcResult
+boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceScal(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceVec(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+rkStep(const Device device, const StreamType stream_type, const int step_number,
+       const int3& start, const int3& end, const AcReal dt)
+{
+    cudaSetDevice(device->id);
+    rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
+    return AC_SUCCESS;
+}
+
+AcResult
+synchronize(const Device device, const StreamType stream_type)
+{
+    cudaSetDevice(device->id);
+    if (stream_type == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    } else {
+        cudaStreamSynchronize(device->streams[stream_type]);
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+loadWithOffset(const Device device, const StreamType stream_type,
+               const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+static AcResult
+storeWithOffset(const Device device, const StreamType stream_type,
+                const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToDevice(const Device device, const StreamType stream_type,
+                 const AcMesh& host_mesh, const int3& src, const int3& dst,
+                 const int num_vertices)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
+                       &device->vba.in[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToHost(const Device device, const StreamType stream_type,
+               const int3& src, const int3& dst, const int num_vertices,
+               AcMesh* host_mesh)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
+                        num_vertices * sizeof(AcReal),
+                        &host_mesh->vertex_buffer[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
+                       const int3& src, Device dst_device, const int3& dst,
+                       const int num_vertices)
+{
+    cudaSetDevice(src_device->id);
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, dst_device->local_config);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
+                                        &src_device->vba.in[i][src_idx], src_device->id,
+                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
+                                        src_device->streams[stream_type]));
+    }
+    return AC_SUCCESS;
+}
+
+
+AcResult
+swapBuffers(const Device device)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        AcReal* tmp     = device->vba.in[i];
+        device->vba.in[i]  = device->vba.out[i];
+        device->vba.out[i] = tmp;
+    }
+    return AC_SUCCESS;
+}
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -0,0 +1,82 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+typedef enum {
+  STREAM_PRIMARY,
+  STREAM_SECONDARY,
+  NUM_STREAM_TYPES,
+  STREAM_ALL
+} StreamType;
+
+typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
+                                 // in Vulkan, f.ex. VkDevice
+
+/** */
+AcResult printDeviceInfo(const Device device);
+
+/** */
+AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
+
+/** */
+AcResult destroyDevice(Device device);
+
+/** */
+AcResult boundcondStep(const Device device, const StreamType stream_type,
+                       const int3& start, const int3& end);
+
+/** */
+AcResult reduceScal(const Device device);
+
+/** */
+AcResult reduceVec(const Device device);
+
+/** */
+AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
+                const int3& start, const int3& end, const AcReal dt);
+
+/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
+    a StreamType, the function synchronizes all streams on the device. */
+AcResult synchronize(const Device device, const StreamType stream_type);
+
+/** */
+AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
+                          const AcMesh& host_mesh, const int3& src, const int3& dst,
+                          const int num_vertices);
+
+/** */
+AcResult copyMeshToHost(const Device device, const StreamType stream_type,
+                        const int3& src, const int3& dst, const int num_vertices,
+                        AcMesh* host_mesh);
+
+/** */
+AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
+                                Device dst, const int3& dst_idx, const int num_vertices);
+
+/** Swaps the input/output buffers used in computations */
+AcResult swapBuffers(const Device device);
--- a/src/core/errchk.h
+++ b/src/core/errchk.h
@@ -0,0 +1,112 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+// clang-format off
+/*
+ * =============================================================================
+ * General error checking
+ * =============================================================================
+ */
+#define ERROR(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tError in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+    exit(EXIT_FAILURE); \
+    abort(); \
+}
+
+#define WARNING(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+}
+
+// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
+#define ERRCHK(retval)  { if (!(retval)) ERROR(#retval " was false"); }
+#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
+#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
+
+/*
+ * =============================================================================
+ * CUDA-specific error checking
+ * =============================================================================
+ */
+#ifdef __CUDACC__
+static inline void
+cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
+{
+    if (code != cudaSuccess) {
+        time_t t; time(&t); \
+        fprintf(stderr, "%s", ctime(&t)); \
+        fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
+                        file, line, cudaGetErrorString(code)); \
+        fflush(stderr); \
+
+        if (abort)
+            exit(code);
+    }
+}
+
+#ifdef NDEBUG
+    #undef ERRCHK
+    #undef WARNCHK
+    #define ERRCHK(params)
+    #define WARNCHK(params)
+    #define ERRCHK_CUDA(params) params;
+    #define WARNCHK_CUDA(params) params;
+    #define ERRCHK_CUDA_KERNEL() {}
+#else
+    #define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
+    #define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
+
+    #define ERRCHK_CUDA_KERNEL()                                               \
+    {                                                                          \
+        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
+        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
+    }
+    #endif
+
+#endif
+
+#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
+
+#define ERRCHK_CUDA_KERNEL_ALWAYS()                                               \
+{                                                                          \
+    ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError());                                    \
+    ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize());                                  \
+}
+// clang-format on
--- a/src/core/kernels/.gitignore
+++ b/src/core/kernels/.gitignore
@@ -0,0 +1,2 @@
+# Ignore the generated headers
+stencil_process.cuh stencil_assembly.cuh
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -0,0 +1,794 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+ #pragma once
+
+__global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vertex_buffer[dst_idx] = vertex_buffer[src_idx];
+}
+
+void
+periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vertex_buffer)
+{
+    const dim3 tpb(8,2,8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
+    ERRCHK_CUDA_KERNEL();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <assert.h>
+
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, 
+               const AcReal dt, VertexBufferArray* buffer)
+{
+    const dim3 tpb(32, 1, 4);
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/reduce.cuh
+++ b/src/core/kernels/reduce.cuh
@@ -0,0 +1,338 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
+
+// Function pointer definitions
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
+typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
+typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
+                                       const AcReal&);
+
+// clang-format off
+/* Comparison funcs */
+__device__ inline AcReal
+_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+__device__ inline AcReal
+_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+__device__ inline AcReal
+_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+__device__ inline AcReal
+_device_length_scal(const AcReal& a) { return AcReal(a); }
+
+__device__ inline AcReal
+_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
+
+__device__ inline AcReal
+_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
+
+__device__ inline AcReal
+_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+__device__ inline AcReal
+_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
+
+__device__ inline AcReal
+_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
+// clang-format on
+
+__device__ inline bool
+oob(const int& i, const int& j, const int& k)
+{
+    if (i >= d_mesh_info.int_params[AC_nx] ||
+        j >= d_mesh_info.int_params[AC_ny] ||
+        k >= d_mesh_info.int_params[AC_nz])
+        return true;
+    else
+        return false;
+}
+
+template <ReduceInitialScalFunc reduce_initial>
+__global__ void
+_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src[src_idx]);
+}
+
+template <ReduceInitialVecFunc reduce_initial>
+__global__ void
+_kernel_reduce_vec(const __restrict__ AcReal* src_a,
+                   const __restrict__ AcReal* src_b,
+                   const __restrict__ AcReal* src_c, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
+                                  src_c[src_idx]);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+#define BLOCK_SIZE (1024)
+#define ELEMS_PER_THREAD (32)
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce(AcReal* src, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+
+    if (idx >= scratchpad_size)
+        return;
+
+    __shared__ AcReal smem[BLOCK_SIZE];
+
+    AcReal tmp = src[idx];
+
+    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
+        const int src_idx = idx + i * BLOCK_SIZE;
+        if (src_idx >= scratchpad_size) {
+            // This check is for safety: if accessing uninitialized values
+            // beyond the mesh boundaries, we will immediately start seeing NANs
+            if (threadIdx.x < BLOCK_SIZE)
+                smem[threadIdx.x] = NAN;
+            else
+                break;
+        }
+        tmp = reduce(tmp, src[src_idx]);
+    }
+
+    smem[threadIdx.x] = tmp;
+    __syncthreads();
+
+    int offset = BLOCK_SIZE / 2;
+    while (offset > 0) {
+
+        if (threadIdx.x < offset) {
+            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
+            smem[threadIdx.x] = tmp;
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        src[idx] = tmp;
+}
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
+{
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    AcReal tmp    = src[idx];
+    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
+    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
+        tmp = reduce(tmp, src[idx + i * block_offset]);
+
+    *result = tmp;
+}
+//////////////////////////////////////////////////////////////////////////////
+
+AcReal
+_reduce_scal(const cudaStream_t stream,
+             const ReductionType& rtype, const int& nx, const int& ny,
+             const int& nz, const AcReal* vertex_buffer,
+             AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
+                   int(ceil(AcReal(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
+                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_scal<_device_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_scal<_device_exp_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
+
+AcReal
+_reduce_vec(const cudaStream_t stream,
+            const ReductionType& rtype, const int& nx, const int& ny,
+            const int& nz, const AcReal* vertex_buffer_a,
+            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
+            AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
+                   int(ceil(float(ny) / tpb.y)),
+                   int(ceil(float(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
+                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    // "Features" of this quick & efficient reduction:
+    // Block size must be smaller than the computational domain size
+    // (otherwise we would have do some additional bounds checking in the
+    // second half of _kernel_reduce, which gets quite confusing)
+    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
+    // the work without worrying too much about the array bounds.
+    ERRCHK(BLOCK_SIZE <= scratchpad_size);
+    ERRCHK(!(BLOCK_SIZE % 2));
+    // NOTE! Also does not work properly with non-power of two mesh dimension
+    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
+    // see that uninitialized smem values are used in the comparison
+    ERRCHK(is_power_of_two(nx));
+    ERRCHK(is_power_of_two(ny));
+    ERRCHK(is_power_of_two(nz));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_vec<_device_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_vec<_device_exp_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
--- a/src/core/kernels/rk3.cuh
+++ b/src/core/kernels/rk3.cuh
@@ -0,0 +1,742 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Implementation of the integration pipeline
+ *
+ *
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include <assert.h>
+
+/*
+#define RK_THREADS_X (32)
+#define RK_THREADS_Y (1)
+#define RK_THREADS_Z (4)
+#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
+#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
+*/
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const dim3& tpb,
+               const int3& start, const int3& end, const int& step_number,
+               const AcReal dt, const AcMeshInfo& /*mesh_info*/,
+               VertexBufferArray* buffer)
+{
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -0,0 +1,91 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <math.h>   // isnan, isinf
+#include <stdlib.h> // rand
+
+template <class T>
+static inline const T
+max(const T& a, const T& b)
+{
+    return a > b ? a : b;
+}
+
+template <class T>
+static inline const T
+min(const T& a, const T& b)
+{
+    return a < b ? a : b;
+}
+
+template <class T>
+static inline const T
+sum(const T& a, const T& b)
+{
+    return a + b;
+}
+
+template <class T>
+static inline const T
+is_valid(const T& val)
+{
+    if (isnan(val) || isinf(val))
+        return false;
+    else
+        return true;
+}
+
+template <class T>
+static inline const T
+clamp(const T& val, const T& min, const T& max)
+{
+    return val < min ? min : val > max ? max : val;
+}
+
+static inline AcReal
+randr()
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+static inline int3
+operator+(const int3& a, const int3& b)
+{
+    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static inline int3
+operator-(const int3& a, const int3& b)
+{
+    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static inline bool
+is_power_of_two(const unsigned val)
+{
+    return val && !(val & (val - 1));
+}