Added Astaroth 2.0

2019-06-14 14:18:35 +03:00
parent 4e4f84c8ff
commit 0e48766a68
87 changed files with 18058 additions and 1 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -0,0 +1,70 @@
+########################################
+##  CMakeLists.txt for Astaroth Core  ##
+########################################
+
+#----------------------Find CUDA-----------------------------------------------#
+
+find_package(CUDA)
+if (NOT CUDA_FOUND)
+    # find_package(CUDA REQUIRED) gives a confusing error message if it fails,
+    # therefore we print the reason here explicitly
+    message(FATAL_ERROR "CUDA not found")
+endif()
+
+
+#----------------------CUDA settings-------------------------------------------#
+
+set(CUDA_SEPARABLE_COMPILATION ON)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
+
+# CUDA_BUILD_CUBIN requires that we're compiling for only one architecture
+# set(CUDA_BUILD_CUBIN ON)
+
+
+#----------------------Setup CUDA compilation flags----------------------------#
+
+# Generate code for the default architecture (Pascal)
+set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
+                    -gencode arch=compute_50,code=sm_50 
+                    -gencode arch=compute_60,code=sm_60 
+                    -gencode arch=compute_61,code=sm_61 
+                    -lineinfo 
+                    --maxrregcount=255
+                    -ftz=true 
+                    -std=c++11) #--maxrregcount=255 -ftz=true #ftz = flush denormalized floats to zero
+# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
+# =cg to opt out
+
+# Additional CUDA optimization flags
+if (CMAKE_BUILD_TYPE MATCHES RELEASE)
+    # Doesn't set any additional flags, see CUDA_NVCC_FLAGS_DEBUG below on how
+    # to add more
+    set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE})
+endif()
+
+# Additional CUDA debug flags
+if (CMAKE_BUILD_TYPE MATCHES DEBUG)
+    # The debug flags must be set inside this if clause, since either CMake 3.5
+    # or nvcc 7.5 is bugged:
+    # CMake converts these into empty strings when doing RELEASE build, but nvcc
+    # 7.5 fails to parse empty flags.
+    set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};
+                               --device-debug;
+                               --generate-line-info;
+                               --ptxas-options=-v)
+endif()
+
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCH_FLAGS}")
+
+
+message("CUDA_NVCC_FLAGS: " ${CUDA_NVCC_FLAGS})
+
+
+#------------------Compile and create a static library-------------------------#
+file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu")
+
+# Use -fPIC if -fpic not supported. Some quick non-scientific tests:
+# Without fpic: 4.94 user, 4.04 system, 0:09.88 elapsed
+# With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed
+# With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed
+CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic")
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -0,0 +1,451 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Multi-GPU implementation.
+ *
+ * Detailed info.
+ *
+ */
+#include "astaroth.h"
+#include "errchk.h"
+
+#include "device.cuh"
+#include "math_utils.h" // sum for reductions
+#include "standalone/config_loader.h" // update_config
+
+const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[]     = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]        = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+
+
+static const int MAX_NUM_DEVICES = 32;
+static int num_devices = 1;
+static Device devices[MAX_NUM_DEVICES] = {};
+
+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
+
+static Grid
+createGrid(const AcMeshInfo& config)
+{
+    Grid grid;
+    grid.m = (int3) {
+        config.int_params[AC_mx],
+        config.int_params[AC_my],
+        config.int_params[AC_mz]
+    };
+
+    grid.n = (int3) {
+        config.int_params[AC_nx],
+        config.int_params[AC_ny],
+        config.int_params[AC_nz]
+    };
+
+    return grid;
+}
+
+static Grid grid; // A grid consists of num_devices subgrids
+static Grid subgrid;
+
+static int
+gridIdx(const Grid& grid, const int i, const int j, const int k)
+{
+    return i + j * grid.m.x + k * grid.m.x * grid.m.y;
+}
+
+static int3
+gridIdx3d(const Grid& grid, const int idx)
+{
+    return (int3){idx % grid.m.x,
+                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+                  idx / (grid.m.x * grid.m.y)};
+}
+
+void
+printInt3(const int3 vec)
+{
+    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
+}
+
+AcResult
+acInit(const AcMeshInfo& config)
+{
+    // Check devices
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices < 1) {
+        ERROR("No CUDA devices found!");
+        return AC_FAILURE;
+    }
+    if (num_devices > MAX_NUM_DEVICES) {
+        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
+        num_devices = MAX_NUM_DEVICES;
+    }
+    if (!AC_MULTIGPU_ENABLED) {
+        WARNING("MULTIGPU_ENABLED was false. Using only one device");
+        num_devices = 1; // Use only one device if multi-GPU is not enabled
+    }
+    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // problem domain to multiple GPUs much easier since we do not have to worry
+    // about remainders
+    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+
+    // Decompose the problem domain
+    // The main grid
+    grid = createGrid(config);
+
+    // Subgrids
+    AcMeshInfo subgrid_config = config;
+    subgrid_config.int_params[AC_nz] /= num_devices;
+    update_config(&subgrid_config);
+    subgrid = createGrid(subgrid_config);
+
+    // Periodic boundary conditions become weird if the system can "fold unto itself".
+    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+
+    printf("Grid m "); printInt3(grid.m); printf("\n");
+    printf("Grid n "); printInt3(grid.n); printf("\n");
+    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+
+    // Initialize the devices
+    for (int i = 0; i < num_devices; ++i) {
+        createDevice(i, subgrid_config, &devices[i]);
+        printDeviceInfo(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acQuit(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        destroyDevice(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+int
+gridIdxx(const Grid grid, const int3 idx)
+{
+    return gridIdx(grid, idx.x, idx.y, idx.z);
+}
+
+AcResult
+acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
+{
+    /*
+    Here we decompose the host mesh and distribute it among the GPUs in
+    the node.
+    
+    The host mesh is a huge contiguous block of data. Its dimensions are given by
+    the global variable named "grid". A "grid" is decomposed into "subgrids",
+    one for each GPU. Here we check which parts of the range s0...s1 maps
+    to the memory space stored by some GPU, ranging d0...d1, and transfer
+    the data if needed.
+    
+    The index mapping is inherently quite involved, but here's a picture which
+    hopefully helps make sense out of all this.
+    
+
+    Grid
+                                     |----num_vertices---|
+    xxx|....................................................|xxx
+             ^                   ^   ^                   ^
+            d0                  d1  s0 (src)            s1
+
+    Subgrid
+
+             xxx|.............|xxx
+             ^                   ^
+            d0                  d1
+
+                                 ^   ^                   
+                                db  da                   
+
+    */
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
+{
+    // See acLoadWithOffset() for an explanation of the index mapping
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+// acCopyMeshToDevice
+AcResult
+acLoad(const AcMesh& host_mesh)
+{
+    return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
+}
+
+// acCopyMeshToHost
+AcResult
+acStore(AcMesh* host_mesh)
+{
+    return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
+}
+
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
+    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
+                              STENCIL_ORDER/2 + subgrid.n.y,
+                              STENCIL_ORDER/2 + subgrid.n.z};
+    for (int i = 0; i < num_devices; ++i) {
+        rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    acSynchronize();
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY,
+                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+    } else {
+        // Local boundary conditions
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+
+/*
+// ===MIIKKANOTE START==========================================
+%JP: The old way for computing boundary conditions conflicts with the 
+way we have to do things with multiple GPUs.
+
+The older approach relied on unified memory, which represented the whole
+memory area as one huge mesh instead of several smaller ones. However, unified memory
+in its current state is more meant for quick prototyping when performance is not an issue.
+Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
+when managing the memory explicitly.
+
+In this new approach, I have simplified the multi- and single-GPU layers significantly. 
+Quick rundown:
+	New struct: Grid. There are two global variables, "grid" and "subgrid", which
+	contain the extents of the whole simulation domain and the decomposed grids, respectively.
+	To simplify thing, we require that each GPU is assigned the same amount of work,
+	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
+	to work with.
+
+	The whole simulation domain is decomposed with respect to the z dimension.
+	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+	contain (nx, ny, nz / num_devices) vertices.
+ 
+	An local index (i, j, k) in some subgrid can be mapped to the global grid with
+		global idx = (i, j, k + device_id * subgrid.n.z)
+
+Terminology:
+	- Single-GPU function: a function defined on the single-GPU layer (device.cu)
+
+Changes required to this commented code block:
+	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
+	  instead. Same holds for any complex index calculations. Instead, the local coordinates
+  	  should be passed as an int3 type without having to consider how the data is actually
+	  laid out in device memory
+	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
+	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
+	  are stored in a global array "devices[num_devices]". 
+	- Every single-GPU function is executed asynchronously by default such that we
+	  can optimize Astaroth by executing memory transactions concurrently with computation.
+	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
+	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+	  as a parameter and commands executing in different streams can be processed 
+	  in parallel/concurrently.
+
+
+Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+
+	With multiple GPUs, periodic boundary conditions applied on indices ranging from 
+		
+		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+
+	on a single device are "local", in the sense that they can be computed without having
+	to exchange data with neighboring GPUs. Special care is needed only for transferring
+	the data to the fron and back plates outside this range. In the solution we use here,
+	we solve the local boundaries first, and then just exchange the front and back plates
+	in a "ring", like so 
+				device_id
+		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+			
+
+// ======MIIKKANOTE END==========================================
+
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
+                                                          moved into device.cu, function boundCondStep()
+                                                          In astaroth.cu, we use acBoundcondStep() 
+                                                          just to distribute the work and manage
+                                                          communication between GPUs.
+
+    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+
+    exit(0);
+    #else
+    
+        
+        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
+
+        const int3 start = (int3){0, 0, device_id * depth};
+        const int3 end = (int3){mesh_info.int_params[AC_mx],
+                                mesh_info.int_params[AC_my],
+                                min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
+
+        const dim3 tpb(8,2,8);
+
+        // TODO uses the default stream currently
+        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
+            wedge_boundconds(0, tpb, start, end, d_buffer);
+        } else { 
+            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) 
+                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+*/
+        // Exchange halos
+        for (int i = 0; i < num_devices; ++i) {
+            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
+            // ...|ooooxxx|... -> xxx|ooooooo|...
+            {
+                const int3 src = (int3) {0, 0, subgrid.n.z};
+                const int3 dst = (int3) {0, 0, 0};
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+            }
+            // ...|ooooooo|xxx <- ...|xxxoooo|...
+            {
+                const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
+                const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
+                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+            }
+        }
+    }
+    acSynchronize();
+    return AC_SUCCESS;
+}
+
+static AcResult
+acSwapBuffers(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        swapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrate(const AcReal& dt)
+{
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acBoundcondStep();
+        acIntegrateStep(isubstep, dt);
+        acSwapBuffers();
+    }
+    return AC_SUCCESS;
+}
+
+AcReal
+acReduceScal(const ReductionType& rtype,
+             const VertexBufferHandle& vtxbuffer_handle)
+{
+    // TODO
+    return 0;
+}
+
+AcReal
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+            const VertexBufferHandle& b, const VertexBufferHandle& c)
+{
+    // TODO
+    return 0;
+}
+
+AcResult
+acSynchronize(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], STREAM_ALL);
+    }
+
+    return AC_SUCCESS;
+}
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -0,0 +1,309 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "device.cuh"
+
+#include "errchk.h"
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+} VertexBufferArray;
+
+__constant__ AcMeshInfo d_mesh_info;
+#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+#include "kernels/kernels.cuh"
+
+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+};
+
+AcResult
+printDeviceInfo(const Device device)
+{
+    const int device_id = device->id;
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);
+
+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf("    Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    // Memory
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
+               (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n",
+           props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n",
+           free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n",
+           total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    printf("    Shared mem per block: %ld KiB\n",
+           props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n",
+           props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
+
+    return AC_SUCCESS;
+}
+
+static __global__ void dummy_kernel(void) {}
+
+AcResult
+createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
+{
+    cudaSetDevice(id);
+    cudaDeviceReset();
+
+    // Create Device
+    struct device_s* device = (struct device_s*) malloc(sizeof(*device));
+    ERRCHK_ALWAYS(device);
+
+    device->id = id;
+    device->local_config = device_config;
+
+    // Check that the code was compiled for the proper GPU architecture
+    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
+           "device supports the CUDA architecture you are compiling for.\n"
+           "Running dummy kernel... ");
+    fflush(stdout);
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    printf("Success!\n");
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamCreate(&device->streams[i]);
+    }
+
+    // Memory
+    const size_t vba_size_bytes = AC_VTXBUF_SIZE_BYTES(device_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+    }
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
+                                  AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+
+    // Device constants
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
+                                          cudaMemcpyHostToDevice));
+
+    printf("Created device %d (%p)\n", device->id, device);
+    *device_handle = device;
+    return AC_SUCCESS;
+}
+
+AcResult
+destroyDevice(Device device)
+{
+    cudaSetDevice(device->id);
+    printf("Destroying device %d (%p)\n", device->id, device);
+
+    // Memory
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        cudaFree(device->vba.in[i]);
+        cudaFree(device->vba.out[i]);
+    }
+    cudaFree(device->reduce_scratchpad);
+    cudaFree(device->reduce_result);
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i)
+        cudaStreamDestroy(device->streams[i]);
+
+    // Destroy Device
+    free(device);
+    return AC_SUCCESS;
+}
+
+AcResult
+boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceScal(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceVec(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+rkStep(const Device device, const StreamType stream_type, const int step_number,
+       const int3& start, const int3& end, const AcReal dt)
+{
+    cudaSetDevice(device->id);
+    rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
+    return AC_SUCCESS;
+}
+
+AcResult
+synchronize(const Device device, const StreamType stream_type)
+{
+    cudaSetDevice(device->id);
+    if (stream_type == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    } else {
+        cudaStreamSynchronize(device->streams[stream_type]);
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+loadWithOffset(const Device device, const StreamType stream_type,
+               const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+static AcResult
+storeWithOffset(const Device device, const StreamType stream_type,
+                const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToDevice(const Device device, const StreamType stream_type,
+                 const AcMesh& host_mesh, const int3& src, const int3& dst,
+                 const int num_vertices)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
+                       &device->vba.in[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToHost(const Device device, const StreamType stream_type,
+               const int3& src, const int3& dst, const int num_vertices,
+               AcMesh* host_mesh)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
+                        num_vertices * sizeof(AcReal),
+                        &host_mesh->vertex_buffer[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
+                       const int3& src, Device dst_device, const int3& dst,
+                       const int num_vertices)
+{
+    cudaSetDevice(src_device->id);
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, dst_device->local_config);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
+                                        &src_device->vba.in[i][src_idx], src_device->id,
+                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
+                                        src_device->streams[stream_type]));
+    }
+    return AC_SUCCESS;
+}
+
+
+AcResult
+swapBuffers(const Device device)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        AcReal* tmp     = device->vba.in[i];
+        device->vba.in[i]  = device->vba.out[i];
+        device->vba.out[i] = tmp;
+    }
+    return AC_SUCCESS;
+}
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -0,0 +1,82 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+typedef enum {
+  STREAM_PRIMARY,
+  STREAM_SECONDARY,
+  NUM_STREAM_TYPES,
+  STREAM_ALL
+} StreamType;
+
+typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
+                                 // in Vulkan, f.ex. VkDevice
+
+/** */
+AcResult printDeviceInfo(const Device device);
+
+/** */
+AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
+
+/** */
+AcResult destroyDevice(Device device);
+
+/** */
+AcResult boundcondStep(const Device device, const StreamType stream_type,
+                       const int3& start, const int3& end);
+
+/** */
+AcResult reduceScal(const Device device);
+
+/** */
+AcResult reduceVec(const Device device);
+
+/** */
+AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
+                const int3& start, const int3& end, const AcReal dt);
+
+/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
+    a StreamType, the function synchronizes all streams on the device. */
+AcResult synchronize(const Device device, const StreamType stream_type);
+
+/** */
+AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
+                          const AcMesh& host_mesh, const int3& src, const int3& dst,
+                          const int num_vertices);
+
+/** */
+AcResult copyMeshToHost(const Device device, const StreamType stream_type,
+                        const int3& src, const int3& dst, const int num_vertices,
+                        AcMesh* host_mesh);
+
+/** */
+AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
+                                Device dst, const int3& dst_idx, const int num_vertices);
+
+/** Swaps the input/output buffers used in computations */
+AcResult swapBuffers(const Device device);
--- a/src/core/errchk.h
+++ b/src/core/errchk.h
@@ -0,0 +1,112 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+// clang-format off
+/*
+ * =============================================================================
+ * General error checking
+ * =============================================================================
+ */
+#define ERROR(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tError in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+    exit(EXIT_FAILURE); \
+    abort(); \
+}
+
+#define WARNING(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+}
+
+// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
+#define ERRCHK(retval)  { if (!(retval)) ERROR(#retval " was false"); }
+#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
+#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
+
+/*
+ * =============================================================================
+ * CUDA-specific error checking
+ * =============================================================================
+ */
+#ifdef __CUDACC__
+static inline void
+cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
+{
+    if (code != cudaSuccess) {
+        time_t t; time(&t); \
+        fprintf(stderr, "%s", ctime(&t)); \
+        fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
+                        file, line, cudaGetErrorString(code)); \
+        fflush(stderr); \
+
+        if (abort)
+            exit(code);
+    }
+}
+
+#ifdef NDEBUG
+    #undef ERRCHK
+    #undef WARNCHK
+    #define ERRCHK(params)
+    #define WARNCHK(params)
+    #define ERRCHK_CUDA(params) params;
+    #define WARNCHK_CUDA(params) params;
+    #define ERRCHK_CUDA_KERNEL() {}
+#else
+    #define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
+    #define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
+
+    #define ERRCHK_CUDA_KERNEL()                                               \
+    {                                                                          \
+        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
+        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
+    }
+    #endif
+
+#endif
+
+#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
+
+#define ERRCHK_CUDA_KERNEL_ALWAYS()                                               \
+{                                                                          \
+    ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError());                                    \
+    ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize());                                  \
+}
+// clang-format on
--- a/src/core/kernels/.gitignore
+++ b/src/core/kernels/.gitignore
@@ -0,0 +1,2 @@
+# Ignore the generated headers
+stencil_process.cuh stencil_assembly.cuh
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -0,0 +1,794 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+ #pragma once
+
+__global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vertex_buffer[dst_idx] = vertex_buffer[src_idx];
+}
+
+void
+periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vertex_buffer)
+{
+    const dim3 tpb(8,2,8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
+    ERRCHK_CUDA_KERNEL();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <assert.h>
+
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, 
+               const AcReal dt, VertexBufferArray* buffer)
+{
+    const dim3 tpb(32, 1, 4);
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/reduce.cuh
+++ b/src/core/kernels/reduce.cuh
@@ -0,0 +1,338 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
+
+// Function pointer definitions
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
+typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
+typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
+                                       const AcReal&);
+
+// clang-format off
+/* Comparison funcs */
+__device__ inline AcReal
+_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+__device__ inline AcReal
+_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+__device__ inline AcReal
+_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+__device__ inline AcReal
+_device_length_scal(const AcReal& a) { return AcReal(a); }
+
+__device__ inline AcReal
+_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
+
+__device__ inline AcReal
+_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
+
+__device__ inline AcReal
+_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+__device__ inline AcReal
+_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
+
+__device__ inline AcReal
+_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
+// clang-format on
+
+__device__ inline bool
+oob(const int& i, const int& j, const int& k)
+{
+    if (i >= d_mesh_info.int_params[AC_nx] ||
+        j >= d_mesh_info.int_params[AC_ny] ||
+        k >= d_mesh_info.int_params[AC_nz])
+        return true;
+    else
+        return false;
+}
+
+template <ReduceInitialScalFunc reduce_initial>
+__global__ void
+_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src[src_idx]);
+}
+
+template <ReduceInitialVecFunc reduce_initial>
+__global__ void
+_kernel_reduce_vec(const __restrict__ AcReal* src_a,
+                   const __restrict__ AcReal* src_b,
+                   const __restrict__ AcReal* src_c, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
+                                  src_c[src_idx]);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+#define BLOCK_SIZE (1024)
+#define ELEMS_PER_THREAD (32)
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce(AcReal* src, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+
+    if (idx >= scratchpad_size)
+        return;
+
+    __shared__ AcReal smem[BLOCK_SIZE];
+
+    AcReal tmp = src[idx];
+
+    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
+        const int src_idx = idx + i * BLOCK_SIZE;
+        if (src_idx >= scratchpad_size) {
+            // This check is for safety: if accessing uninitialized values
+            // beyond the mesh boundaries, we will immediately start seeing NANs
+            if (threadIdx.x < BLOCK_SIZE)
+                smem[threadIdx.x] = NAN;
+            else
+                break;
+        }
+        tmp = reduce(tmp, src[src_idx]);
+    }
+
+    smem[threadIdx.x] = tmp;
+    __syncthreads();
+
+    int offset = BLOCK_SIZE / 2;
+    while (offset > 0) {
+
+        if (threadIdx.x < offset) {
+            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
+            smem[threadIdx.x] = tmp;
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        src[idx] = tmp;
+}
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
+{
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    AcReal tmp    = src[idx];
+    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
+    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
+        tmp = reduce(tmp, src[idx + i * block_offset]);
+
+    *result = tmp;
+}
+//////////////////////////////////////////////////////////////////////////////
+
+AcReal
+_reduce_scal(const cudaStream_t stream,
+             const ReductionType& rtype, const int& nx, const int& ny,
+             const int& nz, const AcReal* vertex_buffer,
+             AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
+                   int(ceil(AcReal(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
+                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_scal<_device_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_scal<_device_exp_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
+
+AcReal
+_reduce_vec(const cudaStream_t stream,
+            const ReductionType& rtype, const int& nx, const int& ny,
+            const int& nz, const AcReal* vertex_buffer_a,
+            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
+            AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
+                   int(ceil(float(ny) / tpb.y)),
+                   int(ceil(float(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
+                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    // "Features" of this quick & efficient reduction:
+    // Block size must be smaller than the computational domain size
+    // (otherwise we would have do some additional bounds checking in the
+    // second half of _kernel_reduce, which gets quite confusing)
+    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
+    // the work without worrying too much about the array bounds.
+    ERRCHK(BLOCK_SIZE <= scratchpad_size);
+    ERRCHK(!(BLOCK_SIZE % 2));
+    // NOTE! Also does not work properly with non-power of two mesh dimension
+    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
+    // see that uninitialized smem values are used in the comparison
+    ERRCHK(is_power_of_two(nx));
+    ERRCHK(is_power_of_two(ny));
+    ERRCHK(is_power_of_two(nz));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_vec<_device_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_vec<_device_exp_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
--- a/src/core/kernels/rk3.cuh
+++ b/src/core/kernels/rk3.cuh
@@ -0,0 +1,742 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Implementation of the integration pipeline
+ *
+ *
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include <assert.h>
+
+/*
+#define RK_THREADS_X (32)
+#define RK_THREADS_Y (1)
+#define RK_THREADS_Z (4)
+#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
+#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
+*/
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const dim3& tpb,
+               const int3& start, const int3& end, const int& step_number,
+               const AcReal dt, const AcMeshInfo& /*mesh_info*/,
+               VertexBufferArray* buffer)
+{
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -0,0 +1,91 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <math.h>   // isnan, isinf
+#include <stdlib.h> // rand
+
+template <class T>
+static inline const T
+max(const T& a, const T& b)
+{
+    return a > b ? a : b;
+}
+
+template <class T>
+static inline const T
+min(const T& a, const T& b)
+{
+    return a < b ? a : b;
+}
+
+template <class T>
+static inline const T
+sum(const T& a, const T& b)
+{
+    return a + b;
+}
+
+template <class T>
+static inline const T
+is_valid(const T& val)
+{
+    if (isnan(val) || isinf(val))
+        return false;
+    else
+        return true;
+}
+
+template <class T>
+static inline const T
+clamp(const T& val, const T& min, const T& max)
+{
+    return val < min ? min : val > max ? max : val;
+}
+
+static inline AcReal
+randr()
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+static inline int3
+operator+(const int3& a, const int3& b)
+{
+    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static inline int3
+operator-(const int3& a, const int3& b)
+{
+    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static inline bool
+is_power_of_two(const unsigned val)
+{
+    return val && !(val & (val - 1));
+}
--- a/src/standalone/CMakeLists.txt
+++ b/src/standalone/CMakeLists.txt
@@ -0,0 +1,10 @@
+################################
+##  CMakeLists.txt for utils  ##
+################################
+
+file (GLOB SOURCES "*.cc" "model/*.cc")
+
+add_library(astaroth_standalone STATIC ${SOURCES})
+target_include_directories(astaroth_standalone PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+#target_compile_definitions(astaroth_standalone PRIVATE CONFIG_PATH=\"${CMAKE_SOURCE_DIR}/config/\")
+target_compile_definitions(astaroth_standalone PRIVATE CONFIG_PATH=\"${ASTAROTH_CONF_PATH}\")
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -0,0 +1,732 @@
+/*
+   Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+   This file is part of Astaroth.
+
+   Astaroth is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Astaroth is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <stdio.h>
+
+#include "config_loader.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_boundconds.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+
+#include "core/errchk.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+// Defines for colored output
+#define RED "\x1B[31m"
+#define GRN "\x1B[32m"
+#define YEL "\x1B[33m"
+#define BLU "\x1B[34m"
+#define MAG "\x1B[35m"
+#define CYN "\x1B[36m"
+#define WHT "\x1B[37m"
+#define RESET "\x1B[0m"
+
+#define GEN_TEST_RESULT (1) // Generate a test file always during testing
+
+typedef struct {
+	int x, y, z;
+} vec3i;
+
+typedef struct {
+	AcReal x, y, z;
+} vec3r;
+
+
+typedef struct {
+	ModelScalar model;
+	AcReal candidate;
+	ModelScalar error;
+} ErrorInfo;
+
+#define QUICK_TEST (0)
+#define THOROUGH_TEST (1)
+#define TEST_TYPE QUICK_TEST
+
+static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
+// #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#if TEST_TYPE == QUICK_TEST // REGULAR TEST START HERE --------------------------------------------------------------------------------------------------------------
+	static inline ModelScalar
+get_absolute_error(const ModelScalar& model, const AcReal& candidate)
+{
+	return fabsl(candidate - model);
+}
+
+	static inline ModelScalar
+get_acceptable_absolute_error(const ModelScalar& range)
+{
+	// This is the upper limit, which assumes that both the min and max values
+	// are used in a calculation (which inherently leads to cancellation).
+	//
+	// AFAIK if this breaks, there is definitely something wrong with the code.
+	// Otherwise the error is so small it's indistiguishable from inherent
+	// inaccuracies in floating-point arithmetic.
+	return range * AC_REAL_EPSILON;
+}
+
+	static inline ModelScalar
+get_acceptable_relative_error(void)
+{
+	return 30; // machine epsilons
+}
+
+	static inline ModelScalar
+get_relative_error(const ModelScalar& model, const AcReal& candidate)
+{
+	ModelScalar error = NAN;
+
+#if 0
+	const ModelScalar abs_epsilon = get_acceptable_absolute_error(range);
+	if (fabsl(model) < abs_epsilon) { // Model is close to zero
+		/*
+		   if (fabsl(candidate - model) <= AC_REAL_EPSILON * fabsl(candidate))
+		   error = 0;
+		// Knuth section 4.2.2 pages 217-218 TODO
+		 */
+		if (fabsl(candidate) < abs_epsilon) // If candidate is close to zero
+			error = fabsl(candidate);       // return candidate itself
+		else
+			error = INFINITY;
+	}
+	else {
+		error = fabsl(1.0l - candidate / model);
+	}
+#endif
+	error = fabsl(1.0l - candidate / model);
+
+	// Return the relative error as multiples of the machine epsilon
+	// See Sect. Relative Error and Ulps in
+	// What Every Computer Scientist Should Know About Floating-Point Arithmetic
+	// By David Goldberg (1991)
+	return error / AC_REAL_EPSILON;
+}
+
+	static bool
+verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range)
+{
+	if (!is_valid(model) || !is_valid(cand))
+		return false;
+
+	const ModelScalar relative_error = get_relative_error(model, cand);
+	if (relative_error < get_acceptable_relative_error())
+		return true;
+
+	const ModelScalar absolute_error = get_absolute_error(model, cand);
+	if (absolute_error < get_acceptable_absolute_error(range))
+		return true;
+
+	return false;
+}
+
+	static ModelScalar
+get_reduction_range(const ModelMesh& mesh)
+{
+	ERRCHK(NUM_VTXBUF_HANDLES >= 3);
+
+	const ModelScalar max0     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(0));
+	const ModelScalar max1     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(1));
+	const ModelScalar max2     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(2));
+	const ModelScalar max_scal = max(max0, max(max1, max2));
+
+	const ModelScalar min0     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(0));
+	const ModelScalar min1     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(1));
+	const ModelScalar min2     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(2));
+	const ModelScalar min_scal = min(min0, min(min1, min2));
+
+	return max_scal - min_scal;
+}
+
+	static void
+print_debug_info(const ModelScalar& model, const AcReal& candidate,
+		const ModelScalar& range)
+{
+	printf("MeshPointInfo\n");
+	printf("\tModel: %e\n", double(model));
+	printf("\tCandidate: %e\n", double(candidate));
+	printf("\tRange: %e\n", double(range));
+
+	printf("\tAbsolute error: %Le (max acceptable: %Le)\n",
+			get_absolute_error(model, candidate),
+			get_acceptable_absolute_error(range));
+	printf("\tRelative error: %Le (max acceptable: %Le)\n",
+			get_relative_error(model, candidate),
+			get_acceptable_relative_error());
+	printf("\tIs acceptable: %d\n", verify(model, candidate, range));
+}
+
+static void
+print_result(const ModelScalar& model, const AcReal& candidate,
+		const ModelScalar& range, const char* name = "???")
+{
+	const ModelScalar rel_err = get_relative_error(model, candidate);
+	const ModelScalar abs_err = get_absolute_error(model, candidate);
+	if (!verify(model, candidate, range)) {
+		printf("\t%-12s... ", name);
+		printf(RED "FAIL! " RESET);
+	}
+	else {
+		printf("\t%-12s... ", name);
+		printf(GRN "OK! " RESET);
+	}
+
+	printf("(relative error: %.3Lg \u03B5, absolute error: %Lg)\n", rel_err, abs_err);
+	/*
+	// DEPRECATED: TODO remove
+	if (rel_err < get_acceptable_relative_error())
+	printf("(relative error: %Lg \u03B5, max accepted %Lg)\n", rel_err,
+	get_acceptable_relative_error());
+	else
+	printf("(absolute error: %Lg, max accepted %Lg)\n", abs_err,
+	get_acceptable_absolute_error(range));
+	 */
+}
+
+	static int
+check_reductions(const AcMeshInfo& config)
+{
+	printf("Testing reductions\n");
+	int num_failures = 0;
+
+	// Init CPU meshes
+	AcMesh* mesh = acmesh_create(config);
+	ModelMesh* modelmesh = modelmesh_create(config);
+
+	// Init GPU meshes
+	acInit(config);
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+        const InitType itype = test_cases[i];
+        printf("Checking %s...\n", init_type_names[InitType(itype)]);
+
+		// Init the mesh and figure out the acceptable range for error
+		acmesh_init_to(InitType(itype), mesh);
+
+		acmesh_to_modelmesh(*mesh, modelmesh);
+		const ModelScalar range = get_reduction_range(*modelmesh);
+
+		acLoad(*mesh);
+
+		for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) {
+			const VertexBufferHandle ftype = VTXBUF_UUX;
+
+			// Scal
+			ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
+					VertexBufferHandle(ftype));
+			AcReal candidate  = acReduceScal(ReductionType(rtype),
+					VertexBufferHandle(ftype));
+			print_result(model, candidate, range, "UUX scal");
+
+			bool is_acceptable = verify(model, candidate, range);
+			if (!is_acceptable) {
+				++num_failures;
+
+				// Print debug info
+				printf("Scalar reduction type %d FAIL\n", rtype);
+				print_debug_info(model, candidate, range);
+			}
+
+			// Vec
+			model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX,
+					VTXBUF_UUY, VTXBUF_UUZ);
+			candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX,
+					VTXBUF_UUY, VTXBUF_UUZ);
+			print_result(model, candidate, range, "UUXYZ vec");
+
+			is_acceptable = verify(model, candidate, range);
+			if (!is_acceptable) {
+				++num_failures;
+
+				// Print debug info
+				printf("Vector reduction type %d FAIL\n", rtype);
+				print_debug_info(model, candidate, range);
+			}
+		}
+
+		printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+	}
+	acQuit();
+	modelmesh_destroy(modelmesh);
+	acmesh_destroy(mesh);
+
+	return num_failures;
+}
+
+/** Finds the maximum and minimum in all meshes and computes the range.
+ * Note! Potentially dangerous if all meshes do not interact with each other.
+ * Otherwise the range may be too high.
+ */
+	static ModelScalar
+get_data_range(const ModelMesh& model)
+{
+	ModelScalar vertex_buffer_max_all = -INFINITY;
+	ModelScalar vertex_buffer_min_all = INFINITY;
+	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+		const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, VertexBufferHandle(w));
+		const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, VertexBufferHandle(w));
+
+		if (vertex_buffer_max > vertex_buffer_max_all)
+			vertex_buffer_max_all = vertex_buffer_max;
+		if (vertex_buffer_min < vertex_buffer_min_all)
+			vertex_buffer_min_all = vertex_buffer_min;
+	}
+	return fabsl(vertex_buffer_max_all - vertex_buffer_min_all);
+}
+
+// #define GEN_TEST_RESULT
+#if GEN_TEST_RESULT == 1
+static FILE* test_result = NULL;
+#endif
+
+	static bool
+verify_meshes(const ModelMesh& model, const AcMesh& candidate)
+{
+	bool retval = true;
+
+#if GEN_TEST_RESULT == 1
+	ErrorInfo err = ErrorInfo();
+#endif
+
+	const ModelScalar range = get_data_range(model);
+	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+		const size_t n = AC_VTXBUF_SIZE(model.info);
+
+		// Maximum errors
+		ErrorInfo max_abs_error = ErrorInfo();
+		ErrorInfo max_rel_error = ErrorInfo();
+
+		for (size_t i = 0; i < n; ++i) {
+			const ModelScalar model_val = model.vertex_buffer[VertexBufferHandle(w)][i];
+			const AcReal cand_val = candidate.vertex_buffer[VertexBufferHandle(w)][i];
+
+			if (!verify(model_val, cand_val, range)) {
+				const int i0 = i % model.info.int_params[AC_mx];
+				const int j0 = ((i % (model.info.int_params[AC_mx] *
+								model.info.int_params[AC_my])) /
+						model.info.int_params[AC_mx]);
+				const int k0 = i / (model.info.int_params[AC_mx] *
+						model.info.int_params[AC_my]);
+				printf("Index (%d, %d, %d)\n", i0, j0, k0);
+				print_debug_info(model_val, cand_val, range);
+				retval = false;
+			}
+
+			const ModelScalar abs_error = get_absolute_error(model_val,
+					cand_val);
+			if (abs_error > max_abs_error.error) {
+				max_abs_error.error     = abs_error;
+				max_abs_error.model     = model_val;
+				max_abs_error.candidate = cand_val;
+			}
+
+			const ModelScalar rel_error = get_relative_error(model_val, cand_val);
+			if (rel_error > max_rel_error.error) {
+				max_rel_error.error     = rel_error;
+				max_rel_error.model     = model_val;
+				max_rel_error.candidate = cand_val;
+			}
+
+#if GEN_TEST_RESULT == 1
+			if (abs_error > err.error) {
+				err.error = abs_error;
+				err.model = model_val;
+				err.candidate = cand_val;
+			}
+#endif
+		}
+		//print_result(max_rel_error.model, max_rel_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
+		print_result(max_abs_error.model, max_abs_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
+	}
+
+#if GEN_TEST_RESULT == 1
+	const ModelScalar rel_err = get_relative_error(err.model, err.candidate);
+	const ModelScalar abs_err = get_absolute_error(err.model, err.candidate);
+	fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
+#endif
+
+	printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+
+	return retval;
+}
+
+	int
+check_rk3(const AcMeshInfo& mesh_info)
+{
+	const int num_iterations = 1; // Note: should work up to at least 15 steps
+	printf("Testing RK3 (running %d steps before checking the result)\n",
+			num_iterations);
+	int num_failures = 0;
+
+	// Init CPU meshes
+	AcMesh* gpu_mesh   = acmesh_create(mesh_info);
+	ModelMesh* model_mesh = modelmesh_create(mesh_info);
+
+	// Init GPU meshes
+	acInit(mesh_info);
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+        const InitType itype = test_cases[i];
+		printf("Checking %s...\n", init_type_names[InitType(itype)]);
+
+		// Init the mesh and figure out the acceptable range for error
+		acmesh_init_to(InitType(itype), gpu_mesh);
+
+		acLoad(*gpu_mesh);
+		acmesh_to_modelmesh(*gpu_mesh, model_mesh);
+
+		acBoundcondStep();
+		boundconds(model_mesh->info, model_mesh);
+
+		for (int i = 0; i < num_iterations; ++i) {
+			//const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+            //const AcReal dt   = host_timestep(umax, mesh_info);
+			const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
+
+			acIntegrate(dt);
+			acBoundcondStep();
+			acSynchronize();
+
+			model_rk3(dt, model_mesh);
+			boundconds(model_mesh->info, model_mesh);
+		}
+		acStore(gpu_mesh);
+
+		bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
+		if (!is_acceptable) {
+			++num_failures;
+		}
+	}
+
+	acQuit();
+	acmesh_destroy(gpu_mesh);
+	modelmesh_destroy(model_mesh);
+
+	return num_failures;
+}
+
+	int
+run_autotest(void)
+{
+#if GEN_TEST_RESULT == 1
+	char testresult_path[256];
+	sprintf(testresult_path, "%s_fullstep_testresult.out", AC_DOUBLE_PRECISION ? "double" : "float");
+
+	test_result = fopen(testresult_path, "w");
+	ERRCHK(test_result);
+
+	fprintf(test_result, "n, max abs error, corresponding rel error\n");
+#endif
+
+	/* Parse configs */
+	AcMeshInfo config;
+	load_config(&config);
+
+	if (STENCIL_ORDER > 6)
+		printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
+	/*
+	   const vec3i test_dims[] = {              //
+	   {15, 11, 13}, //
+	   {17, 61, 127}, //
+	   {511, 17, 16},  //
+	   {64, 64, 8},  //
+	   {32, 32, 64}, //
+	   {64, 32, 32}, //
+	   {128, 64, 32}};
+	 */
+	const vec3i test_dims[] = {{512, 16, 32},  //
+		{64, 64, 32},  //
+		{32, 32, 64}, //
+		{64, 32, 32}, //
+		{128, 64, 32}};
+
+	//const vec3i test_dims[] = {{256,256,256}};
+	//const vec3i test_dims[] = {{256,256,256}};
+	//const vec3i test_dims[] = {{32, 32, 32}};
+
+	int num_failures = 0;
+	/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+		config.int_params[AC_nx] = test_dims[i].x;
+		config.int_params[AC_ny] = test_dims[i].y;
+		config.int_params[AC_nz] = test_dims[i].z;
+		update_config(&config);
+
+		printf("Testing mesh (%d, %d, %d):\n", //
+				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+
+		num_failures += check_reductions(config);
+		fflush(stdout);
+	}*/ // TODO uncomment
+
+	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+		config.int_params[AC_nx] = test_dims[i].x;
+		config.int_params[AC_ny] = test_dims[i].y;
+		config.int_params[AC_nz] = test_dims[i].z;
+		update_config(&config);
+
+		printf("Testing mesh (%d, %d, %d):\n", //
+				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+
+		num_failures += check_rk3(config);
+		fflush(stdout);
+	}
+
+	printf("\n--------Testing done---------\n");
+	printf("Failures found: %d\n", num_failures);
+
+#if GEN_TEST_RESULT == 1
+	fflush(test_result);
+	fclose(test_result);
+#endif
+
+	if (num_failures > 0)
+		return EXIT_FAILURE;
+	else
+		return EXIT_SUCCESS;
+}
+
+#elif TEST_TYPE == THOROUGH_TEST // GEN TEST FILE START HERE --------------------------------------------------------------------------------------------------------------
+typedef struct {
+	ModelScalar model;
+	AcReal candidate;
+	ModelScalar abs_error;
+	ModelScalar ulp_error;
+	ModelScalar rel_error;
+	ModelScalar maximum_magnitude;
+	ModelScalar minimum_magnitude;
+} Error;
+
+Error get_error(ModelScalar model, AcReal candidate)
+{
+	Error error;
+        error.abs_error = 0;
+
+	error.model = model;
+	error.candidate = candidate;
+
+	if (error.model == error.candidate || fabsl(model - candidate) == 0) { // If exact
+		error.abs_error = 0;
+		error.rel_error = 0;
+		error.ulp_error = 0;
+	} else if (!is_valid(error.model) || !is_valid(error.candidate)) {
+		error.abs_error = INFINITY;
+		error.rel_error = INFINITY;
+		error.ulp_error = INFINITY;
+	} else {
+		const int base = 2;
+		const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant
+
+		const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2));
+
+		const ModelScalar ulp = powl(base, e - (p-1));
+		const ModelScalar machine_epsilon = 0.5 * powl(base, -(p-1));
+		error.abs_error = fabsl(model - candidate);
+		error.ulp_error	= error.abs_error / ulp;
+		error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon;
+	}
+
+	return error;
+}
+
+Error get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+{
+	Error error;
+        error.abs_error = -1;
+
+	for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
+		for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+			Error curr_error = get_error(model_mesh.vertex_buffer[j][i], candidate_mesh.vertex_buffer[j][i]);
+			if (curr_error.abs_error > error.abs_error)
+				error = curr_error;
+		}
+	}
+
+	error.maximum_magnitude = -1; // Not calculated.
+	error.minimum_magnitude = -1; // Not calculated.
+
+	return error;
+}
+
+static ModelScalar
+get_maximum_magnitude(const ModelScalar* field, const AcMeshInfo info)
+{
+	ModelScalar maximum = -INFINITY;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+		maximum = max(maximum, fabsl(field[i]));
+
+	return maximum;
+}
+
+
+static ModelScalar
+get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
+{
+	ModelScalar minimum = INFINITY;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+		minimum = min(minimum, fabsl(field[i]));
+
+	return minimum;
+}
+
+Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+{
+	ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
+	AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle];
+
+	Error error;
+        error.abs_error = -1;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+
+		Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
+
+		if (curr_error.abs_error > error.abs_error)
+			error = curr_error;
+	}
+
+
+	error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
+	error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
+
+	return error;
+}
+
+void
+print_error_to_file(const char* path, const int n, const Error error)
+{
+    FILE* file = fopen(path, "a");
+    fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
+    //fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
+    fclose(file);
+}
+
+#define MAX_PATH_LEN (256)
+
+int run_autotest(void)
+{
+
+#define N_MIN (32)
+#define N_MAX (512)
+	for (int n = N_MIN; n <= N_MAX; n += N_MIN) {
+		AcMeshInfo config;
+		load_config(&config);
+		config.int_params[AC_nx] = config.int_params[AC_ny] = config.int_params[AC_nz] = n;
+		update_config(&config);
+
+		// Init host
+		AcMesh* candidate_mesh = acmesh_create(config);
+		ModelMesh* model_mesh = modelmesh_create(config);
+
+		// Init device
+		acInit(config);
+
+		// Check all initial conditions
+        for (int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+            const InitType init_type = test_cases[i];
+			acmesh_init_to((InitType)init_type, candidate_mesh);
+			acmesh_to_modelmesh(*candidate_mesh, model_mesh);   // Load to Host
+			acLoad(*candidate_mesh);                             // Load to Device
+
+			boundconds(model_mesh->info, model_mesh);
+			acBoundcondStep();
+
+            { // Check boundconds
+                acStore(candidate_mesh);
+                Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh);
+                char boundcond_path[MAX_PATH_LEN];
+                sprintf(boundcond_path, "%s_boundcond_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(boundcond_path, n, boundcond_error);
+            }
+
+            { // Check scalar max reduction
+                ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                Error scalar_reduce_error = get_error(model, candidate);
+                char scalar_reduce_path[MAX_PATH_LEN];
+                sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(scalar_reduce_path, n, scalar_reduce_error);
+            }
+
+            { // Check vector max reduction
+                ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                Error vector_reduce_error = get_error(model, candidate);
+                char vector_reduce_path[MAX_PATH_LEN];
+                sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(vector_reduce_path, n, vector_reduce_error);
+            }
+
+            // Time advance
+            {
+                const AcReal umax =  (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                const AcReal dt = host_timestep(umax, config);
+
+                // Host integration step
+                model_rk3(dt, model_mesh);
+                boundconds(config, model_mesh);
+
+                // Device integration step
+                acIntegrate(dt);
+                acBoundcondStep();
+                acSynchronize();
+                acStore(candidate_mesh);
+
+                // Check fields
+                for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) {
+                    Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, *model_mesh, *candidate_mesh);
+
+			printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
+
+                    char field_path[MAX_PATH_LEN];
+                    sprintf(field_path, "%s_integrationstep_%s_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type], vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
+                    print_error_to_file(field_path, n, field_error);
+                }
+            }
+		}
+
+		// Deallocate host
+		acmesh_destroy(candidate_mesh);
+		modelmesh_destroy(model_mesh);
+
+		// Deallocate device
+		acQuit();
+	}
+
+	return 0;
+}
+#endif
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -0,0 +1,300 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <stdlib.h> // EXIT_SUCCESS
+
+#include "config_loader.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+#include <vector>
+#include <algorithm>
+#include <math.h>
+#include "src/core/errchk.h"
+
+static bool
+smaller_than(const double& a, const double& b)
+{
+    return a < b;
+}
+
+static int
+write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
+{
+    FILE* fp;
+    fp = fopen(path, "a");
+
+    if (fp != NULL) {
+        fprintf(fp, "%d, %f, %f, %f, %f\n", n, min, max, median, perc);
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+static int
+write_percentiles(const char* path, const int num_iters, const std::vector<double>& results)
+{
+    FILE* fp;
+    fp = fopen(path, "w");
+
+    if (fp != NULL) {
+        for (int i = 0; i < 100; ++i) {
+            fprintf(fp, "%f\n", results[(long unsigned)((i / 100.) * num_iters)]);
+        }
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+int
+run_benchmark(void)
+{
+    char runningtime_path[256];
+    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+
+    FILE* fp;
+    fp = fopen(runningtime_path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "n, min, max, median, perc\n");
+        fclose(fp);
+    } else {
+        return EXIT_FAILURE;
+    }
+
+    #define N_STEP_SIZE (128)
+    #define MAX_MESH_DIM (128)
+    #define NUM_ITERS (100)
+    for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
+        /* Parse configs */
+        AcMeshInfo mesh_info;
+        load_config(&mesh_info);
+        mesh_info.int_params[AC_nx] = n;
+        mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+        mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+        update_config(&mesh_info);
+
+        AcMesh* mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+        acInit(mesh_info);
+        acLoad(*mesh);
+
+        std::vector<double> results;
+        results.reserve(NUM_ITERS);
+
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            acIntegrate(0);
+            acSynchronize();
+        }
+
+        Timer t;
+        for (int i = 0; i < NUM_ITERS; ++i) {
+
+            timer_reset(&t);
+            #if GEN_BENCHMARK_RK3 == 1
+            acIntegrateStep(2, FLT_EPSILON);
+            #else // GEN_BENCHMARK_FULL
+            //const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+            const AcReal dt   = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
+            acIntegrate(dt);
+            #endif
+            acSynchronize();
+
+            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
+            results.push_back(ms_elapsed);
+        }
+
+        #define NTH_PERCENTILE (0.95)
+        std::sort(results.begin(), results.end(), smaller_than);
+        write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+
+        char percentile_path[256];
+        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+        write_percentiles(percentile_path, NUM_ITERS, results);
+
+        printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
+
+        acStore(mesh);
+        acQuit();
+        acmesh_destroy(mesh);
+    }
+
+    return 0;
+}
+
+/*
+
+#if AUTO_OPTIMIZE
+const char* benchmark_path = "benchmark.out";
+
+#include "core/kernels/rk3_threadblock.conf"
+static int
+write_result_to_file(const float& ms_per_step)
+{
+    FILE* fp;
+    fp = fopen(benchmark_path, "a");
+
+    if (fp != NULL) {
+        fprintf(fp,
+                "(%d, %d, %d), %d elems per thread, launch bound %d, %f ms\n",
+                RK_THREADS_X, RK_THREADS_Y, RK_THREADS_Z, RK_ELEMS_PER_THREAD,
+                RK_LAUNCH_BOUND_MIN_BLOCKS, double(ms_per_step));
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+#endif
+
+#if GENERATE_BENCHMARK_DATA != 1
+int
+run_benchmark(void)
+{
+    // Parse configs
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    mesh_info.int_params[AC_nx] = 128;
+    mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+    mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+    update_config(&mesh_info);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+    Timer t;
+    timer_reset(&t);
+
+    int steps           = 0;
+    const int num_steps = 100;
+    while (steps < num_steps) {
+        // Advance the simulation
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+        ++steps;
+    }
+    acSynchronize();
+    const float wallclock = timer_diff_nsec(t) / 1e9f;
+    printf("%d steps. Wallclock time %f s per step\n", steps,
+           double(wallclock) / num_steps);
+    #if AUTO_OPTIMIZE
+    write_result_to_file(wallclock * 1e3f / steps);
+    #endif
+
+    acStore(mesh);
+    acQuit();
+    acmesh_destroy(mesh);
+
+    return 0;
+}
+
+#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
+
+
+
+
+int
+run_benchmark(void)
+{
+    const char path[] = "result.out";
+    FILE* fp;
+    fp = fopen(path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "n, min, max, median, perc\n");
+        fclose(fp);
+    } else {
+        return EXIT_FAILURE;
+    }
+
+    #define N_STEP_SIZE (256)
+    #define MAX_MESH_DIM (256)
+    #define NUM_ITERS (1000)
+    for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
+        // Parse configs
+        AcMeshInfo mesh_info;
+        load_config(&mesh_info);
+        mesh_info.int_params[AC_nx] = n;
+        mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+        mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+        update_config(&mesh_info);
+
+        AcMesh* mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+        acInit(mesh_info);
+        acLoad(*mesh);
+
+        std::vector<double> results;
+        results.reserve(NUM_ITERS);
+
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            acIntegrate(0);
+            acSynchronize();
+        }
+
+        Timer t;
+
+        const AcReal dt = AcReal(1e-5);
+        for (int i = 0; i < NUM_ITERS; ++i) {
+
+            timer_reset(&t);
+            //acIntegrate(dt);
+            acIntegrateStep(2, dt);
+            acSynchronize();
+
+            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
+            results.push_back(ms_elapsed);
+        }
+
+
+
+        #define NTH_PERCENTILE (0.95)
+        std::sort(results.begin(), results.end(), smaller_than);
+        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+        write_percentiles(n, NUM_ITERS, results);
+    }
+
+    return 0;
+}
+#endif
+*/
--- a/src/standalone/config_loader.cc
+++ b/src/standalone/config_loader.cc
@@ -0,0 +1,194 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "config_loader.h"
+
+#include <limits.h> // UINT_MAX
+#include <stdint.h> // uint8_t, uint32_t
+#include <stdio.h>  // print
+#include <string.h> // memset
+
+#include "core/errchk.h"
+#include "core/math_utils.h"
+
+static inline void
+print(const AcMeshInfo& config)
+{
+    for (int i = 0; i < NUM_INT_PARAM_TYPES; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_REAL_PARAM_TYPES; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+}
+
+/**
+ \brief Find the index of the keyword in names
+ \return Index in range 0...n if the keyword is in names. -1 if the keyword was
+ not found.
+ */
+static int
+find_str(const char keyword[], const char* names[], const int& n)
+{
+    for (int i = 0; i < n; ++i)
+        if (!strcmp(keyword, names[i]))
+            return i;
+
+    return -1;
+}
+
+static void
+parse_config(const char* path, AcMeshInfo* config)
+{
+    FILE* fp;
+    fp = fopen(path, "r");
+    // For knowing which .conf file will be used 
+    printf("Config file path: \n %s \n ", path);
+    ERRCHK(fp != NULL);
+
+    const size_t BUF_SIZE = 128;
+    char keyword[BUF_SIZE];
+    char value[BUF_SIZE];
+    int items_matched;
+    while ((items_matched = fscanf(fp, "%s = %s", keyword, value)) != EOF) {
+
+        if (items_matched < 2)
+            continue;
+
+        int idx = -1;
+        if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
+            config->int_params[idx] = atoi(value);
+        else if ((idx = find_str(keyword, realparam_names,
+                                 NUM_REAL_PARAM_TYPES)) >= 0)
+            config->real_params[idx] = AcReal(atof(value));
+    }
+
+    fclose(fp);
+}
+
+void
+update_config(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    //config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
+                                    config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] +
+                                    STENCIL_ORDER / 2;
+    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] +
+                                    STENCIL_ORDER / 2;
+
+    // Spacing
+    config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
+    config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
+    config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
+    config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
+
+    // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
+    config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx]; 
+    config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my];
+    config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz];
+
+    config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];  
+    config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen]; 
+    config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen]; 
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy] = config->int_params[AC_mx] *
+                                 config->int_params[AC_my];
+    config->int_params[AC_nxy] = config->int_params[AC_nx] *
+                                 config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
+                                  config->int_params[AC_nz];
+
+    // Real helpers
+    config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
+                                        config->real_params[AC_cs_sound];
+
+    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
+
+    AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
+    AcReal M_sun       = AcReal(1.989e33);  // g solar mass
+
+    config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun / 
+                                     ( (config->real_params[AC_unit_length]*
+                                        config->real_params[AC_unit_length]*
+                                        config->real_params[AC_unit_length]) * 
+                                        config->real_params[AC_unit_density] ) ;
+
+    config->real_params[AC_G_CONST] = G_CONST_CGS / 
+                                      ( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
+                                        (config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
+
+    config->real_params[AC_GM_star]  = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
+    config->real_params[AC_sq2GM_star]  = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
+
+
+    const bool print_config = true;
+    if (print_config) {
+        printf("###############################################################"
+               "\n");
+        printf("Config dimensions recalculated:\n");
+        print(*config);
+        printf("###############################################################"
+               "\n");
+    }
+}
+
+/**
+\brief Loads data from astaroth.conf into a config struct.
+\return 0 on success, -1 if there are potentially uninitialized values.
+*/
+int
+load_config(AcMeshInfo* config)
+{
+    int retval = 0;
+    // memset reads the second parameter as a byte even though it says int in
+    // the function declaration
+    memset(config, (uint8_t)0xFF, sizeof(*config));
+
+    parse_config(CONFIG_PATH "astaroth.conf", config);
+    update_config(config);
+
+    // sizeof(config) must be a multiple of 4 bytes for this to work
+    ERRCHK(sizeof(*config) % sizeof(uint32_t) == 0);
+    for (size_t i = 0; i < sizeof(*config) / sizeof(uint32_t); ++i) {
+        if (((uint32_t*)config)[i] == (uint32_t)0xFFFFFFFF) {
+            WARNING("Some config values may be uninitialized. "
+                    "See that all are defined in astaroth.conf\n");
+            retval = -1;
+        }
+    }
+    return retval;
+}
--- a/src/standalone/config_loader.h
+++ b/src/standalone/config_loader.h
@@ -0,0 +1,34 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Functions for loading and updating AcMeshInfo.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+/** Loads data from the config file */
+int load_config(AcMeshInfo* config);
+
+/** Recalculates the portion of int parameters which get their values from nx,
+ * ny and nz. Must be called after modifying the config struct or otherwise
+ * contents of the struct will be incorrect */
+void update_config(AcMeshInfo* config);
--- a/src/standalone/main.cc
+++ b/src/standalone/main.cc
@@ -0,0 +1,94 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core/errchk.h"
+#include "run.h"
+
+// Write all errors from stderr to an <errorlog_name> in the current working
+// directory
+static const bool write_log_to_a_file = false;
+static const char* errorlog_name      = "error.log";
+
+static void
+errorlog_init(void)
+{
+    FILE* fp = freopen(errorlog_name, "w", stderr); // Log errors to a file
+    if (!fp)
+        perror("Error redirecting stderr to a file");
+}
+
+static void
+errorlog_quit(void)
+{
+    fclose(stderr);
+
+    // Print contents of the latest errorlog to screen
+    FILE* fp = fopen(errorlog_name, "r");
+    if (fp) {
+        for (int c = getc(fp); c != EOF; c = getc(fp))
+            putchar(c);
+        fclose(fp);
+    }
+    else {
+        perror("Error opening error log");
+    }
+}
+
+int
+main(int argc, char* argv[])
+{
+    if (write_log_to_a_file) {
+        errorlog_init();
+        atexit(errorlog_quit);
+    }
+
+    printf("Args: \n");
+    for (int i = 0; i < argc; ++i)
+        printf("%d: %s\n", i, argv[i]);
+
+    if (argc == 1) {
+        return run_renderer();
+    }
+    else if (argc == 2) {
+        if (strcmp(argv[1], "-t") == 0)
+            return run_autotest();
+        else if (strcmp(argv[1], "-b") == 0)
+            return run_benchmark();
+        else if (strcmp(argv[1], "-s") == 0)
+            return run_simulation();
+        else
+            WARNING("Unrecognized option");
+    }
+    else {
+        WARNING("Too many options given");
+    }
+
+    return EXIT_FAILURE;
+}
--- a/src/standalone/model/host_memory.cc
+++ b/src/standalone/model/host_memory.cc
@@ -0,0 +1,737 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "host_memory.h"
+
+#include <math.h>
+
+#include "core/errchk.h"
+
+const char* init_type_names[] = {AC_FOR_INIT_TYPES(AC_GEN_STR)};
+
+#define XORIG (AcReal(.5) * mesh->info.int_params[AC_nx] * mesh->info.real_params[AC_dsx])
+#define YORIG (AcReal(.5) * mesh->info.int_params[AC_ny] * mesh->info.real_params[AC_dsy])
+#define ZORIG (AcReal(.5) * mesh->info.int_params[AC_nz] * mesh->info.real_params[AC_dsz])
+
+/*
+#include <stdint.h>
+static uint64_t ac_rand_next = 1;
+
+static int32_t
+ac_rand(void)
+{
+	ac_rand_next = ac_rand_next * 1103515245 + 12345;
+	return (uint32_t)(ac_rand_next/65536) % 32768;
+}
+
+static void
+ac_srand(const uint32_t seed)
+{
+	ac_rand_next = seed;	
+}
+*/
+
+AcMesh*
+acmesh_create(const AcMeshInfo& mesh_info)
+{
+    AcMesh* mesh = (AcMesh*)malloc(sizeof(*mesh));
+    mesh->info   = mesh_info;
+
+    const size_t bytes = AC_VTXBUF_SIZE_BYTES(mesh->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        mesh->vertex_buffer[VertexBufferHandle(i)] = (AcReal*)malloc(bytes);
+        ERRCHK(mesh->vertex_buffer[VertexBufferHandle(i)] != NULL);
+    }
+
+    return mesh;
+}
+
+static void
+vertex_buffer_set(const VertexBufferHandle& key, const AcReal& val,
+                  AcMesh* mesh)
+{
+    const int n = AC_VTXBUF_SIZE(mesh->info);
+    for (int i = 0; i < n; ++i)
+        mesh->vertex_buffer[key][i] = val;
+}
+
+
+/** Inits all fields to 1. Setting the mesh to zero is problematic because some fields are supposed
+    to be > 0 and the results would vary widely, which leads to loss of precision in the
+    computations */
+void
+acmesh_clear(AcMesh* mesh)
+{
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+        vertex_buffer_set(VertexBufferHandle(w), 1, mesh); // Init all fields to 1 by default.
+}
+
+static AcReal
+randr(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+
+void
+lnrho_step(AcMesh* mesh)
+{
+    const int    mx     = mesh->info.int_params[AC_mx];
+    const int    my     = mesh->info.int_params[AC_my];
+    const int    mz     = mesh->info.int_params[AC_mz];
+
+    // const int    nx_min = mesh->info.int_params[AC_nx_min];
+    // const int    nx_max = mesh->info.int_params[AC_nx_max];
+    // const int    ny_min = mesh->info.int_params[AC_ny_min];
+    // const int    ny_max = mesh->info.int_params[AC_ny_max];
+    // const int    nz_min = mesh->info.int_params[AC_nz_min];
+    // const int    nz_max = mesh->info.int_params[AC_nz_max];
+
+    // const AcReal DX     = mesh->info.real_params[AC_dsx];
+    // const AcReal DY     = mesh->info.real_params[AC_dsy];
+    // const AcReal DZ     = mesh->info.real_params[AC_dsz];
+    // const AcReal xmax   = DX * (nx_max - nx_min) ;
+    // const AcReal zmax   = DZ * (nz_max - nz_min) ;
+
+    // const AcReal lnrho1 = (AcReal) -1.0; // TODO mesh->info.real_params[AC_lnrho1];  
+    const AcReal lnrho2 = (AcReal) 0.0; // TODO mesh->info.real_params[AC_lnrho2]; 
+    // const AcReal rho1   = (AcReal) exp(lnrho1); 
+    // const AcReal rho2   = (AcReal) exp(lnrho2);
+
+    // const AcReal k_pert    = (AcReal) 1.0; //mesh->info.real_params[AC_k_pert]; //Wamenumber of the perturbation
+    // const AcReal k_pert    = 4.0; //mesh->info.real_params[AC_k_pert]; //Wamenumber of the perturbation
+    //const AcReal ampl_pert = xmax/10.0; // xmax/mesh->info.real_params[AC_pert]; //Amplitude of the perturbation
+    // const AcReal ampl_pert = (AcReal) 0.0;//xmax/20.0; // xmax/mesh->info.real_params[AC_pert]; //Amplitude of the perturbation
+    // const AcReal two_pi       = (AcReal) 6.28318531;
+
+    // const AcReal xorig  = mesh->info.real_params[AC_xorig];
+    // const AcReal zorig  = mesh->info.real_params[AC_zorig];
+    // const AcReal trans  = mesh->info.real_params[AC_trans];
+     
+    
+    // AcReal       xx, zz, tanhprof, cosz_wave;
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                // zz = DZ * AcReal(k) - zorig; // Not used
+                // cosz_wave = ampl_pert*AcReal(cos(k_pert*((zz/zmax)*two_pi))); // Not used        
+                // xx = DX * AcReal(i) - xorig + cosz_wave; //ADD WAVE TODO // Not used
+                // tanhprof = AcReal(0.5)*((rho2+rho1) + (rho2-rho1)*AcReal(tanh(xx/trans))); // Not used
+                // Commented out the step function initial codition. 
+                //mesh->vertex_buffer[VTXBUF_LNRHO][idx] = log(tanhprof);
+                mesh->vertex_buffer[VTXBUF_LNRHO][idx] = lnrho2;
+            }
+        }
+    } 
+
+
+}
+
+// This is the initial condition type for the infalling vedge in the pseudodisk
+// model. 
+void
+inflow_vedge(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    // const int nx_min = mesh->info.int_params[AC_nx_min];
+    // const int nx_max = mesh->info.int_params[AC_nx_max];
+    // const int ny_min = mesh->info.int_params[AC_ny_min];
+    // const int ny_max = mesh->info.int_params[AC_ny_max];
+    // const int nz_min = mesh->info.int_params[AC_nz_min];
+    // const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    // const double DX    = mesh->info.real_params[AC_dsx];
+    // const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    const double AMPL_UU = mesh->info.real_params[AC_ampl_uu];
+    const double ANGL_UU = mesh->info.real_params[AC_angl_uu];
+
+    const double zorig = mesh->info.real_params[AC_zorig];
+    double zz;
+    double trans = mesh->info.real_params[AC_trans];
+
+    // const AcReal range = AcReal(.5);
+
+    // const AcReal zmax  = AcReal(DZ * (nz_max - nz_min));
+    // const AcReal gaussr  = zmax / AcReal(4.0);
+
+    //for (int k = nz_min; k < nz_max; k++) {
+    //    for (int j = ny_min; j < ny_max; j++) {
+    //        for (int i = nx_min; i < nx_max; i++) {
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                zz = DZ * double(k) - zorig;
+                //mesh->vertex_buffer[VTXBUF_UUX][idx] = -AMPL_UU*cos(ANGL_UU); 
+                mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal(-AMPL_UU*cos(ANGL_UU)*fabs(tanh(zz/trans))); 
+                mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal(-AMPL_UU*sin(ANGL_UU)*tanh(zz/trans)); 
+
+                //Variarion to density
+                //AcReal rho = exp(mesh->vertex_buffer[VTXBUF_LNRHO][idx]);
+                //NO GAUSSIAN//rho = rho*exp(-(zz/gaussr)*(zz/gaussr));
+                //mesh->vertex_buffer[VTXBUF_LNRHO][idx] = log(rho + (range*rho) * (randr() - AcReal(-0.5)));
+            }
+        }
+    }
+}
+
+// This is the initial condition type for the infalling vedge in the pseudodisk
+// model. 
+void
+inflow_vedge_freefall(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    // const int nx_min = mesh->info.int_params[AC_nx_min];
+    // const int nx_max = mesh->info.int_params[AC_nx_max];
+    // const int ny_min = mesh->info.int_params[AC_ny_min];
+    // const int ny_max = mesh->info.int_params[AC_ny_max];
+    // const int nz_min = mesh->info.int_params[AC_nz_min];
+    // const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+    // const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    // const double AMPL_UU = mesh->info.real_params[AC_ampl_uu];
+    const double ANGL_UU = mesh->info.real_params[AC_angl_uu];
+    const double SQ2GM = mesh->info.real_params[AC_sq2GM_star];
+    // const double GM = mesh->info.real_params[AC_GM_star];
+    // const double M_star  = mesh->info.real_params[AC_M_star];
+    // const double G_CONST = mesh->info.real_params[AC_G_CONST];
+
+    // const double unit_length   = mesh->info.real_params[AC_unit_length];
+    // const double unit_density  = mesh->info.real_params[AC_unit_density];
+    // const double unit_velocity = mesh->info.real_params[AC_unit_velocity];
+
+    const double xorig = mesh->info.real_params[AC_xorig];
+    // const double yorig = mesh->info.real_params[AC_yorig];
+    const double zorig = mesh->info.real_params[AC_zorig];
+    // const double trans = mesh->info.real_params[AC_trans];
+    //  double xx, yy, zz, RR;
+    double xx, zz, RR;
+    // double delx, dely, delz;
+    double delx, delz;
+    // double u_x, u_y, u_z, veltot, tanhz;
+    double u_x, u_z, veltot, tanhz;
+
+    const double star_pos_x = mesh->info.real_params[AC_star_pos_x];
+    const double star_pos_z = mesh->info.real_params[AC_star_pos_z];
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                xx = DX * double(i) - xorig;
+                zz = DZ * double(k) - zorig;
+
+                delx = xx - star_pos_x; 
+                delz = zz - star_pos_z;
+                //TODO: Figure out isthis needed. Now a placeholder.
+                //tanhz = fabs(tanh(zz/trans));
+                tanhz = 1.0;
+                
+                RR = sqrt(delx*delx + delz*delz);
+                veltot = SQ2GM/sqrt(RR); //Free fall velocity
+
+                //Normal velocity components
+                u_x = - veltot*(delx/RR);  
+                u_z = - veltot*(delz/RR);
+
+                //printf("star_pos_z %e, zz %e, delz %e, RR %e\n", star_pos_z, zz, delz, RR);
+
+                //printf("unit_length = %e, unit_density = %e, unit_velocity = %e,\n M_star = %e, G_CONST = %e, GM = %e, SQ2GM = %e, \n RR = %e, u_x = %e, u_z %e\n", 
+                //        unit_length, unit_density, 
+                //        unit_velocity, M_star, G_CONST, GM, SQ2GM, RR, u_x, u_z);
+                //printf("%e\n", unit_length*unit_length*unit_length); 
+
+ 
+                //Here including an angel tilt due to pseudodisk
+                if (delz >= 0.0) {
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal((u_x*cos(ANGL_UU) - u_z*sin(ANGL_UU))*tanhz); 
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal((u_x*sin(ANGL_UU) + u_z*cos(ANGL_UU))*tanhz); 
+                } else {
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal((u_x*cos(ANGL_UU) + u_z*sin(ANGL_UU))*tanhz); 
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal((-u_x*sin(ANGL_UU) + u_z*cos(ANGL_UU))*tanhz); 
+                }
+            }
+        }
+    }
+}
+
+// Only x-direction free fall 
+void
+inflow_freefall_x(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+
+    const double SQ2GM = mesh->info.real_params[AC_sq2GM_star];
+    // const double G_CONST = mesh->info.real_params[AC_G_CONST];
+
+    const double xorig = mesh->info.real_params[AC_xorig];
+    double xx, RR;
+    double delx;
+    double /*u_x,*/ veltot;
+
+    const double star_pos_x = mesh->info.real_params[AC_star_pos_x];
+
+    const double ampl_lnrho = mesh->info.real_params[AC_ampl_lnrho];
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                xx = DX * double(i) - xorig;
+
+                delx = xx - star_pos_x;
+                
+                RR = fabs(delx);
+
+                veltot = SQ2GM/sqrt(RR); //Free fall velocity
+
+                if (isinf(veltot) == 1) printf("xx %e star_pos_x %e delz %e RR %e veltot %e\n",xx, star_pos_x, delx, RR, veltot);
+
+                //Normal velocity components
+                // u_x = - veltot; // Not used 
+
+                //Freefall condition 
+                //mesh->vertex_buffer[VTXBUF_UUX][idx] = u_x; 
+                //mesh->vertex_buffer[VTXBUF_UUY][idx] = 0.0;
+                //mesh->vertex_buffer[VTXBUF_UUZ][idx] = 0.0; 
+
+                //Starting with steady state
+                mesh->vertex_buffer[VTXBUF_UUX][idx] = 0.0; 
+                mesh->vertex_buffer[VTXBUF_UUY][idx] = 0.0;
+                mesh->vertex_buffer[VTXBUF_UUZ][idx] = 0.0; 
+
+                mesh->vertex_buffer[VTXBUF_LNRHO][idx] = AcReal(ampl_lnrho); 
+            }
+        }
+    }
+}
+
+
+
+void
+gaussian_radial_explosion(AcMesh* mesh)
+{
+    AcReal* uu_x = mesh->vertex_buffer[VTXBUF_UUX];
+    AcReal* uu_y = mesh->vertex_buffer[VTXBUF_UUY];
+    AcReal* uu_z = mesh->vertex_buffer[VTXBUF_UUZ];
+
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+
+    const int nx_min = mesh->info.int_params[AC_nx_min];
+    const int nx_max = mesh->info.int_params[AC_nx_max];
+    const int ny_min = mesh->info.int_params[AC_ny_min];
+    const int ny_max = mesh->info.int_params[AC_ny_max];
+    const int nz_min = mesh->info.int_params[AC_nz_min];
+    const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+    const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    const double xorig = double(XORIG) - 0.000001; 
+    const double yorig = double(YORIG) - 0.000001;
+    const double zorig = double(ZORIG) - 0.000001;
+
+    const double INIT_LOC_UU_X = 0.0;
+    const double INIT_LOC_UU_Y = 0.0;
+    const double INIT_LOC_UU_Z = 0.0;
+
+    const double AMPL_UU    = mesh->info.real_params[AC_ampl_uu];
+    const double UU_SHELL_R = 0.8;
+    const double WIDTH_UU   = 0.2;
+
+    // Outward explosion with gaussian initial velocity profile.
+    int idx;
+    double xx, yy, zz, rr2, rr, theta = 0.0, phi = 0.0;
+    double uu_radial;
+
+    // double theta_old = 0.0;
+
+    for (int k = nz_min; k < nz_max; k++) {
+        for (int j = ny_min; j < ny_max; j++) {
+            for (int i = nx_min; i < nx_max; i++) {
+                // Calculate the value of velocity in a particular radius.
+                idx = i + j * mx + k * mx * my;
+                // Determine the coordinates
+                xx = DX * (i - nx_min) - xorig;
+                xx = xx - INIT_LOC_UU_X;
+
+                yy = DY * (j - ny_min) - yorig;
+                yy = yy - INIT_LOC_UU_Y;
+
+                zz = DZ * (k - nz_min) - zorig;
+                zz = zz - INIT_LOC_UU_Z;
+
+                rr2 = pow(xx, 2.0) + pow(yy, 2.0) + pow(zz, 2.0);
+                rr  = sqrt(rr2);
+
+                // Origin is different!
+                double xx_abs, yy_abs, zz_abs;
+                if (rr > 0.0) {
+                    // theta range [0, PI]
+                    if (zz >= 0.0) {
+                        theta = acos(zz / rr);
+                        if (theta > M_PI / 2.0 || theta < 0.0) {
+                            printf("Explosion THETA WRONG: zz = %.3f, rr = "
+                                   "%.3f, theta = %.3e/PI, M_PI = %.3e\n",
+                                   zz, rr, theta / M_PI, M_PI);
+                        }
+                    }
+                    else {
+                        zz_abs = -zz; // Needs a posite value for acos
+                        theta  = M_PI - acos(zz_abs / rr);
+                        if (theta < M_PI / 2.0 || theta > 2 * M_PI) {
+                            printf("Explosion THETA WRONG: zz = %.3f, rr = "
+                                   "%.3f, theta = %.3e/PI, M_PI = %.3e\n",
+                                   zz, rr, theta / M_PI, M_PI);
+                        }
+                    }
+
+                    // phi range [0, 2*PI]i
+                    if (xx != 0.0) {
+                        if (xx < 0.0 && yy >= 0.0) {
+                            //-+
+                            xx_abs = -xx; // Needs a posite value for atan
+                            phi    = M_PI - atan(yy / xx_abs);
+                            if (phi < (M_PI / 2.0) || phi > M_PI) {
+                                printf("Explosion PHI WRONG -+: xx = %.3f, yy "
+                                       "= %.3f, phi = %.3e/PI, M_PI = %.3e\n",
+                                       xx, yy, phi / M_PI, M_PI);
+                            }
+                        }
+                        else if (xx > 0.0 && yy < 0.0) {
+                            //+-
+                            yy_abs = -yy;
+                            phi    = 2.0 * M_PI - atan(yy_abs / xx);
+                            if (phi < (3.0 * M_PI) / 2.0 ||
+                                phi > (2.0 * M_PI + 1e-6)) {
+                                printf("Explosion PHI WRONG +-: xx = %.3f, yy "
+                                       "= %.3f, phi = %.3e/PI, M_PI = %.3e\n",
+                                       xx, yy, phi / M_PI, M_PI);
+                            }
+                        }
+                        else if (xx < 0.0 && yy < 0.0) {
+                            //--
+                            yy_abs = -yy;
+                            xx_abs = -xx;
+                            phi    = M_PI + atan(yy_abs / xx_abs);
+                            if (phi < M_PI ||
+                                phi > ((3.0 * M_PI) / 2.0 + 1e-6)) {
+                                printf("Explosion PHI WRONG --: xx = %.3f, yy "
+                                       "= %.3f, xx_abs = %.3f, yy_abs = %.3f, "
+                                       "phi = %.3e, (3.0*M_PI)/2.0 = %.3e\n",
+                                       xx, yy, xx_abs, yy_abs, phi,
+                                       (3.0 * M_PI) / 2.0);
+                            }
+                        }
+                        else {
+                            //++
+                            phi = atan(yy / xx);
+                            if (phi < 0 || phi > M_PI / 2.0) {
+                                printf(
+                                    "Explosion PHI WRONG --: xx = %.3f, yy = "
+                                    "%.3f, phi = %.3e, (3.0*M_PI)/2.0 = %.3e\n",
+                                    xx, yy, phi, (3.0 * M_PI) / 2.0);
+                            }
+                        }
+                    }
+                    else { // To avoid div by zero with atan
+                        if (yy > 0.0) {
+                            phi = M_PI / 2.0;
+                        }
+                        else if (yy < 0.0) {
+                            phi = (3.0 * M_PI) / 2.0;
+                        }
+                        else {
+                            phi = 0.0;
+                        }
+                    }
+
+                    // Set zero for explicit safekeeping
+                    if (xx == 0.0 && yy == 0.0) {
+                        phi = 0.0;
+                    }
+
+                    // Gaussian velocity
+                    // uu_radial = AMPL_UU*exp( -rr2 / (2.0*pow(WIDTH_UU, 2.0))
+                    // ); New distribution, where that gaussion wave is not in
+                    // the exact centre coordinates uu_radial = AMPL_UU*exp(
+                    // -pow((rr - 4.0*WIDTH_UU),2.0) / (2.0*pow(WIDTH_UU, 2.0))
+                    // ); //TODO: Parametrize the peak location.
+                    uu_radial = AMPL_UU * exp(-pow((rr - UU_SHELL_R), 2.0) /
+                                              (2.0 * pow(WIDTH_UU, 2.0)));
+                }
+                else {
+                    uu_radial = 0.0; // TODO: There will be a discontinuity in
+                                     // the origin... Should the shape of the
+                                     // distribution be different?
+                }
+
+                // Determine the carthesian velocity components and lnrho
+                uu_x[idx] = AcReal(uu_radial * sin(theta) * cos(phi));
+                uu_y[idx] = AcReal(uu_radial * sin(theta) * sin(phi));
+                uu_z[idx] = AcReal(uu_radial * cos(theta));
+
+                // Temporary diagnosticv output (TODO: Remove after not needed)
+                // if (theta > theta_old) {
+                // if (theta > M_PI || theta < 0.0 || phi < 0.0 || phi > 2*M_PI)
+                // {
+                /*	printf("Explosion: xx = %.3f, yy = %.3f, zz = %.3f, rr =
+                   %.3f, phi = %.3e/PI, theta = %.3e/PI\n, M_PI = %.3e", xx, yy,
+                   zz, rr, phi/M_PI, theta/M_PI, M_PI); printf(" uu_radial =
+                   %.3e, uu_x[%i] = %.3e, uu_y[%i] = %.3e, uu_z[%i] = %.3e \n",
+                                uu_radial, idx, uu_x[idx], idx, uu_y[idx], idx,
+                   uu_z[idx]); theta_old = theta;
+                */
+            }
+        }
+    }
+}
+
+void
+acmesh_init_to(const InitType& init_type, AcMesh* mesh)
+{
+    srand(123456789);
+
+
+    const int n = AC_VTXBUF_SIZE(mesh->info);
+
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    const int nx_min = mesh->info.int_params[AC_nx_min];
+    const int nx_max = mesh->info.int_params[AC_nx_max];
+    const int ny_min = mesh->info.int_params[AC_ny_min];
+    const int ny_max = mesh->info.int_params[AC_ny_max];
+    const int nz_min = mesh->info.int_params[AC_nz_min];
+    const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    switch (init_type) {
+    case INIT_TYPE_RANDOM: {
+        acmesh_clear(mesh);
+        const AcReal range = AcReal(0.01);
+        for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+            for (int i = 0; i < n; ++i)
+                mesh->vertex_buffer[w][i] = 2 * range * randr() - range;
+
+        break;
+    }
+    case INIT_TYPE_GAUSSIAN_RADIAL_EXPL:
+        acmesh_clear(mesh);
+        //acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        gaussian_radial_explosion(mesh);
+
+        break;
+    case INIT_TYPE_XWAVE:
+        acmesh_clear(mesh);
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        for (int k = 0; k < mz; k++) {
+            for (int j = 0; j < my; j++) {
+                for (int i = 0; i < mx; i++) {
+                    int idx = i + j * mx + k * mx * my;
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = 2*AcReal(sin(j * AcReal(M_PI) / mx)) - 1;
+                }
+            }
+        }
+        break;
+    case INIT_TYPE_VEDGE: 
+        acmesh_clear(mesh);
+        inflow_vedge_freefall(mesh);
+        break;
+    case INIT_TYPE_VEDGEX: 
+        acmesh_clear(mesh);
+        inflow_freefall_x(mesh);
+        break;
+    case INIT_TYPE_RAYLEIGH_TAYLOR: 
+        acmesh_clear(mesh);
+        inflow_freefall_x(mesh);
+        lnrho_step(mesh);
+        break;
+    case INIT_TYPE_ABC_FLOW: {
+        acmesh_clear(mesh);
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        for (int k = nz_min; k < nz_max; k++) {
+            for (int j = ny_min; j < ny_max; j++) {
+                for (int i = nx_min; i < nx_max; i++) {
+                    const int idx = i + j * mx + k * mx * my;
+
+                    /*
+                    const double xx = double(
+                        mesh->info.real_params[AC_dsx] *
+                            (i - mesh->info.int_params[AC_nx_min]) -
+                        XORIG + AcReal(.5) * mesh->info.real_params[AC_dsx]);
+                    const double yy = double(
+                        mesh->info.real_params[AC_dsy] *
+                            (j - mesh->info.int_params[AC_ny_min]) -
+                        YORIG + AcReal(.5) * mesh->info.real_params[AC_dsy]);
+                    const double zz = double(
+                        mesh->info.real_params[AC_dsz] *
+                            (k - mesh->info.int_params[AC_nz_min]) -
+                        ZORIG + AcReal(.5) * mesh->info.real_params[AC_dsz]);
+                    */
+
+                    const AcReal xx = (i - nx_min) * mesh->info.real_params[AC_dsx] - XORIG;
+                    const AcReal yy = (j - ny_min) * mesh->info.real_params[AC_dsy] - YORIG;
+                    const AcReal zz = (k - nz_min) * mesh->info.real_params[AC_dsz] - ZORIG;
+
+                    const AcReal ampl_uu = 0.5;
+                    const AcReal ABC_A   = 1.;
+                    const AcReal ABC_B   = 1.;
+                    const AcReal ABC_C   = 1.;
+                    const AcReal kx_uu   = 8.;
+                    const AcReal ky_uu   = 8.;
+                    const AcReal kz_uu   = 8.;
+
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = ampl_uu * (ABC_A * (AcReal)sin(kz_uu * zz) + ABC_C * (AcReal)cos(ky_uu * yy));
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = ampl_uu * (ABC_B * (AcReal)sin(kx_uu * xx) + ABC_A * (AcReal)cos(kz_uu * zz));
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = ampl_uu * (ABC_C * (AcReal)sin(ky_uu * yy) + ABC_B * (AcReal)cos(kx_uu * xx));
+                }
+            }
+        }
+        break;
+    }
+    case INIT_TYPE_RAYLEIGH_BENARD: {
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        #if LTEMPERATURE
+        vertex_buffer_set(VTXBUF_LNRHO, 1, mesh);
+        const AcReal range = AcReal(0.9);
+        for (int k = nz_min; k < nz_max; k++) {
+            for (int j = ny_min; j < ny_max; j++) {
+                for (int i = nx_min; i < nx_max; i++) {
+                    const int idx = i + j * mx + k * mx * my;
+                    mesh->vertex_buffer[VTXBUF_TEMPERATURE][idx] = (range * (k - nz_min)) / mesh->info.int_params[AC_nz] + 0.1;
+                }
+            }
+        }
+        #else
+        WARNING("INIT_TYPE_RAYLEIGH_BERNARD called even though VTXBUF_TEMPERATURE is not used");
+        #endif
+        break;
+    }
+    default:
+        ERROR("Unknown init_type");
+    }
+
+    AcReal max_val = AcReal(-1e-32);
+    AcReal min_val = AcReal(1e32);
+    // Normalize the grid
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        for (int i = 0; i < n; ++i) {
+            if (mesh->vertex_buffer[w][i] < min_val)
+                min_val = mesh->vertex_buffer[w][i];
+            if (mesh->vertex_buffer[w][i] > max_val)
+                max_val = mesh->vertex_buffer[w][i];
+        }
+    }
+    printf("MAX: %f MIN %f\n", double(max_val), double(min_val));
+    /*
+    const AcReal inv_range = AcReal(1.) / fabs(max_val - min_val);
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        for (int i = 0; i < n; ++i) {
+            mesh->vertex_buffer[w][i] = 2*inv_range*(mesh->vertex_buffer[w][i] - min_val) - 1;
+        }
+    }
+    */
+}
+
+void
+acmesh_destroy(AcMesh* mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        free(mesh->vertex_buffer[VertexBufferHandle(i)]);
+
+    free(mesh);
+}
+
+
+ModelMesh*
+modelmesh_create(const AcMeshInfo& mesh_info)
+{
+    ModelMesh* mesh = (ModelMesh*)malloc(sizeof(*mesh));
+    mesh->info   = mesh_info;
+
+    const size_t bytes = AC_VTXBUF_SIZE(mesh->info) * sizeof(mesh->vertex_buffer[0][0]);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        mesh->vertex_buffer[VertexBufferHandle(i)] = (ModelScalar*)malloc(bytes);
+        ERRCHK(mesh->vertex_buffer[VertexBufferHandle(i)] != NULL);
+    }
+
+    return mesh;
+}
+
+void
+modelmesh_destroy(ModelMesh* mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        free(mesh->vertex_buffer[VertexBufferHandle(i)]);
+
+    free(mesh);
+}
+
+#include <string.h> // memcpy
+void
+acmesh_to_modelmesh(const AcMesh& acmesh, ModelMesh* modelmesh)
+{
+    ERRCHK(sizeof(acmesh.info) == sizeof(modelmesh->info));
+    memcpy(&modelmesh->info, &acmesh.info, sizeof(acmesh.info));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        for (size_t j = 0; j < AC_VTXBUF_SIZE(acmesh.info); ++j)
+            modelmesh->vertex_buffer[i][j] = (ModelScalar)acmesh.vertex_buffer[i][j];
+}
+
+void
+modelmesh_to_acmesh(const ModelMesh& modelmesh, AcMesh* acmesh)
+{
+    ERRCHK(sizeof(acmesh->info) == sizeof(modelmesh.info));
+    memcpy(&acmesh->info, &modelmesh.info, sizeof(modelmesh.info));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        for (size_t j = 0; j < AC_VTXBUF_SIZE(modelmesh.info); ++j)
+            acmesh->vertex_buffer[i][j] = (AcReal)modelmesh.vertex_buffer[i][j];
+}
--- a/src/standalone/model/host_memory.h
+++ b/src/standalone/model/host_memory.h
@@ -0,0 +1,58 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+// clang-format off
+#define AC_FOR_INIT_TYPES(FUNC)\
+        FUNC(INIT_TYPE_RANDOM), \
+        FUNC(INIT_TYPE_XWAVE), \
+        FUNC(INIT_TYPE_GAUSSIAN_RADIAL_EXPL), \
+        FUNC(INIT_TYPE_ABC_FLOW) , \
+        FUNC(INIT_TYPE_VEDGE), \
+        FUNC(INIT_TYPE_VEDGEX), \
+        FUNC(INIT_TYPE_RAYLEIGH_TAYLOR), \
+        FUNC(INIT_TYPE_RAYLEIGH_BENARD)
+// clang-format on
+
+typedef enum { AC_FOR_INIT_TYPES(AC_GEN_ID), NUM_INIT_TYPES } InitType;
+
+extern const char* init_type_names[]; // Defined in host_memory.cc
+
+AcMesh* acmesh_create(const AcMeshInfo& mesh_info);
+
+void acmesh_clear(AcMesh* mesh);
+
+void acmesh_init_to(const InitType& type, AcMesh* mesh);
+
+void acmesh_destroy(AcMesh* mesh);
+
+ModelMesh* modelmesh_create(const AcMeshInfo& mesh_info);
+void modelmesh_destroy(ModelMesh* mesh);
+void acmesh_to_modelmesh(const AcMesh& acmesh, ModelMesh* modelmesh);
+void modelmesh_to_acmesh(const ModelMesh& model, AcMesh* acmesh);
--- a/src/standalone/model/host_timestep.cc
+++ b/src/standalone/model/host_timestep.cc
@@ -0,0 +1,63 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "host_timestep.h"
+
+#include "core/math_utils.h"
+
+static AcReal timescale = AcReal(1.0);
+
+AcReal
+host_timestep(const AcReal& umax, const AcMeshInfo& mesh_info)
+{
+    const long double cdt      = mesh_info.real_params[AC_cdt];
+    const long double cdtv     = mesh_info.real_params[AC_cdtv];
+    // const long double cdts     = mesh_info.real_params[AC_cdts];
+    const long double cs2_sound = mesh_info.real_params[AC_cs2_sound];
+    const long double nu_visc  = mesh_info.real_params[AC_nu_visc];
+    const long double eta      = mesh_info.real_params[AC_eta];
+    const long double chi      = 0; // mesh_info.real_params[AC_chi]; // TODO not calculated
+    const long double gamma    = mesh_info.real_params[AC_gamma];
+    const long double dsmin    = mesh_info.real_params[AC_dsmin];
+
+    // Old ones from legacy Astaroth
+    //const long double uu_dt   = cdt * (dsmin / (umax + cs_sound));
+    //const long double visc_dt = cdtv * dsmin * dsmin / nu_visc;
+
+    // New, closer to the actual Courant timestep
+    // See Pencil Code user manual p. 38 (timestep section)
+    const long double uu_dt   = cdt * dsmin / (fabsl(umax) + sqrtl(cs2_sound + 0.0l));
+    const long double visc_dt = cdtv * dsmin * dsmin / max(max(nu_visc, eta), max(gamma, chi)) + 1; // TODO NOTE: comment the +1 out to get scientifically accurate results
+
+    const long double dt = min(uu_dt, visc_dt);
+    return AcReal(timescale) * AcReal(dt);
+}
+
+void
+set_timescale(const AcReal scale)
+{
+    timescale = scale;
+}
--- a/src/standalone/model/host_timestep.h
+++ b/src/standalone/model/host_timestep.h
@@ -0,0 +1,32 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+AcReal host_timestep(const AcReal& umax, const AcMeshInfo& mesh_info);
+
+void set_timescale(const AcReal scale);
--- a/src/standalone/model/model_boundconds.cc
+++ b/src/standalone/model/model_boundconds.cc
@@ -0,0 +1,487 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) amy later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT Amy WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "model_boundconds.h"
+
+#include "core/errchk.h"
+
+
+void
+boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh)
+{
+    #pragma omp parallel for
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const int3 start = (int3){0, 0, 0};
+        const int3 end = (int3){
+            mesh_info.int_params[AC_mx],
+            mesh_info.int_params[AC_my],
+            mesh_info.int_params[AC_mz]
+        };
+
+        const int nx = mesh_info.int_params[AC_nx];
+        const int ny = mesh_info.int_params[AC_ny];
+        const int nz = mesh_info.int_params[AC_nz];
+
+         const int nx_min = mesh_info.int_params[AC_nx_min];
+         const int ny_min = mesh_info.int_params[AC_ny_min];
+         const int nz_min = mesh_info.int_params[AC_nz_min];
+
+         // The old kxt was inclusive, but our mx_max is exclusive
+         const int nx_max = mesh_info.int_params[AC_nx_max];
+         const int ny_max = mesh_info.int_params[AC_ny_max];
+         const int nz_max = mesh_info.int_params[AC_nz_max];
+
+        for (int k_dst = start.z; k_dst < end.z; ++k_dst) {
+        for (int j_dst = start.y; j_dst < end.y; ++j_dst) {
+        for (int i_dst = start.x; i_dst < end.x; ++i_dst) {
+
+            // If destination index is inside the computational domain, return since
+            // the boundary conditions are only applied to the ghost zones
+            if (i_dst >= nx_min && i_dst < nx_max &&
+                j_dst >= ny_min && j_dst < ny_max &&
+                k_dst >= nz_min && k_dst < nz_max)
+                continue;
+
+            // Find the source index
+            // Map to nx, ny, nz coordinates
+            int i_src = i_dst - nx_min;
+            int j_src = j_dst - ny_min;
+            int k_src = k_dst - nz_min;
+
+            // Translate (s.t. the index is always positive)
+            i_src += nx;
+            j_src += ny;
+            k_src += nz;
+
+            // Wrap
+            i_src %= nx;
+            j_src %= ny;
+            k_src %= nz;
+
+            // Map to mx, my, mz coordinates
+            i_src += nx_min;
+            j_src += ny_min;
+            k_src += nz_min;
+
+            const size_t src_idx      = AC_VTXBUF_IDX(i_src, j_src, k_src, mesh_info);
+            const size_t dst_idx      = AC_VTXBUF_IDX(i_dst, j_dst, k_dst, mesh_info);
+            ERRCHK(src_idx < AC_VTXBUF_SIZE(mesh_info));
+            ERRCHK(dst_idx < AC_VTXBUF_SIZE(mesh_info));
+            mesh->vertex_buffer[w][dst_idx] = mesh->vertex_buffer[w][src_idx];
+        }
+        }
+        }
+    }
+}
+
+#if 0
+void
+boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh)
+{
+    const int mx = mesh_info.int_params[AC_mx];
+    const int my = mesh_info.int_params[AC_my];
+    const int mz = mesh_info.int_params[AC_mz];
+
+    // Volatile here suppresses the warning about strict-overflow (i.e. compiler
+    // wanted to optimize these loops by assuming that kxb etc never overflow)
+    // However we do not need the performance improvement (~1-3%) and it's
+    // not either good to
+    //	a) get useless warnings originating from here
+    //	b) disable the warnings completely
+    volatile const int kxb = mesh_info.int_params[AC_nx_min];
+    volatile const int kyb = mesh_info.int_params[AC_ny_min];
+    volatile const int kzb = mesh_info.int_params[AC_nz_min];
+
+    // The old kxt was inclusive, but our mx_max is exclusive
+    volatile const int kxt = mesh_info.int_params[AC_nx_max] - 1;
+    volatile const int kyt = mesh_info.int_params[AC_ny_max] - 1;
+    volatile const int kzt = mesh_info.int_params[AC_nz_max] - 1;
+    const int bound[3]     = {0, 0, 0};
+
+    // Periodic boundary conditions
+    if (bound[0] == 0) {
+        for (int k = kzb; k <= kzt; k++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int i = kxb; i <= kxb + 2; i++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (kxt + i - 2) + j * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int i = kxt - 2; i <= kxt; i++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - kxt + 2) + j * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    if (bound[1] == 0) {
+        for (int k = kzb; k <= kzt; k++) {
+            for (int i = kxb; i <= kxt; i++) {
+                for (int j = kyb; j <= kyb + 2; j++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (kyt + j - 2) * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int j = kyt - 2; j <= kyt; j++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - kyt + 2) * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+
+    if (bound[2] == 0) {
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + j * mx + (kzt + k - 2) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + j * mx + (k - kzt + 2) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+
+    // Copy the corners in the fully periodic case
+    if (bound[0] == 0 && bound[1] == 0 && bound[2] == 0) {
+        // Source corner: x=0, y=0, z=0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=0, z=0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=1, z=0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=0, z=1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=1, z=0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=0, z=1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=1, z=1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=1, z=1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    else {
+        ERROR("ONLY FULLY PERIODIC WORKS WITH CORNERS SO FAR! \n");
+    }
+
+    // Copy the edges in the fully periodic case
+    if (bound[0] == 0 && bound[1] == 0 && bound[2] == 0) {
+        // Source edge: x = 0, y = 0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, y = 0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, y = 1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, y = 1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, z = 0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + j * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, z = 0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + j * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, z = 1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + j * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, z = 1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + j * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 0, z = 0
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 1, z = 0
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 0, z = 1
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 1, z = 1
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    else {
+        ERROR("ONLY FULLY PERIODIC WORKS WITH EDGES SO FAR! \n");
+    }
+}
+#endif
--- a/src/standalone/model/model_boundconds.h
+++ b/src/standalone/model/model_boundconds.h
@@ -0,0 +1,31 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+void boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh);
--- a/src/standalone/model/model_diff.h
+++ b/src/standalone/model/model_diff.h
@@ -0,0 +1,353 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "core/errchk.h"
+
+typedef long double MODEL_REAL;
+
+typedef enum { AXIS_X, AXIS_Y, AXIS_Z, NUM_AXIS_TYPES } AxisType;
+
+template <AxisType axis>
+static inline MODEL_REAL
+der_scal(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+         const MODEL_REAL* scal)
+{
+    MODEL_REAL f0, f1, f2, f4, f5, f6;
+    MODEL_REAL ds;
+
+    switch (axis) {
+    case AXIS_X:
+        f0 = scal[AC_VTXBUF_IDX(i - 3, j, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i - 2, j, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i - 1, j, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i + 1, j, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i + 2, j, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i + 3, j, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsx];
+        break;
+    case AXIS_Y:
+        f0 = scal[AC_VTXBUF_IDX(i, j - 3, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j - 2, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j - 1, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j + 1, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j + 2, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j + 3, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsy];
+        break;
+    case AXIS_Z:
+        f0 = scal[AC_VTXBUF_IDX(i, j, k - 3, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j, k - 2, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j, k - 1, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j, k + 1, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j, k + 2, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j, k + 3, mesh_info)];
+        ds = mesh_info.real_params[AC_dsz];
+        break;
+    default:
+        ERROR("Unknown axis type");
+    }
+    return ((f6 - f0) + MODEL_REAL(-9.) * (f5 - f1) + MODEL_REAL(45.) * (f4 - f2)) /
+           (MODEL_REAL(60.) * ds);
+}
+
+template <AxisType axis>
+static inline MODEL_REAL
+der2_scal(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+          const MODEL_REAL* scal)
+{
+    MODEL_REAL f0, f1, f2, f3, f4, f5, f6;
+    MODEL_REAL ds;
+
+    f3 = scal[AC_VTXBUF_IDX(i, j, k, mesh_info)];
+
+    switch (axis) {
+    case AXIS_X:
+        f0 = scal[AC_VTXBUF_IDX(i - 3, j, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i - 2, j, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i - 1, j, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i + 1, j, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i + 2, j, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i + 3, j, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsx];
+        break;
+    case AXIS_Y:
+        f0 = scal[AC_VTXBUF_IDX(i, j - 3, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j - 2, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j - 1, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j + 1, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j + 2, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j + 3, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsy];
+        break;
+    case AXIS_Z:
+        f0 = scal[AC_VTXBUF_IDX(i, j, k - 3, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j, k - 2, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j, k - 1, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j, k + 1, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j, k + 2, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j, k + 3, mesh_info)];
+        ds = mesh_info.real_params[AC_dsz];
+        break;
+    default:
+        ERROR("Unknown axis type");
+    }
+    return (MODEL_REAL(2.) * (f0 + f6) + MODEL_REAL(-27.) * (f1 + f5) +
+            MODEL_REAL(270.) * (f2 + f4) + MODEL_REAL(-490.) * f3) /
+           (MODEL_REAL(180.) * ds * ds);
+}
+
+static MODEL_REAL
+laplace_scal(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* scal)
+{
+    return der2_scal<AXIS_X>(i, j, k, mesh_info, scal) +
+           der2_scal<AXIS_Y>(i, j, k, mesh_info, scal) +
+           der2_scal<AXIS_Z>(i, j, k, mesh_info, scal);
+}
+
+static void
+laplace_vec(const int& i, const int& j, const int& k,
+            const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+            const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, MODEL_REAL* laplace_x,
+            MODEL_REAL* laplace_y, MODEL_REAL* laplace_z)
+{
+    *laplace_x = laplace_scal(i, j, k, mesh_info, vec_x);
+    *laplace_y = laplace_scal(i, j, k, mesh_info, vec_y);
+    *laplace_z = laplace_scal(i, j, k, mesh_info, vec_z);
+}
+
+static MODEL_REAL
+div_vec(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+        const MODEL_REAL* vec_x, const MODEL_REAL* vec_y, const MODEL_REAL* vec_z)
+{
+    return der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+           der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+           der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z);
+}
+
+static void
+grad(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+     const MODEL_REAL* scal, MODEL_REAL* res_x, MODEL_REAL* res_y, MODEL_REAL* res_z)
+{
+    *res_x = der_scal<AXIS_X>(i, j, k, mesh_info, scal);
+    *res_y = der_scal<AXIS_Y>(i, j, k, mesh_info, scal);
+    *res_z = der_scal<AXIS_Z>(i, j, k, mesh_info, scal);
+}
+
+static MODEL_REAL
+vec_dot_nabla_scal(const int& i, const int& j, const int& k,
+                   const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+                   const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, const MODEL_REAL* scal)
+{
+    const int idx = AC_VTXBUF_IDX(i, j, k, mesh_info);
+    MODEL_REAL ddx_scal, ddy_scal, ddz_scal;
+    grad(i, j, k, mesh_info, scal, &ddx_scal, &ddy_scal, &ddz_scal);
+    return vec_x[idx] * ddx_scal + vec_y[idx] * ddy_scal +
+           vec_z[idx] * ddz_scal;
+}
+
+/*
+ * =============================================================================
+ * Viscosity
+ * =============================================================================
+ */
+typedef enum { DERNM_XY, DERNM_YZ, DERNM_XZ } DernmType;
+
+template <DernmType dernm>
+static MODEL_REAL
+dernm_scal(const int& i, const int& j, const int& k,
+           const AcMeshInfo& mesh_info, const MODEL_REAL* scal)
+{
+
+    MODEL_REAL fac;
+
+    const MODEL_REAL dsx = mesh_info.real_params[AC_dsx];
+    const MODEL_REAL dsy = mesh_info.real_params[AC_dsy];
+    const MODEL_REAL dsz = mesh_info.real_params[AC_dsz];
+
+    MODEL_REAL f_p1_p1, f_m1_p1, f_m1_m1, f_p1_m1;
+    MODEL_REAL f_p2_p2, f_m2_p2, f_m2_m2, f_p2_m2;
+    MODEL_REAL f_p3_p3, f_m3_p3, f_m3_m3, f_p3_m3;
+
+    switch (dernm) {
+    case DERNM_XY:
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsx) * (MODEL_REAL(1.) / dsy);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i + 1, j + 1, k, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i - 1, j + 1, k, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i - 1, j - 1, k, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i + 1, j - 1, k, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i + 2, j + 2, k, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i - 2, j + 2, k, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i - 2, j - 2, k, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i + 2, j - 2, k, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i + 3, j + 3, k, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i - 3, j + 3, k, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i - 3, j - 3, k, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i + 3, j - 3, k, mesh_info)];
+        break;
+    case DERNM_YZ:
+        // NOTE this is a bit different from the old one, second is j+1k-1
+        // instead of j-1,k+1
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsy) * (MODEL_REAL(1.) / dsz);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i, j + 1, k + 1, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i, j - 1, k + 1, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i, j - 1, k - 1, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i, j + 1, k - 1, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i, j + 2, k + 2, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i, j - 2, k + 2, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i, j - 2, k - 2, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i, j + 2, k - 2, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i, j + 3, k + 3, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i, j - 3, k + 3, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i, j - 3, k - 3, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i, j + 3, k - 3, mesh_info)];
+        break;
+    case DERNM_XZ:
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsx) * (MODEL_REAL(1.) / dsz);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i + 1, j, k + 1, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i - 1, j, k + 1, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i - 1, j, k - 1, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i + 1, j, k - 1, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i + 2, j, k + 2, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i - 2, j, k + 2, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i - 2, j, k - 2, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i + 2, j, k - 2, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i + 3, j, k + 3, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i - 3, j, k + 3, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i - 3, j, k - 3, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i + 3, j, k - 3, mesh_info)];
+        break;
+    default:
+        ERROR("Invalid dernm type");
+    }
+    return fac * (MODEL_REAL(270.) * (f_p1_p1 - f_m1_p1 + f_m1_m1 - f_p1_m1) -
+                  MODEL_REAL(27.) * (f_p2_p2 - f_m2_p2 + f_m2_m2 - f_p2_m2) +
+                  MODEL_REAL(2.) * (f_p3_p3 - f_m3_p3 + f_m3_m3 - f_p3_m3));
+}
+
+static void
+grad_div_vec(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+             const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, MODEL_REAL* gdvx,
+             MODEL_REAL* gdvy, MODEL_REAL* gdvz)
+{
+    *gdvx = der2_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+            dernm_scal<DERNM_XY>(i, j, k, mesh_info, vec_y) +
+            dernm_scal<DERNM_XZ>(i, j, k, mesh_info, vec_z);
+
+    *gdvy = dernm_scal<DERNM_XY>(i, j, k, mesh_info, vec_x) +
+            der2_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+            dernm_scal<DERNM_YZ>(i, j, k, mesh_info, vec_z);
+
+    *gdvz = dernm_scal<DERNM_XZ>(i, j, k, mesh_info, vec_x) +
+            dernm_scal<DERNM_YZ>(i, j, k, mesh_info, vec_y) +
+            der2_scal<AXIS_Z>(i, j, k, mesh_info, vec_z);
+}
+
+static void
+S_grad_lnrho(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+             const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, const MODEL_REAL* lnrho,
+             MODEL_REAL* sgrhox, MODEL_REAL* sgrhoy, MODEL_REAL* sgrhoz)
+{
+    const MODEL_REAL c23 = MODEL_REAL(2. / 3.);
+    const MODEL_REAL c13 = MODEL_REAL(1. / 3.);
+
+    const MODEL_REAL Sxx = c23 * der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) -
+                       c13 * (der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+                              der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z));
+    const MODEL_REAL Sxy = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Y>(i, j, k, mesh_info, vec_x) +
+                        der_scal<AXIS_X>(i, j, k, mesh_info, vec_y));
+    const MODEL_REAL Sxz = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Z>(i, j, k, mesh_info, vec_x) +
+                        der_scal<AXIS_X>(i, j, k, mesh_info, vec_z));
+
+    const MODEL_REAL Syx = Sxy;
+    const MODEL_REAL Syy = c23 * der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) -
+                       c13 * (der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+                              der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z));
+    const MODEL_REAL Syz = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Z>(i, j, k, mesh_info, vec_y) +
+                        der_scal<AXIS_Y>(i, j, k, mesh_info, vec_z));
+
+    const MODEL_REAL Szx = Sxz;
+    const MODEL_REAL Szy = Syz;
+    const MODEL_REAL Szz = c23 *
+                           der_scal<AXIS_Z>(
+                               i, j, k, mesh_info,
+                               vec_z) // replaced from "c23*der_scal<AXIS_Z>(i,
+                                      // j, k, mesh_info, vec_x)"! TODO recheck
+                                      // that ddz_uu_z is the correct one
+                       - c13 * (der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+                                der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y));
+
+    // Grad lnrho
+
+    MODEL_REAL glnx, glny, glnz;
+
+    grad(i, j, k, mesh_info, lnrho, &glnx, &glny, &glnz);
+
+    *sgrhox = Sxx * glnx + Sxy * glny + Sxz * glnz;
+    *sgrhoy = Syx * glnx + Syy * glny + Syz * glnz;
+    *sgrhoz = Szx * glnx + Szy * glny + Szz * glnz;
+}
+
+static void
+nu_const(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+         const MODEL_REAL* vec_x, const MODEL_REAL* vec_y, const MODEL_REAL* vec_z,
+         const MODEL_REAL* scal, MODEL_REAL* visc_x, MODEL_REAL* visc_y, MODEL_REAL* visc_z)
+{
+    MODEL_REAL lx, ly, lz;
+    laplace_vec(i, j, k, mesh_info, vec_x, vec_y, vec_z, &lx, &ly, &lz);
+    // lx = ly = lz = .0f;
+
+    MODEL_REAL gx, gy, gz;
+    grad_div_vec(i, j, k, mesh_info, vec_x, vec_y, vec_z, &gx, &gy, &gz);
+    // gx = gy =gz = .0f;
+
+    MODEL_REAL sgrhox, sgrhoy, sgrhoz;
+    S_grad_lnrho(i, j, k, mesh_info, vec_x, vec_y, vec_z, scal, &sgrhox,
+                 &sgrhoy, &sgrhoz);
+    // sgrhox = sgrhoy = sgrhoz = .0f;
+
+    *visc_x = mesh_info.real_params[AC_nu_visc] *
+              (lx + MODEL_REAL(1. / 3.) * gx + MODEL_REAL(2.) * sgrhox)
+              + mesh_info.real_params[AC_zeta] * gx;
+    *visc_y = mesh_info.real_params[AC_nu_visc] *
+              (ly + MODEL_REAL(1. / 3.) * gy + MODEL_REAL(2.) * sgrhoy)
+              + mesh_info.real_params[AC_zeta] * gy;
+    *visc_z = mesh_info.real_params[AC_nu_visc] *
+              (lz + MODEL_REAL(1. / 3.) * gz + MODEL_REAL(2.) * sgrhoz)
+              + mesh_info.real_params[AC_zeta] * gz;
+}
--- a/src/standalone/model/model_reduce.cc
+++ b/src/standalone/model/model_reduce.cc
@@ -0,0 +1,203 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "model_reduce.h"
+
+#include <math.h>
+
+#include "core/errchk.h"
+
+// Function pointer definitions
+typedef ModelScalar (*ReduceFunc)(const ModelScalar&, const ModelScalar&);
+typedef ModelScalar (*ReduceInitialScalFunc)(const ModelScalar&);
+typedef ModelScalar (*ReduceInitialVecFunc)(const ModelScalar&, const ModelScalar&,
+                                            const ModelScalar&);
+
+// clang-format off
+/* Comparison funcs */
+static inline ModelScalar
+max(const ModelScalar& a, const ModelScalar& b) { return a > b ? a : b; }
+
+static inline ModelScalar
+min(const ModelScalar& a, const ModelScalar& b) { return a < b ? a : b; }
+
+static inline ModelScalar
+sum(const ModelScalar& a, const ModelScalar& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static inline ModelScalar
+length(const ModelScalar& a) { return (ModelScalar)(a); }
+
+static inline ModelScalar
+length(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return sqrtl(a*a + b*b + c*c); }
+
+static inline ModelScalar
+squared(const ModelScalar& a) { return (ModelScalar)(a*a); }
+
+static inline ModelScalar
+squared(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return squared(a) + squared(b) + squared(c); }
+
+static inline ModelScalar
+exp_squared(const ModelScalar& a) { return expl(a)*expl(a); }
+
+static inline ModelScalar
+exp_squared(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return exp_squared(a) + exp_squared(b) + exp_squared(c); }
+// clang-format on
+
+ModelScalar
+model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
+                  const VertexBufferHandle& a)
+{
+    ReduceInitialScalFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = AC_VTXBUF_IDX(
+        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
+        mesh.info.int_params[AC_nz_min], mesh.info);
+
+    ModelScalar res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
+    else
+        res = .0f;
+
+    for (int k = mesh.info.int_params[AC_nz_min];
+         k < mesh.info.int_params[AC_nz_max]; ++k) {
+        for (int j = mesh.info.int_params[AC_ny_min];
+             j < mesh.info.int_params[AC_ny_max]; ++j) {
+            for (int i = mesh.info.int_params[AC_nx_min];
+                 i < mesh.info.int_params[AC_nx_max]; ++i) {
+                const int idx              = AC_VTXBUF_IDX(i, j, k, mesh.info);
+                const ModelScalar curr_val = reduce_initial(
+                    mesh.vertex_buffer[a][idx]);
+                res = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const ModelScalar inv_n = 1.0l / mesh.info.int_params[AC_nxyz];
+        return sqrtl(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
+
+ModelScalar
+model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
+                 const VertexBufferHandle& a, const VertexBufferHandle& b,
+                 const VertexBufferHandle& c)
+{
+    // ModelScalar (*reduce_initial)(ModelScalar, ModelScalar, ModelScalar);
+    ReduceInitialVecFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = AC_VTXBUF_IDX(
+        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
+        mesh.info.int_params[AC_nz_min], mesh.info);
+
+    ModelScalar res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx],
+                             mesh.vertex_buffer[b][initial_idx],
+                             mesh.vertex_buffer[c][initial_idx]);
+    else
+        res = 0;
+
+    for (int k = mesh.info.int_params[AC_nz_min];
+         k < mesh.info.int_params[AC_nz_max]; k++) {
+        for (int j = mesh.info.int_params[AC_ny_min];
+             j < mesh.info.int_params[AC_ny_max]; j++) {
+            for (int i = mesh.info.int_params[AC_nx_min];
+                 i < mesh.info.int_params[AC_nx_max]; i++) {
+                const int idx              = AC_VTXBUF_IDX(i, j, k, mesh.info);
+                const ModelScalar curr_val = reduce_initial(
+                    mesh.vertex_buffer[a][idx], mesh.vertex_buffer[b][idx],
+                    mesh.vertex_buffer[c][idx]);
+                res = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const ModelScalar inv_n = 1.0l / mesh.info.int_params[AC_nxyz];
+        return sqrtl(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
--- a/src/standalone/model/model_reduce.h
+++ b/src/standalone/model/model_reduce.h
@@ -0,0 +1,37 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+ModelScalar model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
+                              const VertexBufferHandle& a);
+
+ModelScalar model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
+                             const VertexBufferHandle& a,
+                             const VertexBufferHandle& b,
+                             const VertexBufferHandle& c);
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
--- a/src/standalone/model/model_rk3.h
+++ b/src/standalone/model/model_rk3.h
@@ -0,0 +1,33 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+void model_rk3(const ModelScalar dt, ModelMesh* mesh);
+
+void model_rk3_step(const int step_number, const ModelScalar dt, ModelMesh* mesh);
--- a/src/standalone/model/modelmesh.h
+++ b/src/standalone/model/modelmesh.h
@@ -0,0 +1,36 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+
+typedef long double ModelScalar;
+
+typedef struct {
+    ModelScalar* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} ModelMesh;
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -0,0 +1,447 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <SDL.h>    // Note: using local version in src/3rdparty dir
+#include <math.h>   // ceil
+#include <string.h> // memcpy
+
+#include "config_loader.h"
+#include "core/errchk.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+// Window
+SDL_Renderer* renderer      = NULL;
+static SDL_Window* window   = NULL;
+static int window_width     = 800;
+static int window_height    = 600;
+static const int window_bpp = 32; // Bits per pixel
+
+// Surfaces
+SDL_Surface* surfaces[NUM_VTXBUF_HANDLES];
+static int datasurface_width  = -1;
+static int datasurface_height = -1;
+static int k_slice = 0;
+static int k_slice_max = 0;
+
+// Colors
+static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
+static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
+static const int tiles_per_row = 3;
+
+/*
+ * =============================================================================
+ * Camera
+ * =============================================================================
+ */
+/*
+typedef struct {
+   float x, y;
+} float2;
+*/
+typedef struct {
+    float x, y, w, h;
+} vec4;
+
+typedef struct {
+    float2 pos;
+    float scale;
+} Camera;
+
+static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
+
+static inline vec4
+project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
+{
+    const vec4 rect = (vec4){
+        camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
+        camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
+        camera.scale * bbox.x, camera.scale * bbox.y};
+
+    return rect;
+}
+
+/*
+ * =============================================================================
+ * Renderer
+ * =============================================================================
+ */
+
+static int
+renderer_init(const int& mx, const int& my)
+{
+    // Init video
+    SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
+
+    // Setup window
+    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
+                              SDL_WINDOWPOS_UNDEFINED, window_width,
+                              window_height, SDL_WINDOW_SHOWN);
+
+    // Setup SDL renderer
+    renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+    //SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
+    SDL_GetWindowSize(window, &window_width, &window_height);
+
+    SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
+
+    datasurface_width  = mx;
+    datasurface_height = my;
+    // vec drawing uses the surface of the first component, no memory issues here
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
+            0, datasurface_width, datasurface_height, window_bpp,
+            SDL_PIXELFORMAT_RGBA8888);
+
+    camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
+                          -.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
+    camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
+                       window_height / float(datasurface_height * (num_tiles/tiles_per_row)));
+
+    SDL_RendererInfo renderer_info;
+    SDL_GetRendererInfo(renderer, &renderer_info);
+    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
+    return 0;
+}
+
+static int
+set_pixel(const int& i, const int& j, const uint32_t& color,
+          SDL_Surface* surface)
+{
+    uint32_t* pixels           = (uint32_t*)surface->pixels;
+    pixels[i + j * surface->w] = color;
+    return 0;
+}
+
+static int
+draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
+                   const int& tile)
+{
+    const float xoffset = (tile % tiles_per_row) * datasurface_width;
+    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+
+    /*
+    const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
+    const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
+    */
+    const float max = 1.f;//float(acReduceScal(RTYPE_MAX, vertex_buffer));
+    const float min = 0.f;//float(acReduceScal(RTYPE_MIN, vertex_buffer));
+    const float range = fabsf(max - min);
+    const float mid   = max - .5f * range;
+
+    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+
+    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
+        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
+            ERRCHK(i < datasurface_width && j < datasurface_height);
+
+            const int idx       = AC_VTXBUF_IDX(i, j, k, mesh.info);
+            const uint8_t shade = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
+                range);
+            uint8_t color[4]            = {0, 0, 0, 255};
+            color[tile % 3]             = shade;
+            const uint32_t mapped_color = SDL_MapRGBA(
+                surfaces[vertex_buffer]->format, color[0], color[1], color[2],
+                color[3]);
+            set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
+        }
+    }
+
+    const float2 pos   = (float2){xoffset, yoffset};
+    const float2 bbox  = (float2){.5f * datasurface_width,
+                                 .5f * datasurface_height};
+    const float2 wsize = (float2){float(window_width), float(window_height)};
+    const vec4 rectf   = project_ortho(pos, bbox, wsize);
+    SDL_Rect rect      = (SDL_Rect){
+        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+                                                    surfaces[vertex_buffer]);
+    SDL_RenderCopy(renderer, tex, NULL, &rect);
+    SDL_DestroyTexture(tex);
+
+    return 0;
+}
+
+static int
+draw_vertex_buffer_vec(const AcMesh& mesh,
+                       const VertexBufferHandle& vertex_buffer_a,
+                       const VertexBufferHandle& vertex_buffer_b,
+                       const VertexBufferHandle& vertex_buffer_c,
+                       const int& tile)
+{
+    const float xoffset = (tile % tiles_per_row) * datasurface_width;
+    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+
+    /*
+    const float maxx = float(
+        max(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_a),
+            max(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_b),
+                model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_c))));
+    const float minn = float(
+        min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_a),
+            min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
+                model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
+    */
+    const float maxx = float(
+        max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
+            max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
+                acReduceScal(RTYPE_MAX, vertex_buffer_c))));
+    const float minn = float(
+        min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
+            min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
+                acReduceScal(RTYPE_MIN, vertex_buffer_c))));
+    const float range = fabsf(maxx - minn);
+    const float mid   = maxx - .5f * range;
+
+    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
+        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
+            ERRCHK(i < datasurface_width && j < datasurface_height);
+
+            const int idx   = AC_VTXBUF_IDX(i, j, k, mesh.info);
+            const uint8_t r = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
+                range);
+            const uint8_t g = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
+                range);
+            const uint8_t b = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
+                range);
+            const uint32_t mapped_color = SDL_MapRGBA(
+                surfaces[vertex_buffer_a]->format, r, g, b, 255);
+            set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
+        }
+    }
+
+    const float2 pos   = (float2){xoffset, yoffset};
+    const float2 bbox  = (float2){.5f * datasurface_width,
+                                 .5f * datasurface_height};
+    const float2 wsize = (float2){float(window_width), float(window_height)};
+    const vec4 rectf   = project_ortho(pos, bbox, wsize);
+    SDL_Rect rect      = (SDL_Rect){
+        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+                                                    surfaces[vertex_buffer_a]);
+    SDL_RenderCopy(renderer, tex, NULL, &rect);
+    SDL_DestroyTexture(tex);
+
+    return 0;
+}
+
+static int
+renderer_draw(const AcMesh& mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
+    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
+                           NUM_VTXBUF_HANDLES);
+
+    // Drawing done, present
+    SDL_RenderPresent(renderer);
+    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
+                           color_bg.a);
+    SDL_RenderClear(renderer);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        const VertexBufferHandle vertex_buffer = VertexBufferHandle(i);
+        /*
+        printf("\t%s umax %e, min %e\n", vtxbuf_names[vertex_buffer],
+               (double)model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer),
+               (double)model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
+        */
+        printf("\t%s umax %e, min %e\n", vtxbuf_names[vertex_buffer],
+               (double)acReduceScal(RTYPE_MAX, vertex_buffer),
+               (double)acReduceScal(RTYPE_MIN, vertex_buffer));
+    }
+    printf("\n");
+
+    return 0;
+}
+
+static int
+renderer_quit(void)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        SDL_FreeSurface(surfaces[i]);
+
+    SDL_DestroyRenderer(renderer);
+    SDL_DestroyWindow(window);
+
+    renderer = NULL;
+    window   = NULL;
+
+    SDL_Quit();
+    return 0;
+}
+
+static int init_type = INIT_TYPE_GAUSSIAN_RADIAL_EXPL;
+
+static bool
+running(AcMesh* mesh)
+{
+    SDL_Event e;
+    while (SDL_PollEvent(&e)) {
+        if (e.type == SDL_QUIT) {
+            return false;
+        }
+        else if (e.type == SDL_KEYDOWN) {
+            if (e.key.keysym.sym == SDLK_ESCAPE)
+                return false;
+            if (e.key.keysym.sym == SDLK_SPACE) {
+                init_type = (init_type + 1) % NUM_INIT_TYPES;
+                acmesh_init_to(InitType(init_type), mesh);
+                acLoad(*mesh);
+            }
+            if (e.key.keysym.sym == SDLK_i) {
+                k_slice = (k_slice + 1) % k_slice_max;
+                printf("k_slice %d\n", k_slice);
+            }
+            if (e.key.keysym.sym == SDLK_k) {
+                k_slice = (k_slice - 1 + k_slice_max) % k_slice_max;
+                printf("k_slice %d\n", k_slice);
+            }
+        }
+    }
+    return true;
+}
+
+static void
+check_input(const float& dt)
+{
+    /* Camera movement */
+    const float camera_translate_rate = 1000.f / camera.scale;
+    const float camera_scale_rate     = 1.0001f;
+    const uint8_t* keystates          = (uint8_t*)SDL_GetKeyboardState(NULL);
+    if (keystates[SDL_SCANCODE_UP])
+        camera.pos.y += camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_DOWN])
+        camera.pos.y -= camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_LEFT])
+        camera.pos.x -= camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_RIGHT])
+        camera.pos.x += camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_PAGEUP])
+        camera.scale += camera.scale * camera_scale_rate * dt;
+    if (keystates[SDL_SCANCODE_PAGEDOWN])
+        camera.scale -= camera.scale * camera_scale_rate * dt;
+    if (keystates[SDL_SCANCODE_COMMA])
+        set_timescale(AcReal(.1));
+    if (keystates[SDL_SCANCODE_PERIOD])
+        set_timescale(AcReal(1.));
+}
+
+int
+run_renderer(void)
+{
+    /* Parse configs */
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    renderer_init(mesh_info.int_params[AC_mx], mesh_info.int_params[AC_my]);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(InitType(init_type), mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+    Timer frame_timer;
+    timer_reset(&frame_timer);
+
+    Timer wallclock;
+    timer_reset(&wallclock);
+
+    Timer io_timer;
+    timer_reset(&io_timer);
+
+    const float desired_frame_time = 1.f / 60.f;
+    int steps                      = 0;
+    k_slice                        = mesh->info.int_params[AC_mz] / 2;
+    k_slice_max                    = mesh->info.int_params[AC_mz];
+    while (running(mesh)) {
+
+        /* Input */
+        check_input(timer_diff_nsec(io_timer) / 1e9f);
+        timer_reset(&io_timer);
+
+/* Step the simulation */
+#if 1
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+#else
+        ModelMesh* model_mesh = modelmesh_create(mesh->info);
+        const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acmesh_to_modelmesh(*mesh, model_mesh);
+        model_rk3(dt, model_mesh);
+        modelmesh_to_acmesh(*model_mesh, mesh);
+        modelmesh_destroy(model_mesh);
+        acLoad(*mesh); // Just a quick hack s.t. we do not have to add an
+                       // additional if to the render part
+#endif
+
+        ++steps;
+
+        /* Render */
+        const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
+        if (timer_diff_sec >= desired_frame_time) {
+            //acStore(mesh);
+            const int num_vertices = mesh->info.int_params[AC_mxy];
+            const int3 dst         = (int3){0, 0, k_slice};
+            acStoreWithOffset(dst, num_vertices, mesh);
+            acSynchronize();
+            renderer_draw(*mesh); // Bottleneck is here
+            printf("Step #%d, dt: %f\n", steps, double(dt));
+            timer_reset(&frame_timer);
+        }
+    }
+    printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f));
+
+    acStore(mesh);
+    acQuit();
+    acmesh_destroy(mesh);
+
+    renderer_quit();
+
+    return 0;
+}
--- a/src/standalone/run.h
+++ b/src/standalone/run.h
@@ -0,0 +1,35 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+
+int run_autotest(void);
+
+int run_simulation(void);
+
+int run_benchmark(void);
+
+int run_renderer(void);
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -0,0 +1,339 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include "config_loader.h"
+#include "core/errchk.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+/*
+// DEPRECATED: TODO remove
+static inline void
+print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
+{
+    const int max_name_width = 16;
+    printf("Step %d, dt %e s\n", step, double(dt));
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
+    double(model_reduce_vec(mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)),
+    double(model_reduce_vec(mesh, RTYPE_MIN, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)),
+    double(model_reduce_vec(mesh, RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
+        double(model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(i))),
+        double(model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(i))),
+        double(model_reduce_scal(mesh, RTYPE_RMS, VertexBufferHandle(i))));
+    }
+}
+*/
+
+//Write all setting info into a separate ascii file. This is done to guarantee
+//that we have the data specifi information in the thing, even though in
+//principle these things are in the astaroth.conf.
+static inline 
+void write_mesh_info(const AcMeshInfo* config)
+{
+ 
+    FILE* infotxt;
+
+    infotxt = fopen("purge.sh","w");
+    fprintf(infotxt, "#!/bin/bash\n");
+    fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
+    fclose(infotxt);   
+
+    infotxt = fopen("mesh_info.list","w");
+
+    //Total grid dimensions
+    fprintf(infotxt, "int  AC_mx        %i \n", config->int_params[AC_mx]);
+    fprintf(infotxt, "int  AC_my        %i \n", config->int_params[AC_my]);
+    fprintf(infotxt, "int  AC_mz        %i \n", config->int_params[AC_mz]);
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    fprintf(infotxt, "int  AC_nx_min    %i \n", config->int_params[AC_nx_min]);
+    fprintf(infotxt, "int  AC_nx_max    %i \n", config->int_params[AC_nx_max]);
+    fprintf(infotxt, "int  AC_ny_min    %i \n", config->int_params[AC_ny_min]);
+    fprintf(infotxt, "int  AC_ny_max    %i \n", config->int_params[AC_ny_max]);
+    fprintf(infotxt, "int  AC_nz_min    %i \n", config->int_params[AC_nz_min]);
+    fprintf(infotxt, "int  AC_nz_max    %i \n", config->int_params[AC_nz_max]);
+
+    // Spacing
+    fprintf(infotxt, "real AC_inv_dsx   %e \n", (double)config->real_params[AC_inv_dsx]);
+    fprintf(infotxt, "real AC_inv_dsy   %e \n", (double)config->real_params[AC_inv_dsy]);
+    fprintf(infotxt, "real AC_inv_dsz   %e \n", (double)config->real_params[AC_inv_dsz]);
+    fprintf(infotxt, "real AC_dsmin     %e \n", (double)config->real_params[AC_dsmin  ]);
+
+    /* Additional helper params */
+    // Int helpers
+    fprintf(infotxt, "int  AC_mxy       %i \n", config->int_params[AC_mxy ]);
+    fprintf(infotxt, "int  AC_nxy       %i \n", config->int_params[AC_nxy ]);
+    fprintf(infotxt, "int  AC_nxyz      %i \n", config->int_params[AC_nxyz]);
+
+    // Real helpers
+    fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
+    fprintf(infotxt, "real AC_cv_sound  %e \n", (double)config->real_params[AC_cv_sound ]);
+
+    fclose(infotxt);
+}
+
+
+//This funtion writes a run state into a set of C binaries. For the sake of
+//accuracy, all floating point numbers are to be saved in long double precision
+//regardless of the choise of accuracy during runtime. 
+static inline void
+save_mesh(const AcMesh &save_mesh, const int step, 
+          const AcReal t_step)
+{
+    FILE* save_ptr;  
+
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const size_t n = AC_VTXBUF_SIZE(save_mesh.info);
+
+        const char* buffername = vtxbuf_names[w];
+        char cstep[10];
+        char bin_filename[80] = "\0";
+
+        //sprintf(bin_filename, "");
+
+        sprintf(cstep, "%d", step);
+
+        strcat(bin_filename, buffername);
+        strcat(bin_filename, "_");
+        strcat(bin_filename, cstep);
+        strcat(bin_filename, ".mesh");
+
+        printf("Savefile %s \n", bin_filename);
+
+        save_ptr = fopen(bin_filename,"wb");
+
+        //Start file with time stamp
+        long double write_long_buf =  (long double) t_step;
+        fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
+        //Grid data
+        for (size_t i = 0; i < n; ++i) {
+            const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
+            long double write_long_buf =  (long double) point_val;
+            fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
+        }
+        fclose(save_ptr);
+    }
+
+}
+
+
+
+// This function prints out the diagnostic values to std.out and also saves and
+// appends an ascii file to contain all the result. 
+static inline void
+print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file)
+{
+    
+    AcReal buf_rms, buf_max, buf_min;
+    const int max_name_width = 16;
+
+    // Calculate rms, min and max from the velocity vector field
+    buf_max = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+    buf_min = acReduceVec(RTYPE_MIN, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+    buf_rms = acReduceVec(RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+
+    // MV: The ordering in the earlier version was wrong in terms of variable
+    // MV: name and its diagnostics. 
+    printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
+           double(buf_min), double(buf_rms), double(buf_max));
+    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), 
+           double(buf_min), double(buf_rms), double(buf_max));
+    
+
+    // Calculate rms, min and max from the variables as scalars
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        buf_max = acReduceScal(RTYPE_MAX, VertexBufferHandle(i));
+        buf_min = acReduceScal(RTYPE_MIN, VertexBufferHandle(i));
+        buf_rms = acReduceScal(RTYPE_RMS, VertexBufferHandle(i));
+        
+        printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
+               double(buf_min), double(buf_rms), double(buf_max));
+        fprintf(diag_file, "%e %e %e ", double(buf_min), double(buf_rms), double(buf_max));
+    }
+
+    fprintf(diag_file, "\n");
+}
+
+    /* 
+        MV NOTE: At the moment I have no clear idea how to calculate magnetic
+        diagnostic variables from grid. Vector potential measures have a limited
+        value. TODO: Smart way to get brms, bmin and bmax.
+    */ 
+
+int
+run_simulation(void)
+{
+    /* Parse configs */
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(INIT_TYPE_GAUSSIAN_RADIAL_EXPL, mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+
+    FILE *diag_file;
+    diag_file = fopen("timeseries.ts", "a");
+    // TODO Get time from earlier state. 
+    AcReal t_step = 0.0;
+
+    // Generate the title row.
+    fprintf(diag_file, "step  t_step  dt  uu_total_min  uu_total_rms  uu_total_max  ");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
+    }
+
+    fprintf(diag_file, "\n");
+
+    write_mesh_info(&mesh_info);
+    print_diagnostics(0, AcReal(.0), t_step, diag_file);
+
+    acSynchronize();
+    acStore(mesh);
+    save_mesh(*mesh, 0, t_step);
+
+    const int max_steps = mesh_info.int_params[AC_max_steps];
+    const int save_steps = mesh_info.int_params[AC_save_steps];
+    const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info
+
+    AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
+    AcReal bin_crit_t = bin_save_t;
+
+    /* Step the simulation */
+    for (int i = 1; i < max_steps; ++i) {
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+
+        t_step += dt; 
+
+        /* Save the simulation state and print diagnostics */
+        if ((i % save_steps) == 0) {
+
+            /*
+		print_diagnostics() writes out both std.out printout from the
+		results and saves the diagnostics into a table for ascii file
+                timeseries.ts.
+            */
+
+            print_diagnostics(i, dt, t_step, diag_file);
+
+            /*
+		We would also might want an XY-average calculating funtion,
+		which can be very useful when observing behaviour of turbulent
+                simulations. (TODO)
+            */
+
+        }
+
+        /* Save the simulation state and print diagnostics */
+        if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) {
+
+            /*
+		This loop saves the data into simple C binaries which can be
+                used for analysing the data snapshots closely.
+ 
+                Saving simulation state should happen in a separate stage. We do 
+                not want to save it as often as diagnostics. The file format 
+                should IDEALLY be HDF5 which has become a well supported, portable and 
+		reliable data format when it comes to HPC applications.
+		However, implementing it will have to for more simpler approach
+                to function. (TODO?)
+            */
+                
+            /*
+                The updated mesh will be located on the GPU. Also all calls
+                to the astaroth interface (functions beginning with ac*) are
+                assumed to be asynchronous, so the meshes must be also synchronized
+                before transferring the data to the CPU. Like so:
+
+                acSynchronize();
+                acStore(mesh);
+            */
+
+            acSynchronize();
+            acStore(mesh);
+
+            save_mesh(*mesh, i, t_step);
+
+            bin_crit_t += bin_save_t; 
+
+        }
+
+    }
+
+    //////Save the final snapshot
+    ////acSynchronize();
+    ////acStore(mesh);
+
+    ////save_mesh(*mesh, , t_step);
+
+    acQuit();
+    acmesh_destroy(mesh);
+
+    fclose(diag_file);
+
+    return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/src/standalone/timer_hires.h
+++ b/src/standalone/timer_hires.h
@@ -0,0 +1,64 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+  @file
+ \brief High-resolution timer.
+
+    Usage:
+        Timer t;
+        timer_reset(&t);
+        timer_diff_nsec(t);
+
+    If there are issues, try compiling with -std=gnu11 -lrt
+ */
+#pragma once
+#include <stdio.h> // perror
+#include <time.h>
+
+typedef struct timespec Timer;
+// Contains at least the following members:
+// time_t tv_sec;
+// long tv_nsec;
+
+static inline int
+timer_reset(Timer* t)
+{
+    const int retval = clock_gettime(CLOCK_REALTIME, t);
+    if (retval == -1)
+        perror("clock_gettime failure");
+
+    return retval;
+}
+
+static inline long
+timer_diff_nsec(const Timer start)
+{
+    Timer end;
+    timer_reset(&end);
+    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
+                      (end.tv_nsec - start.tv_nsec);
+    return diff;
+}
+
+static inline void
+timer_diff_print(const Timer t)
+{
+    printf("Time elapsed: %g ms\n", timer_diff_nsec(t) / 1e6);
+}