Added Astaroth 2.0

2019-06-14 14:18:35 +03:00
parent 4e4f84c8ff
commit 0e48766a68
87 changed files with 18058 additions and 1 deletions
--- a/3rdparty/setup_dependencies.sh
+++ b/3rdparty/setup_dependencies.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+INITIAL_DIR=$(pwd)
+
+
+# Fetch SDL2
+git clone https://github.com/davidsiaw/SDL2.git
+cd SDL2
+git pull
+mkdir build
+cd build && cmake .. && make -j
+
+# See https://github.com/davidsiaw/SDL2/blob/master/docs/README-linux.md
+# if there are isses with building
+
+
+# Done
+cd $INITIAL_DIR
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,172 @@
+#
+# CMakeLists.txt for generating the makefile for Astaroth.
+#   Usage: mkdir build && cd build && cmake <optional flags> ..
+#
+#   For example: cmake -DDOUBLE_PRECISION=ON ..
+#
+#   If you want to see the exact flags used during compilation, run
+#   "make -j VERBOSE=1"
+#
+# Make sure your machine satisfies the system requirements:
+# https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements
+
+#-------------------General---------------------------------------------------#
+
+project(ASTAROTH_2.0 CXX)
+set (CMAKE_CXX_STANDARD 98)
+cmake_minimum_required (VERSION 3.5.1) # Need >= 3.8 for first-class CUDA support
+cmake_policy (SET CMP0023 NEW)
+
+
+#-------------------Set user options with default values---------------------#
+
+#Usage f.ex. cmake -DBUILD_DEBUG=ON ..
+option(BUILD_DEBUG "Builds the program with extensive error checking" OFF)
+option(BUILD_STANDALONE "Builds standalone Astaroth" ON)
+option(DOUBLE_PRECISION "Generates double precision code" OFF)
+option(TIARA_CLUSTER "Special settings for compilation TIARA GPU cluster" OFF)
+option(MULTIGPU_ENABLED "If enabled, uses all the available GPUs" ON)
+option(ALTER_CONF "If enabled, loads astaroth.conf from the build directory" OFF)
+
+#-------------------Determine build type--------------------------------------#
+
+#Available types (case-sensitive):
+#RELEASE         (best performance)
+#DEBUG           (w/ debug information, non-concurrent kernels)
+if (BUILD_DEBUG)
+    set(CMAKE_BUILD_TYPE DEBUG)
+else ()
+    set(CMAKE_BUILD_TYPE RELEASE)
+endif()
+message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
+
+
+#----------------------Find packages------------------------------------------#
+
+# C++ compiler info
+message(STATUS "CMAKE_CXX_COMPILER: " ${CMAKE_CXX_COMPILER})
+message(STATUS "CMAKE_CXX_COMPILER: " ${CMAKE_CXX_COMPILER_ID})
+
+# SDL 2
+set(SDL2_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/3rdparty/SDL2/include/)
+set(SDL2_LIBRARY_DIR ${CMAKE_SOURCE_DIR}/3rdparty/SDL2/build/)
+set(SDL2_LIBRARY "SDL2")
+include_directories(${SDL2_INCLUDE_DIR})
+link_directories(${SDL2_LIBRARY_DIR})
+
+# CUDA
+find_package(CUDA)
+if (NOT CUDA_FOUND)
+    # find_package(CUDA REQUIRED) gives a confusing error message if it fails,
+    # therefore we print the reason here explicitly
+    message(FATAL_ERROR "CUDA not found")
+endif()
+include_directories(${CUDA_INCLUDE_DIRS})
+
+# OpenMP
+find_package(OpenMP)
+if (NOT OPENMP_FOUND)
+    message(WARNING "OpenMP not found. All host-side concurrency disabled \
+                    (lower performance).")
+else ()
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+#----------------------Compilation settings-----------------------------------#
+
+#Debug and verification
+#set(CMAKE_VERBOSE_MAKEFILE OFF)
+#set(CXX_VERBOSE_BUILD OFF)
+#set(CUDA_VERBOSE_BUILD OFF)
+#include(CTest)
+#add_test(ac_test ac_run)
+#find_program(MEMORYCHECK_COMMAND valgrind)
+#set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full" )
+
+
+#----------------------Setup defines------------------------------------------#
+
+if (DOUBLE_PRECISION)
+	add_definitions(-DAC_DOUBLE_PRECISION=1)
+else()
+    add_definitions(-DAC_DOUBLE_PRECISION=0)
+endif()
+
+# A full integration step is benchmarked by default, use this flag to override and
+# benchmark RK3 only
+if (GEN_BENCHMARK_RK3)
+    add_definitions(-DGEN_BENCHMARK_RK3=1)
+else()
+    add_definitions(-DGEN_BENCHMARK_RK3=0)
+endif()
+
+if (MULTIGPU_ENABLED)
+    add_definitions(-DAC_MULTIGPU_ENABLED=1)
+else()
+    add_definitions(-DAC_MULTIGPU_ENABLED=0)
+endif()
+
+#-----------------------TIARA specific options--------------------------------#
+#OLD#set (CXX_FLAGS_TIARA "-I/software/opt/cuda/9.0/include/")
+# %JP: NOTE! This should not be needed anymore because the command
+#      find_package(CUDA) above should find and include this directory automatically
+#USE THIS:
+if (TIARA_CLUSTER)
+	set (CXX_FLAGS_TIARA "-mno-bmi2")
+endif()
+
+#----------------------Setup CXX compilation flags----------------------------#
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}\
+                             -O2 -march=native -pipe")
+
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}\
+                             -O0 -g")
+
+set (CXX_FLAGS_WARNING "-Wall -Wextra -Werror -Wno-error=unused-parameter\
+                        -Wno-error=unused-function -Wno-error=unknown-pragmas")
+
+# Also warn about implicit conversions if the compiler supports it
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+    set (CXX_FLAGS_WARNING "${CXX_FLAGS_WARNING} -Wdouble-promotion -Wfloat-conversion")
+endif()
+
+# Other flags. -D_FORCE_INLINES is a workaround to some CUDA/C++ "feature"
+# which botches the compilation ("memcpy was not declared in this scope")
+# (Not required with cc >= 3.0)
+#set(CXX_FLAGS_ETC "-D_FORCE_INLINES")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}\
+                    ${CXX_FLAGS_WARNING}\
+                    ${CXX_FLAGS_ETC}\
+                    ${CXX_FLAGS_TIARA}") # %JP: CXX_FLAGS_TIARA should not be needed,
+					 #      see comments in "TIARA specific options"
+
+message("CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
+
+
+#----------------------Setup core subdirectories------------------------------#
+
+#Include root directory (.) so that the following modules can include their
+#parent dir (f.ex. #include "common/stuff.h" instead of "../common/stuff")
+include_directories(.)
+include_directories(include)
+include_directories(src)
+
+# CUDA sources
+add_subdirectory(src/core)
+
+#----------------------Link---------------------------------------------------#
+
+if (BUILD_STANDALONE)
+    #Define the config directory
+    if (ALTER_CONF)
+        set(ASTAROTH_CONF_PATH "${CMAKE_BINARY_DIR}/")
+    else()
+        set(ASTAROTH_CONF_PATH "${CMAKE_SOURCE_DIR}/config/")
+    endif()
+
+    #Add additional subdirectories
+    add_subdirectory (src/standalone)
+    cuda_add_executable(ac_run src/standalone/main.cc)
+    target_link_libraries(ac_run astaroth_standalone astaroth_core ${SDL2_LIBRARY})
+endif()
--- a/LICENCE.txt
+++ b/LICENCE.txt
@@ -0,0 +1,18 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
--- a/README.md
+++ b/README.md
@@ -1,2 +1,118 @@
-# Astaroth

+# Astaroth - A Multi-GPU library for generic stencil computations
+
+Astaroth is a single-node multi-GPU library for multiphysics and other problems, which involve stencil computations in a discrete mesh. It's licenced under the terms of the GNU General Public Licence, version 3, or later (see [LICENCE.txt](https://bitbucket.org/miikkavaisala/astaroth-code/src/master/astaroth_2.0/LICENCE.txt)). Astaroth ships with a domain-specific language, that can be used to translate high-level representation of the stencil computations into a heavily inlined GPU pipeline.
+
+## System requirements
+
+NVIDIA GPU with >= 3.0 compute capability. See https://en.wikipedia.org/wiki/CUDA#GPUs_supported.
+
+## Building (3rd party libraries)
+
+1. `cd 3rdparty`
+1. `./setup_dependencies.sh` Note: this may take some time.
+
+## Building (Astaroth 2.0)
+
+1. `cd astaroth_2.0/build`
+1. `cmake -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ..` (Use `cmake -D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ..` if compiling on TIARA)
+1. `../scripts/compile_acc.sh && make -j`
+1. `./ac_run <options>`
+
+If you encounter issues, recheck that the 3rd party libraries were successfully built during the previous step.
+
+### Available options
+
+- `-s` simulation
+- `-b` benchmark
+- `-t` automated test (NOTE! This is expected to fail with the default configuration as there's no CPU model solution for forcing/entropy)
+
+By default, the program does a real-time visualization of the simulation domain. The camera and the initial conditions can be controller by `arrow keys`, `pgup`, `pgdown` and `spacebar`.
+
+## Generating documentation
+
+Run `doxygen doxyfile` in astaroth_2.0 directory. The generated files can be found in `doc/doxygen`. The main page of the documentation will be at `dox/doxygen/astaroth_doc_html/index.html`.
+
+## Formatting
+
+If you have clang-format, you may run `scripts/fix_style.sh`. This script will recursively fix style of all the source files down from the current working directory. The script will ask for a confirmation before making any changes. 
+
+## Directory structure
+
+## Coding style.
+
+### In a nutshell
+- Use [K&R indentation style](https://en.wikipedia.org/wiki/Indentation_style#K&R_style) and 4 space tabs. 
+- Line width is 100 characters
+- Start function names after a linebreak in source files. 
+- [Be generous with `const` type qualifiers](https://isocpp.org/wiki/faq/const-correctness). 
+- When in doubt, see [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+### Header example:
+```cpp
+// Licence notice and doxygen description here
+#pragma once
+#include "avoid_including_headers_here.h"
+
+/** Doxygen comments */
+void global_function(void);
+```
+
+
+### Source example:
+```cpp
+#include "parent_header.h"
+
+#include <standard_library_headers.h>
+
+#include "other_headers.h"
+#include "more_headers.h"
+
+typedef struct {
+	int data;
+} SomeStruct;
+
+static inline int small_function(const SomeStruct& stuff) { return stuff.data; }
+
+// Pass constant structs always by reference (&) and use const type qualifier.
+// Modified structs are always passed as pointers (*), never as references.
+// Constant parameters should be on the left-hand side, while non-consts go to the right.
+static void
+local_function(const SomeStruct& constant_struct, SomeStruct* modified_struct)
+{
+	modified_struct->data = constant_struct.data;
+}
+
+void
+global_function(void)
+{
+	return;
+}
+```
+## Miikka's compilation notes
+
+Modules Modules usen when compiling when compiling
+
+  * intel/2016                         
+  * hdf5/1.8.16_openmpi_1.10.2_ic16.0   
+  * cmake/3.9.5
+  * openmpi/1.10.2_ic16.0               
+  * gcc/5.3.0
+  * cuda/9.0
+
+Requires this gcc flag to compile: `-mno-bmi2` Otherwise you get assembler error! 
+
+For stencil pre-processing `flex` and particularly `libfl` is required for `acc/code_generator.c` to compile. 
+
+Need CUDA version 9.2 or above version. 
+
+Comment out cudaGetDeviceCount(&num_devices) in astaroth.cu 
+
+OLD: `astaroth_2.0/acc/build.sh` only work when each line is written individually. (**solution needed**)
+
+  
+(**These are here because I don't dare to delete them yet** OLD: Intel compiler does not get correct flags with cmake on default settings. 
+This worked with 1.0: `cmake -D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ..` 
+but not this time. Issue with calling c+11 + definin compiler flags correctly in nvcc. 
+
+OLD: I need to put `-I/software/opt/cuda/9.0/include` into the ../CMakeLists.txt so that it compiles. )
--- a/acc/.gitignore
+++ b/acc/.gitignore
@@ -0,0 +1,5 @@
+build
+testbin
+
+# Except this file
+!.gitignore
--- a/acc/README.md
+++ b/acc/README.md
@@ -0,0 +1,42 @@
+# Dependencies
+## Debian/Ubuntu
+`apt install flex bison build-essential`
+
+# Usage
+* `./build_acc.sh # Builds the ASPL compiler (acc)`
+* `./compile.sh <.sps or .sas source> # Compiles the given stage into CUDA`
+* `./test.sh # Tries to compile the sample stages`
+* `./clean.sh # Removed directories generated by build_acc.sh and test.sh`
+
+## Example
+
+- `./compile.sh src/stencil_assembly.sas # Generates stencil_assembly.cuh`
+- `./compile.sh src/stencil_process.sps # Generates stencil_process.cuh`
+
+# What happens under the hood
+
+The compiler is made of a scanner (flex), parser (bison), implementation of the abstract syntax tree (AST) and a code generator.
+The language is defined by tokens and grammars found in acc.l and acc.y. These files are given as input to flex and bison, which generate the scanning and parsing stages for the compiler. The resulting AST is defined in ast.h. Finally, we traverse the generated AST with our code generator, generating CUDA code.
+
+## ACC compilation stages
+
+### In short: 
+* Preprocess .ac
+* Compile preprocessed .ac to .cuh
+* Compile .cuh
+
+### More detailed:
+0. A Parser is generated: bison --verbose -d acc.y
+0. A Scanner is generated: flex acc.l
+0. The compiler is built: gcc -std=gnu11 code_generator.c acc.tab.c lex.yy.c -lfl
+0. Source files (.sps and .sas) are preprocessed using the GCC preprocessor and cleaned from any residual directives which would be useful when compiling the code further with GCC. We do not need those when compiling with ACC and are not recognized by our grammar.
+0. Either the stencil processing stage (.sps) or the stencil assembly stage (.sas) are generated by passing the preprocessed file to acc. This emits the final CUDA code.
+0. Compilation is continued with the NVIDIA CUDA compiler
+
+### Even more detailed:
+The NVIDIA CUDA compiler compiles .cuh to .fatbin, which is embedded into a C++ binary containig host code of the program. A fatbin contains .cubin files, which contain the configuration of the GPU and the kernels in a streaming assembly code (.sass). We could also compile for a virtual architecture (.ptx) instead of the actual hardware-specific machine code (.cubin) by passing -code=compute_XX flag to nvcc, which would compile cuda sources at runtime (just-in-time compilation, JIT) when creating the CUDA context. However, we alway know which architecture we want to run the code on and JIT compilation would just increase the time to takes to launch the program.
+
+nvcc -DAC_DOUBLE_PRECISION=1 -ptx --relocatable-device-code true -O3 -std=c++11 --maxrregcount=255 -ftz=true -gencode arch=compute_60,code=sm_60 device.cu -I ../../include -I ../../
+nvcc -DAC_DOUBLE_PRECISION=1 -cubin --relocatable-device-code true -O3 -std=c++11 --maxrregcount=255 -ftz=true -gencode arch=compute_60,code=sm_60 device.cu -I ../../include -I ../../
+cuobjdump --dump-sass device.cubin > device.sass
+
--- a/acc/build_acc.sh
+++ b/acc/build_acc.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+cd `dirname $0` # Only operate in the same directory with this script
+
+COMPILER_NAME="acc"
+
+SRC_DIR=${PWD}/src
+BUILD_DIR=${PWD}/build
+
+echo "Created" ${BUILD_DIR}
+
+mkdir -p ${BUILD_DIR}
+cd ${BUILD_DIR}
+
+echo ${BASE_DIR}
+echo ${SRC_DIR}
+echo ${BUILD_DIR}
+
+# Generate Bison headers
+bison --verbose -d ${SRC_DIR}/${COMPILER_NAME}.y
+
+## Generate Flex sources and headers
+flex ${SRC_DIR}/${COMPILER_NAME}.l
+
+## Compile the ASPL compiler
+gcc -std=gnu11 ${SRC_DIR}/code_generator.c ${COMPILER_NAME}.tab.c lex.yy.c -lfl -I ${BUILD_DIR} -I ${SRC_DIR} -o ${COMPILER_NAME}
--- a/acc/clean.sh
+++ b/acc/clean.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+cd `dirname $0` # Only operate in the same directory with this script
+
+rm -rf build testbin
+
--- a/acc/compile.sh
+++ b/acc/compile.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Usage ./compile <source file>
+
+ACC_DIR=`dirname $0`
+
+FULL_NAME=$(basename -- $1)
+FILENAME="${FULL_NAME%.*}"
+EXTENSION="${FULL_NAME##*.}"
+
+if [ "${EXTENSION}" = "sas" ]; then
+    echo "Generating stencil assembly stage ${FILENAME}.sas -> stencil_assembly.cuh"
+    COMPILE_FLAGS="-sas" # Generate stencil assembly stage
+    CUH_FILENAME="stencil_assembly.cuh"
+elif [ "${EXTENSION}" = "sps" ]; then
+    echo "Generating stencil processing stage:  ${FILENAME}.sps -> stencil_process.cuh"
+    COMPILE_FLAGS="-sps" # Generate stencil processing stage
+    CUH_FILENAME="stencil_process.cuh"
+else
+    echo "Error: unknown extension" ${EXTENSION} "of file" ${FULL_NAME}
+    echo "Extension should be either .sas or .sps"
+    exit
+fi
+
+${ACC_DIR}/preprocess.sh $1 | ${ACC_DIR}/build/acc ${COMPILE_FLAGS} > ${CUH_FILENAME}
--- a/acc/mhd_solver/stencil_assembly.sas
+++ b/acc/mhd_solver/stencil_assembly.sas
@@ -0,0 +1,26 @@
+
+Preprocessed Scalar
+value(in Scalar vertex)
+{
+    return vertex[vertexIdx];
+}
+
+Preprocessed Vector
+gradient(in Scalar vertex)
+{
+    return (Vector){derx(vertexIdx, vertex),
+                    dery(vertexIdx, vertex),
+                    derz(vertexIdx, vertex)};
+}
+
+Preprocessed Matrix
+hessian(in Scalar vertex)
+{
+    Matrix hessian;
+
+    hessian.row[0] = (Vector){derxx(vertexIdx, vertex), derxy(vertexIdx, vertex), derxz(vertexIdx, vertex)};
+    hessian.row[1] = (Vector){hessian.row[0].y,       deryy(vertexIdx, vertex), deryz(vertexIdx, vertex)};
+    hessian.row[2] = (Vector){hessian.row[0].z,       hessian.row[1].z,       derzz(vertexIdx, vertex)};
+
+    return hessian;
+}
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -0,0 +1,265 @@
+#define LINDUCTION (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+#define LGRAVITY (0)
+
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar cv_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar zeta;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+#if LENTROPY
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar ss, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+    const Scalar cs2 = cs2_sound * exp(gamma * value(ss) / cp_sound + (gamma - 1) * (value(lnrho) - LNRHO0));
+    const Vector  j = (Scalar(1.) / mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Vector B = curl(aa);
+    const Scalar inv_rho = Scalar(1.) / exp(value(lnrho));
+
+    // Regex replace CPU constants with get\(AC_([a-zA-Z_0-9]*)\)
+    // \1
+    const Vector mom = - mul(gradients(uu), value(uu)) 
+                                                       - cs2 * ((Scalar(1.) / cp_sound) * gradient(ss) + gradient(lnrho))
+                                                       + inv_rho * cross(j, B)
+                                                       + nu_visc * (
+                                                            laplace_vec(uu) 
+                                                        + Scalar(1. / 3.) * gradient_of_divergence(uu) 
+                                                        + Scalar(2.) * mul(S, gradient(lnrho))
+                                                        )
+                                                        + zeta * gradient_of_divergence(uu);
+    return mom;
+}
+#elif LTEMPERATURE
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar tt) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+    const Vector pressure_term = (cp_sound - cv_sound) * (gradient(tt) + value(tt) * gradient(lnrho));
+
+  mom = -mul(gradients(uu), value(uu)) -
+    pressure_term +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  #if LGRAVITY
+  mom = mom - (Vector){0, 0, -10.0};
+  #endif
+
+  return mom;
+}
+#else
+Vector
+momentum(in Vector uu, in Scalar lnrho) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+    // Isothermal: we have constant speed of sound
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  #if LGRAVITY
+  mom = mom - (Vector){0, 0, -10.0};
+  #endif
+
+  return mom;
+}
+#endif
+
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+
+#if LENTROPY
+Scalar
+lnT( in Scalar ss, in Scalar lnrho) {
+  const Scalar lnT = LNT0 + gamma * value(ss) / cp_sound +
+    (gamma - Scalar(1.)) * (value(lnrho) - LNRHO0);
+  return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction( in Scalar ss, in Scalar lnrho) {
+  const Scalar inv_cp_sound = AcReal(1.) / cp_sound;
+
+  const Vector grad_ln_chi = - gradient(lnrho);
+
+  const Scalar first_term = gamma * inv_cp_sound * laplace(ss) +
+    (gamma - AcReal(1.)) * laplace(lnrho);
+  const Vector second_term = gamma * inv_cp_sound * gradient(ss) +
+    (gamma - AcReal(1.)) * gradient(lnrho);
+  const Vector third_term = gamma * (inv_cp_sound * gradient(ss) +
+    gradient(lnrho)) + grad_ln_chi;
+
+  const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * cp_sound);
+  return cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k) {
+  return 1;
+}
+
+Scalar
+entropy(in Scalar ss, in Vector uu, in Scalar lnrho, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+    const Scalar inv_pT = Scalar(1.) / (exp(value(lnrho)) * exp(lnT(ss, lnrho)));
+    const Vector  j = (Scalar(1.) / mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Scalar RHS = H_CONST - C_CONST
+                                                + eta * (mu0) * dot(j, j) 
+                                                + Scalar(2.) * exp(value(lnrho)) * nu_visc * contract(S)
+                                                + zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
+
+    return - dot(value(uu), gradient(ss))
+                  + inv_pT * RHS
+                  + heat_conduction(ss, lnrho);
+}
+#endif
+
+#if LTEMPERATURE
+Scalar
+heat_transfer(in Vector uu, in Scalar lnrho, in Scalar tt)
+{
+    const Matrix S = stress_tensor(uu);
+    const Scalar heat_diffusivity_k = 0.0008; //8e-4;
+    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) + heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) + nu_visc * contract(S) * (Scalar(1.) / cv_sound) - (gamma - 1) * value(tt) * divergence(uu);
+}
+#endif
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+#if LENTROPY
+in Scalar ss = VTXBUF_ENTROPY;
+out Scalar out_ss = VTXBUF_ENTROPY;
+#endif
+
+#if LTEMPERATURE
+in Scalar tt = VTXBUF_TEMPERATURE;
+out Scalar out_tt = VTXBUF_TEMPERATURE;
+#endif
+
+Kernel void
+solve(Scalar dt) {
+    out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
+
+    #if LINDUCTION
+    out_aa = rk3(out_aa, aa, induction(uu, aa), dt);
+    #endif
+
+    #if LENTROPY
+        out_uu = rk3(out_uu, uu, momentum(uu, lnrho, ss, aa), dt);
+        out_ss  = rk3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt);
+    #elif LTEMPERATURE
+        out_uu =rk3(out_uu, uu, momentum(uu, lnrho, tt), dt);
+        out_tt = rk3(out_tt, tt, heat_transfer(uu, lnrho, tt), dt);
+    #else
+        out_uu = rk3(out_uu, uu, momentum(uu, lnrho), dt);
+    #endif    
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/acc/preprocess.sh
+++ b/acc/preprocess.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Preprocesses the give file using GCC. This script is usually automatically called in
+# ./compile.sh, but may be called also individually for debugging purposes.
+gcc -E -x c ${@} | sed "s/#.*//g"
--- a/acc/pseudodisk/stencil_process_gravx.sps
+++ b/acc/pseudodisk/stencil_process_gravx.sps
@@ -0,0 +1,228 @@
+#define LINDUCTION (1)
+#define LENTROPY (1)
+
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance_x(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+// Gravitation for in negative x-direction. 
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+
+    const Scalar RR = vertex_pos.x - star_pos.x;
+
+    const Scalar G_force_abs = GM_star / (RR*RR); // Force per unit mass;
+
+    Vector G_force = (Vector){ - G_force_abs,
+                                 AcReal(0.0),
+                                 AcReal(0.0)};
+
+    return G_force;
+}
+
+#if LENTROPY
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar ss, in Vector aa, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom - cs2_sound * (Scalar(1.) / cp_sound) * gradient(ss);
+
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+  const Vector j = (Scalar(1.) / mu0) * (grad_div - lap);
+  const Vector B = curl(aa);
+  mom = mom + (Scalar(1.) / exp(value(lnrho))) * cross(j, B);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#else
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#endif
+
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+
+#if LENTROPY
+Scalar
+lnT( in Scalar ss, in Scalar lnrho) {
+  const Scalar lnT = LNT0 + value(ss) / cp_sound +
+    (gamma - AcReal(1.)) * (value(lnrho) - LNRHO0);
+  return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction( in Scalar ss, in Scalar lnrho) {
+  const Scalar inv_cp_sound = AcReal(1.) / cp_sound;
+
+  const Vector grad_ln_chi = (Vector) {
+    0,
+    0,
+    0
+  }; // TODO not used
+
+  const Scalar first_term = gamma * inv_cp_sound * laplace(ss) +
+    (gamma - AcReal(1.)) * laplace(lnrho);
+  const Vector second_term = gamma * inv_cp_sound * gradient(ss) +
+    (gamma - AcReal(1.)) * gradient(lnrho);
+  const Vector third_term = gamma * (inv_cp_sound * gradient(ss) +
+    gradient(lnrho)) + grad_ln_chi;
+
+  return cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k) {
+  return 1;
+}
+
+Scalar
+entropy(in Scalar ss, in Vector uu, in Scalar lnrho, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+
+    // nabla x nabla x A / mu0 = nabla(nabla dot A) - nabla^2(A)
+    const Vector j = gradient_of_divergence(aa) - laplace_vec(aa);
+
+    const Scalar inv_pT = AcReal(1.) / (exp(value(lnrho)) + exp(lnT(ss, lnrho)));
+
+    return -dot(value(uu), gradient(ss)) +
+      inv_pT * (H_CONST - C_CONST +
+        eta * mu0 * dot(j, j) +
+        AcReal(2.) * exp(value(lnrho)) * nu_visc * contract(S) +
+        zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu)
+      ) + heat_conduction(ss, lnrho);
+}
+#endif
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+#if LENTROPY
+in Scalar ss = VTXBUF_ENTROPY;
+out Scalar out_ss = VTXBUF_ENTROPY;
+#endif
+
+Kernel void
+solve(Scalar dt) {
+    WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+    #if LINDUCTION
+        WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+    #endif
+
+
+    #if LENTROPY
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, ss, aa, vertexIdx), dt));
+        WRITE(out_ss,    RK3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt));
+    #else
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+    #endif
+}
--- a/acc/pseudodisk/stencil_process_isotherm_gravx.sps
+++ b/acc/pseudodisk/stencil_process_isotherm_gravx.sps
@@ -0,0 +1,169 @@
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance_x(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+
+// "Line-like" gravity with no y-component
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+
+    const Scalar RR = vertex_pos.x - star_pos.x;
+
+    const Scalar G_force_abs = GM_star / (RR*RR); // Force per unit mass;
+
+    Vector G_force = (Vector){ - G_force_abs,
+                                 AcReal(0.0),
+                                 AcReal(0.0)};
+
+    return G_force;
+}
+
+
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu) 
+      + grav_force_line(vertexIdx);
+  
+
+  return mom;
+}
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+Kernel void
+solve(Scalar dt) {
+  WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+  #if LINDUCTION
+  WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+  #endif
+
+  WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/acc/pseudodisk/stencil_process_isotherm_linegrav.sps
+++ b/acc/pseudodisk/stencil_process_isotherm_linegrav.sps
@@ -0,0 +1,174 @@
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+
+// "Line-like" gravity with no y-component
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    //Vector star_pos   = (Vector){star_pos_x      - xorig, dsy * vertexIdx.y - yorig, star_pos_z      - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, star_pos_z};
+    //LIKE THIS: Vector star_pos = (Vector){star_pos_x, 0.0, star_pos_z};
+
+    const Scalar RR = distance(star_pos, vertex_pos);
+
+    const Scalar G_force_abs   = GM_star / (RR*RR); // Force per unit mass;
+    //const Scalar G_force_abs = 1.0; // Simple temp. test;
+
+    Vector G_force = (Vector){ - G_force_abs*((vertex_pos.x-star_pos.x)/RR),
+                                 AcReal(0.0),
+                               - G_force_abs*((vertex_pos.z-star_pos.z)/RR)};
+
+    //printf("G_force %e %e %e", G_force_abs.x, G_force_abs.y, G_force_abs.z)
+
+    return G_force;
+}
+
+
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu) 
+      + grav_force_line(vertexIdx);
+  
+
+  return mom;
+}
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+Kernel void
+solve(Scalar dt) {
+  WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+  #if LINDUCTION
+  WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+  #endif
+
+  WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/acc/pseudodisk/stencil_process_linegrav.sps
+++ b/acc/pseudodisk/stencil_process_linegrav.sps
@@ -0,0 +1,233 @@
+#define LINDUCTION (1)
+#define LENTROPY (1)
+
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance_x(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+// "Line-like" gravity with no y-component
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    //Vector star_pos   = (Vector){star_pos_x      - xorig, dsy * vertexIdx.y - yorig, star_pos_z      - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, star_pos_z};
+    //LIKE THIS: Vector star_pos = (Vector){star_pos_x, 0.0, star_pos_z};
+
+    const Scalar RR = distance(star_pos, vertex_pos);
+
+    const Scalar G_force_abs = GM_star / (RR*RR); // Force per unit mass;
+    //const Scalar G_force_abs = 1.0; // Simple temp. test;
+
+    Vector G_force = (Vector){ - G_force_abs*((vertex_pos.x-star_pos.x)/RR),
+                                 AcReal(0.0),
+                               - G_force_abs*((vertex_pos.z-star_pos.z)/RR)};
+
+    //printf("G_force %e %e %e", G_force_abs.x, G_force_abs.y, G_force_abs.z)
+
+    return G_force;
+}
+
+#if LENTROPY
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar ss, in Vector aa, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom - cs2_sound * (Scalar(1.) / cp_sound) * gradient(ss);
+
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+  const Vector j = (Scalar(1.) / mu0) * (grad_div - lap);
+  const Vector B = curl(aa);
+  mom = mom + (Scalar(1.) / exp(value(lnrho))) * cross(j, B);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#else
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#endif
+
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+
+#if LENTROPY
+Scalar
+lnT( in Scalar ss, in Scalar lnrho) {
+  const Scalar lnT = LNT0 + value(ss) / cp_sound +
+    (gamma - AcReal(1.)) * (value(lnrho) - LNRHO0);
+  return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction( in Scalar ss, in Scalar lnrho) {
+  const Scalar inv_cp_sound = AcReal(1.) / cp_sound;
+
+  const Vector grad_ln_chi = (Vector) {
+    0,
+    0,
+    0
+  }; // TODO not used
+
+  const Scalar first_term = gamma * inv_cp_sound * laplace(ss) +
+    (gamma - AcReal(1.)) * laplace(lnrho);
+  const Vector second_term = gamma * inv_cp_sound * gradient(ss) +
+    (gamma - AcReal(1.)) * gradient(lnrho);
+  const Vector third_term = gamma * (inv_cp_sound * gradient(ss) +
+    gradient(lnrho)) + grad_ln_chi;
+
+  return cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k) {
+  return 1;
+}
+
+Scalar
+entropy(in Scalar ss, in Vector uu, in Scalar lnrho, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+
+    // nabla x nabla x A / mu0 = nabla(nabla dot A) - nabla^2(A)
+    const Vector j = gradient_of_divergence(aa) - laplace_vec(aa);
+
+    const Scalar inv_pT = AcReal(1.) / (exp(value(lnrho)) + exp(lnT(ss, lnrho)));
+
+    return -dot(value(uu), gradient(ss)) +
+      inv_pT * (H_CONST - C_CONST +
+        eta * mu0 * dot(j, j) +
+        AcReal(2.) * exp(value(lnrho)) * nu_visc * contract(S) +
+        zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu)
+      ) + heat_conduction(ss, lnrho);
+}
+#endif
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+#if LENTROPY
+in Scalar ss = VTXBUF_ENTROPY;
+out Scalar out_ss = VTXBUF_ENTROPY;
+#endif
+
+Kernel void
+solve(Scalar dt) {
+    WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+    #if LINDUCTION
+        WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+    #endif
+
+
+    #if LENTROPY
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, ss, aa, vertexIdx), dt));
+        WRITE(out_ss,    RK3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt));
+    #else
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+    #endif
+}
--- a/acc/samples/common_header.h
+++ b/acc/samples/common_header.h
@@ -0,0 +1,422 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Provides an interface to Astaroth. Contains all the necessary configuration
+ * structs and functions for running the code on multiple GPUs.
+ *
+ * All interface functions declared here (such as acInit()) operate all GPUs
+ * available in the node under the hood, and the user does not need any
+ * information about the decomposition, synchronization or such to use these
+ * functions.
+ *
+ */
+#pragma once
+
+/* Prevent name mangling */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <float.h>        // FLT_EPSILON, etc
+#include <stdlib.h>       // size_t
+#include <vector_types.h> // CUDA vector types (float4, etc)
+
+
+/*
+ * =============================================================================
+ * Flags for auto-optimization
+ * =============================================================================
+ */
+#define AUTO_OPTIMIZE (0) // DEPRECATED TODO remove
+#define BOUNDCONDS_OPTIMIZE (0)
+#define GENERATE_BENCHMARK_DATA (0)
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define MAX_TB_DIM (MAX_THREADS_PER_BLOCK)
+#define NUM_ITERATIONS (10)
+#define WARP_SIZE (32)
+
+
+/*
+ * =============================================================================
+ * Compile-time constants used during simulation (user definable)
+ * =============================================================================
+ */
+#define STENCIL_ORDER (6)
+
+///////////// PAD TEST
+// NOTE: works only with nx is divisible by 32
+//#define PAD_LEAD (32 - STENCIL_ORDER/2)
+//#define PAD_SIZE (32 - STENCIL_ORDER)
+///////////// PAD TEST
+
+// L-prefix inherited from the old Astaroth, no idea what it means
+// MV: L means a Logical switch variale, something having true of false value.
+#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+#define LINDUCTION (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
+/*
+ * =============================================================================
+ * Identifiers used to construct the parameter lists for AcMeshInfo
+ * (IntParamType and RealParamType)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_INT_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_nx), \
+        FUNC(AC_ny), \
+        FUNC(AC_nz), \
+        FUNC(AC_mx), \
+        FUNC(AC_my), \
+        FUNC(AC_mz), \
+        FUNC(AC_nx_min), \
+        FUNC(AC_ny_min), \
+        FUNC(AC_nz_min), \
+        FUNC(AC_nx_max), \
+        FUNC(AC_ny_max), \
+        FUNC(AC_nz_max), \
+        /* Other */\
+        FUNC(AC_max_steps), \
+        FUNC(AC_save_steps), \
+        FUNC(AC_bin_steps), \
+        FUNC(AC_bc_type), \
+        /* Additional */\
+        FUNC(AC_mxy),\
+        FUNC(AC_nxy),\
+        FUNC(AC_nxyz)
+#define AC_FOR_REAL_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_dsx), \
+        FUNC(AC_dsy), \
+        FUNC(AC_dsz), \
+        FUNC(AC_dsmin), \
+        /* physical grid*/\
+        FUNC(AC_xlen), \
+        FUNC(AC_ylen), \
+        FUNC(AC_zlen), \
+        FUNC(AC_xorig), \
+        FUNC(AC_yorig), \
+        FUNC(AC_zorig), \
+        /*Physical units*/\
+        FUNC(AC_unit_density),\
+        FUNC(AC_unit_velocity),\
+        FUNC(AC_unit_length),\
+        /* properties of gravitating star*/\
+        FUNC(AC_star_pos_x),\
+        FUNC(AC_star_pos_y),\
+        FUNC(AC_star_pos_z),\
+        FUNC(AC_M_star),\
+        /* Run params */\
+        FUNC(AC_cdt), \
+        FUNC(AC_cdtv), \
+        FUNC(AC_cdts), \
+        FUNC(AC_nu_visc), \
+        FUNC(AC_cs_sound), \
+        FUNC(AC_eta), \
+        FUNC(AC_mu0), \
+        FUNC(AC_relhel), \
+        FUNC(AC_cp_sound), \
+        FUNC(AC_gamma), \
+        FUNC(AC_cv_sound), \
+        FUNC(AC_lnT0), \
+        FUNC(AC_lnrho0), \
+        FUNC(AC_zeta), \
+        FUNC(AC_trans),\
+        /* Other */\
+        FUNC(AC_bin_save_t), \
+        /* Initial condition params */\
+        FUNC(AC_ampl_lnrho), \
+        FUNC(AC_ampl_uu), \
+        FUNC(AC_angl_uu), \
+        FUNC(AC_lnrho_edge),\
+        FUNC(AC_lnrho_out),\
+        /* Additional helper params */\
+        /* (deduced from other params do not set these directly!) */\
+        FUNC(AC_G_CONST),\
+        FUNC(AC_GM_star),\
+        FUNC(AC_sq2GM_star),\
+        FUNC(AC_cs2_sound), \
+        FUNC(AC_inv_dsx), \
+        FUNC(AC_inv_dsy), \
+        FUNC(AC_inv_dsz)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Identifiers for VertexBufferHandle
+ * (i.e. the arrays used to construct AcMesh)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_LNRHO), \
+        FUNC(VTXBUF_UUX), \
+        FUNC(VTXBUF_UUY), \
+        FUNC(VTXBUF_UUZ), \
+        // FUNC(VTXBUF_DYE),
+
+#if LINDUCTION
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_AX), \
+        FUNC(VTXBUF_AY), \
+        FUNC(VTXBUF_AZ),
+#else
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LENTROPY
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_ENTROPY),
+#else
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LTEMPERATURE
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_TEMPERATURE),
+#else
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+#endif
+
+#define AC_FOR_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Single/double precision switch
+ * =============================================================================
+ */
+#if AC_DOUBLE_PRECISION == 1
+typedef double AcReal;
+typedef double3 AcReal3;
+#define AC_REAL_MAX (DBL_MAX)
+#define AC_REAL_MIN (DBL_MIN)
+#define AC_REAL_EPSILON (DBL_EPSILON)
+#else
+typedef float AcReal;
+typedef float3 AcReal3;
+#define AC_REAL_MAX (FLT_MAX)
+#define AC_REAL_MIN (FLT_MIN)
+#define AC_REAL_EPSILON (FLT_EPSILON)
+#endif
+
+typedef struct {
+    AcReal3 row[3];
+} AcMatrix;
+
+/*
+ * =============================================================================
+ * Helper macros
+ * =============================================================================
+ */
+#define AC_GEN_ID(X) X
+#define AC_GEN_STR(X) #X
+
+/*
+ * =============================================================================
+ * Error codes
+ * =============================================================================
+ */
+typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
+
+/*
+ * =============================================================================
+ * Reduction types
+ * =============================================================================
+ */
+typedef enum {
+    RTYPE_MAX,
+    RTYPE_MIN,
+    RTYPE_RMS,
+    RTYPE_RMS_EXP,
+    NUM_REDUCTION_TYPES
+} ReductionType;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
+    NUM_INT_PARAM_TYPES
+} AcIntParam;
+
+typedef enum {
+    AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
+    NUM_REAL_PARAM_TYPES
+} AcRealParam;
+
+extern const char* intparam_names[];  // Defined in astaroth.cu
+extern const char* realparam_names[]; // Defined in astaroth.cu
+
+typedef struct {
+    int int_params[NUM_INT_PARAM_TYPES];
+    AcReal real_params[NUM_REAL_PARAM_TYPES];
+} AcMeshInfo;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
+} VertexBufferHandle;
+
+extern const char* vtxbuf_names[]; // Defined in astaroth.cu
+
+/*
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+*/
+
+// NOTE: there's no particular benefit declaring AcMesh a class, since
+// a library user may already have allocated memory for the vertex_buffers.
+// But then we would allocate memory again when the user wants to start
+// filling the class with data. => Its better to consider AcMesh as a
+// payload-only struct
+typedef struct {
+    AcReal* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} AcMesh;
+
+#define AC_VTXBUF_SIZE(mesh_info)                                              \
+    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *      \
+              mesh_info.int_params[AC_mz]))
+
+#define AC_VTXBUF_SIZE_BYTES(mesh_info)                                        \
+    (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                   \
+    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] *               \
+     mesh_info.int_params[AC_nz])
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                             \
+    (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
+
+#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                      \
+    ((i) + (j)*mesh_info.int_params[AC_mx] +                                   \
+     (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my])
+
+/*
+ * =============================================================================
+ * Astaroth interface
+ * =============================================================================
+ */
+/** Starting point of all GPU computation. Handles the allocation and
+initialization of *all memory needed on all GPUs in the node*. In other words,
+setups everything GPU-side so that calling any other GPU interface function
+afterwards does not result in illegal memory accesses. */
+AcResult acInit(const AcMeshInfo& mesh_info);
+
+/** Splits the host_mesh and distributes it among the GPUs in the node */
+AcResult acLoad(const AcMesh& host_mesh);
+AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
+
+/** Does all three steps of the RK3 integration and computes the boundary
+conditions when necessary. Note that the boundary conditions are not applied
+after the final integration step.
+The result can be fetched to CPU memory with acStore(). */
+AcResult acIntegrate(const AcReal& dt);
+
+/** Performs a single RK3 step without computing boundary conditions. */
+AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
+
+/** Applies boundary conditions on the GPU meshs and communicates the
+ ghost zones among GPUs if necessary */
+AcResult acBoundcondStep(void);
+
+/** Performs a scalar reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
+
+/** Performs a vector reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+                   const VertexBufferHandle& b, const VertexBufferHandle& c);
+
+/** Stores the mesh distributed among GPUs of the node back to a single host
+ * mesh */
+AcResult acStore(AcMesh* host_mesh);
+AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
+
+/** Frees all GPU allocations and resets all devices in the node. Should be
+ * called at exit. */
+AcResult acQuit(void);
+
+/** Synchronizes all devices. All calls to Astaroth are asynchronous by default
+    unless otherwise stated. */
+AcResult acSynchronize(void);
+
+/* End extern "C" */
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ * =============================================================================
+ * Notes
+ * =============================================================================
+ */
+/*
+typedef enum {
+    VTX_BUF_LNRHO,
+    VTX_BUF_UUX,
+    VTX_BUF_UUY,
+    VTX_BUF_UUZ,
+    NUM_VERTEX_BUFFER_HANDLES
+} VertexBufferHandle
+
+// LNRHO etc
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+
+// Host
+typedef struct {
+    VertexBuffer vertex_buffers[NUM_VERTEX_BUFFER_HANDLES];
+    MeshInfo info;
+} Mesh;
+
+// Device
+typedef struct {
+    VertexBuffer in[NUM_VERTEX_BUFFER_HANDLES];
+    VertexBuffer out[NUM_VERTEX_BUFFER_HANDLES];
+} VertexBufferArray;
+*/
--- a/acc/samples/sample_stencil_assembly.sas
+++ b/acc/samples/sample_stencil_assembly.sas
@@ -0,0 +1,49 @@
+// TODO comments and reformatting
+
+//Scalar
+//dostuff(in Scalar uux)
+//{
+//   return uux[vertexIdx.x, vertexIdx.y, vertexIdx.z];
+//}
+
+// stencil_assembly.in
+Preprocessed Scalar
+some_exotic_stencil_computation(in Scalar uux)
+{
+    //#if STENCIL_ORDER == 2
+    //    const Scalar coefficients[] = {1, 1, 1};
+    //#else if STENCIL_ORDER == 4
+    //    const Scalar coefficients[] = {....};
+    //#endif
+
+    int i = vertexIdx.x;
+    int j = vertexIdx.y;
+    int k = vertexIdx.z;
+    const Scalar coefficients[] = {1, 2, 3};
+
+    return coefficients[0] * uux[i-1, j, k] + 
+           coefficients[1] * uux[i, j, k] + 
+           coefficients[2] * uux[i+1, j, k];
+}
+
+// stencil_process.in
+//in Scalar uux_in = VTXBUF_UUX;
+//out Scalar uux_out = VTXBUF_UUX;
+
+
+//Kernel
+//solve(Scalar dt)
+//{
+//    uux_out = some_exotic_stencil(uux_in);
+//}
+
+
+
+
+
+
+
+
+
+
+
--- a/acc/samples/sample_stencil_process.sps
+++ b/acc/samples/sample_stencil_process.sps
@@ -0,0 +1,149 @@
+// TODO comments and reformatting
+
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+
+uniform Scalar GM_star;
+// Other uniforms types than Scalar or int not yet supported
+
+// BUILTIN
+//Scalar dot(...){}
+
+// BUILTIN
+//Scalar distance(Vector a, Vector b) { return sqrt(dot(a, b)); }
+
+// BUILTIN
+// Scalar first_derivative(Scalar pencil[], Scalar inv_ds) { return pencil[3] * inv_ds; }
+
+Scalar first_derivative(Scalar pencil[], Scalar inv_ds)
+{
+    Scalar res = 0;
+    for (int i = 0; i < STENCIL_ORDER+1; ++i) {
+        res = res + pencil[i];
+    }
+    return inv_ds * res;
+}
+
+Scalar distance(Vector a, Vector b)
+{
+    return sqrt(a.x * b.x + a.y * b.y + a.z * b.z); 
+}
+
+Scalar
+gravity_potential(int i, int j, int k)
+{
+    Vector star_pos = (Vector){0, 0, 0};
+    Vector vertex_pos = (Vector){dsx * i, dsy * j, dsz * k};
+    return GM_star / distance(star_pos, vertex_pos);
+}
+
+Scalar
+gradx_gravity_potential(int i, int j, int k)
+{
+    Scalar pencil[STENCIL_ORDER + 1];
+    for (int offset = -STENCIL_ORDER; offset <= STENCIL_ORDER; ++offset) {
+        pencil[offset+STENCIL_ORDER] = gravity_potential(i + offset, j, k);
+    }
+
+    Scalar inv_ds = Scalar(1.) / dsx;
+    return first_derivative(pencil, inv_ds);
+}
+
+Scalar
+grady_gravity_potential(int i, int j, int k)
+{
+    Scalar pencil[STENCIL_ORDER + 1];
+    for (int offset = -STENCIL_ORDER; offset <= STENCIL_ORDER; ++offset) {
+        pencil[offset+STENCIL_ORDER] = gravity_potential(i, j + offset, k);
+    }
+
+    Scalar inv_ds = Scalar(1.) / dsy;
+    return first_derivative(pencil, inv_ds);
+}
+
+Scalar
+gradz_gravity_potential(int i, int j, int k)
+{
+    Scalar pencil[STENCIL_ORDER + 1];
+    for (int offset = -STENCIL_ORDER; offset <= STENCIL_ORDER; ++offset) {
+        pencil[offset+STENCIL_ORDER] = gravity_potential(i, j, k + offset);
+    }
+
+    Scalar inv_ds = Scalar(1.) / dsz;
+    return first_derivative(pencil, inv_ds);
+}
+
+Vector
+momentum(int i, int j, int k, in Vector uu)
+{
+
+    Vector gravity_potential = (Vector){gradx_gravity_potential(i, j, k),
+                                      grady_gravity_potential(i, j, k),
+                                      gradz_gravity_potential(i, j, k)};
+
+
+    return gravity_potential;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/acc/src/acc.l
+++ b/acc/src/acc.l
@@ -0,0 +1,56 @@
+%option yylineno
+
+D [0-9]
+L [a-zA-Z_]
+
+%{
+#include "acc.tab.h"
+%}
+
+%%
+
+"Scalar"    { return SCALAR; } /* Builtin types */
+"Vector"    { return VECTOR; }
+"Matrix"    { return MATRIX; }
+"void"      { return VOID; } /* Rest of the types inherited from C */
+"int"       { return INT; }
+"int3"      { return INT3; }
+
+"Kernel"       { return KERNEL; } /* Function specifiers */
+"Preprocessed" { return PREPROCESSED; }
+
+"const"     { return CONSTANT; }
+"in"        { return IN; } /* Device func storage specifiers */
+"out"       { return OUT; }
+"uniform"   { return UNIFORM; }
+
+"else if"   { return ELIF; }
+"if"        { return IF; }
+"else"      { return ELSE; }
+"for"       { return FOR; }
+"while"     { return WHILE; }
+
+"return"    { return RETURN; }
+
+{D}+"."?{D}*[flud]? { return NUMBER; } /* Literals */
+"."{D}+[flud]?      { return NUMBER; }
+{L}({L}|{D})*       { return IDENTIFIER; }
+\"(.)*\"            { return IDENTIFIER; } /* String */
+
+"=="                { return LEQU; }/* Logic operations */
+"&&"                { return LAND; }
+"||"                { return LOR; }
+"<="                { return LLEQU; }
+
+"++"                { return INPLACE_INC; }
+"--"                { return INPLACE_DEC; }
+
+[-+*/;=\[\]{}(),\.<>] { return yytext[0]; } /* Characters */
+
+
+"//".*              { /* Skip regular comments */ }
+[ \t\n\v\r]+        { /* Ignore whitespace, tabs and newlines */ }
+.                   { printf("unrecognized char %d: [%c]\n", *yytext, *yytext); }
+
+
+%%
--- a/acc/src/acc.y
+++ b/acc/src/acc.y
@@ -0,0 +1,234 @@
+%{
+#include <stdio.h>
+#include <string.h>
+
+#include "ast.h"
+
+extern char* yytext;
+
+int yylex();
+int yyerror(const char* str);
+int yyget_lineno();
+
+#define YYSTYPE ASTNode* // Sets the default type
+%}
+
+%token CONSTANT IN OUT UNIFORM
+%token IDENTIFIER NUMBER
+%token RETURN
+%token SCALAR VECTOR MATRIX
+%token VOID INT INT3
+%token IF ELSE FOR WHILE ELIF
+%token LEQU LAND LOR LLEQU
+%token KERNEL PREPROCESSED 
+%token INPLACE_INC INPLACE_DEC
+
+%%
+
+root: program { root->lhs = $1; }
+    ;
+
+program: /* Empty*/                                                                     { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); }
+       | program function_definition                                                    { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+       | program assignment ';'        /* Global definition */                          { $$ = astnode_create(NODE_UNKNOWN, $1, $2); $$->postfix = ';'; }
+       | program declaration ';'       /* Global declaration */                         { $$ = astnode_create(NODE_UNKNOWN, $1, $2); $$->postfix = ';'; }
+       ;
+
+/*
+ * =============================================================================
+ * Functions
+ * =============================================================================
+ */
+
+function_definition: function_declaration compound_statement                            { $$ = astnode_create(NODE_FUNCTION_DEFINITION, $1, $2); }
+                   ;
+
+function_declaration: declaration function_parameter_declaration                        { $$ = astnode_create(NODE_FUNCTION_DECLARATION, $1, $2); }
+                    ;
+
+function_parameter_declaration: '(' ')'                                                 { $$ = astnode_create(NODE_FUNCTION_PARAMETER_DECLARATION, NULL, NULL);  $$->prefix = '('; $$->postfix = ')'; }
+                              | '(' declaration_list ')'                                { $$ = astnode_create(NODE_FUNCTION_PARAMETER_DECLARATION, $2, NULL);    $$->prefix = '('; $$->postfix = ')'; }
+                              ;
+
+/*
+ * =============================================================================
+ * Statement
+ * =============================================================================
+ */
+statement_list: statement                                                               { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | statement_list statement                                                { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+              ;
+
+compound_statement: '{' '}'                                                             { $$ = astnode_create(NODE_COMPOUND_STATEMENT, NULL, NULL); $$->prefix = '{'; $$->postfix = '}'; }
+                  | '{' statement_list '}'                                              { $$ = astnode_create(NODE_COMPOUND_STATEMENT, $2, NULL);   $$->prefix = '{'; $$->postfix = '}'; }
+                  ;
+
+statement: selection_statement                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+         | iteration_statement                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+         | exec_statement ';'                                                           { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+         ;
+
+selection_statement: IF expression else_selection_statement                             { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = IF; }
+                   ;
+
+else_selection_statement: compound_statement                                            { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                        | compound_statement elif_selection_statement                   { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }                        
+                        | compound_statement ELSE compound_statement                    { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ELSE; }
+                        ;
+
+elif_selection_statement: ELIF expression else_selection_statement                      { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = ELIF; }   
+                        ;
+
+iteration_statement: WHILE expression compound_statement                                { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = WHILE; }
+                   | FOR for_expression compound_statement                              { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = FOR; }
+                   ;
+
+for_expression: '(' for_init_param for_other_params ')'                                 { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = '('; $$->postfix = ')'; }
+              ;
+
+for_init_param: expression ';'                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+              | assignment ';'                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+              ;
+
+for_other_params: expression ';'                                                        { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+                | expression ';' expression                                             { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ';'; }
+                ;
+
+exec_statement: declaration                                                             { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | assignment                                                              { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | expression                                                              { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | return return_statement                                                 { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+              ;
+
+assignment: declaration '=' expression                                                  { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '='; }
+          | expression '=' expression                                                   { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '='; }
+          ; 
+
+return_statement: /* Empty */                                                           { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); }
+                | expression                                                            { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                ;
+
+/*
+ * =============================================================================
+ * Declaration
+ * =============================================================================
+ */
+
+declaration_list: declaration                                                           { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                | declaration_list ',' declaration                                      { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ','; }
+                ;
+
+declaration: type_declaration identifier                                                { $$ = astnode_create(NODE_DECLARATION, $1, $2); } // Note: accepts only one type qualifier. Good or not?
+           | type_declaration array_declaration                                         { $$ = astnode_create(NODE_DECLARATION, $1, $2); }
+           ;
+
+array_declaration: identifier '[' ']'                                                   { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->infix = '['; $$->postfix = ']'; }
+                 | identifier '[' expression ']'                                        { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '['; $$->postfix = ']'; }
+                 ;
+
+type_declaration: type_specifier                                                        { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }                 
+                | type_qualifier type_specifier                                         { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+                ;
+
+/*
+ * =============================================================================
+ * Expressions
+ * =============================================================================
+ */
+expression_list: expression                                                             { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+               | expression_list ',' expression                                         { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ','; }
+               ;
+
+expression: unary_expression                                                            { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+          | expression binary_expression                                                { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+          ;
+
+binary_expression: binary_operator unary_expression                                     { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+                 ;
+
+unary_expression: postfix_expression                                                    { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                | unary_operator postfix_expression                                     { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+                ;
+
+postfix_expression: primary_expression                                                  { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                  | postfix_expression '[' expression_list ']' /* Subscript */          { $$ = astnode_create(NODE_MULTIDIM_SUBSCRIPT_EXPRESSION, $1, $3);    $$->infix = '['; $$->postfix = ']'; }
+                  | cast_expression '{' expression_list '}'    /* Array */              { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '{'; $$->postfix = '}'; }
+                  | postfix_expression '(' ')'                 /* Function call */      { $$ = astnode_create(NODE_UNKNOWN, $1, NULL);  $$->infix = '('; $$->postfix = ')'; }
+                  | postfix_expression '(' expression_list ')' /* Function call */      { $$ = astnode_create(NODE_UNKNOWN, $1, $3);    $$->infix = '('; $$->postfix = ')'; }
+                  | type_specifier '(' expression_list ')'     /* Cast */               { $$ = astnode_create(NODE_UNKNOWN, $1, $3);  $$->infix = '('; $$->postfix = ')'; }
+                  | postfix_expression '.' identifier          /* Member access */      { $$ = astnode_create(NODE_UNKNOWN, $1, $3);    $$->infix = '.'; }
+                  ;
+
+cast_expression: /* Empty: implicit cast */                                             { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); }
+               | '(' type_specifier ')'                                                 { $$ = astnode_create(NODE_UNKNOWN, $2, NULL); $$->prefix = '('; $$->postfix = ')'; }
+               ;
+
+primary_expression: identifier                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                  | number                                                              { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                  | '(' expression ')'                                                  { $$ = astnode_create(NODE_UNKNOWN, $2, NULL); $$->prefix = '('; $$->postfix = ')'; }
+                  ;
+
+
+
+/*
+ * =============================================================================
+ * Terminals
+ * =============================================================================
+ */
+
+binary_operator: '+'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '-'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '/'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '*'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '<'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '>'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }           
+               | LEQU                                                                   { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               | LAND                                                                   { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               | LOR                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               | LLEQU                                                                  { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               ;
+
+unary_operator: '-' /* C-style casts are disallowed, would otherwise be defined here */ { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+              | '!'                                                                     { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+              | INPLACE_INC                                                             { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->token = INPLACE_INC; }
+              | INPLACE_DEC                                                             { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->token = INPLACE_DEC; }
+              ;
+
+type_qualifier: KERNEL                                                                  { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = KERNEL; }
+              | PREPROCESSED                                                            { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = PREPROCESSED; }
+              | CONSTANT                                                                { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = CONSTANT; }
+              | IN                                                                      { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = IN; }
+              | OUT                                                                     { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = OUT; }
+              | UNIFORM                                                                 { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = UNIFORM; }
+              ;
+
+type_specifier: VOID                                                                    { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VOID; }
+              | INT                                                                     { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = INT; }
+              | INT3                                                                    { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = INT3; }
+              | SCALAR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALAR;  }
+              | VECTOR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VECTOR;  }
+              | MATRIX                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = MATRIX;  }
+              ;
+
+identifier: IDENTIFIER                                                                  { $$ = astnode_create(NODE_IDENTIFIER, NULL, NULL); astnode_set_buffer(yytext, $$); }
+          ;
+
+number: NUMBER                                                                          { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+      ;
+
+return: RETURN                                                                          { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+      ;
+
+%%
+
+void
+print(void)
+{
+    printf("%s\n", yytext);
+}
+
+int
+yyerror(const char* str)
+{
+    fprintf(stderr, "%s on line %d when processing char %d: [%s]\n", str, yyget_lineno(), *yytext, yytext);
+}
--- a/acc/src/ast.h
+++ b/acc/src/ast.h
@@ -0,0 +1,126 @@
+/*
+    Nodes for the Abstract Syntax Tree
+
+    Statement: syntactic unit tha expresses some action.
+    May have internal components, expressions, which are evaluated
+
+    Statements: return value
+                block
+*/
+#include <stdlib.h>
+#include <assert.h>
+
+#define BUFFER_SIZE (4096)
+
+#define GEN_ID(X) X
+#define GEN_STR(X) #X
+
+#define FOR_NODE_TYPES(FUNC) \
+    FUNC(NODE_UNKNOWN), \
+    FUNC(NODE_DEFINITION), \
+    FUNC(NODE_GLOBAL_DEFINITION), \
+    FUNC(NODE_DECLARATION), \
+    FUNC(NODE_TYPE_QUALIFIER), \
+    FUNC(NODE_TYPE_SPECIFIER), \
+    FUNC(NODE_IDENTIFIER), \
+    FUNC(NODE_FUNCTION_DEFINITION), \
+    FUNC(NODE_FUNCTION_DECLARATION), \
+    FUNC(NODE_COMPOUND_STATEMENT), \
+    FUNC(NODE_FUNCTION_PARAMETER_DECLARATION), \
+    FUNC(NODE_MULTIDIM_SUBSCRIPT_EXPRESSION)
+
+/* 
+// Recreating strdup is not needed when using the GNU compiler.
+// Let's also just say that anything but the GNU
+// compiler is NOT supported, since there are also
+// some gcc-specific calls in the files generated 
+// by flex and being completely compiler-independent is
+// not a priority right now
+#ifndef strdup 
+static inline char*
+strdup(const char* in)
+{
+    const size_t len = strlen(in) + 1;
+    char* out = malloc(len);
+
+    if (out) {
+        memcpy(out, in, len);
+        return out;
+    } else {
+        return NULL;
+    }
+}
+#endif
+*/
+
+typedef enum {
+    FOR_NODE_TYPES(GEN_ID),
+    NUM_NODE_TYPES
+} NodeType;
+
+typedef struct astnode_s {
+    int id;
+    struct astnode_s* lhs;
+    struct astnode_s* rhs;
+    NodeType type;          // Type of the AST node
+    char* buffer;           // Indentifiers and other strings (empty by default)
+
+    int token;              // Type of a terminal (that is not a simple char)
+    int prefix;            // Tokens. Also makes the grammar since we don't have
+    int infix;             // to divide it into max two-child rules
+    int postfix;           // (which makes it much harder to read)
+} ASTNode;
+
+
+static inline ASTNode*
+astnode_create(const NodeType type, ASTNode* lhs, ASTNode* rhs)
+{
+    ASTNode* node = malloc(sizeof(node[0]));
+
+    static int id_counter = 0;
+    node->id     = id_counter++;
+    node->type   = type;
+    node->lhs    = lhs;
+    node->rhs    = rhs;
+    node->buffer = NULL;
+
+    node->prefix = node->infix = node->postfix = 0;
+
+    return node;
+}
+
+static inline void
+astnode_set_buffer(const char* buffer, ASTNode* node)
+{
+    node->buffer = strdup(buffer);
+}
+
+static inline void
+astnode_destroy(ASTNode* node)
+{
+    if (node->lhs)
+        astnode_destroy(node->lhs);
+    if (node->rhs)
+        astnode_destroy(node->rhs);
+    if (node->buffer)
+        free(node->buffer);
+    free(node);
+}
+
+
+extern ASTNode* root;
+
+/*
+typedef enum {
+    SCOPE_BLOCK
+} ScopeType;
+
+typedef struct symbol_s {
+    int type_specifier;
+    char* identifier;
+    int scope;
+    struct symbol_s* next;
+} Symbol;
+
+extern ASTNode* symbol_table;
+*/
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -0,0 +1,569 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "acc.tab.h"
+#include "ast.h"
+
+ASTNode* root = NULL;
+
+static const char inout_name_prefix[] = "handle_";
+static bool doing_stencil_assembly    = true;
+
+/*
+ * =============================================================================
+ * Translation
+ * =============================================================================
+ */
+#define TRANSLATION_TABLE_SIZE (1024)
+static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
+    [0] = NULL,
+    // Control flow
+    [IF]    = "if",
+    [ELSE]  = "else",
+    [ELIF]  = "else if",
+    [WHILE] = "while",
+    [FOR]   = "for",
+    // Type specifiers
+    [VOID]   = "void",
+    [INT]    = "int",
+    [INT3]   = "int3",
+    [SCALAR] = "AcReal",
+    [VECTOR] = "AcReal3",
+    [MATRIX] = "AcMatrix",
+    // Type qualifiers
+    [KERNEL] = "template <int step_number>  static "
+               "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
+                             // RK_LAUNCH_BOUND_MIN_BLOCKS),
+    [PREPROCESSED] = "static __device__ "
+                     "__forceinline__",
+    [CONSTANT] = "const",
+    [IN]       = "in",
+    [OUT]      = "out",
+    [UNIFORM]  = "uniform",
+    // ETC
+    [INPLACE_INC] = "++",
+    [INPLACE_DEC] = "--",
+    // Unary
+    [','] = ",",
+    [';'] = ";\n",
+    ['('] = "(",
+    [')'] = ")",
+    ['['] = "[",
+    [']'] = "]",
+    ['{'] = "{\n",
+    ['}'] = "}\n",
+    ['='] = "=",
+    ['+'] = "+",
+    ['-'] = "-",
+    ['/'] = "/",
+    ['*'] = "*",
+    ['<'] = "<",
+    ['>'] = ">",
+    ['!'] = "!",
+    ['.'] = "."};
+
+static const char*
+translate(const int token)
+{
+    assert(token >= 0);
+    assert(token < TRANSLATION_TABLE_SIZE);
+    if (token > 0) {
+        if (!translation_table[token])
+            printf("ERROR: unidentified token %d\n", token);
+        assert(translation_table[token]);
+    }
+
+    return translation_table[token];
+}
+
+/*
+ * =============================================================================
+ * Symbols
+ * =============================================================================
+ */
+typedef enum {
+    SYMBOLTYPE_FUNCTION,
+    SYMBOLTYPE_FUNCTION_PARAMETER,
+    SYMBOLTYPE_OTHER,
+    NUM_SYMBOLTYPES
+} SymbolType;
+
+#define MAX_ID_LEN (128)
+typedef struct {
+    SymbolType type;
+    int type_qualifier;
+    int type_specifier;
+    char identifier[MAX_ID_LEN];
+} Symbol;
+
+#define SYMBOL_TABLE_SIZE (4096)
+static Symbol symbol_table[SYMBOL_TABLE_SIZE] = {};
+static int num_symbols                        = 0;
+
+static int
+symboltable_lookup(const char* identifier)
+{
+    if (!identifier)
+        return -1;
+
+    for (int i = 0; i < num_symbols; ++i)
+        if (strcmp(identifier, symbol_table[i].identifier) == 0)
+            return i;
+
+    return -1;
+}
+
+static void
+add_symbol(const SymbolType type, const int tqualifier, const int tspecifier, const char* id)
+{
+    assert(num_symbols < SYMBOL_TABLE_SIZE);
+
+    symbol_table[num_symbols].type           = type;
+    symbol_table[num_symbols].type_qualifier = tqualifier;
+    symbol_table[num_symbols].type_specifier = tspecifier;
+    strcpy(symbol_table[num_symbols].identifier, id);
+
+    ++num_symbols;
+}
+
+static void
+rm_symbol(const int handle)
+{
+    assert(handle >= 0 && handle < num_symbols);
+
+    if (&symbol_table[handle] != &symbol_table[num_symbols - 1])
+        memcpy(&symbol_table[handle], &symbol_table[num_symbols - 1], sizeof(Symbol));
+    --num_symbols;
+}
+
+static void
+print_symbol(const int handle)
+{
+    assert(handle < SYMBOL_TABLE_SIZE);
+
+    const char* fields[]    = {translate(symbol_table[handle].type_qualifier),
+                            translate(symbol_table[handle].type_specifier),
+                            symbol_table[handle].identifier};
+    const size_t num_fields = sizeof(fields) / sizeof(fields[0]);
+
+    for (int i = 0; i < num_fields; ++i)
+        if (fields[i])
+            printf("%s ", fields[i]);
+}
+
+static void
+translate_latest_symbol(void)
+{
+    const int handle = num_symbols - 1;
+    assert(handle < SYMBOL_TABLE_SIZE);
+
+    Symbol* symbol = &symbol_table[handle];
+
+    // FUNCTION
+    if (symbol->type == SYMBOLTYPE_FUNCTION) {
+        // KERNEL FUNCTION
+        if (symbol->type_qualifier == KERNEL) {
+            printf("%s %s\n%s", translate(symbol->type_qualifier),
+                   translate(symbol->type_specifier), symbol->identifier);
+        }
+        // PREPROCESSED FUNCTION
+        else if (symbol->type_qualifier == PREPROCESSED) {
+            printf("%s %s\npreprocessed_%s", translate(symbol->type_qualifier),
+                   translate(symbol->type_specifier), symbol->identifier);
+        }
+        // OTHER FUNCTION
+        else {
+            const char* regular_function_decorator = "static __device__ "
+                                                     "__forceinline__";
+            printf("%s %s %s\n%s", regular_function_decorator,
+                   translate(symbol->type_qualifier) ? translate(symbol->type_qualifier) : "",
+                   translate(symbol->type_specifier), symbol->identifier);
+        }
+    }
+    // FUNCTION PARAMETER
+    else if (symbol->type == SYMBOLTYPE_FUNCTION_PARAMETER) {
+        if (symbol->type_qualifier == IN || symbol->type_qualifier == OUT) {
+            if (doing_stencil_assembly)
+                printf("const __restrict__ %s* %s", translate(symbol->type_specifier),
+                       symbol->identifier);
+            else
+                printf("const %sData& %s", translate(symbol->type_specifier), symbol->identifier);
+        }
+        else {
+            print_symbol(handle);
+        }
+    }
+    // UNIFORM
+    else if (symbol->type_qualifier == UNIFORM) {
+        /* Do nothing */
+    }
+    // IN / OUT
+    else if (symbol->type != SYMBOLTYPE_FUNCTION_PARAMETER &&
+             (symbol->type_qualifier == IN || symbol->type_qualifier == OUT)) {
+        const char* inout_type_qualifier = "static __device__ const auto";
+        printf("%s %s%s", inout_type_qualifier, inout_name_prefix, symbol_table[handle].identifier);
+    }
+    // OTHER
+    else {
+        print_symbol(handle);
+    }
+}
+
+static void
+print_symbol_table(void)
+{
+    for (int i = 0; i < num_symbols; ++i) {
+        printf("%d: ", i);
+        const char* fields[]    = {translate(symbol_table[i].type_qualifier),
+                                translate(symbol_table[i].type_specifier),
+                                symbol_table[i].identifier};
+        const size_t num_fields = sizeof(fields) / sizeof(fields[0]);
+
+        for (int i = 0; i < num_fields; ++i)
+            if (fields[i])
+                printf("%s ", fields[i]);
+
+        if (symbol_table[i].type == SYMBOLTYPE_FUNCTION)
+            printf("(function)");
+        else if (symbol_table[i].type == SYMBOLTYPE_FUNCTION_PARAMETER)
+            printf("(function parameter)");
+        else
+            printf("(other)");
+        printf("\n");
+    }
+}
+
+/*
+ * =============================================================================
+ * State
+ * =============================================================================
+ */
+static bool inside_declaration                    = false;
+static bool inside_function_declaration           = false;
+static bool inside_function_parameter_declaration = false;
+
+static bool inside_kernel       = false;
+static bool inside_preprocessed = false;
+
+static int scope_start = 0;
+
+/*
+ * =============================================================================
+ * AST traversal
+ * =============================================================================
+ */
+
+static void
+traverse(const ASTNode* node)
+{
+    // Prefix logic %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    if (node->type == NODE_FUNCTION_DECLARATION)
+        inside_function_declaration = true;
+    if (node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        inside_function_parameter_declaration = true;
+    if (node->type == NODE_DECLARATION)
+        inside_declaration = true;
+
+    if (!inside_declaration && translate(node->prefix))
+        printf("%s", translate(node->prefix));
+
+    // BOILERPLATE START////////////////////////////////////////////////////////
+    if (node->type == NODE_TYPE_QUALIFIER && node->token == KERNEL)
+        inside_kernel = true;
+
+    // Kernel parameter boilerplate
+    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, ";
+    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        printf("%s ", kernel_parameter_boilerplate);
+
+    // Kernel builtin variables boilerplate (read input/output arrays and setup
+    // indices)
+    const char* kernel_builtin_variables_boilerplate = "GEN_KERNEL_BUILTIN_VARIABLES_"
+                                                       "BOILERPLATE();";
+    if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
+        printf("%s ", kernel_builtin_variables_boilerplate);
+
+        for (int i = 0; i < num_symbols; ++i) {
+            if (symbol_table[i].type_qualifier == IN) {
+                printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
+                       symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+            } else if (symbol_table[i].type_qualifier == OUT) {
+                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+                //printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+            }
+        }
+    }
+
+    // Preprocessed parameter boilerplate
+    if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
+        inside_preprocessed = true;
+    static const char
+        preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
+    if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        printf("%s ", preprocessed_parameter_boilerplate);
+    // BOILERPLATE END////////////////////////////////////////////////////////
+
+    // Enter LHS
+    if (node->lhs)
+        traverse(node->lhs);
+
+    // Infix logic  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    if (!inside_declaration && translate(node->infix))
+        printf("%s ", translate(node->infix));
+
+    if (node->type == NODE_FUNCTION_DECLARATION)
+        inside_function_declaration = false;
+
+
+    // If the node is a subscript expression and the expression list inside it is not empty
+    if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
+        printf("IDX(");
+
+    // Do a regular translation
+    if (!inside_declaration) {
+        const int handle = symboltable_lookup(node->buffer);
+        if (handle >= 0) { // The variable exists in the symbol table
+            const Symbol* symbol = &symbol_table[handle];
+
+            //if (symbol->type_qualifier == OUT) {
+            //    printf("%s%s", inout_name_prefix, symbol->identifier);
+            //}
+            if (symbol->type_qualifier == UNIFORM) {
+                if (symbol->type_specifier == SCALAR)
+                    printf("DCONST_REAL(AC_%s) ", symbol->identifier);
+                else if (symbol->type_specifier == INT)
+                    printf("DCONST_INT(AC_%s) ", symbol->identifier);
+                else
+                    printf("INVALID UNIFORM type specifier %s with %s\n",
+                           translate(symbol->type_specifier), symbol->identifier);
+            }
+            else {
+                // Do a regular translation
+                if (translate(node->token))
+                    printf("%s ", translate(node->token));
+                if (node->buffer)
+                    printf("%s ", node->buffer);
+            }
+        }
+        else {
+            // Do a regular translation
+            if (translate(node->token))
+                printf("%s ", translate(node->token));
+            if (node->buffer)
+                printf("%s ", node->buffer);
+        }
+    }
+
+    if (node->type == NODE_FUNCTION_DECLARATION) {
+        scope_start = num_symbols;
+    }
+
+    // Enter RHS
+    if (node->rhs)
+        traverse(node->rhs);
+
+    // Postfix logic  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    // If the node is a subscript expression and the expression list inside it is not empty
+    if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
+        printf(")");    // Closing bracket of IDX()
+
+    // Generate writeback boilerplate for OUT fields
+    if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
+        for (int i = 0; i < num_symbols; ++i) {
+            if (symbol_table[i].type_qualifier == OUT) {
+                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+                //printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+            }
+        }
+    }
+
+    if (!inside_declaration && translate(node->postfix))
+        printf("%s", translate(node->postfix));
+
+    if (node->type == NODE_DECLARATION) {
+        inside_declaration = false;
+
+        int tqual = 0;
+        int tspec = 0;
+        if (node->lhs && node->lhs->lhs) {
+            if (node->lhs->lhs->type == NODE_TYPE_QUALIFIER)
+                tqual = node->lhs->lhs->token;
+            else if (node->lhs->lhs->type == NODE_TYPE_SPECIFIER)
+                tspec = node->lhs->lhs->token;
+        }
+        if (node->lhs && node->lhs->rhs) {
+            if (node->lhs->rhs->type == NODE_TYPE_SPECIFIER)
+                tspec = node->lhs->rhs->token;
+        }
+
+        // Determine symbol type
+        SymbolType symboltype = SYMBOLTYPE_OTHER;
+        if (inside_function_declaration)
+            symboltype = SYMBOLTYPE_FUNCTION;
+        else if (inside_function_parameter_declaration)
+            symboltype = SYMBOLTYPE_FUNCTION_PARAMETER;
+
+        // Determine identifier
+        if (node->rhs->type == NODE_IDENTIFIER) {
+            add_symbol(symboltype, tqual, tspec, node->rhs->buffer); // Ordinary
+            translate_latest_symbol();
+        }
+        else {
+            add_symbol(symboltype, tqual, tspec,
+                       node->rhs->lhs->buffer); // Array
+            translate_latest_symbol();
+            // Traverse the expression once again, this time with
+            // "inside_declaration" flag off
+            printf("%s ", translate(node->rhs->infix));
+            if (node->rhs->rhs)
+                traverse(node->rhs->rhs);
+            printf("%s ", translate(node->rhs->postfix));
+        }
+    }
+
+    if (node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        inside_function_parameter_declaration = false;
+
+    if (node->type == NODE_FUNCTION_DEFINITION) {
+        while (num_symbols > scope_start)
+            rm_symbol(num_symbols - 1);
+
+        inside_kernel       = false;
+        inside_preprocessed = false;
+    }
+}
+
+// TODO: these should use the generic type names SCALAR and VECTOR
+static void
+generate_preprocessed_structures(void)
+{
+    // PREPROCESSED DATA STRUCT
+    printf("\n");
+    printf("typedef struct {\n");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == PREPROCESSED)
+            printf("%s %s;\n", translate(symbol_table[i].type_specifier),
+                   symbol_table[i].identifier);
+    }
+    printf("} %sData;\n", translate(SCALAR));
+
+    // FILLING THE DATA STRUCT
+    printf("static __device__ __forceinline__ AcRealData\
+            read_data(const int3 vertexIdx,\
+            AcReal* __restrict__ buf[], const int handle)\
+            {\n\
+                %sData data;\n",
+           translate(SCALAR));
+
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == PREPROCESSED)
+            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier,
+                   symbol_table[i].identifier);
+    }
+    printf("return data;\n");
+    printf("}\n");
+
+    // FUNCTIONS FOR ACCESSING MEMBERS OF THE PREPROCESSED STRUCT
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == PREPROCESSED)
+            printf("static __device__ __forceinline__ %s\
+                    %s(const AcRealData& data)\
+                    {\n\
+                        return data.%s;\
+                    }\n",
+                   translate(symbol_table[i].type_specifier), symbol_table[i].identifier,
+                   symbol_table[i].identifier);
+    }
+
+    // Syntactic sugar: generate also a Vector data struct
+    printf("\
+        typedef struct {\
+            AcRealData x;\
+            AcRealData y;\
+            AcRealData z;\
+        } AcReal3Data;\
+        \
+        static __device__ __forceinline__ AcReal3Data\
+        read_data(const int3 vertexIdx,\
+                  AcReal* __restrict__ buf[], const int3& handle)\
+        {\
+            AcReal3Data data;\
+        \
+            data.x = read_data(vertexIdx, buf, handle.x);\
+            data.y = read_data(vertexIdx, buf, handle.y);\
+            data.z = read_data(vertexIdx, buf, handle.z);\
+        \
+            return data;\
+        }\
+    ");
+}
+
+int
+main(int argc, char** argv)
+{
+    if (argc == 2) {
+        if (!strcmp(argv[1], "-sas"))
+            doing_stencil_assembly = true;
+        else if (!strcmp(argv[1], "-sps"))
+            doing_stencil_assembly = false;
+        else
+            printf("Unknown flag %s. Generating stencil assembly.\n", argv[1]);
+    }
+    else {
+        printf("Usage: ./acc [flags]\n"
+               "Flags:\n"
+               "\t-sas - Generates code for the stencil assembly stage\n"
+               "\t-sps - Generates code for the stencil processing "
+               "stage\n");
+        printf("\n");
+        return EXIT_FAILURE;
+    }
+
+    root = astnode_create(NODE_UNKNOWN, NULL, NULL);
+
+    const int retval = yyparse();
+    if (retval) {
+        printf("COMPILATION FAILED\n");
+        return EXIT_FAILURE;
+    }
+
+    // Traverse
+    traverse(root);
+    if (doing_stencil_assembly)
+        generate_preprocessed_structures();
+
+    // print_symbol_table();
+
+    // Cleanup
+    astnode_destroy(root);
+    // printf("COMPILATION SUCCESS\n");
+}
--- a/acc/test_grammar.sh
+++ b/acc/test_grammar.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+cd `dirname $0` # Only operate in the same directory with this script
+
+./build_acc.sh
+
+mkdir -p testbin
+./compile.sh samples/sample_stencil_process.sps
+./compile.sh samples/sample_stencil_assembly.sas
+
+mv stencil_process.cuh testbin/
+mv stencil_assembly.cuh testbin/
+
+printf "
+#include <stdio.h>
+#include <stdlib.h>
+#include \"%s\" // i.e. astaroth.h
+
+__constant__ AcMeshInfo d_mesh_info;
+#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+#include \"%s\"
+#include \"%s\"
+int main(void) { printf(\"Grammar check complete.\\\nAll tests passed.\\\n\"); return EXIT_SUCCESS; }
+" common_header.h stencil_assembly.cuh stencil_process.cuh >testbin/test.cu
+
+cd testbin
+nvcc -std=c++11 test.cu -I ../samples -o test && ./test
--- a/analysis/python/.gitignore
+++ b/analysis/python/.gitignore
@@ -0,0 +1 @@
+*.png
--- a/analysis/python/README.md
+++ b/analysis/python/README.md
@@ -0,0 +1,7 @@
+# Python directory
+
+This directory is for Python script connected to data visualization and analysis. 
+
+Content of this directory should be structured so that it is always callable by
+`import astar` more task related scips should be written elsewhere, depending
+the user's convenience. 
--- a/analysis/python/add_to_pythonpath.sh
+++ b/analysis/python/add_to_pythonpath.sh
@@ -0,0 +1,3 @@
+
+
+export PYTHONPATH=${PYTHONPATH}:$PWD/
--- a/analysis/python/astar/init.py
+++ b/analysis/python/astar/init.py
@@ -0,0 +1,24 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+# Developers note. We require Python 3 approach to have 
+# compatibility towards the future.
+
+import numpy as np 
+import pylab as plt
--- a/analysis/python/astar/data/init.py
+++ b/analysis/python/astar/data/init.py
@@ -0,0 +1,21 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+
+from . import read
--- a/analysis/python/astar/data/read.py
+++ b/analysis/python/astar/data/read.py
@@ -0,0 +1,142 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+# This module is for reading data.
+
+import numpy as np
+
+def read_bin(fname, fdir, fnum, minfo, numtype=np.longdouble):
+    '''Read in a floating point array'''
+    filename = fdir + fname + '_' + fnum + '.mesh'
+    datas = np.DataSource()
+    read_ok = datas.exists(filename)
+    if read_ok:
+        print(filename)
+        array = np.fromfile(filename, dtype=numtype)
+
+        timestamp = array[0]
+
+        array = np.reshape(array[1:], (minfo.contents['AC_mx'], 
+                                   minfo.contents['AC_my'], 
+                                   minfo.contents['AC_mz']), order='F')
+    else:
+        array = None
+        timestamp = None
+     
+    return array, timestamp, read_ok 
+
+def read_meshtxt(fdir, fname):
+    with open(fdir+fname) as f:
+        filetext = f.read().splitlines()
+
+    contents = {}  
+
+    for line in filetext:
+        line = line.split()
+        if line[0] == 'int':
+            contents[line[1]] = np.int(line[2])
+        elif line[0] == 'real':
+            contents[line[1]] = np.float(line[2])
+        else: 
+            print('ERROR: ' + line[0] +' no recognized!')
+
+    return contents
+
+class MeshInfo():
+    '''Object that contains all mesh info'''
+
+    def __init__(self, fdir):
+        self.contents = read_meshtxt(fdir, 'mesh_info.list') 
+
+class Mesh:
+    '''Class tha contains all 3d mesh data'''
+
+    def __init__(self, fnum, fdir=""):
+        fnum = str(fnum)
+        self.framenum = fnum.zfill(10)
+
+        self.minfo = MeshInfo(fdir)
+
+        self.lnrho, self.timestamp, self.ok = read_bin('VTXBUF_LNRHO', fdir, fnum, self.minfo)
+
+        if self.ok:
+
+            self.ss, timestamp, ok = read_bin('VTXBUF_ENTROPY', fdir, fnum, self.minfo)
+ 
+            #TODO Generalize is a dict. Do not hardcode!  
+            uux, timestamp, ok = read_bin('VTXBUF_UUX', fdir, fnum, self.minfo)
+            uuy, timestamp, ok = read_bin('VTXBUF_UUY', fdir, fnum, self.minfo) 
+            uuz, timestamp, ok = read_bin('VTXBUF_UUZ', fdir, fnum, self.minfo)
+            self.uu = (uux, uuy, uuz)
+            uux = []
+            uuy = [] 
+            uuz = []
+ 
+            aax, timestamp, ok = read_bin('VTXBUF_AX', fdir, fnum, self.minfo)
+            aay, timestamp, ok = read_bin('VTXBUF_AY', fdir, fnum, self.minfo) 
+            aaz, timestamp, ok = read_bin('VTXBUF_AZ', fdir, fnum, self.minfo)
+            self.aa = (aax, aay, aaz)
+            aax = []
+            aay = [] 
+            aaz = []
+
+            self.xx =  self.minfo.contents['AC_inv_dsx']*np.arange(self.minfo.contents['AC_mx'])
+            self.yy =  self.minfo.contents['AC_inv_dsy']*np.arange(self.minfo.contents['AC_my'])
+            self.zz =  self.minfo.contents['AC_inv_dsz']*np.arange(self.minfo.contents['AC_mz'])
+
+            self.xmid = int(self.minfo.contents['AC_mx']/2)
+            self.ymid = int(self.minfo.contents['AC_my']/2)
+            self.zmid = int(self.minfo.contents['AC_mz']/2)
+
+
+def parse_ts(fdir, fname):
+    with open(fdir+fname) as f:
+        filetext = f.read().splitlines()
+
+    var = {}  
+
+    line = filetext[0].split()
+    for i in range(len(line)):
+        line[i] = line[i].replace('VTXBUF_', "")
+        line[i] = line[i].replace('UU', "uu")
+        line[i] = line[i].replace('_total', "tot")
+        line[i] = line[i].replace('A', "aa")
+        line[i] = line[i].replace('LNRHO', "lnrho")
+        line[i] = line[i].replace('X', "x")
+        line[i] = line[i].replace('Y', "y")
+        line[i] = line[i].replace('Z', "z")
+
+    tsdata = np.loadtxt(fdir+fname,skiprows=1)
+
+    for i in range(len(line)):
+        var[line[i]] = tsdata[:,i]
+
+    var['step'] = np.int64(var['step'])
+
+    print("HERE ARE ALL KEYS FOR TS DATA:")
+    print(var.keys())
+   
+    return var
+
+class TimeSeries:
+    '''Class for time series data'''
+
+    def __init__(self, fdir="", fname="timeseries.ts"):
+
+        self.var = parse_ts(fdir, fname)
--- a/analysis/python/astar/visual/init.py
+++ b/analysis/python/astar/visual/init.py
@@ -0,0 +1,21 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+
+from . import slices
--- a/analysis/python/astar/visual/slices.py
+++ b/analysis/python/astar/visual/slices.py
@@ -0,0 +1,92 @@
+
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import pylab as plt 
+import numpy as np 
+import matplotlib.gridspec as gridspec
+import matplotlib.colors as colors
+
+CM_INFERNO = plt.get_cmap('inferno')
+
+def plot_3(mesh, input_grid, title = '', fname = 'default', bitmap=False, slicetype = 'middle', colrange=None, colormap=CM_INFERNO , contourplot=False):
+    fig = plt.figure(figsize=(8, 8))
+    grid = gridspec.GridSpec(2, 3, wspace=0.4, hspace=0.4, width_ratios=[1,1, 0.15])
+    ax00   = fig.add_subplot( grid[0,0] )
+    ax10   = fig.add_subplot( grid[0,1] )
+    ax11   = fig.add_subplot( grid[1,1] )
+    axcbar = fig.add_subplot( grid[:,2] )
+
+    print(mesh.minfo.contents.keys())
+
+    if slicetype == 'middle':
+        yz_slice = input_grid[mesh.xmid, :, :]
+        xz_slice = input_grid[:, mesh.ymid, :]
+        xy_slice = input_grid[:, :, mesh.zmid]
+        if colrange==None:
+            plotnorm = colors.Normalize(vmin=input_grid.min(),vmax=input_grid.max()) 
+        else:
+            plotnorm = colors.Normalize(vmin=colrange[0],vmax=colrange[1]) 
+    elif slicetype == 'sum':
+        yz_slice = np.sum(input_grid, axis=0) 
+        xz_slice = np.sum(input_grid, axis=1) 
+        xy_slice = np.sum(input_grid, axis=2) 
+        cmin = np.amin([yz_slice.min(), xz_slice.min(), xy_slice.min()])
+        cmax = np.amax([yz_slice.max(), xz_slice.max(), xy_slice.max()])
+        if colrange==None:
+            plotnorm = colors.Normalize(vmin=cmin,vmax=cmax) 
+        else:
+            plotnorm = colors.Normalize(vmin=colrange[0],vmax=colrange[1]) 
+        
+    
+    yy, zz = np.meshgrid(mesh.yy, mesh.zz, indexing='ij')
+    if contourplot:
+        map1 = ax00.contourf(yy, zz, yz_slice, norm=plotnorm, cmap=colormap, nlev=10)
+    else:
+        map1 = ax00.pcolormesh(yy, zz, yz_slice, norm=plotnorm, cmap=colormap)
+    ax00.set_xlabel('y')
+    ax00.set_ylabel('z')
+    ax00.set_title('%s t = %.4e' % (title, mesh.timestamp) )    
+    ax00.set_aspect('equal')
+    
+    xx, zz = np.meshgrid(mesh.xx, mesh.zz, indexing='ij')
+    if contourplot:
+        ax10.contourf(xx, zz, xz_slice, norm=plotnorm, cmap=colormap, nlev=10)
+    else:
+        ax10.pcolormesh(xx, zz, xz_slice, norm=plotnorm, cmap=colormap)
+    ax10.set_xlabel('x')
+    ax10.set_ylabel('z')
+    ax10.set_aspect('equal')
+    
+    xx, yy = np.meshgrid(mesh.xx, mesh.yy, indexing='ij')
+    if contourplot:
+        ax11.contourf(xx, yy, xy_slice, norm=plotnorm, cmap=colormap, nlev=10)
+    else:
+        ax11.pcolormesh(xx, yy, xy_slice, norm=plotnorm, cmap=colormap)
+    ax11.set_xlabel('x')
+    ax11.set_ylabel('y')
+    ax11.set_aspect('equal')
+    
+    cbar = plt.colorbar(map1, cax=axcbar) 
+
+    if bitmap:
+        plt.savefig('%s_%s.png' % (fname, mesh.framenum))
+        print('Saved %s_%s.png' % (fname, mesh.framenum))
+        plt.close(fig)
+         
+ 
--- a/analysis/python/calc/convert.sh
+++ b/analysis/python/calc/convert.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+#gm convert -delay 40 colden_*.png colden.gif
+
+DATE=`date '+%Y_%m_%d_%H_%M'`
+
+echo $DATE
+
+gm convert -delay 15 $1_*.png $1_$DATE.gif
--- a/analysis/python/calc/galli_shu_plotter.py
+++ b/analysis/python/calc/galli_shu_plotter.py
@@ -0,0 +1,835 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import numpy as np
+import pylab as plt
+import scipy as scp
+
+import matplotlib.colors as colors
+
+G_newton = 6.674e-8 #cm**3 g**-1 s**-2  
+
+# Time to convert to physical quantities
+yr  = 3.154e+7 #s 
+kyr = 1000.0*yr
+km = 1e5 #cm
+AU = 1.496e+13 #cm
+Msun = 1.98847e33 #g
+
+#cs0 = 20000.0   #cs cm/s "a" in Shu notation
+cs0 = 35000.0   #cs cm/s "a" in Shu notation
+B0  = 30e-6   #G 
+ksii = 11.3 # 
+
+#GS Eq. 10
+ttm = 9.03e12*(cs0/35000.0)/(B0/30e-6) 
+
+
+CM_INFERNO = plt.cm.get_cmap('inferno')
+
+
+
+
+
+
+def P_harmonics(theta, J=666):
+    #Vector spherical harmonics in e_r direction
+    if J == 0: 
+        P = np.ones_like(theta)  # 1.0 
+    elif J == 2:
+        cos_theta = np.cos(theta)
+        P = (1.0/2.0)*(3.0*(cos_theta**2.0) - 1.0)
+    else:
+        P = 0.0
+  
+    #print("P_2", P) 
+    return P 
+    
+
+def B_harmonics(theta, J=666):
+    #Vector spherical harmonics in e_theta direction
+    #print("B_harmonics theta", theta)
+    if J == 2:
+        sin_theta = np.abs(np.sin(theta))
+        cos_theta = np.cos(theta)
+        #B = -(3.0/np.sqrt(6.0))*cos_theta*sin_theta #Morse & Feshbach 1953 book
+        B = -3.0*cos_theta*sin_theta #GS93 Appendix B
+    else:
+        B = 0.0*theta
+
+    #print("B_harmonics", B)
+   
+    return B 
+
+def get_tau(tt): 
+    return tt/ttm
+
+def get_SHU77_potential(xx_point):
+    #Copied here again for convenience
+    m0 = 0.975 #Shu 77 core reduced mass
+    xx_SHU_table   = np.array([ 0.05,  0.10,  0.15,  0.20,  0.25, 
+                          0.30,  0.35,  0.40,  0.45,  0.50, 
+                          0.55,  0.60,  0.65,  0.70,  0.75, 
+                          0.80,  0.85,  0.90,  0.95,  1.00]) 
+    
+    mm_SHU77_table = np.array([0.981, 0.993,  1.01,  1.03,  1.05, 
+                          1.08,  1.12,  1.16,  1.20,  1.25, 
+                          1.30,  1.36,  1.42,  1.49,  1.56, 
+                          1.64,  1.72,  1.81,  1.90,  2.00]) 
+ 
+    xx = xx_SHU_table[  np.where(xx_SHU <= xx_point)]
+    mm = mm_SHU77_table[np.where(xx_SHU <= xx_point)]
+
+    psi = - m0/xx_point + np.trapz(mm/(xx**2.0), xx)
+
+    return psi
+
+
+def psi2(xx_SHU, mm_term, pp_term, J=666):
+    #GS93 Eq. 113
+    if J == 0: 
+        psi2 = - mm_term/xx_SHU + pp_term
+    elif J == 2:
+        psi2 = - mm_term/(xx_SHU**3.0) + (xx_SHU**2.0)*pp_term 
+    else:
+        psi2 = 0.0
+
+    #print('psi2', psi2, 'J', J, 'mm_term', mm_term, 'xx_SHU', xx_SHU, 'pp_term', pp_term)
+
+    return psi2
+
+# Calculate the directional parameter
+def dv_dx(xx,vv, alpha):
+    EE = alpha*(xx-vv) - 2.0/xx 
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+def dalpha_dx(xx,vv, alpha):
+    EE = alpha*(alpha - (2.0/xx)*(xx-vv))
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+def dpsi_dx(xx, mm):
+    return mm/(xx**2.0)
+
+def dmm_dx(xx, alpha):
+    return (xx**2.0)*alpha
+
+def dphi_dx(xx, alpha, mm, theta):
+    ff_zero_der = 0.5*mm*dmm_dx(xx, alpha)
+    sin_theta = np.sin(theta)
+    return ff_zero_der*(sin_theta*2.0) 
+
+
+def deltaspace(theta, tau):
+    #Assuming J= 0, 2 only
+    v0 = -2.222e-1
+    v2 = 2.177e-1
+    deltaJ2 = -(1.0/3.0)*((v0+2.0/3.0)*P_harmonics(theta, J=0) + (v2 - 2.0/3.0)*P_harmonics(theta, J=2))
+    delta   = 1 + (tau**2.0)*deltaJ2 
+    return delta
+
+def delta2(theta, tau):
+    #Assuming J= 0, 2 only
+    return deltaspace(theta, tau)**2.0
+
+def yy_transform(xx_SHU, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93):
+    
+    
+
+    return alpha_mono_GS93, alpha_quad_GS93 
+
+# Calculating the perturbation stage
+def alpha_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta):
+    #Assuming J= 0, 2 only
+    directional = xx_SHU*dalpha_dx(xx_SHU, vv_SHU77, alpha_SHU77)*delta2(theta, tau)
+    directional = 0.0 # 
+    alpha       = alpha_mono_GS93*P_harmonics(theta, J=0) + alpha_quad_GS93*P_harmonics(theta, J=2) + directional
+    return alpha
+
+def vv_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta):
+    #Assuming J= 0, 2 only
+    directional = xx_SHU*dv_dx(xx_SHU, vv_SHU77, alpha_SHU77)*delta2(theta, tau)
+    directional = 0.0 # 
+    vv_mono  = vv_ww_mono_GS93[0]
+    vv_quad  = vv_ww_quad_GS93[0]
+    ww_mono  = vv_ww_mono_GS93[1]
+    ww_quad  = vv_ww_quad_GS93[1]
+    #print('vv_mono, vv_quad, ww_mono, ww_quad', vv_mono, vv_quad, ww_mono, ww_quad)
+    vv_r     = vv_mono*P_harmonics(theta, J=0) + vv_quad*P_harmonics(theta, J=2) + directional ## vv
+    vv_theta = ww_mono*B_harmonics(theta, J=0) + ww_quad*B_harmonics(theta, J=2) + directional ## ww
+    #print("vv_r, vv_theta", vv_r, vv_theta)
+    vv       = np.array([vv_r, vv_theta])
+    return vv
+
+def psi_perturb(tau, xx_SHU, mm_SHU77, mm_pp_mono_GS93, mm_pp_quad_GS93, theta):
+    #Assuming J= 0, 2 only
+    directional = xx_SHU*dpsi_dx(xx_SHU, mm_SHU77)*delta2(theta, tau)
+    directional = 0.0 # 
+    mm_mono  = mm_pp_mono_GS93[0]
+    mm_quad  = mm_pp_quad_GS93[0]
+    pp_mono  = mm_pp_mono_GS93[1]
+    pp_quad  = mm_pp_quad_GS93[1]
+
+    #print('mm_pp_mono_GS93', mm_pp_mono_GS93)
+    #print('mm_mono', mm_mono)
+    
+    psi      =   psi2(xx_SHU, mm_mono, pp_mono, J=0)*P_harmonics(theta, J=0) \
+               + psi2(xx_SHU, mm_quad, pp_quad, J=0)*P_harmonics(theta, J=2) \
+               + directional
+    
+    #print('psi_perturb', psi)
+ 
+    return psi
+
+def phi_vecpot_second_order(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta):
+    directional = xx_SHU*dphi_dx(xx_SHU, alpha_SHU77, mm_SHU77, theta)*delta2(theta, tau)
+    directional = 0.0 # 
+    sin_theta = np.sin(theta)
+    #print(FF_DD_mono_GS93)
+    #print(FF_DD_quad_GS93)
+    #print(ksii, P_harmonics(theta, J=0), P_harmonics(theta, J=2))
+    mono_term = (FF_DD_mono_GS93[0] + (1.0/ksii)*FF_DD_mono_GS93[1])
+    quad_term = (FF_DD_quad_GS93[0] + (1.0/ksii)*FF_DD_quad_GS93[1])
+    phi_vecpot_second = (sin_theta**2.0)*( mono_term*P_harmonics(theta, J=0) \
+                                           + quad_term*P_harmonics(theta, J=2) ) \
+                                           + directional
+    return phi_vecpot_second
+
+def phi_vecpot_zero_order(xx_SHU, mm_SHU77, theta):
+    ff_zero = 0.25*(mm_SHU77**2.0)
+    sin_theta = np.sin(theta)
+    phi_vecpot_zero = ff_zero*(sin_theta*2.0)
+    return phi_vecpot_zero
+
+
+# Combining the perturbation stage.
+def alpha_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta):
+    alpha = alpha_SHU77 + (tau**2.0)*alpha_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta)
+    return alpha
+
+def vv_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta):
+    vv = (tau**2.0)*vv_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta)
+    #print("BF",vv, vv_ww_mono_GS93, vv_ww_quad_GS93) 
+    vv[0] = vv_SHU77 + vv[0]
+    vv[1] = 0.0      + vv[1]   #No poloidal velocity in Shu77
+    #print("AF",vv)
+    return vv 
+
+def psi_xvec_tau(tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta):
+    #print("psi_xvec_tau --- tau, xx_SHU, mm_SHU7, mm_pp_mono, mm_pp_quad, theta", tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    psi = (tau**2.0)*psi_perturb(tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    psi77 = get_SHU77_potential(xx_SHU)
+    #print('psi77', psi77)
+    psi = psi77 + psi  
+    #print('psi_xvec_tau', psi)
+    return psi 
+
+
+def phi_vecpot_xvec_tau(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta):
+    phi_vecpot_second = (tau**2.0)*phi_vecpot_second_order(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta)
+    phi_vecpot_zero = phi_vecpot_zero_order(xx_SHU, mm_SHU77, theta)
+    phi_vecpot = phi_vecpot_zero + phi_vecpot_second 
+    return phi_vecpot 
+
+#Physical unit converion stage
+def rho_rt(tt, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta):
+    tau = get_tau(tt)
+    alpha_xvec = alpha_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta)
+    rho = (1.0/(4.0*np.pi*G_newton*(tt**2.0))) * alpha_xvec
+    return rho, alpha_xvec
+
+def uu_rt(tt, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta):
+    tau = get_tau(tt)
+    vv_xvec = vv_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta)
+    uu = cs0*vv_xvec
+    return uu, vv_xvec
+
+def grav_psi_rt(tt, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta):
+    tau = get_tau(tt)
+    #print("tt , xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta", tt, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    psi_xvec = psi_xvec_tau(tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    Vpot     = (cs0**2.0)*psi_xvec
+    return Vpot, psi_xvec
+
+def vectorpot_rt(tt, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta):
+    tau = get_tau(tt)
+    phi_vecpot_xvec = phi_vecpot_xvec_tau(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta)
+    Phi_flux = np.pi*B0*((cs0*tt)**2.0)*phi_vecpot_xvec
+    return Phi_flux, phi_vecpot_xvec
+
+
+
+###def match_xx(xx_rad, xx_SHU):
+###    xx_buffer = np.empty_like(xx_rad)
+###    stride = np.abs(xx_SHU[1] - xx_SHU[0])
+###    for xx in xx_SHU:
+###        #where  xx - stride <  xx_rad < xx + stride   -> xx_rad[i] = xx 
+###        #loc = np.where((xx_rad <= (xx + stride) and xx_rad > (xx - stride) ))
+###        loc = np.where(xx_rad <= (xx + stride) )
+###        print(loc)
+
+
+def get_shu_index(xx, xx_SHU):
+    stride = np.abs(xx_SHU[1] - xx_SHU[0])/2.0
+
+    #ishu = np.where((xx_SHU <= (xx + stride)) & (xx_SHU > (xx - stride)))[0]    
+
+
+    #TODO Now a purkka version. Do better. 
+    # Can be improve by taking the treatment of the actual low and high x cases. 
+    if (xx > xx_SHU[xx_SHU.size-1]):
+        ishu = xx_SHU.size-1 
+    elif (xx < xx_SHU[0]):
+        ishu = 0
+    else:
+        ishu = np.where((xx_SHU <= (xx + stride)) & (xx_SHU > (xx - stride)))[0]
+        #print("get_shu_index", ishu, ishu.size)
+        ishu = ishu[0]
+        #print("get_shu_index", ishu, ishu.size)
+
+    #print(ishu, xx_SHU[ishu], xx)
+
+    return ishu
+
+def plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, xxvar, physvar, 
+                vv_hor=np.array(None), vv_ver=np.array(None), uu_hor=np.array(None), uu_ver=np.array(None), 
+                title1=r"\alpha", title2=r"\rho", filetitle='density',
+                var_min=[None, None], var_max=[None, None], colmap=CM_INFERNO, normtype='log', 
+                streamlines = 0, contourplot = 0):
+
+    if var_min[0] != None:
+        if normtype == 'log':
+            mynorm1 = colors.LogNorm( vmin=var_min[0], vmax=var_max[0] )
+            mynorm2 = colors.LogNorm( vmin=var_min[1], vmax=var_max[1] )
+        else:
+            mynorm1 = colors.Normalize( vmin=var_min[0], vmax=var_max[0] )
+            mynorm2 = colors.Normalize( vmin=var_min[1], vmax=var_max[1] )
+    else:
+        mynorm1 = colors.Normalize( )
+        mynorm2 = colors.Normalize( )
+
+    if contourplot: 
+        if normtype =='cdensity':
+            numbers = np.arange(0, 20, dtype=np.float64)
+            contourlevs = 1e-20*(np.sqrt(2.0)**numbers)
+            contournorm = colors.LogNorm( vmin=contourlevs.min(), vmax=contourlevs.max() )
+        elif normtype =='cflux':
+            contourlevs = np.linspace(1.0, 1e31, num=20)
+            contournorm = colors.Normalize( vmin=contourlevs.min(), vmax=contourlevs.max() )
+        else: 
+            contourlevs = np.linspace(physvar.min(), physvar.max(), num=10)
+            contournorm = colors.Normalize( vmin=contourlevs.min(), vmax=contourlevs.max() )
+
+
+    ##rr_horizontal_corners = xx_horizontal_corners*(cs0*tt)/AU
+    ##rr_vertical_corners   = xx_vertical_corners*  (cs0*tt)/AU
+    ##rr_horizontal         = xx_horizontal*(cs0*tt)/AU
+    ##rr_vertical           = xx_vertical*  (cs0*tt)/AU
+
+    rr_horizontal_corners = xx_horizontal_corners*(cs0*tt)/1e17
+    rr_vertical_corners   = xx_vertical_corners*  (cs0*tt)/1e17
+    rr_horizontal         = xx_horizontal*(cs0*tt)/1e17
+    rr_vertical           = xx_vertical*  (cs0*tt)/1e17
+
+
+
+    figa, axa = plt.subplots(nrows=1, ncols=2, figsize=(16,6))
+    if contourplot:
+        mapa = axa[0].contourf(xx_horizontal, xx_vertical, xxvar, cmap=colmap, norm=mynorm1)
+        maprho = axa[1].contourf(rr_horizontal, rr_vertical, physvar, contourlevs, cmap=colmap, norm=contournorm)
+    else: 
+        mapa = axa[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, xxvar, cmap=colmap, norm=mynorm1 )
+        maprho = axa[1].pcolormesh(rr_horizontal_corners, rr_vertical_corners, physvar, cmap=colmap, norm=mynorm2)
+
+    #mapa = axa[0].contourf(xx_horizontal, xx_vertical, alpha, cmap=CM_INFERNO, norm=colors.LogNorm(vmin=0.1, vmax=50.0))
+    #maprho = axa[1].contourf(xx_horizontal*(cs0*tt)/AU, xx_vertical*(cs0*tt)/AU, rho, cmap=CM_INFERNO, norm=colors.LogNorm(vmin=1e15, vmax=1e20))
+
+    if vv_hor.any() != None:
+        if streamlines:
+            #vv_tot = np.sqrt(vv_hor**2.0 + vv_ver**2.0)
+            #vv_tot = np.log(vv_tot/vv_tot.max())
+            axa[0].streamplot(xx_horizontal, xx_vertical, vv_hor, vv_ver, color  = 'k')
+            axa[1].streamplot(rr_horizontal, rr_vertical, uu_hor, uu_ver, color = 'k' )
+        else:
+            axa[0].quiver(xx_horizontal, xx_vertical, vv_hor, vv_ver, pivot = 'middle')
+            axa[1].quiver(rr_horizontal, rr_vertical, uu_hor, uu_ver, pivot = 'middle')
+
+    fig.colorbar(mapa, ax=axa[0])
+    fig.colorbar(maprho, ax=axa[1])
+
+    tau    = get_tau(tt)
+    tt_kyr = tt/kyr
+    axa[0].set_title(r'$%s(x, \tau = %.3f)$ ' % (title1, tau))
+    axa[1].set_title(r'$%s(r, t = %.3f \mathrm{kyr})$ ' % (title2, tt_kyr))
+
+    axa[0].set_xlabel('x')
+    axa[0].set_ylabel('x')
+    #axa[1].set_xlabel('r (AU)')
+    #axa[1].set_ylabel('r (AU)')
+    axa[1].set_xlabel(r'r ($10^{17}$ cm)')
+    axa[1].set_ylabel(r'r ($10^{17}$ cm)' )
+
+    ##axa[1].set_xlim(0.0, 3e17/AU)
+    ##axa[1].set_ylim(0.0, 3e17/AU)
+    axa[1].set_xlim(0.0, 3.0)
+    axa[1].set_ylim(0.0, 3.0)
+
+    axa[0].set_aspect('equal', 'datalim')
+    #axa[1].set_aspect('equal', 'datalim')
+
+    figfile = '%s_%s.png' % (filetitle, str(numslice).zfill(6))
+    print(figfile)
+    figa.savefig(figfile)
+    plt.close(figa)
+
+
+
+xx_SHU      =  np.array([ 0.05,  0.10,  0.15,  0.20,  0.25, 
+                          0.30,  0.35,  0.40,  0.45,  0.50, 
+                          0.55,  0.60,  0.65,  0.70,  0.75, 
+                          0.80,  0.85,  0.90,  0.95,  1.00]) 
+
+alpha_SHU77 =  np.array([ 71.5,  27.8,  16.4,  11.5,  8.76, 
+                          7.09,  5.95,  5.14,  4.52,  4.04, 
+                          3.66,  3.35,  3.08,  2.86,  2.67, 
+                          2.50,  2.35,  2.22,  2.10,  2.00]) 
+
+vv_SHU77    = -np.array([ 5.44,  3.47,  2.58,  2.05,  1.68, 
+                          1.40,  1.18,  1.01, 0.861, 0.735, 
+                         0.625, 0.528, 0.442, 0.363, 0.291, 
+                         0.225, 0.163, 0.106, 0.051,  0.00]) 
+
+mm_SHU77    =  np.array([0.981, 0.993,  1.01,  1.03,  1.05, 
+                          1.08,  1.12,  1.16,  1.20,  1.25, 
+                          1.30,  1.36,  1.42,  1.49,  1.56, 
+                          1.64,  1.72,  1.81,  1.90,  2.00]) 
+
+
+
+
+#GS Table 1 
+
+alpha_mono_GS93 = np.array([    6.304,     2.600,     1.652,     1.156,  9.005e-1, 
+                             7.314e-1,  6.084e-1,  5.084e-1,  4.256e-1,  3.517e-1, 
+                             2.829e-1,  2.172e-1,  1.488e-1,  8.091e-2,  8.360e-3, 
+                            -6.826e-2, -1.512e-1, -2.406e-1, -3.382e-1, -4.444e-1]) 
+
+vv_ww_mono_GS93 = np.array([[4.372e-1,  3.335e-1,  2.390e-1,  1.918e-1,  1.522e-1,
+                             1.226e-1,  9.579e-2,  7.103e-2,  4.828e-2,  2.640e-2, 
+                             5.058e-3, -1.588e-2, -3.791e-2, -5.975e-2, -8.293e-2,
+                            -1.071e-1, -1.330e-1, -1.605e-1, -1.902e-1, -2.222e-1],
+                           [      0.0,       0.0,       0.0,       0.0,       0.0,
+                                  0.0,       0.0,       0.0,       0.0,       0.0,
+                                  0.0,       0.0,       0.0,       0.0,       0.0, 
+                                  0.0,       0.0,       0.0,       0.0,       0.0]])
+
+mm_pp_mono_GS93 = np.array([[8.634e-4, 1.959e-3, 3.560e-3, 5.661e-3, 8.235e-3,
+                             1.130e-2, 1.482e-2, 1.873e-2, 2.293e-2, 2.730e-2,
+                             3.166e-2, 3.579e-2, 3.935e-2, 4.196e-2, 4.312e-2,
+                             4.221e-2, 3.847e-2, 3.097e-2, 1.859e-2,      0.0],
+                           [      0.0,      0.0,      0.0,      0.0,      0.0,
+                                  0.0,      0.0,      0.0,      0.0,      0.0,
+                                  0.0,      0.0,      0.0,      0.0,      0.0,
+                                  0.0,      0.0,      0.0,      0.0,      0.0]])
+
+
+FF_DD_mono_GS93 = np.array([[   -1.130, -3.275e-1, -1.355e-1, -6.415e-2, -2.889e-2, #F
+                             -8.387e-3,  5.358e-3,  1.534e-2,  2.303e-2,  2.931e-2,
+                              3.454e-2,  3.888e-2,  4.225e-2,  4.442e-2,  4.504e-2,
+                              4.358e-2,  3.935e-2,  3.146e-2,  1.881e-2,      0.0],
+                           [  -1.246e1,    -3.168,    -1.141, -5.740e-1, -3.178e-1, #D
+                             -1.878e-1, -1.049e-1, -4.547e-2,  3.393e-4,  3.924e-2,
+                              7.431e-2,  1.070e-1,  1.376e-1,  1.650e-1,  1.867e-1,
+                              1.992e-1,  1.966e-1,  1.708e-1,  1.103e-1,       0.0]])
+
+
+
+#GS Table 2
+
+alpha_quad_GS93 = np.array([ -1.096e3, -1.191e2,  -3.148e1,  -1.158e1,    -5.105, 
+                               -2.456,   -1.217, -5.889e-1, -2.569e-1, -7.024e-2, 
+                             3.790e-2, 1.042e-1,  1.505e-1,  1.845e-1,  2.163e-1, 
+                             2.492e-1, 2.865e-1,  3.302e-1,  3.823e-1,  4.437e-1])
+
+vv_ww_quad_GS93 = np.array([[  -2.581,    -1.533, -8.072e-1, -5.666e-1, -3.905e-1, #v
+                            -2.790e-1, -1.928e-1, -1.254e-1, -7.156e-2, -2.614e-2, 
+                             1.267e-2,  4.650e-2,  7.724e-2,  1.042e-1,  1.288e-1,
+                             1.510e-1,  1.711e-1,  1.889e-1,  2.045e-1,  2.177e-1],
+                           [   -2.085,    -4.890,    -1.811, -8.842e-1, -4.816e-1, #w
+                            -2.807e-1, -1.628e-1, -8.779e-2, -3.852e-2, -4.481e-3,
+                             1.928e-2,  3.578e-2,  4.683e-2,  5.306e-2,  5.512e-2, 
+                             5.312e-2,  4.704e-2,  3.670e-2,  2.179e-2,  1.898e-3]])
+
+mm_pp_quad_GS93 = np.array([[-3.860e-5, -1.541e-4, -3.044e-4, -4.847e-4, -6.831e-4, #m
+                             -8.874e-4, -1.083e-3, -1.253e-3, -1.385e-3, -1.462e-3,
+                             -1.470e-3, -1.389e-3, -1.191e-3, -8.405e-4, -2.841e-4,
+                              5.579e-4,  1.800e-3,  3.609e-3,  6.218e-3,  9.951e-3],
+                            [ -7.539e1,    -7.275,    -1.730, -5.586e-1, -1.999e-1, #p
+                             -6.591e-1, -1.062e-2,  1.294e-2,  2.267e-2,  2.600e-2,
+                              2.625e-2,  2.500e-2,  2.294e-2,  2.046e-2,  1.769e-2,
+                              1.469e-2,  1.146e-2,  7.941e-3,  4.102e-3, -1.214e-4]])
+
+FF_DD_quad_GS93 = np.array([[   -2.253, -6.517e-1, -2.722e-1, -1.345e-1, -6.993e-2, #F
+                             -3.593e-2, -1.660e-2, -5.864e-3, -6.809e-4,  8.213e-4,
+                             -3.086e-4, -3.338e-3, -7.681e-3, -1.272e-2, -1.778e-2,
+                             -2.191e-2, -2.392e-2, -2.219e-2, -1.457e-2,  1.729e-3],
+                            [ -2.484e1,    -6.258,    -2.221,    -1.102, -6.127e-1, #D
+                             -3.645e-1, -2.213e-1, -1.297e-1, -7.020e-2, -1.112e-2,
+                             -2.139e-3, -1.615e-2,  2.744e-2,  3.252e-2,  3.269e-2,
+                              2.839e-2,  2.104e-2,  1.199e-2,  3.732e-3,       0.0]])
+
+
+tt = 0.3*ttm
+theta = 0.5*np.pi
+
+
+xx_SHU          = xx_SHU[:-1]  
+vv_SHU77        = vv_SHU77[:-1]
+alpha_SHU77     = alpha_SHU77[:-1]
+
+alpha_mono_GS93 = alpha_mono_GS93[:-1]
+alpha_quad_GS93 = alpha_quad_GS93[:-1]
+
+vv_ww_mono_GS93 = np.array([vv_ww_mono_GS93[0][:-1], vv_ww_mono_GS93[1][:-1]])
+vv_ww_quad_GS93 = np.array([vv_ww_quad_GS93[0][:-1], vv_ww_quad_GS93[1][:-1]])
+
+
+rho, alpha_xvec = rho_rt(tt, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta)
+
+rr = xx_SHU*cs0*tt 
+
+np.set_printoptions(linewidth=200)
+
+print(rho.shape)
+print(xx_SHU.shape)
+
+print(rho)
+print(xx_SHU)
+
+print(vv_ww_mono_GS93)
+print(vv_ww_quad_GS93)
+print(vv_ww_quad_GS93[0])
+print(vv_ww_quad_GS93[1])
+
+#plt.figure()
+#plt.plot(rr, rho)
+#
+#plt.figure()
+#plt.plot(xx_SHU, alpha_xvec, label = "GS93")
+#plt.plot(xx_SHU, alpha_SHU77, label = "Shu77")
+#plt.legend()
+
+
+#alpha_mono_yy, alpha_quad_yy, alpha_mono_yy = yy_transform(xx_SHU, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93)
+
+
+plt.figure()
+plt.plot(xx_SHU, alpha_SHU77, label=r"$\alpha^{(0)}$")
+plt.plot(xx_SHU, alpha_mono_GS93, label=r"$\alpha^{(2)}_0$")
+plt.plot(xx_SHU, alpha_quad_GS93, label=r"$\alpha^{(2)}_2$")
+plt.ylim([-5.0,5.0])
+plt.legend()
+plt.show()
+
+
+'''
+ii = 0
+theta_axis = np.linspace(0.0, np.pi)
+xx_theta = np.array([])
+
+print("PIIP")
+
+
+plt.figure()
+for ii in range(0,xx_SHU.size):
+    alpha_theta  = np.array([])
+    alpha_shuref = np.array([])
+    for theta in theta_axis: 
+        rho, alpha_xvec = rho_rt(tt, xx_SHU[ii], vv_SHU77[ii], alpha_SHU77[ii], alpha_mono_GS93[ii], alpha_quad_GS93[ii])
+        alpha_theta  = np.append(alpha_theta, alpha_xvec)
+        alpha_shuref = np.append(alpha_shuref, alpha_SHU77[ii])
+
+    plt.plot(alpha_theta, theta_axis, label = "GS93")
+    #plt.plot(alpha_shuref, theta_axis, label = "GS93")
+'''
+
+
+#Interpolate a mesh. 
+
+xx_SHU_GRID = np.insert(xx_SHU, 0, 0.0)
+print(xx_SHU_GRID)
+
+xx_horizontal, xx_vertical = np.meshgrid(xx_SHU_GRID, xx_SHU_GRID,  indexing='xy') 
+theta = np.arctan2(xx_horizontal, xx_vertical)
+
+#Take pcolormesh coordinate system into account, which marks corners instead of centre points. 
+dxx = np.abs(xx_horizontal[0,1] - xx_horizontal[0,0])
+    
+print(dxx)
+xx_horizontal_corners = xx_horizontal - dxx/2.0
+xx_vertical_corners   = xx_vertical - dxx/2.0 
+
+xx_rad = np.sqrt(xx_horizontal**2.0 +  xx_vertical**2.0)
+
+
+
+
+fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
+        
+map1 = ax[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, theta)
+map2 = ax[1].pcolormesh(xx_horizontal_corners, xx_vertical_corners, xx_rad)
+
+ax[0].set_title(r"$\theta$")
+ax[1].set_title(r"$x_\mathrm{rad}$")
+
+fig.colorbar(map1, ax=ax[0])
+fig.colorbar(map2, ax=ax[1])
+
+ax[0].set_aspect('equal', 'datalim')
+ax[1].set_aspect('equal', 'datalim')
+
+
+
+
+Pfig, Pax = plt.subplots(nrows=1, ncols=3, figsize=(16,4))
+
+print("P_harmonics(theta, J=0)", P_harmonics(theta, J=0))
+
+Pmap1 = Pax[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, P_harmonics(theta, J=0))
+Pmap2 = Pax[1].pcolormesh(xx_horizontal_corners, xx_vertical_corners, P_harmonics(theta, J=2))
+Pmap3 = Pax[2].pcolormesh(xx_horizontal_corners, xx_vertical_corners, deltaspace(theta, 0.5))
+
+Pax[0].set_title(r"$P_0(\theta)$")
+Pax[1].set_title(r"$P_2(\theta)$")
+Pax[2].set_title(r"$\Delta(\theta, \tau = 0.5)$")
+
+
+Pfig.colorbar(Pmap1, ax=Pax[0])
+Pfig.colorbar(Pmap2, ax=Pax[1])
+Pfig.colorbar(Pmap3, ax=Pax[2])
+
+Pax[0].set_aspect('equal', 'datalim')
+Pax[1].set_aspect('equal', 'datalim')
+Pax[2].set_aspect('equal', 'datalim')
+
+
+
+
+Bfig, Bax = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
+
+print("B_harmonics(theta, J=0)", B_harmonics(theta, J=0))
+
+Bmap1 = Bax[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, B_harmonics(theta, J=0))
+Bmap2 = Bax[1].pcolormesh(xx_horizontal_corners, xx_vertical_corners, B_harmonics(theta, J=2))
+
+Bax[0].set_title(r"$B_0(\theta)$")
+Bax[1].set_title(r"$B_2(\theta)$")
+
+Bfig.colorbar(Bmap1, ax=Bax[0])
+Bfig.colorbar(Bmap2, ax=Bax[1])
+
+Bax[0].set_aspect('equal', 'datalim')
+Bax[1].set_aspect('equal', 'datalim')
+
+
+plt.show()
+
+
+
+##xx_horizontal_corners = np.append(xx_horizontal_corners, (np.amax(xx_horizontal_corners)+dxx)*np.ones((xx_horizontal_corners.shape[1],1)), axis=1)
+
+print(xx_horizontal_corners[-1,:])
+print(xx_horizontal_corners)
+
+##xx_horizontal_corners = np.vstack((xx_horizontal_corners, xx_horizontal_corners[-1,:]))
+##print(xx_horizontal_corners)
+
+##xx_vertical_corners   = np.append(xx_vertical_corners,   (np.amax(xx_vertical_corners)+dxx)*np.ones((1,xx_vertical_corners.shape[0])),   axis=0)
+
+print(xx_vertical_corners[:, -1])
+print(xx_vertical_corners)
+##xx_vertical_corners   =  np.hstack((xx_vertical_corners, xx_vertical_corners[:,-1])) 
+print(xx_vertical_corners)
+
+numslice = 0
+frametot = 201
+#frametot = 101
+#frametot = 11
+for tt in np.linspace(0.1, ttm, num=frametot):
+    
+    alpha      = np.empty_like(xx_rad)
+    alpha77    = np.empty_like(xx_rad)
+    rho        = np.empty_like(xx_rad)
+
+    vv_rad     = np.empty_like(xx_rad)
+    vv_pol     = np.empty_like(xx_rad)
+    uu_rad     = np.empty_like(xx_rad)
+    uu_pol     = np.empty_like(xx_rad)
+
+    psi        = np.empty_like(xx_rad)
+    Vpot       = np.empty_like(xx_rad)
+
+    Delta      = np.empty_like(xx_rad)
+
+    Phi_flux = np.empty_like(xx_rad)
+    phi_vecpot     = np.empty_like(xx_rad)
+
+
+    alpha_2_J  = np.empty_like(xx_rad)
+
+    for ii in range(xx_SHU_GRID.size):
+        for kk in range(xx_SHU_GRID.size):
+            xx    = xx_rad[ii,kk]
+            th    = theta[ii,kk]
+            ishu  = get_shu_index(xx, xx_SHU)
+            rho[ii, kk], alpha[ii, kk] = rho_rt(tt, xx_SHU[ishu],
+                                                vv_SHU77[ishu],
+                                                alpha_SHU77[ishu],
+                                                alpha_mono_GS93[ishu],
+                                                alpha_quad_GS93[ishu], th)
+            alpha77[ii, kk] = alpha_SHU77[ishu]
+
+            vv_ww_mono_point = vv_ww_mono_GS93[:, ishu]
+            vv_ww_quad_point = vv_ww_quad_GS93[:, ishu]
+            uu_dump, vv_dump =  uu_rt(tt, xx_SHU[ishu], vv_SHU77[ishu], alpha_SHU77[ishu], vv_ww_mono_point, vv_ww_quad_point, th)
+            vv_rad[ii, kk]  = vv_dump[0] 
+            vv_pol[ii, kk]  = vv_dump[1] 
+            uu_rad[ii, kk] = uu_dump[0] 
+            uu_pol[ii, kk] = uu_dump[1] 
+
+            mm_pp_mono_point = mm_pp_mono_GS93[:, ishu]
+            mm_pp_quad_point = mm_pp_quad_GS93[:, ishu]
+            Vpot[ii, kk], psi[ii, kk] = grav_psi_rt(tt, xx_SHU[ishu], mm_SHU77[ishu], mm_pp_mono_point, mm_pp_quad_point, th)
+
+            Phi_flux[ii, kk], phi_vecpot[ii, kk] = vectorpot_rt(tt, xx_SHU[ishu], mm_SHU77[ishu], alpha_SHU77[ishu], 
+                                                                FF_DD_mono_GS93[:, ishu], 
+                                                                FF_DD_quad_GS93[:, ishu], th)
+
+            Delta[ii, kk] = deltaspace(th, get_tau(tt))
+            alpha_2_J[ii, kk] = alpha_mono_GS93[ishu]*P_harmonics(th, J=0) + alpha_quad_GS93[ishu]*P_harmonics(th, J=2) 
+
+
+    vv_hor =   vv_pol*np.cos(theta) + vv_rad*np.sin(theta)
+    vv_ver = - vv_pol*np.sin(theta) + vv_rad*np.cos(theta)
+    uu_hor =   uu_pol*np.cos(theta) + uu_rad*np.sin(theta)
+    uu_ver = - uu_pol*np.sin(theta) + uu_rad*np.cos(theta)
+
+
+    rho77 = alpha77 * (1.0/(4.0*np.pi*G_newton)*tt) #TODO WRONG COEFFS!!! 
+
+
+    #Apply mask
+    
+    rad_mask = 0.2
+
+   
+    alpha = np.ma.masked_where(xx_rad < rad_mask, alpha)
+    rho   = np.ma.masked_where(xx_rad < rad_mask, rho)
+
+    vv_rad = np.ma.masked_where(xx_rad < rad_mask, vv_rad) 
+    uu_rad = np.ma.masked_where(xx_rad < rad_mask, uu_rad) 
+    vv_pol = np.ma.masked_where(xx_rad < rad_mask, vv_pol) 
+    uu_pol = np.ma.masked_where(xx_rad < rad_mask, uu_pol) 
+
+    vv_hor = np.ma.masked_where(xx_rad < rad_mask, vv_hor)
+    vv_ver = np.ma.masked_where(xx_rad < rad_mask, vv_ver)
+    uu_hor = np.ma.masked_where(xx_rad < rad_mask, uu_hor)
+    uu_ver = np.ma.masked_where(xx_rad < rad_mask, uu_ver)
+
+    psi  = np.ma.masked_where(xx_rad < rad_mask, psi )
+    Vpot = np.ma.masked_where(xx_rad < rad_mask, Vpot)
+
+    phi_vecpot = np.ma.masked_where(xx_rad < rad_mask, phi_vecpot)
+    Phi_flux   = np.ma.masked_where(xx_rad < rad_mask, Phi_flux  )
+
+    alpha_2_J = np.ma.masked_where(xx_rad < rad_mask, alpha_2_J)
+    Delta     = np.ma.masked_where(xx_rad < rad_mask, Delta    )
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, alpha, rho, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"\alpha", title2=r"\rho", filetitle='GS93density',
+                streamlines = 1, contourplot=1, 
+                var_min=[0.00, 1e15], var_max=[16, 1e21], 
+                normtype = 'cdensity')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, alpha77, rho77, 
+                #var_min=[0.00, 0], var_max=[16, 1e20], 
+                title1=r"\alpha", title2=r"\rho", filetitle='S77density')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, vv_rad, uu_rad, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"v_r", title2=r"u_r", filetitle='GS93velocity_rad',
+                var_min=[-2.5, -2.5*cs0], var_max=[0.0, 0.0*cs0], 
+                normtype = 'lin')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, vv_pol, uu_pol, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"v_\theta", title2=r"u_\theta", filetitle='GS93velocity_pol',
+                var_min=[0.0, 0.0*cs0], var_max=[0.5, 0.5*cs0], 
+                normtype = 'lin')
+
+    
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, psi, Vpot, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"\psi", title2=r"V_\mathrm{pot}", filetitle='GS93gravpot',
+                var_min=[12.0, 12.0*(cs0**2.0)], var_max=[21.0, 21.0*(cs0**2.0)], 
+                normtype = 'lin')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, phi_vecpot, Phi_flux, 
+                title1=r"\phi", title2=r"\Phi_\mathrm{flux}", filetitle='GS93vecpot',
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                streamlines = 1, contourplot=1,
+                normtype = 'cflux')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, np.sqrt(vv_hor**2.0 + vv_ver**2.0), np.sqrt(uu_hor**2.0 + uu_ver**2.0), 
+                title1=r"|v|", title2=r"|u| (cm/s)", filetitle='GS93vel2',
+                var_min=[0.0, 0.0*cs0], var_max=[2.5, 2.5*cs0], 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                streamlines = 1,  
+                normtype = 'lin')
+
+    
+    ##plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, Delta, Delta,  
+    ##            title1=r"\Delta", title2=r"\Delta", filetitle='Delta',
+    ##            normtype = 'lin')
+
+    ##plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, alpha_2_J, alpha_2_J,  
+    ##            title1=r"\sum \alpha^{(2)}_J", title2=r"\sum \alpha^{(2)}_J", filetitle='alpha_2_J', 
+    ##            normtype = 'lin')
+
+    numslice += 1 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/analysis/python/calc/purge.sh
+++ b/analysis/python/calc/purge.sh
@@ -0,0 +1 @@
+rm *.png
--- a/analysis/python/calc/shu_selfsim.py
+++ b/analysis/python/calc/shu_selfsim.py
@@ -0,0 +1,279 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import numpy as np
+import pylab as plt
+
+G_newton = 6.674e-8 #cm**3 g**-1 s**-2  
+
+def dv_dx(xx,vv, alpha):
+    EE = alpha*(xx-vv) - 2.0/xx 
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+def dalpha_dx(xx,vv, alpha):
+    EE = alpha*(alpha - (2.0/xx)*(xx-vv))
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+###def dv_dx(xx,vv, alpha):
+###    return 2.0*(xx-vv)
+###
+###def dalpha_dx(xx,vv, alpha):
+###    return -1.0*(xx-vv)
+
+def get_m(xx, vv, alpha): 
+    mm = xx**2.0 * alpha * (xx-vv)
+    return mm 
+
+def alpha_to_rho(alpha, tt):
+    rho = alpha/(4.0*np.pi*G_newton*(tt**2.0))
+    return rho
+
+def vv_to_uu(vv, cs0):
+    uu = cs0*vv
+    return uu
+
+def mm_to_MM(mm, tt, cs0):
+    MM = (((cs0**3.0)*tt)/G_newton)*mm
+    return MM
+
+def euler(xx_step, xx, vv, alpha, mm, target):
+    diff = target - xx[-1]  
+    if diff >= 0:         
+        while xx[-1] <= target:
+            vv_step    = vv[-1]    + xx_step*dv_dx(xx[-1], vv[-1], alpha[-1])
+            alpha_step = alpha[-1] + xx_step*dalpha_dx(xx[-1], vv[-1], alpha[-1])
+        
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+    else: 
+        while xx[-1] <= target:
+            vv_step    = vv[-1]    + xx_step*dv_dx(xx[-1], vv[-1], alpha[-1])
+            alpha_step = alpha[-1] + xx_step*dalpha_dx(xx[-1], vv[-1], alpha[-1])
+        
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+    return xx, vv, alpha, mm
+
+def RK4_step(vv, xx, alpha, xx_step): 
+    vv1    =     xx_step*dv_dx(xx[-1], vv[-1], alpha[-1]) 
+    alpha1 = xx_step*dalpha_dx(xx[-1], vv[-1], alpha[-1])
+    
+    vv2 =        xx_step*dv_dx(xx[-1]+xx_step/2.0, vv[-1]+vv1/2.0, alpha[-1]+alpha1/2.0)
+    alpha2 = xx_step*dalpha_dx(xx[-1]+xx_step/2.0, vv[-1]+vv1/2.0, alpha[-1]+alpha1/2.0)
+    
+    vv3 =        xx_step*dv_dx(xx[-1]+xx_step/2.0, vv[-1]+vv2/2.0, alpha[-1]+alpha2/2.0)
+    alpha3 = xx_step*dalpha_dx(xx[-1]+xx_step/2.0, vv[-1]+vv2/2.0, alpha[-1]+alpha2/2.0)
+    
+    vv4 =        xx_step*dv_dx(xx[-1]+xx_step, vv[-1]+vv3, alpha[-1]+alpha3)
+    alpha4 = xx_step*dalpha_dx(xx[-1]+xx_step, vv[-1]+vv3, alpha[-1]+alpha3)
+    
+    vv_step    = vv[-1]    + (1.0/6.0)*(vv1 + 2.0*vv2 + 2.0*vv3 + vv4) 
+    alpha_step = alpha[-1] + (1.0/6.0)*(alpha1 + 2.0*alpha2 + 2.0*alpha3 + alpha4)
+
+    return vv_step, alpha_step
+
+def RK4(xx_step, xx, vv, alpha, mm, target, epsilon):
+    #Runge-Kutta RK4
+    diff = target - xx[-1]  
+    #if diff < 0: 
+
+    if diff >= 0:         
+        while xx[-1] <= target:
+            if (np.abs(xx[-1] - vv[-1] - 1.0) > epsilon):
+                vv_step, alpha_step = RK4_step(vv, xx, alpha, xx_step)
+                print( vv_step, alpha_step)
+            else: 
+                vv_step    = vv[-1]
+                alpha_step = alpha[-1]
+                print("PIIP") 
+
+            #print(np.abs(xx[-1] - vv[-1]), epsilon)
+ 
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+    else:         
+        while xx[-1] >= target:
+            if (np.abs(xx[-1] - vv[-1] - 1.0) > epsilon):
+                vv_step, alpha_step = RK4_step(vv, xx, alpha, xx_step)
+                print( vv_step, alpha_step)
+            else: 
+                vv_step    = vv[-1]
+                alpha_step = alpha[-1]
+                print("PIIP") 
+
+            #print(np.abs(xx[-1] - vv[-1]), epsilon)
+ 
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+            
+
+    return xx, vv, alpha, mm
+
+# From Shu 1977 TABLE II
+
+xx_SHU    =  np.array([0.05 , 0.10 , 0.15 , 0.20 , 0.25 , 0.30 , 0.35 , 0.40 , 0.45 ,
+		       0.50 , 0.55 , 0.60 , 0.65 , 0.70 , 0.75 , 0.80 , 0.85 ,
+                       0.90 , 0.95 , 1.00]) 
+alpha_SHU =  np.array([71.5 , 27.8 , 16.4 , 11.5 , 8.76 , 7.09 , 5.95 , 5.14 , 4.52 ,
+		       4.04 , 3.66 , 3.35 , 3.08 , 2.86 , 2.67 , 2.50 , 2.35 ,
+                       2.22 , 2.10 , 2.00]) 
+vv_SHU    = -np.array([5.44 , 3.47 , 2.58 , 2.05 , 1.68 , 1.40 , 1.18 , 1.01 , 0.861,
+		       0.735, 0.625, 0.528, 0.442, 0.363, 0.291, 0.225, 0.163,
+                       0.106, 0.051, 0.00]) 
+mm_SHU    =  np.array([0.981, 0.993, 1.01 , 1.03 , 1.05 , 1.08 , 1.12 , 1.16 , 1.20 ,
+		       1.25 , 1.30 , 1.36 , 1.42 , 1.49 , 1.56 , 1.64 , 1.72 ,
+                       1.81 , 1.90 , 2.00]) 
+
+
+##From Shu (1977)
+#AA = [  2.0,  2.2,  2.4,  2.6,  2.8,  3.0,  3.2,  3.4,  3.6,  3.8, 4.0]
+#m0 = [0.975, 1.45, 1.88, 2.31, 2.74, 3.18, 3.63, 4.10, 4.58, 5.08, 5.58]
+#AA = np.array(AA)
+#m0 = np.array(m0)
+
+#xx0    = xx_SHU[1] 
+#alpha0 = alpha_SHU[1] 
+#vv0    = vv_SHU[1]
+#xx_step = 0.005
+#target = 1.0
+
+xx0    = xx_SHU[-3] 
+alpha0 = alpha_SHU[-3] 
+vv0    = vv_SHU[-3]
+target = 0.05
+xx_step = -0.005
+xx_step = -0.001
+             
+print(get_m(xx0, alpha0, vv0))
+
+xx = np.array([])
+alpha = np.array([])
+vv = np.array([])
+mm = np.array([])
+
+xx = np.append(xx, xx0)
+alpha = np.append(alpha, alpha0)
+vv = np.append(vv, vv0)
+mm = np.append(mm, get_m(xx0, alpha0, vv0))
+
+print(xx, alpha, vv, mm)
+
+
+xx_EUL, vv_EUL, alpha_EUL, mm_EUL = euler(xx_step, xx, vv, alpha, mm, target)
+xx_RK , vv_RK , alpha_RK , mm_RK  = RK4(xx_step, xx, vv, alpha, mm, target, epsilon = 0.000001)
+
+mm_EUL = get_m(xx_EUL, alpha_EUL, vv_EUL)
+mm_RK  = get_m(xx_RK , alpha_RK , vv_RK )
+mm_SHU = get_m(xx_SHU, alpha_SHU, vv_SHU)
+
+# Plotting time
+ 
+figQ, axQ = plt.subplots(nrows=2, ncols=2, sharex=True)
+
+axQ[0,0].plot(xx_EUL, alpha_EUL, label=r'$\alpha$ (Euler)', linewidth = 3.0)
+axQ[0,0].plot(xx_RK , alpha_RK , label=r'$\alpha$ (RK4)', linewidth = 3.0)
+axQ[0,0].plot(xx_SHU, alpha_SHU, 'd', label=r'$\alpha$ (Shu)', linewidth = 3.0)
+axQ[0,0].set_xlabel(r'x')
+axQ[0,0].set_ylabel(r'$\alpha$')
+axQ[0,0].legend()
+
+axQ[0,1].plot(xx_EUL, np.abs(vv_EUL), label='v (Euler)', linewidth = 3.0)
+axQ[0,1].plot(xx_RK , np.abs(vv_RK ), label='v (RK4)', linewidth = 3.0)
+axQ[0,1].plot(xx_SHU, np.abs(vv_SHU),'d', label='v (Shu)', linewidth = 3.0)
+axQ[0,1].set_xlabel(r'x')
+axQ[0,1].set_ylabel(r'-v')
+axQ[0,1].legend()
+
+axQ[1,0].plot(xx_EUL, mm_EUL, label='m (Euler)', linewidth = 3.0)
+axQ[1,0].plot(xx_RK , mm_RK , label='m (RK4)', linewidth = 3.0)
+axQ[1,0].plot(xx_SHU , mm_SHU , 'd', label='m (Shu)', linewidth = 3.0)
+axQ[1,0].set_xlabel(r'x')
+axQ[1,0].set_ylabel(r'm')
+axQ[1,0].legend()
+
+
+axQ[1,1].plot(xx_EUL, xx_EUL-vv_EUL, label='x-v (Euler)', linewidth = 3.0)
+axQ[1,1].plot(xx_RK , xx_RK -vv_RK , label='x-v (RK4)', linewidth = 3.0)
+axQ[1,1].plot(xx_SHU, xx_SHU-vv_SHU, 'd', label='x-v (Shu)', linewidth = 3.0)
+axQ[1,1].set_xlabel(r'x')
+axQ[1,1].set_ylabel(r'x-v')
+axQ[1,1].legend()
+
+# Time to convert to physical quantities
+yr  = 3.154e+7 #s 
+kyr = 1000.0*yr
+km = 1e5 #cm
+AU = 1.496e+13 #cm
+Msun = 1.98847e33 #g
+
+cs0 = 20000   #cs cm/s "a" in Shu notation
+
+tt_list = np.linspace(10*kyr, 20.0*kyr, num=4)
+mm = get_m(xx_RK, vv_RK, alpha_RK) 
+
+
+fig, ax = plt.subplots(nrows=1, ncols=3, sharex=True)
+
+for tt in tt_list:
+    rho = alpha_to_rho(alpha_RK, tt)
+    RR = xx_RK*(cs0*tt)
+    time = r'%.2f $\mathrm{kyr}$' % (tt/kyr) 
+    
+    ax[0].plot(RR/AU, rho, label= r'$\rho$, t = ' + time, linewidth = 3.0)
+    ax[0].set_xlabel(r'R (AU)')
+    ax[0].set_ylabel(r'$\rho$ (g/cm$^3$)')
+    ax[0].set_xscale('log')
+    ax[0].set_yscale('log')
+    ax[0].legend()
+
+    uu = vv_to_uu(vv_RK, cs0)
+
+    ax[1].plot(RR/AU, -uu/km, label= r'$u$, t = ' + time, linewidth = 3.0)
+    ax[1].set_xlabel(r'R (AU)')
+    ax[1].set_ylabel(r'-$u$ (km/s)')
+    ax[1].set_yscale('log')
+    ax[1].legend()
+
+    MM = mm_to_MM(mm, tt, cs0)
+
+    ax[2].plot(RR/AU, MM/Msun, label= r'$M$, t = ' + time, linewidth = 3.0)
+    ax[2].set_xlabel(r'R (AU)')
+    ax[2].set_ylabel(r'$M$ ($M_\odot}$)')
+    ax[2].legend()
+
+   
+
+plt.show()
+
+
+
+
--- a/analysis/python/purgepng.sh
+++ b/analysis/python/purgepng.sh
@@ -0,0 +1 @@
+rm *.png
--- a/analysis/python/samples/README.md
+++ b/analysis/python/samples/README.md
@@ -0,0 +1,3 @@
+# Analysis script samples
+
+This directory is for sample scripts useable for data analysis and visualization. 
--- a/analysis/python/samples/lnrhobound.py
+++ b/analysis/python/samples/lnrhobound.py
@@ -0,0 +1,41 @@
+import pylab as plt
+import numpy as np 
+
+
+def do_bound(coeff):
+    vertex_buffer = np.zeros(7, dtype=np.float32)
+    xx = np.arange(vertex_buffer.size)
+    
+    edge_idx = 3
+    
+    for dst_idx in range(3):
+        i_diff = abs(edge_idx - dst_idx)
+        vertex_buffer[dst_idx] = coeff*np.exp(vertex_buffer[edge_idx])
+     
+        print("initial",vertex_buffer)
+    
+        for i in range(i_diff): 
+            vertex_buffer[dst_idx] = coeff*vertex_buffer[dst_idx]
+            print("looped", vertex_buffer[dst_idx])
+        
+        vertex_buffer[dst_idx] = np.log(vertex_buffer[dst_idx]);
+        print("final",vertex_buffer)
+
+    return xx, vertex_buffer
+
+
+AC_dsx = 0.04908738521
+coeff1 = 1.0 - AC_dsx/(25.0*AC_dsx)
+coeff2 = 1.0 - AC_dsx/(100.0*AC_dsx)
+
+
+plt.figure()
+xx, yy = do_bound(coeff1)
+plt.plot(xx, yy)
+
+plt.figure()
+xx, yy = do_bound(coeff2)
+plt.plot(xx, yy)
+
+plt.show()
+
--- a/analysis/python/samples/readtest.py
+++ b/analysis/python/samples/readtest.py
@@ -0,0 +1,260 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import astar.data as ad
+import astar.visual as vis
+import pylab as plt 
+import numpy as np 
+import sys
+
+##mesh = ad.read.Mesh(500, fdir="/tiara/home/mvaisala/astaroth-code/astaroth_2.0/build/")
+##
+##print(np.shape(mesh.uu))
+##print(np.shape(mesh.lnrho))
+##
+##uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+##vis.slices.plot_3(mesh, uu_tot, title = r'$|u|$', bitmap = True, fname = 'uutot')
+##
+##vis.slices.plot_3(mesh, mesh.lnrho, title = r'$\ln \rho$', bitmap = True, fname = 'lnrho')
+##
+##print(mesh.minfo.contents)
+
+
+AC_unit_density  =  1e-17
+AC_unit_velocity = 1e5
+AC_unit_length   = 1.496e+13
+
+
+print("sys.argv", sys.argv)
+
+#meshdir = "/tiara/home/mvaisala/astaroth-code/astaroth_2.0/build/"
+meshdir  = "/tiara/ara/data/mvaisala/tmp/astaroth-code/astaroth_2.0/build/"
+#meshdir = "/tiara/ara/data/mvaisala/asth_testbed_double/"
+
+if "xtopbound" in sys.argv: 
+    for i in range(0, 171):
+        mesh = ad.read.Mesh(i, fdir=meshdir) 
+        if mesh.ok:
+            np.set_printoptions(precision=4, linewidth=150)
+            uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+            print(mesh.lnrho.shape)
+            print(range((mesh.lnrho.shape[0]-7),mesh.lnrho.shape[0]))
+            print('lnrho', i, mesh.lnrho[(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uux', i, mesh.uu[0][(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uuy', i, mesh.uu[1][(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uuz', i, mesh.uu[2][(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uu_tot', i, uu_tot[    (mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+    
+
+if "single" in sys.argv:
+    mesh = ad.read.Mesh(1, fdir=meshdir)
+    print(mesh.lnrho.shape)
+    
+    print( mesh.lnrho[1, 50, 100], 0.0)
+    print( mesh.lnrho[197, 50, 100], 0.0)
+    print( mesh.lnrho[100, 50, 1], 0.0)
+    print( mesh.lnrho[100, 50, 197], 0.0)
+    print( mesh.lnrho[100, 1, 100], "periodic")
+    print( mesh.lnrho[100, 101, 00], "periodic")
+
+    angle = 0.78
+    UUXX = -0.25 * np.cos(angle)
+    zorig = 4.85965
+    zz = [0.0490874*1.0 - zorig,  0.0490874*100.0 - zorig, 0.0490874*197.0 - zorig]
+    print (zz) 
+    zz = np.array(zz)
+    UUZZ = - 0.25*np.sin(angle)*np.tanh(zz/0.2)
+    #plt.plot(np.linspace(-5.0, 5.0, num=100),- (0.25*np.sin(angle))*np.tanh(np.linspace(-5.0, 5.0, num=100)/0.2)) 
+    #plt.show()
+    print("---- UUX")
+    print( mesh.uu[0][1, 50, 100], 0.0)
+    print( mesh.uu[0][197, 50, 100], UUXX)
+    print( mesh.uu[0][100, 50, 1], UUXX)
+    print( mesh.uu[0][100, 50, 197], UUXX)
+    print( mesh.uu[0][100, 1, 100], "periodic")
+    print( mesh.uu[0][100, 101, 00], "periodic")
+    print("---- UUY")
+    print( mesh.uu[1][1, 50, 100], 0.0)
+    print( mesh.uu[1][197, 50, 100], 0.0)
+    print( mesh.uu[1][100, 50, 1], 0.0)
+    print( mesh.uu[1][100, 50, 197], 0.0)
+    print( mesh.uu[1][100, 1, 100], "periodic")
+    print( mesh.uu[1][100, 101, 00], "periodic")
+    print("---- UUZ")
+    print( mesh.uu[2][1, 50, 100], 0.0)
+    print( mesh.uu[2][197, 50, 100], UUZZ[1])
+    print( mesh.uu[2][100, 50, 1],   UUZZ[0])
+    print( mesh.uu[2][100, 50, 197], UUZZ[2])
+    print( mesh.uu[2][100, 1, 100], "periodic")
+    print( mesh.uu[2][100, 101, 00], "periodic")
+
+if 'xline' in sys.argv:
+    mesh = ad.read.Mesh(0, fdir=meshdir)
+    plt.figure()
+    plt.plot(mesh.uu[0][100, 50, :] , label="z")
+    plt.plot(mesh.uu[0][100, :, 100], label="x")
+    plt.plot(mesh.uu[0][:, 50, 100] , label="y")
+    plt.legend()
+
+    plt.figure()
+    plt.plot(mesh.uu[0][197, 50, :] , label="z edge")
+
+    plt.figure()
+    plt.plot(mesh.uu[1][100, 50, :] , label="z")
+    plt.plot(mesh.uu[1][100, :, 100], label="x")
+    plt.plot(mesh.uu[1][:, 50, 100] , label="y")
+    plt.legend()
+
+    plt.figure()
+    plt.plot(mesh.uu[2][100, 50, :] , label="z")
+    plt.plot(mesh.uu[2][100, :, 100], label="x")
+    plt.plot(mesh.uu[2][:, 50, 100] , label="y")
+    plt.legend()
+    plt.show()
+
+if 'check' in sys.argv:
+    mesh = ad.read.Mesh(0, fdir=meshdir)
+    vis.slices.plot_3(mesh, mesh.lnrho, title = r'$\ln \rho$', bitmap = False, fname = 'lnrho', contourplot = True)
+    plt.show()
+
+
+
+if 'diff' in sys.argv:
+    mesh0 = ad.read.Mesh(1, fdir=meshdir)
+    mesh1 = ad.read.Mesh(2, fdir=meshdir)
+    vis.slices.plot_3(mesh1, mesh1.lnrho - mesh0.lnrho, title = r'$\ln \rho$', bitmap = True, fname = 'lnrho')
+    vis.slices.plot_3(mesh1, mesh1.uu[0] - mesh0.uu[0], title = r'$u_x$',      bitmap = True, fname = 'uux')
+    vis.slices.plot_3(mesh1, mesh1.uu[1] - mesh0.uu[1], title = r'$u_y$',      bitmap = True, fname = 'uuy')
+    vis.slices.plot_3(mesh1, mesh1.uu[2] - mesh0.uu[2], title = r'$u_z$',      bitmap = True, fname = 'uuz')
+
+if '1d' in sys.argv:
+    plt.figure()
+    for i in range(0, 100001, 1000):
+        mesh = ad.read.Mesh(i, fdir=meshdir) 
+        if mesh.ok:
+
+            if 'lnrho' in sys.argv:
+                plt.plot(mesh.lnrho[:, 20, 100], label=i)
+            elif 'uux' in sys.argv:
+                plt.plot(mesh.uu[0][:, 20, 100], label=i)
+            elif 'uuy' in sys.argv:
+                plt.plot(mesh.uu[1][:, 20, 100], label=i)
+            elif 'uuz' in sys.argv:
+                plt.plot(mesh.uu[2][:, 20, 100], label=i)
+            elif 'uutot' in sys.argv:
+                uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+                plt.plot(uu_tot[:, 20, 100], label=i)
+ 
+            plt.legend()
+
+    plt.show()
+
+
+if 'sl' in sys.argv:
+    maxfiles = 200002
+    stride = 10000
+    for i in range(0, maxfiles, stride):
+        mesh = ad.read.Mesh(i, fdir=meshdir) 
+        print(" %i / %i" % (i, maxfiles))
+        if mesh.ok:
+            uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+
+            if 'lim' in sys.argv:
+                vis.slices.plot_3(mesh, mesh.lnrho,         title = r'$\ln \rho$', bitmap = True, fname = 'lnrho', colrange=[-0.02, 0.0])
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$\rho$', bitmap = True, fname = 'rho', colrange=[0.97, 1.0])
+                vis.slices.plot_3(mesh, mesh.uu[0],         title = r'$u_x$', bitmap = True, fname = 'uux', colrange=[-0.002, 0.002])
+                vis.slices.plot_3(mesh, mesh.uu[1],         title = r'$u_y$', bitmap = True, fname = 'uuy', colrange=[-1.0e-20, 1.0e-20])
+                vis.slices.plot_3(mesh, mesh.uu[2],         title = r'$u_z$', bitmap = True, fname = 'uuz', colrange=[-0.002, 0.002])
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$N_\mathrm{col}$', bitmap = True, fname = 'colden', slicetype = 'sum', colrange=[0.0, 100.0])
+                vis.slices.plot_3(mesh, uu_tot,             title = r'$|u|$', bitmap = True, fname = 'uutot', colrange=[0.00, 0.004])
+            else: 
+                vis.slices.plot_3(mesh, mesh.lnrho,         title = r'$\ln \rho$', bitmap = True, fname = 'lnrho')
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$\rho$', bitmap = True, fname = 'rho')
+                #vis.slices.plot_3(mesh, mesh.ss, title = r'$s$', bitmap = True, fname = 'ss')
+                vis.slices.plot_3(mesh, mesh.uu[0],         title = r'$u_x$', bitmap = True, fname = 'uux')
+                vis.slices.plot_3(mesh, mesh.uu[1],         title = r'$u_y$', bitmap = True, fname = 'uuy')
+                vis.slices.plot_3(mesh, mesh.uu[2],         title = r'$u_z$', bitmap = True, fname = 'uuz')
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$N_\mathrm{col}$', bitmap = True, fname = 'colden', slicetype = 'sum')
+                vis.slices.plot_3(mesh, uu_tot,             title = r'$|u|$', bitmap = True, fname = 'uutot')
+    
+    
+
+if 'ts' in sys.argv:
+   ts = ad.read.TimeSeries(fdir=meshdir)
+
+   end_rm = -1 #-35#-40
+
+   plt.figure()
+   xaxis  = 't_step'
+   yaxis1 = 'lnrho_rms'
+   yaxis2 = 'lnrho_min'
+   yaxis3 = 'lnrho_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uutot_rms'
+   yaxis2 = 'uutot_min'
+   yaxis3 = 'uutot_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uux_rms'
+   yaxis2 = 'uux_min'
+   yaxis3 = 'uux_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uuy_rms'
+   yaxis2 = 'uuy_min'
+   yaxis3 = 'uuy_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uuz_rms'
+   yaxis2 = 'uuz_min'
+   yaxis3 = 'uuz_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+  
+   plt.show()
+
+
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -0,0 +1,54 @@
+
+
+/*
+ * =============================================================================
+ * "Compile-time" params
+ * =============================================================================
+ */
+AC_nx = 192
+AC_ny = 120
+AC_nz = 7
+
+AC_dsx = 0.04908738521
+AC_dsy = 0.04908738521
+AC_dsz = 0.04908738521
+
+/*
+ * =============================================================================
+ * Run-time params
+ * =============================================================================
+ */
+AC_max_steps = 1001 
+AC_save_steps = 10
+AC_bin_steps = 1000
+AC_bin_save_t = 1e666
+
+// Hydro
+AC_cdt = 0.4
+AC_cdtv = 0.3
+AC_cdts = 1.0
+AC_nu_visc  = 5e-3
+AC_cs_sound = 1.0
+AC_zeta = 0.01
+
+// Magnetic
+AC_eta = 5e-3
+AC_mu0 = 1.4
+AC_chi = 0.0001
+
+// Forcing
+AC_relhel = 0.0
+
+// Entropy
+AC_cp_sound = 1.0
+AC_gamma = 0.5
+AC_lnT0 = 1.2
+AC_lnrho0 = 1.3
+
+/*
+ * =============================================================================
+ * Initial conditions
+ * =============================================================================
+ */
+AC_ampl_lnrho = 0.0
+AC_ampl_uu = 1.0
--- a/config/astaroth_pseudodisk.conf
+++ b/config/astaroth_pseudodisk.conf
@@ -0,0 +1,121 @@
+
+
+/*
+ * =============================================================================
+ * "Compile-time" params
+ * =============================================================================
+ */
+AC_nx = 192
+AC_ny = 48
+AC_nz = 192
+
+AC_dsx = 0.04908738521
+AC_dsy = 0.04908738521
+AC_dsz = 0.04908738521
+
+/*
+ * =============================================================================
+ * Run-time params
+ * =============================================================================
+ */
+//AC_max_steps = 16001
+//AC_save_steps = 50
+//AC_bin_steps = 16000
+
+//AC_max_steps = 1001
+//AC_save_steps = 10
+//AC_bin_steps = 1000
+
+//AC_max_steps = 11
+//AC_save_steps = 1
+//AC_bin_steps = 1
+
+//AC_max_steps = 4
+//AC_save_steps = 1
+//AC_bin_steps = 1
+
+//AC_max_steps = 1201
+//AC_save_steps = 10
+//AC_bin_steps = 1200
+//AC_bin_save_t = 5.0 
+
+
+//AC_max_steps = 50001
+//AC_save_steps = 100
+//AC_bin_steps = 10000
+
+AC_max_steps = 100001
+AC_save_steps = 500
+AC_bin_steps = 20000
+
+AC_bin_save_t = 2300000.0
+
+// Hydro
+AC_cdt = 0.4
+AC_cdtv = 0.3
+AC_cdts = 1.0
+//GOOD VISC Re_mesh = 3 
+//AC_nu_visc  = 3.0e-3
+AC_nu_visc  = 1.0e-3
+AC_cs_sound = 0.2
+AC_zeta = 1.0e-3
+
+// Magnetic
+AC_eta = 5e-3
+AC_mu0 = 1.4
+AC_chi = 0.0001
+
+// Forcing
+AC_relhel = 0.0
+
+// Entropy
+// cp arbitrary
+AC_cp_sound = 1.0
+// 5/3 adiabatic process
+AC_gamma = 1.66
+AC_lnT0 = 1.0
+AC_lnrho0 = 0.0
+
+
+// Boundary condition. Defined by arbitrary int. 
+AC_bc_type = 666 
+//AC_bc_type = 121 
+AC_trans = 0.6
+
+
+//Physical units (cgs) 
+// Based on Shu 1977 model calculations with t = 20 kyr, R = 500 AU
+// g/cm^3
+AC_unit_density =  1e-17 
+// cm/s
+// Now 1 km/s
+//AC_unit_velocity = 1e5 
+AC_unit_velocity = 1.0 
+// cm
+// Now 1 AU 
+AC_unit_length = 1.496e+13 
+
+//Properties of gravitating star*
+AC_star_pos_x = -500.0 
+//AC_star_pos_x = -10.0 
+AC_star_pos_y = 0.0 
+AC_star_pos_z = 0.0 
+//In M_sun 
+//AC_M_star = 0.05
+AC_M_star = 0.5
+//AC_M_star = 0.0
+
+/*
+ * =============================================================================
+ * Initial conditions
+ * =============================================================================
+ */
+AC_ampl_lnrho = 0.0
+AC_lnrho_edge = -1.0
+AC_lnrho_out  = 0.0
+//original
+//AC_ampl_uu = 0.25
+//For gravity test
+AC_ampl_uu = 0.0
+AC_angl_uu = 0.0
+//AC_angl_uu = 0.35
--- a/doc/doxygen/.gitignore
+++ b/doc/doxygen/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
--- a/doc/manual/manual.md
+++ b/doc/manual/manual.md
@@ -0,0 +1,131 @@
+
+*Miikka Vaisala: This is just something I have astarted to write up to make sense about the Astaroth 2.0. Starting for personally important notes to understand the code. Will be refined as my understanding improves.*
+
+#Astaroth manual
+
+## Compilation
+
+See the `README.md`. At the moment, let us keep certaint things in one place.
+
+## Simulation instructions
+
+At the moment it is only possible to build and run in the `astaroth_2.0/build/` directory. Possibility to add separate run directories will be included later.
+
+### Choosing physics
+
+Runtime settings can be adjusted from `astaroth_2.0/include/astaroth.h` and `astaroth_2.0/config/astaroth.conf`.
+
+Howeve, physics switches LENTROPY, LFORCING etc. do not work at the moment. There has been an issue to get pre-processor combatible with astaroth-domain-specific language in Astaroth 2.0. Therefore, all features are online by default.
+
+To get the switcher working now, rename `astaroth_2.0/src/core/kernels/rk3handtuned.cuh` -> `rk3.cuh`. (**MV:** Not yet tested.)
+
+How to use?
+
+What kind of runtime settings?
+
+### Setting initial conditions
+
+Where can we effectively choose the initial condition?
+
+### Launchin a run
+
+`./ac_run -s` assuming you are doing a normal simulation. Basic code for this invocation can be found in the source file `astaroth_2.0/src/standalone/simulation.cc`.
+
+Please note that launching `./ac_run -t` will *fail if entropy and forcing are in use*. Test is mainly for finding paralleization bugs. (In principle if hydro stuff and induction work, so will forcing and entropy.)
+
+### Diagnostic variables
+
+What is calculated?
+
+Where it is saved?
+
+### Simulation data
+
+Saving output binaries is not enabled yet.
+
+**MV:** I am planning to implement HDF5 format for the data. **TOP PRIORITY**.
+
+#### Notes about data structures
+
+- Configuration parameters have prefix `AC_`, such as `AC_dsx`.
+
+- All configurations are stored in the struct `AcMeshInfo`, containing tables `int_params` ja `real_params`. **NOTE:** `int_params` and `real_params` require diligence. If you call e.g. `int_params[AC_dsx]`, the result will be something unexpected. So-far error checking with this has now been possible to be automated.
+
+
+- All mesh data is stored to the struct `AcMesh`, containing both configuration values and vertex data (`lnrho`, `uux`, etc.)
+
+- All essential tructs, macros and enumerators are found in astaroth.h for better reference.
+
+- In the case there is changes in the data layout, better use macro `AC_VTXBUF_IDX(i, j, k, mesh_info)`which transform indices from 3D to 1D. Therefore no need to start writing `i + j * mesh_info.int_params[AC_mx] + ...` which would affect the code readability.
+
+- AcReal on generic floating point real number type used everywhere in the code. Currently can be either `float` or `double`. Possibly in the future also `half` or `long double` could become available.
+
+Sample code:
+
+```cpp
+AcMeshInfo mesh_info;
+// Loads data from astaroth.conf into the AcMeshInfo struct
+load_config(&mesh_info);
+
+// Allocates data on the host for the AcMesh struct using information found in mesh_info.
+AcMesh* mesh = acmesh_create(mesh_info);
+
+// Initializes mesh to InitType (specified in standalone/model/host_memory.h)
+acmesh_init_to(INIT_TYPE_GAUSSIAN_RADIAL_EXPL, mesh); 
+
+// Allocates data on the device for the AcMesh struct
+acInit(mesh_info); 
+
+acLoad(*mesh); // Loads the mesh to the device
+
+
+const AcReal dt = 1.f;
+
+// Synchronizes previous device commands
+acSynchronize(); 
+
+// Does a full rk3 integration step on the device
+acIntegrate(dt); 
+
+acSynchronize();
+
+// Store data from device to host mesh
+acStore(mesh); 
+
+printf("nx: %d, dsx %f\n", 
+        mesh->info.int_params[AC_nx], 
+        double(mesh->info.real_params[AC_dsx]));
+printf("First vertex of the computational domain: %f\n",        
+double(mesh->vertex_buffer[VTXBUF_LNRHO][AC_VTXBUF_IDX(3, 3, 3, mesh_info)]));
+
+```
+
+
+### Reading data
+
+Depends on the output format. With HDF5 should be simple enough.
+
+[Jupyter notebook](http://jupyter.org/) visualization?
+
+Do we want to use [YT?](https://yt-project.org/)
+
+### Live rendering
+
+MV: Cool, but does not work for remote cluster so far. A GPU workstation is required.
+
+##Multi-GPU
+
+At the moment multi-GPU is not included in Astaroth 2.0. However, it has been implemented 1.0 (`astaroth_1.0/src/gpu/cuda/cuda_generic.cu`) could be essentially ported by copypasting to `astaroth_2.0/src/core/astaroth.cu` after we have clear idea how to run things with single GPU. Could be done overnight in principle.
+
+
+## Profiling
+
+The built-in beachmark is currently unreliable due to an unknown reason. Please use [nvprof and nvvp](https://docs.nvidia.com/cuda/profiler-users-guide/index.html) for precise profiling. Also, NVIDIA suggests their [Nsight Systems](https://developer.nvidia.com/nsight-systems).
+
+
+
+## ETC
+
+**Note** `auto_optimize.sh` does not currently work, but it aims to tune thread block dimensions automatically.
+
+
--- a/2427
+++ b/2427
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -0,0 +1,422 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Provides an interface to Astaroth. Contains all the necessary configuration
+ * structs and functions for running the code on multiple GPUs.
+ *
+ * All interface functions declared here (such as acInit()) operate all GPUs
+ * available in the node under the hood, and the user does not need any
+ * information about the decomposition, synchronization or such to use these
+ * functions.
+ *
+ */
+#pragma once
+
+/* Prevent name mangling */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <float.h>        // FLT_EPSILON, etc
+#include <stdlib.h>       // size_t
+#include <vector_types.h> // CUDA vector types (float4, etc)
+
+
+/*
+ * =============================================================================
+ * Flags for auto-optimization
+ * =============================================================================
+ */
+#define AUTO_OPTIMIZE (0) // DEPRECATED TODO remove
+#define BOUNDCONDS_OPTIMIZE (0)
+#define GENERATE_BENCHMARK_DATA (0)
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define MAX_TB_DIM (MAX_THREADS_PER_BLOCK)
+#define NUM_ITERATIONS (10)
+#define WARP_SIZE (32)
+
+
+/*
+ * =============================================================================
+ * Compile-time constants used during simulation (user definable)
+ * =============================================================================
+ */
+#define STENCIL_ORDER (6)
+
+///////////// PAD TEST
+// NOTE: works only with nx is divisible by 32
+//#define PAD_LEAD (32 - STENCIL_ORDER/2)
+//#define PAD_SIZE (32 - STENCIL_ORDER)
+///////////// PAD TEST
+
+// L-prefix inherited from the old Astaroth, no idea what it means
+// MV: L means a Logical switch variale, something having true of false value.
+#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+#define LINDUCTION (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
+/*
+ * =============================================================================
+ * Identifiers used to construct the parameter lists for AcMeshInfo
+ * (IntParamType and RealParamType)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_INT_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_nx), \
+        FUNC(AC_ny), \
+        FUNC(AC_nz), \
+        FUNC(AC_mx), \
+        FUNC(AC_my), \
+        FUNC(AC_mz), \
+        FUNC(AC_nx_min), \
+        FUNC(AC_ny_min), \
+        FUNC(AC_nz_min), \
+        FUNC(AC_nx_max), \
+        FUNC(AC_ny_max), \
+        FUNC(AC_nz_max), \
+        /* Other */\
+        FUNC(AC_max_steps), \
+        FUNC(AC_save_steps), \
+        FUNC(AC_bin_steps), \
+        FUNC(AC_bc_type), \
+        /* Additional */\
+        FUNC(AC_mxy),\
+        FUNC(AC_nxy),\
+        FUNC(AC_nxyz)
+#define AC_FOR_REAL_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_dsx), \
+        FUNC(AC_dsy), \
+        FUNC(AC_dsz), \
+        FUNC(AC_dsmin), \
+        /* physical grid*/\
+        FUNC(AC_xlen), \
+        FUNC(AC_ylen), \
+        FUNC(AC_zlen), \
+        FUNC(AC_xorig), \
+        FUNC(AC_yorig), \
+        FUNC(AC_zorig), \
+        /*Physical units*/\
+        FUNC(AC_unit_density),\
+        FUNC(AC_unit_velocity),\
+        FUNC(AC_unit_length),\
+        /* properties of gravitating star*/\
+        FUNC(AC_star_pos_x),\
+        FUNC(AC_star_pos_y),\
+        FUNC(AC_star_pos_z),\
+        FUNC(AC_M_star),\
+        /* Run params */\
+        FUNC(AC_cdt), \
+        FUNC(AC_cdtv), \
+        FUNC(AC_cdts), \
+        FUNC(AC_nu_visc), \
+        FUNC(AC_cs_sound), \
+        FUNC(AC_eta), \
+        FUNC(AC_mu0), \
+        FUNC(AC_relhel), \
+        FUNC(AC_cp_sound), \
+        FUNC(AC_gamma), \
+        FUNC(AC_cv_sound), \
+        FUNC(AC_lnT0), \
+        FUNC(AC_lnrho0), \
+        FUNC(AC_zeta), \
+        FUNC(AC_trans),\
+        /* Other */\
+        FUNC(AC_bin_save_t), \
+        /* Initial condition params */\
+        FUNC(AC_ampl_lnrho), \
+        FUNC(AC_ampl_uu), \
+        FUNC(AC_angl_uu), \
+        FUNC(AC_lnrho_edge),\
+        FUNC(AC_lnrho_out),\
+        /* Additional helper params */\
+        /* (deduced from other params do not set these directly!) */\
+        FUNC(AC_G_CONST),\
+        FUNC(AC_GM_star),\
+        FUNC(AC_sq2GM_star),\
+        FUNC(AC_cs2_sound), \
+        FUNC(AC_inv_dsx), \
+        FUNC(AC_inv_dsy), \
+        FUNC(AC_inv_dsz)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Identifiers for VertexBufferHandle
+ * (i.e. the arrays used to construct AcMesh)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_LNRHO), \
+        FUNC(VTXBUF_UUX), \
+        FUNC(VTXBUF_UUY), \
+        FUNC(VTXBUF_UUZ), \
+        // FUNC(VTXBUF_DYE),
+
+#if LINDUCTION
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_AX), \
+        FUNC(VTXBUF_AY), \
+        FUNC(VTXBUF_AZ),
+#else
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LENTROPY
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_ENTROPY),
+#else
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LTEMPERATURE
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_TEMPERATURE),
+#else
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+#endif
+
+#define AC_FOR_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Single/double precision switch
+ * =============================================================================
+ */
+#if AC_DOUBLE_PRECISION == 1
+typedef double AcReal;
+typedef double3 AcReal3;
+#define AC_REAL_MAX (DBL_MAX)
+#define AC_REAL_MIN (DBL_MIN)
+#define AC_REAL_EPSILON (DBL_EPSILON)
+#else
+typedef float AcReal;
+typedef float3 AcReal3;
+#define AC_REAL_MAX (FLT_MAX)
+#define AC_REAL_MIN (FLT_MIN)
+#define AC_REAL_EPSILON (FLT_EPSILON)
+#endif
+
+typedef struct {
+    AcReal3 row[3];
+} AcMatrix;
+
+/*
+ * =============================================================================
+ * Helper macros
+ * =============================================================================
+ */
+#define AC_GEN_ID(X) X
+#define AC_GEN_STR(X) #X
+
+/*
+ * =============================================================================
+ * Error codes
+ * =============================================================================
+ */
+typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
+
+/*
+ * =============================================================================
+ * Reduction types
+ * =============================================================================
+ */
+typedef enum {
+    RTYPE_MAX,
+    RTYPE_MIN,
+    RTYPE_RMS,
+    RTYPE_RMS_EXP,
+    NUM_REDUCTION_TYPES
+} ReductionType;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
+    NUM_INT_PARAM_TYPES
+} AcIntParam;
+
+typedef enum {
+    AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
+    NUM_REAL_PARAM_TYPES
+} AcRealParam;
+
+extern const char* intparam_names[];  // Defined in astaroth.cu
+extern const char* realparam_names[]; // Defined in astaroth.cu
+
+typedef struct {
+    int int_params[NUM_INT_PARAM_TYPES];
+    AcReal real_params[NUM_REAL_PARAM_TYPES];
+} AcMeshInfo;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
+} VertexBufferHandle;
+
+extern const char* vtxbuf_names[]; // Defined in astaroth.cu
+
+/*
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+*/
+
+// NOTE: there's no particular benefit declaring AcMesh a class, since
+// a library user may already have allocated memory for the vertex_buffers.
+// But then we would allocate memory again when the user wants to start
+// filling the class with data. => Its better to consider AcMesh as a
+// payload-only struct
+typedef struct {
+    AcReal* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} AcMesh;
+
+#define AC_VTXBUF_SIZE(mesh_info)                                              \
+    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *      \
+              mesh_info.int_params[AC_mz]))
+
+#define AC_VTXBUF_SIZE_BYTES(mesh_info)                                        \
+    (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                   \
+    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] *               \
+     mesh_info.int_params[AC_nz])
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                             \
+    (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
+
+#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                      \
+    ((i) + (j)*mesh_info.int_params[AC_mx] +                                   \
+     (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my])
+
+/*
+ * =============================================================================
+ * Astaroth interface
+ * =============================================================================
+ */
+/** Starting point of all GPU computation. Handles the allocation and
+initialization of *all memory needed on all GPUs in the node*. In other words,
+setups everything GPU-side so that calling any other GPU interface function
+afterwards does not result in illegal memory accesses. */
+AcResult acInit(const AcMeshInfo& mesh_info);
+
+/** Splits the host_mesh and distributes it among the GPUs in the node */
+AcResult acLoad(const AcMesh& host_mesh);
+AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
+
+/** Does all three steps of the RK3 integration and computes the boundary
+conditions when necessary. Note that the boundary conditions are not applied
+after the final integration step.
+The result can be fetched to CPU memory with acStore(). */
+AcResult acIntegrate(const AcReal& dt);
+
+/** Performs a single RK3 step without computing boundary conditions. */
+AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
+
+/** Applies boundary conditions on the GPU meshs and communicates the
+ ghost zones among GPUs if necessary */
+AcResult acBoundcondStep(void);
+
+/** Performs a scalar reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
+
+/** Performs a vector reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+                   const VertexBufferHandle& b, const VertexBufferHandle& c);
+
+/** Stores the mesh distributed among GPUs of the node back to a single host
+ * mesh */
+AcResult acStore(AcMesh* host_mesh);
+AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
+
+/** Frees all GPU allocations and resets all devices in the node. Should be
+ * called at exit. */
+AcResult acQuit(void);
+
+/** Synchronizes all devices. All calls to Astaroth are asynchronous by default
+    unless otherwise stated. */
+AcResult acSynchronize(void);
+
+/* End extern "C" */
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ * =============================================================================
+ * Notes
+ * =============================================================================
+ */
+/*
+typedef enum {
+    VTX_BUF_LNRHO,
+    VTX_BUF_UUX,
+    VTX_BUF_UUY,
+    VTX_BUF_UUZ,
+    NUM_VERTEX_BUFFER_HANDLES
+} VertexBufferHandle
+
+// LNRHO etc
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+
+// Host
+typedef struct {
+    VertexBuffer vertex_buffers[NUM_VERTEX_BUFFER_HANDLES];
+    MeshInfo info;
+} Mesh;
+
+// Device
+typedef struct {
+    VertexBuffer in[NUM_VERTEX_BUFFER_HANDLES];
+    VertexBuffer out[NUM_VERTEX_BUFFER_HANDLES];
+} VertexBufferArray;
+*/
--- a/scripts/ac_mkbuilddir.sh
+++ b/scripts/ac_mkbuilddir.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+if [ -z $AC_HOME ]
+then
+       echo "ASTAROTH_HOME environment variable not set, run \"source ./sourceme.sh\" in Astaroth home directory"
+       exit 1
+fi
+
+
+TIARA_SETUP_DEFAULT=""
+DOUBLE_DEFAULT="OFF"
+DEBUG_MODE_DEFAULT="OFF"
+BUILD_DIR_DEFAULT=${AC_HOME}/build/
+ALTER_CONF_DEFAULT="OFF"
+
+BUILD_DIR=${BUILD_DIR_DEFAULT}
+TIARA_SETUP=${TIARA_SETUP_DEFAULT}
+DOUBLE=${DOUBLE_DEFAULT}
+DEBUG_MODE=${DEBUG_MODE_DEFAULT}
+ALTER_CONF=${ALTER_CONF_DEFAULT}
+
+while [ "$#" -gt 0 ]
+do
+	case $1 in  
+		-h|--help)
+			echo "You can set up a build directory separe of the ASTAROTH_HOME"
+			echo "Available flags:"
+			echo "-b, --buildir [PATH] : Set build directory"
+			echo "-t,--tiara : Use TIARA cluster setting for cmake"
+			echo "-d, --double : Compile with double precision"
+			echo "-e, --debug: : Compile in debug mode"
+			echo "Example:"
+			echo "ac_mkbuilddir.sh -b my_build_dir/"
+			exit 0
+			;;
+		-b|--buildir)
+			shift
+                        BUILD_DIR=${1}
+			shift
+                        echo "Setting up build directory..."
+			ALTER_CONF="ON"
+			;;
+		-t|--tiara)
+			shift
+                        TIARA_SETUP="-D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc"
+                        echo "Using TIARA cluster compiler settings"
+			;;
+		-d|--double)
+			shift
+                        DOUBLE="ON"
+                        echo "Double precision"
+			;;
+		-e|--debug)
+			shift
+                        DEBUG_MODE="ON"
+                        echo "Debug mode compilation"
+			;;
+		*)
+			break
+	esac
+done
+
+echo "Creating build directory: ${BUILD_DIR}"
+
+mkdir ${BUILD_DIR}
+
+cd ${BUILD_DIR}
+
+#Set up the astaroth.conf to be define and customized in the build directory to
+#not always alter the default use i.e. for unit test etc. 
+#Assumed by default if you do this thing anyway.
+echo "cp ${AC_HOME}/config/astaroth.conf ${PWD}"
+cp ${AC_HOME}/config/astaroth.conf .
+
+CONF_DIR="-D ASTAROTH_CONF_PATH=${PWD}"
+
+
+#cmake -D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ${AC_HOME}
+
+echo "cmake ${TIARA_SETUP} ${CONF_DIR} -DDOUBLE_PRECISION=${DOUBLE} -DBUILD_DEBUG=${DEBUG_MODE} -DALTER_CONF=${ALTER_CONF} ${AC_HOME}"
+
+cmake ${TIARA_SETUP} ${CONF_DIR} -DDOUBLE_PRECISION=${DOUBLE} -DBUILD_DEBUG=${DEBUG_MODE} -DALTER_CONF=${ALTER_CONF} ${AC_HOME}
--- a/scripts/auto_optimize.sh
+++ b/scripts/auto_optimize.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Run this in your build directory (cd build && ../scripts/auto_optimize.sh)
+# Generates a ${BENCHMARK_FILE} which contains the threadblock dims and other
+# constants used in the integration in addition to the time used.
+
+MAX_THREADS=1024 # Max size of the thread block, depends on hardware
+
+BENCHMARK_FILE="benchmark.out"
+TBCONFCREATOR_SRC_PATH="../scripts/gen_rk3_threadblockconf.c"
+TBCONFFILE_DST_PATH="../src/core/kernels"
+
+C_COMPILER_NAME="gcc"
+
+rm ${BENCHMARK_FILE}
+
+for (( tz=2; tz<=8; tz*=2))
+do
+for (( ty=1; ty<=1; ty+=1))
+do
+for (( tx=16; tx<=64; tx*=2))
+do
+
+if ( (${tx}*${ty}*${tz}) > ${MAX_THREADS})
+then break
+fi
+
+for (( launch_bound=1; launch_bound<=8; launch_bound*=2))
+do
+for (( elems_per_thread=1; elems_per_thread<=128; elems_per_thread*=2))
+do
+    # Generate the threadblock configuration
+    ${C_COMPILER_NAME} ${TBCONFCREATOR_SRC_PATH} -o gen_rk3_threadblockconf
+    ./gen_rk3_threadblockconf ${tx} ${ty} ${tz} ${elems_per_thread} ${launch_bound}
+    rm gen_rk3_threadblockconf
+    mv rk3_threadblock.conf ${TBCONFFILE_DST_PATH}
+
+    # Compile and run the test build
+    cmake -DBUILD_DEBUG=OFF -DDOUBLE_PRECISION=OFF -DAUTO_OPTIMIZE=ON .. && make -j
+    #if ./ac_run -t; then
+    #    echo Success
+        ./ac_run -b
+    #else
+    #    echo fail!
+    #fi
+done 
+done 
+done 
+done
+done 
+
--- a/scripts/buildtest.sh
+++ b/scripts/buildtest.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cmake -DCUDA_BUILD_LEGACY=OFF -DDOUBLE_PRECISION=ON .. && make -j && valgrind --leak-check=full --show-leak-kinds=all ./ac_run -t && make clean &&\
+cmake -DCUDA_BUILD_LEGACY=OFF -DDOUBLE_PRECISION=OFF .. && make -j && valgrind --leak-check=full --show-leak-kinds=all ./ac_run -t
--- a/scripts/compile_acc.sh
+++ b/scripts/compile_acc.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#!/bin/bash
+if [ -z $AC_HOME ]
+then
+       echo "ASTAROTH_HOME environment variable not set, run \"source ./sourceme.sh\" in Astaroth home directory"
+       exit 1
+fi
+
+KERNEL_DIR=${AC_HOME}"/src/core/kernels"
+ACC_DIR=${AC_HOME}"/acc"
+ACC_DEFAULT_SAS="mhd_solver/stencil_assembly.sas"
+ACC_DEFAULT_SPS="mhd_solver/stencil_process.sps"
+
+${ACC_DIR}/clean.sh
+${ACC_DIR}/build_acc.sh
+
+
+ACC_SAS=${ACC_DEFAULT_SAS}
+ACC_SPS=${ACC_DEFAULT_SPS}
+
+while [ "$#" -gt 0 ]
+do
+	case $1 in  
+		-h|--help)
+			echo "You can set a custom files for DSL under the path $AC_HOME/"
+			echo "Example:"
+			echo "compile_acc.sh -a custom_setup/custom_assembly.sas -p custom_setup/custom_process.sps"
+			exit 0
+			;;
+		-a|--assembly)
+			shift
+                        ACC_SAS=${1}
+			shift
+                        echo "CUSTOM Assembly file!"
+			;;
+		-p|--process)
+			shift
+                        ACC_SPS=${1}
+			shift
+			echo "CUSTOM Process file!"
+			;;
+		*)
+			break
+	esac
+done
+
+echo "Assembly file: ${ACC_DIR}/${ACC_SAS}"
+echo "Process file: ${ACC_DIR}/${ACC_SPS}"
+
+cd ${KERNEL_DIR}
+${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS}
+${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS}
--- a/scripts/fix_style.sh
+++ b/scripts/fix_style.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+if [[ $1 == "DO" && $2 == "IT!" ]]; then
+    find -name \*.h -o -name \*.cc -o -name \*.cu -o -name \*.cuh | xargs clang-format-6.0 -i -style=file
+    echo "It is done."
+else
+    find -name \*.h -o -name \*.cc -o -name \*.cu -o -name \*.cuh
+    echo "I'm going to try to fix the style of these files."
+    echo "If you're absolutely sure, give \"DO IT!\" (without quotes) as a parameter."
+fi
--- a/scripts/gen_rk3_threadblockconf.c
+++ b/scripts/gen_rk3_threadblockconf.c
@@ -0,0 +1,60 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+ * @file
+ * \brief Generates a threadblock config file for RK3 using the given parameters.
+ *
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <assert.h>
+
+const char* rk3_threadblockconf_path = "rk3_threadblock.conf";
+
+int
+write_to_file(int threads_x, int threads_y, int threads_z, int elems_per_thread, int launch_bound)
+{
+    FILE* fp;
+    fp = fopen(rk3_threadblockconf_path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "#define RK_THREADS_X (%d)\n", threads_x);
+        fprintf(fp, "#define RK_THREADS_Y (%d)\n", threads_y);
+        fprintf(fp, "#define RK_THREADS_Z (%d)\n", threads_z);
+        fprintf(fp, "#define RK_ELEMS_PER_THREAD (%d)\n", elems_per_thread);
+        fprintf(fp, "#define RK_LAUNCH_BOUND_MIN_BLOCKS (%d)\n", launch_bound);
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+// Takes arguments and writes them into a file
+// RK_THREADS_X, RK_THREADS_Y, RK_THREADS_Z, RK_ELEMS_PER_THREAD, RK_LAUNCH_BOUND_MIN_BLOCKS
+int
+main(int argc, char* argv[])
+{
+    assert(argc == 6);
+
+    return write_to_file(atoi(argv[1]), atoi(argv[2]),atoi(argv[3]), atoi(argv[4]), atoi(argv[5]));
+}
--- a/scripts/generate_doc.sh
+++ b/scripts/generate_doc.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+doxygen doxyfile
--- a/sourceme.sh
+++ b/sourceme.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+export AC_HOME=$PWD
+export PATH=${PATH}:$AC_HOME/scripts/
+
+echo $AC_HOME
+echo $PATH
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -0,0 +1,70 @@
+########################################
+##  CMakeLists.txt for Astaroth Core  ##
+########################################
+
+#----------------------Find CUDA-----------------------------------------------#
+
+find_package(CUDA)
+if (NOT CUDA_FOUND)
+    # find_package(CUDA REQUIRED) gives a confusing error message if it fails,
+    # therefore we print the reason here explicitly
+    message(FATAL_ERROR "CUDA not found")
+endif()
+
+
+#----------------------CUDA settings-------------------------------------------#
+
+set(CUDA_SEPARABLE_COMPILATION ON)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
+
+# CUDA_BUILD_CUBIN requires that we're compiling for only one architecture
+# set(CUDA_BUILD_CUBIN ON)
+
+
+#----------------------Setup CUDA compilation flags----------------------------#
+
+# Generate code for the default architecture (Pascal)
+set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
+                    -gencode arch=compute_50,code=sm_50 
+                    -gencode arch=compute_60,code=sm_60 
+                    -gencode arch=compute_61,code=sm_61 
+                    -lineinfo 
+                    --maxrregcount=255
+                    -ftz=true 
+                    -std=c++11) #--maxrregcount=255 -ftz=true #ftz = flush denormalized floats to zero
+# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
+# =cg to opt out
+
+# Additional CUDA optimization flags
+if (CMAKE_BUILD_TYPE MATCHES RELEASE)
+    # Doesn't set any additional flags, see CUDA_NVCC_FLAGS_DEBUG below on how
+    # to add more
+    set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE})
+endif()
+
+# Additional CUDA debug flags
+if (CMAKE_BUILD_TYPE MATCHES DEBUG)
+    # The debug flags must be set inside this if clause, since either CMake 3.5
+    # or nvcc 7.5 is bugged:
+    # CMake converts these into empty strings when doing RELEASE build, but nvcc
+    # 7.5 fails to parse empty flags.
+    set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};
+                               --device-debug;
+                               --generate-line-info;
+                               --ptxas-options=-v)
+endif()
+
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCH_FLAGS}")
+
+
+message("CUDA_NVCC_FLAGS: " ${CUDA_NVCC_FLAGS})
+
+
+#------------------Compile and create a static library-------------------------#
+file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu")
+
+# Use -fPIC if -fpic not supported. Some quick non-scientific tests:
+# Without fpic: 4.94 user, 4.04 system, 0:09.88 elapsed
+# With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed
+# With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed
+CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic")
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -0,0 +1,451 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Multi-GPU implementation.
+ *
+ * Detailed info.
+ *
+ */
+#include "astaroth.h"
+#include "errchk.h"
+
+#include "device.cuh"
+#include "math_utils.h" // sum for reductions
+#include "standalone/config_loader.h" // update_config
+
+const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[]     = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]        = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+
+
+static const int MAX_NUM_DEVICES = 32;
+static int num_devices = 1;
+static Device devices[MAX_NUM_DEVICES] = {};
+
+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
+
+static Grid
+createGrid(const AcMeshInfo& config)
+{
+    Grid grid;
+    grid.m = (int3) {
+        config.int_params[AC_mx],
+        config.int_params[AC_my],
+        config.int_params[AC_mz]
+    };
+
+    grid.n = (int3) {
+        config.int_params[AC_nx],
+        config.int_params[AC_ny],
+        config.int_params[AC_nz]
+    };
+
+    return grid;
+}
+
+static Grid grid; // A grid consists of num_devices subgrids
+static Grid subgrid;
+
+static int
+gridIdx(const Grid& grid, const int i, const int j, const int k)
+{
+    return i + j * grid.m.x + k * grid.m.x * grid.m.y;
+}
+
+static int3
+gridIdx3d(const Grid& grid, const int idx)
+{
+    return (int3){idx % grid.m.x,
+                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+                  idx / (grid.m.x * grid.m.y)};
+}
+
+void
+printInt3(const int3 vec)
+{
+    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
+}
+
+AcResult
+acInit(const AcMeshInfo& config)
+{
+    // Check devices
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices < 1) {
+        ERROR("No CUDA devices found!");
+        return AC_FAILURE;
+    }
+    if (num_devices > MAX_NUM_DEVICES) {
+        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
+        num_devices = MAX_NUM_DEVICES;
+    }
+    if (!AC_MULTIGPU_ENABLED) {
+        WARNING("MULTIGPU_ENABLED was false. Using only one device");
+        num_devices = 1; // Use only one device if multi-GPU is not enabled
+    }
+    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // problem domain to multiple GPUs much easier since we do not have to worry
+    // about remainders
+    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+
+    // Decompose the problem domain
+    // The main grid
+    grid = createGrid(config);
+
+    // Subgrids
+    AcMeshInfo subgrid_config = config;
+    subgrid_config.int_params[AC_nz] /= num_devices;
+    update_config(&subgrid_config);
+    subgrid = createGrid(subgrid_config);
+
+    // Periodic boundary conditions become weird if the system can "fold unto itself".
+    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+
+    printf("Grid m "); printInt3(grid.m); printf("\n");
+    printf("Grid n "); printInt3(grid.n); printf("\n");
+    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+
+    // Initialize the devices
+    for (int i = 0; i < num_devices; ++i) {
+        createDevice(i, subgrid_config, &devices[i]);
+        printDeviceInfo(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acQuit(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        destroyDevice(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+int
+gridIdxx(const Grid grid, const int3 idx)
+{
+    return gridIdx(grid, idx.x, idx.y, idx.z);
+}
+
+AcResult
+acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
+{
+    /*
+    Here we decompose the host mesh and distribute it among the GPUs in
+    the node.
+    
+    The host mesh is a huge contiguous block of data. Its dimensions are given by
+    the global variable named "grid". A "grid" is decomposed into "subgrids",
+    one for each GPU. Here we check which parts of the range s0...s1 maps
+    to the memory space stored by some GPU, ranging d0...d1, and transfer
+    the data if needed.
+    
+    The index mapping is inherently quite involved, but here's a picture which
+    hopefully helps make sense out of all this.
+    
+
+    Grid
+                                     |----num_vertices---|
+    xxx|....................................................|xxx
+             ^                   ^   ^                   ^
+            d0                  d1  s0 (src)            s1
+
+    Subgrid
+
+             xxx|.............|xxx
+             ^                   ^
+            d0                  d1
+
+                                 ^   ^                   
+                                db  da                   
+
+    */
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
+{
+    // See acLoadWithOffset() for an explanation of the index mapping
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+// acCopyMeshToDevice
+AcResult
+acLoad(const AcMesh& host_mesh)
+{
+    return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
+}
+
+// acCopyMeshToHost
+AcResult
+acStore(AcMesh* host_mesh)
+{
+    return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
+}
+
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
+    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
+                              STENCIL_ORDER/2 + subgrid.n.y,
+                              STENCIL_ORDER/2 + subgrid.n.z};
+    for (int i = 0; i < num_devices; ++i) {
+        rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    acSynchronize();
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY,
+                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+    } else {
+        // Local boundary conditions
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+
+/*
+// ===MIIKKANOTE START==========================================
+%JP: The old way for computing boundary conditions conflicts with the 
+way we have to do things with multiple GPUs.
+
+The older approach relied on unified memory, which represented the whole
+memory area as one huge mesh instead of several smaller ones. However, unified memory
+in its current state is more meant for quick prototyping when performance is not an issue.
+Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
+when managing the memory explicitly.
+
+In this new approach, I have simplified the multi- and single-GPU layers significantly. 
+Quick rundown:
+	New struct: Grid. There are two global variables, "grid" and "subgrid", which
+	contain the extents of the whole simulation domain and the decomposed grids, respectively.
+	To simplify thing, we require that each GPU is assigned the same amount of work,
+	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
+	to work with.
+
+	The whole simulation domain is decomposed with respect to the z dimension.
+	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+	contain (nx, ny, nz / num_devices) vertices.
+ 
+	An local index (i, j, k) in some subgrid can be mapped to the global grid with
+		global idx = (i, j, k + device_id * subgrid.n.z)
+
+Terminology:
+	- Single-GPU function: a function defined on the single-GPU layer (device.cu)
+
+Changes required to this commented code block:
+	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
+	  instead. Same holds for any complex index calculations. Instead, the local coordinates
+  	  should be passed as an int3 type without having to consider how the data is actually
+	  laid out in device memory
+	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
+	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
+	  are stored in a global array "devices[num_devices]". 
+	- Every single-GPU function is executed asynchronously by default such that we
+	  can optimize Astaroth by executing memory transactions concurrently with computation.
+	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
+	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+	  as a parameter and commands executing in different streams can be processed 
+	  in parallel/concurrently.
+
+
+Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+
+	With multiple GPUs, periodic boundary conditions applied on indices ranging from 
+		
+		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+
+	on a single device are "local", in the sense that they can be computed without having
+	to exchange data with neighboring GPUs. Special care is needed only for transferring
+	the data to the fron and back plates outside this range. In the solution we use here,
+	we solve the local boundaries first, and then just exchange the front and back plates
+	in a "ring", like so 
+				device_id
+		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+			
+
+// ======MIIKKANOTE END==========================================
+
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
+                                                          moved into device.cu, function boundCondStep()
+                                                          In astaroth.cu, we use acBoundcondStep() 
+                                                          just to distribute the work and manage
+                                                          communication between GPUs.
+
+    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+
+    exit(0);
+    #else
+    
+        
+        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
+
+        const int3 start = (int3){0, 0, device_id * depth};
+        const int3 end = (int3){mesh_info.int_params[AC_mx],
+                                mesh_info.int_params[AC_my],
+                                min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
+
+        const dim3 tpb(8,2,8);
+
+        // TODO uses the default stream currently
+        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
+            wedge_boundconds(0, tpb, start, end, d_buffer);
+        } else { 
+            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) 
+                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+*/
+        // Exchange halos
+        for (int i = 0; i < num_devices; ++i) {
+            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
+            // ...|ooooxxx|... -> xxx|ooooooo|...
+            {
+                const int3 src = (int3) {0, 0, subgrid.n.z};
+                const int3 dst = (int3) {0, 0, 0};
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+            }
+            // ...|ooooooo|xxx <- ...|xxxoooo|...
+            {
+                const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
+                const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
+                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+            }
+        }
+    }
+    acSynchronize();
+    return AC_SUCCESS;
+}
+
+static AcResult
+acSwapBuffers(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        swapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrate(const AcReal& dt)
+{
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acBoundcondStep();
+        acIntegrateStep(isubstep, dt);
+        acSwapBuffers();
+    }
+    return AC_SUCCESS;
+}
+
+AcReal
+acReduceScal(const ReductionType& rtype,
+             const VertexBufferHandle& vtxbuffer_handle)
+{
+    // TODO
+    return 0;
+}
+
+AcReal
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+            const VertexBufferHandle& b, const VertexBufferHandle& c)
+{
+    // TODO
+    return 0;
+}
+
+AcResult
+acSynchronize(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], STREAM_ALL);
+    }
+
+    return AC_SUCCESS;
+}
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -0,0 +1,309 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "device.cuh"
+
+#include "errchk.h"
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+} VertexBufferArray;
+
+__constant__ AcMeshInfo d_mesh_info;
+#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+#include "kernels/kernels.cuh"
+
+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+};
+
+AcResult
+printDeviceInfo(const Device device)
+{
+    const int device_id = device->id;
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);
+
+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf("    Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    // Memory
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
+               (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n",
+           props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n",
+           free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n",
+           total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    printf("    Shared mem per block: %ld KiB\n",
+           props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n",
+           props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
+
+    return AC_SUCCESS;
+}
+
+static __global__ void dummy_kernel(void) {}
+
+AcResult
+createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
+{
+    cudaSetDevice(id);
+    cudaDeviceReset();
+
+    // Create Device
+    struct device_s* device = (struct device_s*) malloc(sizeof(*device));
+    ERRCHK_ALWAYS(device);
+
+    device->id = id;
+    device->local_config = device_config;
+
+    // Check that the code was compiled for the proper GPU architecture
+    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
+           "device supports the CUDA architecture you are compiling for.\n"
+           "Running dummy kernel... ");
+    fflush(stdout);
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    printf("Success!\n");
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamCreate(&device->streams[i]);
+    }
+
+    // Memory
+    const size_t vba_size_bytes = AC_VTXBUF_SIZE_BYTES(device_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+    }
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
+                                  AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+
+    // Device constants
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
+                                          cudaMemcpyHostToDevice));
+
+    printf("Created device %d (%p)\n", device->id, device);
+    *device_handle = device;
+    return AC_SUCCESS;
+}
+
+AcResult
+destroyDevice(Device device)
+{
+    cudaSetDevice(device->id);
+    printf("Destroying device %d (%p)\n", device->id, device);
+
+    // Memory
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        cudaFree(device->vba.in[i]);
+        cudaFree(device->vba.out[i]);
+    }
+    cudaFree(device->reduce_scratchpad);
+    cudaFree(device->reduce_result);
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i)
+        cudaStreamDestroy(device->streams[i]);
+
+    // Destroy Device
+    free(device);
+    return AC_SUCCESS;
+}
+
+AcResult
+boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceScal(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceVec(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+rkStep(const Device device, const StreamType stream_type, const int step_number,
+       const int3& start, const int3& end, const AcReal dt)
+{
+    cudaSetDevice(device->id);
+    rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
+    return AC_SUCCESS;
+}
+
+AcResult
+synchronize(const Device device, const StreamType stream_type)
+{
+    cudaSetDevice(device->id);
+    if (stream_type == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    } else {
+        cudaStreamSynchronize(device->streams[stream_type]);
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+loadWithOffset(const Device device, const StreamType stream_type,
+               const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+static AcResult
+storeWithOffset(const Device device, const StreamType stream_type,
+                const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToDevice(const Device device, const StreamType stream_type,
+                 const AcMesh& host_mesh, const int3& src, const int3& dst,
+                 const int num_vertices)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
+                       &device->vba.in[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToHost(const Device device, const StreamType stream_type,
+               const int3& src, const int3& dst, const int num_vertices,
+               AcMesh* host_mesh)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
+                        num_vertices * sizeof(AcReal),
+                        &host_mesh->vertex_buffer[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
+                       const int3& src, Device dst_device, const int3& dst,
+                       const int num_vertices)
+{
+    cudaSetDevice(src_device->id);
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, dst_device->local_config);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
+                                        &src_device->vba.in[i][src_idx], src_device->id,
+                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
+                                        src_device->streams[stream_type]));
+    }
+    return AC_SUCCESS;
+}
+
+
+AcResult
+swapBuffers(const Device device)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        AcReal* tmp     = device->vba.in[i];
+        device->vba.in[i]  = device->vba.out[i];
+        device->vba.out[i] = tmp;
+    }
+    return AC_SUCCESS;
+}
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -0,0 +1,82 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+typedef enum {
+  STREAM_PRIMARY,
+  STREAM_SECONDARY,
+  NUM_STREAM_TYPES,
+  STREAM_ALL
+} StreamType;
+
+typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
+                                 // in Vulkan, f.ex. VkDevice
+
+/** */
+AcResult printDeviceInfo(const Device device);
+
+/** */
+AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
+
+/** */
+AcResult destroyDevice(Device device);
+
+/** */
+AcResult boundcondStep(const Device device, const StreamType stream_type,
+                       const int3& start, const int3& end);
+
+/** */
+AcResult reduceScal(const Device device);
+
+/** */
+AcResult reduceVec(const Device device);
+
+/** */
+AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
+                const int3& start, const int3& end, const AcReal dt);
+
+/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
+    a StreamType, the function synchronizes all streams on the device. */
+AcResult synchronize(const Device device, const StreamType stream_type);
+
+/** */
+AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
+                          const AcMesh& host_mesh, const int3& src, const int3& dst,
+                          const int num_vertices);
+
+/** */
+AcResult copyMeshToHost(const Device device, const StreamType stream_type,
+                        const int3& src, const int3& dst, const int num_vertices,
+                        AcMesh* host_mesh);
+
+/** */
+AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
+                                Device dst, const int3& dst_idx, const int num_vertices);
+
+/** Swaps the input/output buffers used in computations */
+AcResult swapBuffers(const Device device);
--- a/src/core/errchk.h
+++ b/src/core/errchk.h
@@ -0,0 +1,112 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+// clang-format off
+/*
+ * =============================================================================
+ * General error checking
+ * =============================================================================
+ */
+#define ERROR(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tError in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+    exit(EXIT_FAILURE); \
+    abort(); \
+}
+
+#define WARNING(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+}
+
+// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
+#define ERRCHK(retval)  { if (!(retval)) ERROR(#retval " was false"); }
+#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
+#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
+
+/*
+ * =============================================================================
+ * CUDA-specific error checking
+ * =============================================================================
+ */
+#ifdef __CUDACC__
+static inline void
+cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
+{
+    if (code != cudaSuccess) {
+        time_t t; time(&t); \
+        fprintf(stderr, "%s", ctime(&t)); \
+        fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
+                        file, line, cudaGetErrorString(code)); \
+        fflush(stderr); \
+
+        if (abort)
+            exit(code);
+    }
+}
+
+#ifdef NDEBUG
+    #undef ERRCHK
+    #undef WARNCHK
+    #define ERRCHK(params)
+    #define WARNCHK(params)
+    #define ERRCHK_CUDA(params) params;
+    #define WARNCHK_CUDA(params) params;
+    #define ERRCHK_CUDA_KERNEL() {}
+#else
+    #define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
+    #define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
+
+    #define ERRCHK_CUDA_KERNEL()                                               \
+    {                                                                          \
+        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
+        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
+    }
+    #endif
+
+#endif
+
+#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
+
+#define ERRCHK_CUDA_KERNEL_ALWAYS()                                               \
+{                                                                          \
+    ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError());                                    \
+    ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize());                                  \
+}
+// clang-format on
--- a/src/core/kernels/.gitignore
+++ b/src/core/kernels/.gitignore
@@ -0,0 +1,2 @@
+# Ignore the generated headers
+stencil_process.cuh stencil_assembly.cuh
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -0,0 +1,794 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+ #pragma once
+
+__global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vertex_buffer[dst_idx] = vertex_buffer[src_idx];
+}
+
+void
+periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vertex_buffer)
+{
+    const dim3 tpb(8,2,8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
+    ERRCHK_CUDA_KERNEL();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <assert.h>
+
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, 
+               const AcReal dt, VertexBufferArray* buffer)
+{
+    const dim3 tpb(32, 1, 4);
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/reduce.cuh
+++ b/src/core/kernels/reduce.cuh
@@ -0,0 +1,338 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
+
+// Function pointer definitions
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
+typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
+typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
+                                       const AcReal&);
+
+// clang-format off
+/* Comparison funcs */
+__device__ inline AcReal
+_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+__device__ inline AcReal
+_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+__device__ inline AcReal
+_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+__device__ inline AcReal
+_device_length_scal(const AcReal& a) { return AcReal(a); }
+
+__device__ inline AcReal
+_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
+
+__device__ inline AcReal
+_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
+
+__device__ inline AcReal
+_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+__device__ inline AcReal
+_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
+
+__device__ inline AcReal
+_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
+// clang-format on
+
+__device__ inline bool
+oob(const int& i, const int& j, const int& k)
+{
+    if (i >= d_mesh_info.int_params[AC_nx] ||
+        j >= d_mesh_info.int_params[AC_ny] ||
+        k >= d_mesh_info.int_params[AC_nz])
+        return true;
+    else
+        return false;
+}
+
+template <ReduceInitialScalFunc reduce_initial>
+__global__ void
+_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src[src_idx]);
+}
+
+template <ReduceInitialVecFunc reduce_initial>
+__global__ void
+_kernel_reduce_vec(const __restrict__ AcReal* src_a,
+                   const __restrict__ AcReal* src_b,
+                   const __restrict__ AcReal* src_c, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
+                                  src_c[src_idx]);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+#define BLOCK_SIZE (1024)
+#define ELEMS_PER_THREAD (32)
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce(AcReal* src, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+
+    if (idx >= scratchpad_size)
+        return;
+
+    __shared__ AcReal smem[BLOCK_SIZE];
+
+    AcReal tmp = src[idx];
+
+    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
+        const int src_idx = idx + i * BLOCK_SIZE;
+        if (src_idx >= scratchpad_size) {
+            // This check is for safety: if accessing uninitialized values
+            // beyond the mesh boundaries, we will immediately start seeing NANs
+            if (threadIdx.x < BLOCK_SIZE)
+                smem[threadIdx.x] = NAN;
+            else
+                break;
+        }
+        tmp = reduce(tmp, src[src_idx]);
+    }
+
+    smem[threadIdx.x] = tmp;
+    __syncthreads();
+
+    int offset = BLOCK_SIZE / 2;
+    while (offset > 0) {
+
+        if (threadIdx.x < offset) {
+            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
+            smem[threadIdx.x] = tmp;
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        src[idx] = tmp;
+}
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
+{
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    AcReal tmp    = src[idx];
+    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
+    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
+        tmp = reduce(tmp, src[idx + i * block_offset]);
+
+    *result = tmp;
+}
+//////////////////////////////////////////////////////////////////////////////
+
+AcReal
+_reduce_scal(const cudaStream_t stream,
+             const ReductionType& rtype, const int& nx, const int& ny,
+             const int& nz, const AcReal* vertex_buffer,
+             AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
+                   int(ceil(AcReal(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
+                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_scal<_device_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_scal<_device_exp_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
+
+AcReal
+_reduce_vec(const cudaStream_t stream,
+            const ReductionType& rtype, const int& nx, const int& ny,
+            const int& nz, const AcReal* vertex_buffer_a,
+            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
+            AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
+                   int(ceil(float(ny) / tpb.y)),
+                   int(ceil(float(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
+                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    // "Features" of this quick & efficient reduction:
+    // Block size must be smaller than the computational domain size
+    // (otherwise we would have do some additional bounds checking in the
+    // second half of _kernel_reduce, which gets quite confusing)
+    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
+    // the work without worrying too much about the array bounds.
+    ERRCHK(BLOCK_SIZE <= scratchpad_size);
+    ERRCHK(!(BLOCK_SIZE % 2));
+    // NOTE! Also does not work properly with non-power of two mesh dimension
+    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
+    // see that uninitialized smem values are used in the comparison
+    ERRCHK(is_power_of_two(nx));
+    ERRCHK(is_power_of_two(ny));
+    ERRCHK(is_power_of_two(nz));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_vec<_device_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_vec<_device_exp_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
--- a/src/core/kernels/rk3.cuh
+++ b/src/core/kernels/rk3.cuh
@@ -0,0 +1,742 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Implementation of the integration pipeline
+ *
+ *
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include <assert.h>
+
+/*
+#define RK_THREADS_X (32)
+#define RK_THREADS_Y (1)
+#define RK_THREADS_Z (4)
+#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
+#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
+*/
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const dim3& tpb,
+               const int3& start, const int3& end, const int& step_number,
+               const AcReal dt, const AcMeshInfo& /*mesh_info*/,
+               VertexBufferArray* buffer)
+{
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -0,0 +1,91 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <math.h>   // isnan, isinf
+#include <stdlib.h> // rand
+
+template <class T>
+static inline const T
+max(const T& a, const T& b)
+{
+    return a > b ? a : b;
+}
+
+template <class T>
+static inline const T
+min(const T& a, const T& b)
+{
+    return a < b ? a : b;
+}
+
+template <class T>
+static inline const T
+sum(const T& a, const T& b)
+{
+    return a + b;
+}
+
+template <class T>
+static inline const T
+is_valid(const T& val)
+{
+    if (isnan(val) || isinf(val))
+        return false;
+    else
+        return true;
+}
+
+template <class T>
+static inline const T
+clamp(const T& val, const T& min, const T& max)
+{
+    return val < min ? min : val > max ? max : val;
+}
+
+static inline AcReal
+randr()
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+static inline int3
+operator+(const int3& a, const int3& b)
+{
+    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static inline int3
+operator-(const int3& a, const int3& b)
+{
+    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static inline bool
+is_power_of_two(const unsigned val)
+{
+    return val && !(val & (val - 1));
+}
--- a/src/standalone/CMakeLists.txt
+++ b/src/standalone/CMakeLists.txt
@@ -0,0 +1,10 @@
+################################
+##  CMakeLists.txt for utils  ##
+################################
+
+file (GLOB SOURCES "*.cc" "model/*.cc")
+
+add_library(astaroth_standalone STATIC ${SOURCES})
+target_include_directories(astaroth_standalone PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+#target_compile_definitions(astaroth_standalone PRIVATE CONFIG_PATH=\"${CMAKE_SOURCE_DIR}/config/\")
+target_compile_definitions(astaroth_standalone PRIVATE CONFIG_PATH=\"${ASTAROTH_CONF_PATH}\")
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -0,0 +1,732 @@
+/*
+   Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+   This file is part of Astaroth.
+
+   Astaroth is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Astaroth is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <stdio.h>
+
+#include "config_loader.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_boundconds.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+
+#include "core/errchk.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+// Defines for colored output
+#define RED "\x1B[31m"
+#define GRN "\x1B[32m"
+#define YEL "\x1B[33m"
+#define BLU "\x1B[34m"
+#define MAG "\x1B[35m"
+#define CYN "\x1B[36m"
+#define WHT "\x1B[37m"
+#define RESET "\x1B[0m"
+
+#define GEN_TEST_RESULT (1) // Generate a test file always during testing
+
+typedef struct {
+	int x, y, z;
+} vec3i;
+
+typedef struct {
+	AcReal x, y, z;
+} vec3r;
+
+
+typedef struct {
+	ModelScalar model;
+	AcReal candidate;
+	ModelScalar error;
+} ErrorInfo;
+
+#define QUICK_TEST (0)
+#define THOROUGH_TEST (1)
+#define TEST_TYPE QUICK_TEST
+
+static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
+// #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#if TEST_TYPE == QUICK_TEST // REGULAR TEST START HERE --------------------------------------------------------------------------------------------------------------
+	static inline ModelScalar
+get_absolute_error(const ModelScalar& model, const AcReal& candidate)
+{
+	return fabsl(candidate - model);
+}
+
+	static inline ModelScalar
+get_acceptable_absolute_error(const ModelScalar& range)
+{
+	// This is the upper limit, which assumes that both the min and max values
+	// are used in a calculation (which inherently leads to cancellation).
+	//
+	// AFAIK if this breaks, there is definitely something wrong with the code.
+	// Otherwise the error is so small it's indistiguishable from inherent
+	// inaccuracies in floating-point arithmetic.
+	return range * AC_REAL_EPSILON;
+}
+
+	static inline ModelScalar
+get_acceptable_relative_error(void)
+{
+	return 30; // machine epsilons
+}
+
+	static inline ModelScalar
+get_relative_error(const ModelScalar& model, const AcReal& candidate)
+{
+	ModelScalar error = NAN;
+
+#if 0
+	const ModelScalar abs_epsilon = get_acceptable_absolute_error(range);
+	if (fabsl(model) < abs_epsilon) { // Model is close to zero
+		/*
+		   if (fabsl(candidate - model) <= AC_REAL_EPSILON * fabsl(candidate))
+		   error = 0;
+		// Knuth section 4.2.2 pages 217-218 TODO
+		 */
+		if (fabsl(candidate) < abs_epsilon) // If candidate is close to zero
+			error = fabsl(candidate);       // return candidate itself
+		else
+			error = INFINITY;
+	}
+	else {
+		error = fabsl(1.0l - candidate / model);
+	}
+#endif
+	error = fabsl(1.0l - candidate / model);
+
+	// Return the relative error as multiples of the machine epsilon
+	// See Sect. Relative Error and Ulps in
+	// What Every Computer Scientist Should Know About Floating-Point Arithmetic
+	// By David Goldberg (1991)
+	return error / AC_REAL_EPSILON;
+}
+
+	static bool
+verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range)
+{
+	if (!is_valid(model) || !is_valid(cand))
+		return false;
+
+	const ModelScalar relative_error = get_relative_error(model, cand);
+	if (relative_error < get_acceptable_relative_error())
+		return true;
+
+	const ModelScalar absolute_error = get_absolute_error(model, cand);
+	if (absolute_error < get_acceptable_absolute_error(range))
+		return true;
+
+	return false;
+}
+
+	static ModelScalar
+get_reduction_range(const ModelMesh& mesh)
+{
+	ERRCHK(NUM_VTXBUF_HANDLES >= 3);
+
+	const ModelScalar max0     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(0));
+	const ModelScalar max1     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(1));
+	const ModelScalar max2     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(2));
+	const ModelScalar max_scal = max(max0, max(max1, max2));
+
+	const ModelScalar min0     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(0));
+	const ModelScalar min1     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(1));
+	const ModelScalar min2     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(2));
+	const ModelScalar min_scal = min(min0, min(min1, min2));
+
+	return max_scal - min_scal;
+}
+
+	static void
+print_debug_info(const ModelScalar& model, const AcReal& candidate,
+		const ModelScalar& range)
+{
+	printf("MeshPointInfo\n");
+	printf("\tModel: %e\n", double(model));
+	printf("\tCandidate: %e\n", double(candidate));
+	printf("\tRange: %e\n", double(range));
+
+	printf("\tAbsolute error: %Le (max acceptable: %Le)\n",
+			get_absolute_error(model, candidate),
+			get_acceptable_absolute_error(range));
+	printf("\tRelative error: %Le (max acceptable: %Le)\n",
+			get_relative_error(model, candidate),
+			get_acceptable_relative_error());
+	printf("\tIs acceptable: %d\n", verify(model, candidate, range));
+}
+
+static void
+print_result(const ModelScalar& model, const AcReal& candidate,
+		const ModelScalar& range, const char* name = "???")
+{
+	const ModelScalar rel_err = get_relative_error(model, candidate);
+	const ModelScalar abs_err = get_absolute_error(model, candidate);
+	if (!verify(model, candidate, range)) {
+		printf("\t%-12s... ", name);
+		printf(RED "FAIL! " RESET);
+	}
+	else {
+		printf("\t%-12s... ", name);
+		printf(GRN "OK! " RESET);
+	}
+
+	printf("(relative error: %.3Lg \u03B5, absolute error: %Lg)\n", rel_err, abs_err);
+	/*
+	// DEPRECATED: TODO remove
+	if (rel_err < get_acceptable_relative_error())
+	printf("(relative error: %Lg \u03B5, max accepted %Lg)\n", rel_err,
+	get_acceptable_relative_error());
+	else
+	printf("(absolute error: %Lg, max accepted %Lg)\n", abs_err,
+	get_acceptable_absolute_error(range));
+	 */
+}
+
+	static int
+check_reductions(const AcMeshInfo& config)
+{
+	printf("Testing reductions\n");
+	int num_failures = 0;
+
+	// Init CPU meshes
+	AcMesh* mesh = acmesh_create(config);
+	ModelMesh* modelmesh = modelmesh_create(config);
+
+	// Init GPU meshes
+	acInit(config);
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+        const InitType itype = test_cases[i];
+        printf("Checking %s...\n", init_type_names[InitType(itype)]);
+
+		// Init the mesh and figure out the acceptable range for error
+		acmesh_init_to(InitType(itype), mesh);
+
+		acmesh_to_modelmesh(*mesh, modelmesh);
+		const ModelScalar range = get_reduction_range(*modelmesh);
+
+		acLoad(*mesh);
+
+		for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) {
+			const VertexBufferHandle ftype = VTXBUF_UUX;
+
+			// Scal
+			ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
+					VertexBufferHandle(ftype));
+			AcReal candidate  = acReduceScal(ReductionType(rtype),
+					VertexBufferHandle(ftype));
+			print_result(model, candidate, range, "UUX scal");
+
+			bool is_acceptable = verify(model, candidate, range);
+			if (!is_acceptable) {
+				++num_failures;
+
+				// Print debug info
+				printf("Scalar reduction type %d FAIL\n", rtype);
+				print_debug_info(model, candidate, range);
+			}
+
+			// Vec
+			model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX,
+					VTXBUF_UUY, VTXBUF_UUZ);
+			candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX,
+					VTXBUF_UUY, VTXBUF_UUZ);
+			print_result(model, candidate, range, "UUXYZ vec");
+
+			is_acceptable = verify(model, candidate, range);
+			if (!is_acceptable) {
+				++num_failures;
+
+				// Print debug info
+				printf("Vector reduction type %d FAIL\n", rtype);
+				print_debug_info(model, candidate, range);
+			}
+		}
+
+		printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+	}
+	acQuit();
+	modelmesh_destroy(modelmesh);
+	acmesh_destroy(mesh);
+
+	return num_failures;
+}
+
+/** Finds the maximum and minimum in all meshes and computes the range.
+ * Note! Potentially dangerous if all meshes do not interact with each other.
+ * Otherwise the range may be too high.
+ */
+	static ModelScalar
+get_data_range(const ModelMesh& model)
+{
+	ModelScalar vertex_buffer_max_all = -INFINITY;
+	ModelScalar vertex_buffer_min_all = INFINITY;
+	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+		const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, VertexBufferHandle(w));
+		const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, VertexBufferHandle(w));
+
+		if (vertex_buffer_max > vertex_buffer_max_all)
+			vertex_buffer_max_all = vertex_buffer_max;
+		if (vertex_buffer_min < vertex_buffer_min_all)
+			vertex_buffer_min_all = vertex_buffer_min;
+	}
+	return fabsl(vertex_buffer_max_all - vertex_buffer_min_all);
+}
+
+// #define GEN_TEST_RESULT
+#if GEN_TEST_RESULT == 1
+static FILE* test_result = NULL;
+#endif
+
+	static bool
+verify_meshes(const ModelMesh& model, const AcMesh& candidate)
+{
+	bool retval = true;
+
+#if GEN_TEST_RESULT == 1
+	ErrorInfo err = ErrorInfo();
+#endif
+
+	const ModelScalar range = get_data_range(model);
+	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+		const size_t n = AC_VTXBUF_SIZE(model.info);
+
+		// Maximum errors
+		ErrorInfo max_abs_error = ErrorInfo();
+		ErrorInfo max_rel_error = ErrorInfo();
+
+		for (size_t i = 0; i < n; ++i) {
+			const ModelScalar model_val = model.vertex_buffer[VertexBufferHandle(w)][i];
+			const AcReal cand_val = candidate.vertex_buffer[VertexBufferHandle(w)][i];
+
+			if (!verify(model_val, cand_val, range)) {
+				const int i0 = i % model.info.int_params[AC_mx];
+				const int j0 = ((i % (model.info.int_params[AC_mx] *
+								model.info.int_params[AC_my])) /
+						model.info.int_params[AC_mx]);
+				const int k0 = i / (model.info.int_params[AC_mx] *
+						model.info.int_params[AC_my]);
+				printf("Index (%d, %d, %d)\n", i0, j0, k0);
+				print_debug_info(model_val, cand_val, range);
+				retval = false;
+			}
+
+			const ModelScalar abs_error = get_absolute_error(model_val,
+					cand_val);
+			if (abs_error > max_abs_error.error) {
+				max_abs_error.error     = abs_error;
+				max_abs_error.model     = model_val;
+				max_abs_error.candidate = cand_val;
+			}
+
+			const ModelScalar rel_error = get_relative_error(model_val, cand_val);
+			if (rel_error > max_rel_error.error) {
+				max_rel_error.error     = rel_error;
+				max_rel_error.model     = model_val;
+				max_rel_error.candidate = cand_val;
+			}
+
+#if GEN_TEST_RESULT == 1
+			if (abs_error > err.error) {
+				err.error = abs_error;
+				err.model = model_val;
+				err.candidate = cand_val;
+			}
+#endif
+		}
+		//print_result(max_rel_error.model, max_rel_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
+		print_result(max_abs_error.model, max_abs_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
+	}
+
+#if GEN_TEST_RESULT == 1
+	const ModelScalar rel_err = get_relative_error(err.model, err.candidate);
+	const ModelScalar abs_err = get_absolute_error(err.model, err.candidate);
+	fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
+#endif
+
+	printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+
+	return retval;
+}
+
+	int
+check_rk3(const AcMeshInfo& mesh_info)
+{
+	const int num_iterations = 1; // Note: should work up to at least 15 steps
+	printf("Testing RK3 (running %d steps before checking the result)\n",
+			num_iterations);
+	int num_failures = 0;
+
+	// Init CPU meshes
+	AcMesh* gpu_mesh   = acmesh_create(mesh_info);
+	ModelMesh* model_mesh = modelmesh_create(mesh_info);
+
+	// Init GPU meshes
+	acInit(mesh_info);
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+        const InitType itype = test_cases[i];
+		printf("Checking %s...\n", init_type_names[InitType(itype)]);
+
+		// Init the mesh and figure out the acceptable range for error
+		acmesh_init_to(InitType(itype), gpu_mesh);
+
+		acLoad(*gpu_mesh);
+		acmesh_to_modelmesh(*gpu_mesh, model_mesh);
+
+		acBoundcondStep();
+		boundconds(model_mesh->info, model_mesh);
+
+		for (int i = 0; i < num_iterations; ++i) {
+			//const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+            //const AcReal dt   = host_timestep(umax, mesh_info);
+			const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
+
+			acIntegrate(dt);
+			acBoundcondStep();
+			acSynchronize();
+
+			model_rk3(dt, model_mesh);
+			boundconds(model_mesh->info, model_mesh);
+		}
+		acStore(gpu_mesh);
+
+		bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
+		if (!is_acceptable) {
+			++num_failures;
+		}
+	}
+
+	acQuit();
+	acmesh_destroy(gpu_mesh);
+	modelmesh_destroy(model_mesh);
+
+	return num_failures;
+}
+
+	int
+run_autotest(void)
+{
+#if GEN_TEST_RESULT == 1
+	char testresult_path[256];
+	sprintf(testresult_path, "%s_fullstep_testresult.out", AC_DOUBLE_PRECISION ? "double" : "float");
+
+	test_result = fopen(testresult_path, "w");
+	ERRCHK(test_result);
+
+	fprintf(test_result, "n, max abs error, corresponding rel error\n");
+#endif
+
+	/* Parse configs */
+	AcMeshInfo config;
+	load_config(&config);
+
+	if (STENCIL_ORDER > 6)
+		printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
+	/*
+	   const vec3i test_dims[] = {              //
+	   {15, 11, 13}, //
+	   {17, 61, 127}, //
+	   {511, 17, 16},  //
+	   {64, 64, 8},  //
+	   {32, 32, 64}, //
+	   {64, 32, 32}, //
+	   {128, 64, 32}};
+	 */
+	const vec3i test_dims[] = {{512, 16, 32},  //
+		{64, 64, 32},  //
+		{32, 32, 64}, //
+		{64, 32, 32}, //
+		{128, 64, 32}};
+
+	//const vec3i test_dims[] = {{256,256,256}};
+	//const vec3i test_dims[] = {{256,256,256}};
+	//const vec3i test_dims[] = {{32, 32, 32}};
+
+	int num_failures = 0;
+	/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+		config.int_params[AC_nx] = test_dims[i].x;
+		config.int_params[AC_ny] = test_dims[i].y;
+		config.int_params[AC_nz] = test_dims[i].z;
+		update_config(&config);
+
+		printf("Testing mesh (%d, %d, %d):\n", //
+				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+
+		num_failures += check_reductions(config);
+		fflush(stdout);
+	}*/ // TODO uncomment
+
+	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+		config.int_params[AC_nx] = test_dims[i].x;
+		config.int_params[AC_ny] = test_dims[i].y;
+		config.int_params[AC_nz] = test_dims[i].z;
+		update_config(&config);
+
+		printf("Testing mesh (%d, %d, %d):\n", //
+				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+
+		num_failures += check_rk3(config);
+		fflush(stdout);
+	}
+
+	printf("\n--------Testing done---------\n");
+	printf("Failures found: %d\n", num_failures);
+
+#if GEN_TEST_RESULT == 1
+	fflush(test_result);
+	fclose(test_result);
+#endif
+
+	if (num_failures > 0)
+		return EXIT_FAILURE;
+	else
+		return EXIT_SUCCESS;
+}
+
+#elif TEST_TYPE == THOROUGH_TEST // GEN TEST FILE START HERE --------------------------------------------------------------------------------------------------------------
+typedef struct {
+	ModelScalar model;
+	AcReal candidate;
+	ModelScalar abs_error;
+	ModelScalar ulp_error;
+	ModelScalar rel_error;
+	ModelScalar maximum_magnitude;
+	ModelScalar minimum_magnitude;
+} Error;
+
+Error get_error(ModelScalar model, AcReal candidate)
+{
+	Error error;
+        error.abs_error = 0;
+
+	error.model = model;
+	error.candidate = candidate;
+
+	if (error.model == error.candidate || fabsl(model - candidate) == 0) { // If exact
+		error.abs_error = 0;
+		error.rel_error = 0;
+		error.ulp_error = 0;
+	} else if (!is_valid(error.model) || !is_valid(error.candidate)) {
+		error.abs_error = INFINITY;
+		error.rel_error = INFINITY;
+		error.ulp_error = INFINITY;
+	} else {
+		const int base = 2;
+		const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant
+
+		const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2));
+
+		const ModelScalar ulp = powl(base, e - (p-1));
+		const ModelScalar machine_epsilon = 0.5 * powl(base, -(p-1));
+		error.abs_error = fabsl(model - candidate);
+		error.ulp_error	= error.abs_error / ulp;
+		error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon;
+	}
+
+	return error;
+}
+
+Error get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+{
+	Error error;
+        error.abs_error = -1;
+
+	for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
+		for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+			Error curr_error = get_error(model_mesh.vertex_buffer[j][i], candidate_mesh.vertex_buffer[j][i]);
+			if (curr_error.abs_error > error.abs_error)
+				error = curr_error;
+		}
+	}
+
+	error.maximum_magnitude = -1; // Not calculated.
+	error.minimum_magnitude = -1; // Not calculated.
+
+	return error;
+}
+
+static ModelScalar
+get_maximum_magnitude(const ModelScalar* field, const AcMeshInfo info)
+{
+	ModelScalar maximum = -INFINITY;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+		maximum = max(maximum, fabsl(field[i]));
+
+	return maximum;
+}
+
+
+static ModelScalar
+get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
+{
+	ModelScalar minimum = INFINITY;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+		minimum = min(minimum, fabsl(field[i]));
+
+	return minimum;
+}
+
+Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+{
+	ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
+	AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle];
+
+	Error error;
+        error.abs_error = -1;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+
+		Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
+
+		if (curr_error.abs_error > error.abs_error)
+			error = curr_error;
+	}
+
+
+	error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
+	error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
+
+	return error;
+}
+
+void
+print_error_to_file(const char* path, const int n, const Error error)
+{
+    FILE* file = fopen(path, "a");
+    fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
+    //fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
+    fclose(file);
+}
+
+#define MAX_PATH_LEN (256)
+
+int run_autotest(void)
+{
+
+#define N_MIN (32)
+#define N_MAX (512)
+	for (int n = N_MIN; n <= N_MAX; n += N_MIN) {
+		AcMeshInfo config;
+		load_config(&config);
+		config.int_params[AC_nx] = config.int_params[AC_ny] = config.int_params[AC_nz] = n;
+		update_config(&config);
+
+		// Init host
+		AcMesh* candidate_mesh = acmesh_create(config);
+		ModelMesh* model_mesh = modelmesh_create(config);
+
+		// Init device
+		acInit(config);
+
+		// Check all initial conditions
+        for (int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+            const InitType init_type = test_cases[i];
+			acmesh_init_to((InitType)init_type, candidate_mesh);
+			acmesh_to_modelmesh(*candidate_mesh, model_mesh);   // Load to Host
+			acLoad(*candidate_mesh);                             // Load to Device
+
+			boundconds(model_mesh->info, model_mesh);
+			acBoundcondStep();
+
+            { // Check boundconds
+                acStore(candidate_mesh);
+                Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh);
+                char boundcond_path[MAX_PATH_LEN];
+                sprintf(boundcond_path, "%s_boundcond_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(boundcond_path, n, boundcond_error);
+            }
+
+            { // Check scalar max reduction
+                ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                Error scalar_reduce_error = get_error(model, candidate);
+                char scalar_reduce_path[MAX_PATH_LEN];
+                sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(scalar_reduce_path, n, scalar_reduce_error);
+            }
+
+            { // Check vector max reduction
+                ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                Error vector_reduce_error = get_error(model, candidate);
+                char vector_reduce_path[MAX_PATH_LEN];
+                sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(vector_reduce_path, n, vector_reduce_error);
+            }
+
+            // Time advance
+            {
+                const AcReal umax =  (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                const AcReal dt = host_timestep(umax, config);
+
+                // Host integration step
+                model_rk3(dt, model_mesh);
+                boundconds(config, model_mesh);
+
+                // Device integration step
+                acIntegrate(dt);
+                acBoundcondStep();
+                acSynchronize();
+                acStore(candidate_mesh);
+
+                // Check fields
+                for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) {
+                    Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, *model_mesh, *candidate_mesh);
+
+			printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
+
+                    char field_path[MAX_PATH_LEN];
+                    sprintf(field_path, "%s_integrationstep_%s_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type], vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
+                    print_error_to_file(field_path, n, field_error);
+                }
+            }
+		}
+
+		// Deallocate host
+		acmesh_destroy(candidate_mesh);
+		modelmesh_destroy(model_mesh);
+
+		// Deallocate device
+		acQuit();
+	}
+
+	return 0;
+}
+#endif
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -0,0 +1,300 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <stdlib.h> // EXIT_SUCCESS
+
+#include "config_loader.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+#include <vector>
+#include <algorithm>
+#include <math.h>
+#include "src/core/errchk.h"
+
+static bool
+smaller_than(const double& a, const double& b)
+{
+    return a < b;
+}
+
+static int
+write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
+{
+    FILE* fp;
+    fp = fopen(path, "a");
+
+    if (fp != NULL) {
+        fprintf(fp, "%d, %f, %f, %f, %f\n", n, min, max, median, perc);
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+static int
+write_percentiles(const char* path, const int num_iters, const std::vector<double>& results)
+{
+    FILE* fp;
+    fp = fopen(path, "w");
+
+    if (fp != NULL) {
+        for (int i = 0; i < 100; ++i) {
+            fprintf(fp, "%f\n", results[(long unsigned)((i / 100.) * num_iters)]);
+        }
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+int
+run_benchmark(void)
+{
+    char runningtime_path[256];
+    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+
+    FILE* fp;
+    fp = fopen(runningtime_path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "n, min, max, median, perc\n");
+        fclose(fp);
+    } else {
+        return EXIT_FAILURE;
+    }
+
+    #define N_STEP_SIZE (128)
+    #define MAX_MESH_DIM (128)
+    #define NUM_ITERS (100)
+    for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
+        /* Parse configs */
+        AcMeshInfo mesh_info;
+        load_config(&mesh_info);
+        mesh_info.int_params[AC_nx] = n;
+        mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+        mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+        update_config(&mesh_info);
+
+        AcMesh* mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+        acInit(mesh_info);
+        acLoad(*mesh);
+
+        std::vector<double> results;
+        results.reserve(NUM_ITERS);
+
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            acIntegrate(0);
+            acSynchronize();
+        }
+
+        Timer t;
+        for (int i = 0; i < NUM_ITERS; ++i) {
+
+            timer_reset(&t);
+            #if GEN_BENCHMARK_RK3 == 1
+            acIntegrateStep(2, FLT_EPSILON);
+            #else // GEN_BENCHMARK_FULL
+            //const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+            const AcReal dt   = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
+            acIntegrate(dt);
+            #endif
+            acSynchronize();
+
+            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
+            results.push_back(ms_elapsed);
+        }
+
+        #define NTH_PERCENTILE (0.95)
+        std::sort(results.begin(), results.end(), smaller_than);
+        write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+
+        char percentile_path[256];
+        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+        write_percentiles(percentile_path, NUM_ITERS, results);
+
+        printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
+
+        acStore(mesh);
+        acQuit();
+        acmesh_destroy(mesh);
+    }
+
+    return 0;
+}
+
+/*
+
+#if AUTO_OPTIMIZE
+const char* benchmark_path = "benchmark.out";
+
+#include "core/kernels/rk3_threadblock.conf"
+static int
+write_result_to_file(const float& ms_per_step)
+{
+    FILE* fp;
+    fp = fopen(benchmark_path, "a");
+
+    if (fp != NULL) {
+        fprintf(fp,
+                "(%d, %d, %d), %d elems per thread, launch bound %d, %f ms\n",
+                RK_THREADS_X, RK_THREADS_Y, RK_THREADS_Z, RK_ELEMS_PER_THREAD,
+                RK_LAUNCH_BOUND_MIN_BLOCKS, double(ms_per_step));
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+#endif
+
+#if GENERATE_BENCHMARK_DATA != 1
+int
+run_benchmark(void)
+{
+    // Parse configs
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    mesh_info.int_params[AC_nx] = 128;
+    mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+    mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+    update_config(&mesh_info);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+    Timer t;
+    timer_reset(&t);
+
+    int steps           = 0;
+    const int num_steps = 100;
+    while (steps < num_steps) {
+        // Advance the simulation
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+        ++steps;
+    }
+    acSynchronize();
+    const float wallclock = timer_diff_nsec(t) / 1e9f;
+    printf("%d steps. Wallclock time %f s per step\n", steps,
+           double(wallclock) / num_steps);
+    #if AUTO_OPTIMIZE
+    write_result_to_file(wallclock * 1e3f / steps);
+    #endif
+
+    acStore(mesh);
+    acQuit();
+    acmesh_destroy(mesh);
+
+    return 0;
+}
+
+#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
+
+
+
+
+int
+run_benchmark(void)
+{
+    const char path[] = "result.out";
+    FILE* fp;
+    fp = fopen(path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "n, min, max, median, perc\n");
+        fclose(fp);
+    } else {
+        return EXIT_FAILURE;
+    }
+
+    #define N_STEP_SIZE (256)
+    #define MAX_MESH_DIM (256)
+    #define NUM_ITERS (1000)
+    for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
+        // Parse configs
+        AcMeshInfo mesh_info;
+        load_config(&mesh_info);
+        mesh_info.int_params[AC_nx] = n;
+        mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+        mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+        update_config(&mesh_info);
+
+        AcMesh* mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+        acInit(mesh_info);
+        acLoad(*mesh);
+
+        std::vector<double> results;
+        results.reserve(NUM_ITERS);
+
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            acIntegrate(0);
+            acSynchronize();
+        }
+
+        Timer t;
+
+        const AcReal dt = AcReal(1e-5);
+        for (int i = 0; i < NUM_ITERS; ++i) {
+
+            timer_reset(&t);
+            //acIntegrate(dt);
+            acIntegrateStep(2, dt);
+            acSynchronize();
+
+            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
+            results.push_back(ms_elapsed);
+        }
+
+
+
+        #define NTH_PERCENTILE (0.95)
+        std::sort(results.begin(), results.end(), smaller_than);
+        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+        write_percentiles(n, NUM_ITERS, results);
+    }
+
+    return 0;
+}
+#endif
+*/
--- a/src/standalone/config_loader.cc
+++ b/src/standalone/config_loader.cc
@@ -0,0 +1,194 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "config_loader.h"
+
+#include <limits.h> // UINT_MAX
+#include <stdint.h> // uint8_t, uint32_t
+#include <stdio.h>  // print
+#include <string.h> // memset
+
+#include "core/errchk.h"
+#include "core/math_utils.h"
+
+static inline void
+print(const AcMeshInfo& config)
+{
+    for (int i = 0; i < NUM_INT_PARAM_TYPES; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_REAL_PARAM_TYPES; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+}
+
+/**
+ \brief Find the index of the keyword in names
+ \return Index in range 0...n if the keyword is in names. -1 if the keyword was
+ not found.
+ */
+static int
+find_str(const char keyword[], const char* names[], const int& n)
+{
+    for (int i = 0; i < n; ++i)
+        if (!strcmp(keyword, names[i]))
+            return i;
+
+    return -1;
+}
+
+static void
+parse_config(const char* path, AcMeshInfo* config)
+{
+    FILE* fp;
+    fp = fopen(path, "r");
+    // For knowing which .conf file will be used 
+    printf("Config file path: \n %s \n ", path);
+    ERRCHK(fp != NULL);
+
+    const size_t BUF_SIZE = 128;
+    char keyword[BUF_SIZE];
+    char value[BUF_SIZE];
+    int items_matched;
+    while ((items_matched = fscanf(fp, "%s = %s", keyword, value)) != EOF) {
+
+        if (items_matched < 2)
+            continue;
+
+        int idx = -1;
+        if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
+            config->int_params[idx] = atoi(value);
+        else if ((idx = find_str(keyword, realparam_names,
+                                 NUM_REAL_PARAM_TYPES)) >= 0)
+            config->real_params[idx] = AcReal(atof(value));
+    }
+
+    fclose(fp);
+}
+
+void
+update_config(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    //config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
+                                    config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] +
+                                    STENCIL_ORDER / 2;
+    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] +
+                                    STENCIL_ORDER / 2;
+
+    // Spacing
+    config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
+    config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
+    config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
+    config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
+
+    // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
+    config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx]; 
+    config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my];
+    config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz];
+
+    config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];  
+    config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen]; 
+    config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen]; 
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy] = config->int_params[AC_mx] *
+                                 config->int_params[AC_my];
+    config->int_params[AC_nxy] = config->int_params[AC_nx] *
+                                 config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
+                                  config->int_params[AC_nz];
+
+    // Real helpers
+    config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
+                                        config->real_params[AC_cs_sound];
+
+    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
+
+    AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
+    AcReal M_sun       = AcReal(1.989e33);  // g solar mass
+
+    config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun / 
+                                     ( (config->real_params[AC_unit_length]*
+                                        config->real_params[AC_unit_length]*
+                                        config->real_params[AC_unit_length]) * 
+                                        config->real_params[AC_unit_density] ) ;
+
+    config->real_params[AC_G_CONST] = G_CONST_CGS / 
+                                      ( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
+                                        (config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
+
+    config->real_params[AC_GM_star]  = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
+    config->real_params[AC_sq2GM_star]  = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
+
+
+    const bool print_config = true;
+    if (print_config) {
+        printf("###############################################################"
+               "\n");
+        printf("Config dimensions recalculated:\n");
+        print(*config);
+        printf("###############################################################"
+               "\n");
+    }
+}
+
+/**
+\brief Loads data from astaroth.conf into a config struct.
+\return 0 on success, -1 if there are potentially uninitialized values.
+*/
+int
+load_config(AcMeshInfo* config)
+{
+    int retval = 0;
+    // memset reads the second parameter as a byte even though it says int in
+    // the function declaration
+    memset(config, (uint8_t)0xFF, sizeof(*config));
+
+    parse_config(CONFIG_PATH "astaroth.conf", config);
+    update_config(config);
+
+    // sizeof(config) must be a multiple of 4 bytes for this to work
+    ERRCHK(sizeof(*config) % sizeof(uint32_t) == 0);
+    for (size_t i = 0; i < sizeof(*config) / sizeof(uint32_t); ++i) {
+        if (((uint32_t*)config)[i] == (uint32_t)0xFFFFFFFF) {
+            WARNING("Some config values may be uninitialized. "
+                    "See that all are defined in astaroth.conf\n");
+            retval = -1;
+        }
+    }
+    return retval;
+}
--- a/src/standalone/config_loader.h
+++ b/src/standalone/config_loader.h
@@ -0,0 +1,34 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Functions for loading and updating AcMeshInfo.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+/** Loads data from the config file */
+int load_config(AcMeshInfo* config);
+
+/** Recalculates the portion of int parameters which get their values from nx,
+ * ny and nz. Must be called after modifying the config struct or otherwise
+ * contents of the struct will be incorrect */
+void update_config(AcMeshInfo* config);
--- a/src/standalone/main.cc
+++ b/src/standalone/main.cc
@@ -0,0 +1,94 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core/errchk.h"
+#include "run.h"
+
+// Write all errors from stderr to an <errorlog_name> in the current working
+// directory
+static const bool write_log_to_a_file = false;
+static const char* errorlog_name      = "error.log";
+
+static void
+errorlog_init(void)
+{
+    FILE* fp = freopen(errorlog_name, "w", stderr); // Log errors to a file
+    if (!fp)
+        perror("Error redirecting stderr to a file");
+}
+
+static void
+errorlog_quit(void)
+{
+    fclose(stderr);
+
+    // Print contents of the latest errorlog to screen
+    FILE* fp = fopen(errorlog_name, "r");
+    if (fp) {
+        for (int c = getc(fp); c != EOF; c = getc(fp))
+            putchar(c);
+        fclose(fp);
+    }
+    else {
+        perror("Error opening error log");
+    }
+}
+
+int
+main(int argc, char* argv[])
+{
+    if (write_log_to_a_file) {
+        errorlog_init();
+        atexit(errorlog_quit);
+    }
+
+    printf("Args: \n");
+    for (int i = 0; i < argc; ++i)
+        printf("%d: %s\n", i, argv[i]);
+
+    if (argc == 1) {
+        return run_renderer();
+    }
+    else if (argc == 2) {
+        if (strcmp(argv[1], "-t") == 0)
+            return run_autotest();
+        else if (strcmp(argv[1], "-b") == 0)
+            return run_benchmark();
+        else if (strcmp(argv[1], "-s") == 0)
+            return run_simulation();
+        else
+            WARNING("Unrecognized option");
+    }
+    else {
+        WARNING("Too many options given");
+    }
+
+    return EXIT_FAILURE;
+}
--- a/src/standalone/model/host_memory.cc
+++ b/src/standalone/model/host_memory.cc
@@ -0,0 +1,737 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "host_memory.h"
+
+#include <math.h>
+
+#include "core/errchk.h"
+
+const char* init_type_names[] = {AC_FOR_INIT_TYPES(AC_GEN_STR)};
+
+#define XORIG (AcReal(.5) * mesh->info.int_params[AC_nx] * mesh->info.real_params[AC_dsx])
+#define YORIG (AcReal(.5) * mesh->info.int_params[AC_ny] * mesh->info.real_params[AC_dsy])
+#define ZORIG (AcReal(.5) * mesh->info.int_params[AC_nz] * mesh->info.real_params[AC_dsz])
+
+/*
+#include <stdint.h>
+static uint64_t ac_rand_next = 1;
+
+static int32_t
+ac_rand(void)
+{
+	ac_rand_next = ac_rand_next * 1103515245 + 12345;
+	return (uint32_t)(ac_rand_next/65536) % 32768;
+}
+
+static void
+ac_srand(const uint32_t seed)
+{
+	ac_rand_next = seed;	
+}
+*/
+
+AcMesh*
+acmesh_create(const AcMeshInfo& mesh_info)
+{
+    AcMesh* mesh = (AcMesh*)malloc(sizeof(*mesh));
+    mesh->info   = mesh_info;
+
+    const size_t bytes = AC_VTXBUF_SIZE_BYTES(mesh->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        mesh->vertex_buffer[VertexBufferHandle(i)] = (AcReal*)malloc(bytes);
+        ERRCHK(mesh->vertex_buffer[VertexBufferHandle(i)] != NULL);
+    }
+
+    return mesh;
+}
+
+static void
+vertex_buffer_set(const VertexBufferHandle& key, const AcReal& val,
+                  AcMesh* mesh)
+{
+    const int n = AC_VTXBUF_SIZE(mesh->info);
+    for (int i = 0; i < n; ++i)
+        mesh->vertex_buffer[key][i] = val;
+}
+
+
+/** Inits all fields to 1. Setting the mesh to zero is problematic because some fields are supposed
+    to be > 0 and the results would vary widely, which leads to loss of precision in the
+    computations */
+void
+acmesh_clear(AcMesh* mesh)
+{
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+        vertex_buffer_set(VertexBufferHandle(w), 1, mesh); // Init all fields to 1 by default.
+}
+
+static AcReal
+randr(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+
+void
+lnrho_step(AcMesh* mesh)
+{
+    const int    mx     = mesh->info.int_params[AC_mx];
+    const int    my     = mesh->info.int_params[AC_my];
+    const int    mz     = mesh->info.int_params[AC_mz];
+
+    // const int    nx_min = mesh->info.int_params[AC_nx_min];
+    // const int    nx_max = mesh->info.int_params[AC_nx_max];
+    // const int    ny_min = mesh->info.int_params[AC_ny_min];
+    // const int    ny_max = mesh->info.int_params[AC_ny_max];
+    // const int    nz_min = mesh->info.int_params[AC_nz_min];
+    // const int    nz_max = mesh->info.int_params[AC_nz_max];
+
+    // const AcReal DX     = mesh->info.real_params[AC_dsx];
+    // const AcReal DY     = mesh->info.real_params[AC_dsy];
+    // const AcReal DZ     = mesh->info.real_params[AC_dsz];
+    // const AcReal xmax   = DX * (nx_max - nx_min) ;
+    // const AcReal zmax   = DZ * (nz_max - nz_min) ;
+
+    // const AcReal lnrho1 = (AcReal) -1.0; // TODO mesh->info.real_params[AC_lnrho1];  
+    const AcReal lnrho2 = (AcReal) 0.0; // TODO mesh->info.real_params[AC_lnrho2]; 
+    // const AcReal rho1   = (AcReal) exp(lnrho1); 
+    // const AcReal rho2   = (AcReal) exp(lnrho2);
+
+    // const AcReal k_pert    = (AcReal) 1.0; //mesh->info.real_params[AC_k_pert]; //Wamenumber of the perturbation
+    // const AcReal k_pert    = 4.0; //mesh->info.real_params[AC_k_pert]; //Wamenumber of the perturbation
+    //const AcReal ampl_pert = xmax/10.0; // xmax/mesh->info.real_params[AC_pert]; //Amplitude of the perturbation
+    // const AcReal ampl_pert = (AcReal) 0.0;//xmax/20.0; // xmax/mesh->info.real_params[AC_pert]; //Amplitude of the perturbation
+    // const AcReal two_pi       = (AcReal) 6.28318531;
+
+    // const AcReal xorig  = mesh->info.real_params[AC_xorig];
+    // const AcReal zorig  = mesh->info.real_params[AC_zorig];
+    // const AcReal trans  = mesh->info.real_params[AC_trans];
+     
+    
+    // AcReal       xx, zz, tanhprof, cosz_wave;
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                // zz = DZ * AcReal(k) - zorig; // Not used
+                // cosz_wave = ampl_pert*AcReal(cos(k_pert*((zz/zmax)*two_pi))); // Not used        
+                // xx = DX * AcReal(i) - xorig + cosz_wave; //ADD WAVE TODO // Not used
+                // tanhprof = AcReal(0.5)*((rho2+rho1) + (rho2-rho1)*AcReal(tanh(xx/trans))); // Not used
+                // Commented out the step function initial codition. 
+                //mesh->vertex_buffer[VTXBUF_LNRHO][idx] = log(tanhprof);
+                mesh->vertex_buffer[VTXBUF_LNRHO][idx] = lnrho2;
+            }
+        }
+    } 
+
+
+}
+
+// This is the initial condition type for the infalling vedge in the pseudodisk
+// model. 
+void
+inflow_vedge(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    // const int nx_min = mesh->info.int_params[AC_nx_min];
+    // const int nx_max = mesh->info.int_params[AC_nx_max];
+    // const int ny_min = mesh->info.int_params[AC_ny_min];
+    // const int ny_max = mesh->info.int_params[AC_ny_max];
+    // const int nz_min = mesh->info.int_params[AC_nz_min];
+    // const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    // const double DX    = mesh->info.real_params[AC_dsx];
+    // const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    const double AMPL_UU = mesh->info.real_params[AC_ampl_uu];
+    const double ANGL_UU = mesh->info.real_params[AC_angl_uu];
+
+    const double zorig = mesh->info.real_params[AC_zorig];
+    double zz;
+    double trans = mesh->info.real_params[AC_trans];
+
+    // const AcReal range = AcReal(.5);
+
+    // const AcReal zmax  = AcReal(DZ * (nz_max - nz_min));
+    // const AcReal gaussr  = zmax / AcReal(4.0);
+
+    //for (int k = nz_min; k < nz_max; k++) {
+    //    for (int j = ny_min; j < ny_max; j++) {
+    //        for (int i = nx_min; i < nx_max; i++) {
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                zz = DZ * double(k) - zorig;
+                //mesh->vertex_buffer[VTXBUF_UUX][idx] = -AMPL_UU*cos(ANGL_UU); 
+                mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal(-AMPL_UU*cos(ANGL_UU)*fabs(tanh(zz/trans))); 
+                mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal(-AMPL_UU*sin(ANGL_UU)*tanh(zz/trans)); 
+
+                //Variarion to density
+                //AcReal rho = exp(mesh->vertex_buffer[VTXBUF_LNRHO][idx]);
+                //NO GAUSSIAN//rho = rho*exp(-(zz/gaussr)*(zz/gaussr));
+                //mesh->vertex_buffer[VTXBUF_LNRHO][idx] = log(rho + (range*rho) * (randr() - AcReal(-0.5)));
+            }
+        }
+    }
+}
+
+// This is the initial condition type for the infalling vedge in the pseudodisk
+// model. 
+void
+inflow_vedge_freefall(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    // const int nx_min = mesh->info.int_params[AC_nx_min];
+    // const int nx_max = mesh->info.int_params[AC_nx_max];
+    // const int ny_min = mesh->info.int_params[AC_ny_min];
+    // const int ny_max = mesh->info.int_params[AC_ny_max];
+    // const int nz_min = mesh->info.int_params[AC_nz_min];
+    // const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+    // const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    // const double AMPL_UU = mesh->info.real_params[AC_ampl_uu];
+    const double ANGL_UU = mesh->info.real_params[AC_angl_uu];
+    const double SQ2GM = mesh->info.real_params[AC_sq2GM_star];
+    // const double GM = mesh->info.real_params[AC_GM_star];
+    // const double M_star  = mesh->info.real_params[AC_M_star];
+    // const double G_CONST = mesh->info.real_params[AC_G_CONST];
+
+    // const double unit_length   = mesh->info.real_params[AC_unit_length];
+    // const double unit_density  = mesh->info.real_params[AC_unit_density];
+    // const double unit_velocity = mesh->info.real_params[AC_unit_velocity];
+
+    const double xorig = mesh->info.real_params[AC_xorig];
+    // const double yorig = mesh->info.real_params[AC_yorig];
+    const double zorig = mesh->info.real_params[AC_zorig];
+    // const double trans = mesh->info.real_params[AC_trans];
+    //  double xx, yy, zz, RR;
+    double xx, zz, RR;
+    // double delx, dely, delz;
+    double delx, delz;
+    // double u_x, u_y, u_z, veltot, tanhz;
+    double u_x, u_z, veltot, tanhz;
+
+    const double star_pos_x = mesh->info.real_params[AC_star_pos_x];
+    const double star_pos_z = mesh->info.real_params[AC_star_pos_z];
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                xx = DX * double(i) - xorig;
+                zz = DZ * double(k) - zorig;
+
+                delx = xx - star_pos_x; 
+                delz = zz - star_pos_z;
+                //TODO: Figure out isthis needed. Now a placeholder.
+                //tanhz = fabs(tanh(zz/trans));
+                tanhz = 1.0;
+                
+                RR = sqrt(delx*delx + delz*delz);
+                veltot = SQ2GM/sqrt(RR); //Free fall velocity
+
+                //Normal velocity components
+                u_x = - veltot*(delx/RR);  
+                u_z = - veltot*(delz/RR);
+
+                //printf("star_pos_z %e, zz %e, delz %e, RR %e\n", star_pos_z, zz, delz, RR);
+
+                //printf("unit_length = %e, unit_density = %e, unit_velocity = %e,\n M_star = %e, G_CONST = %e, GM = %e, SQ2GM = %e, \n RR = %e, u_x = %e, u_z %e\n", 
+                //        unit_length, unit_density, 
+                //        unit_velocity, M_star, G_CONST, GM, SQ2GM, RR, u_x, u_z);
+                //printf("%e\n", unit_length*unit_length*unit_length); 
+
+ 
+                //Here including an angel tilt due to pseudodisk
+                if (delz >= 0.0) {
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal((u_x*cos(ANGL_UU) - u_z*sin(ANGL_UU))*tanhz); 
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal((u_x*sin(ANGL_UU) + u_z*cos(ANGL_UU))*tanhz); 
+                } else {
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal((u_x*cos(ANGL_UU) + u_z*sin(ANGL_UU))*tanhz); 
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal((-u_x*sin(ANGL_UU) + u_z*cos(ANGL_UU))*tanhz); 
+                }
+            }
+        }
+    }
+}
+
+// Only x-direction free fall 
+void
+inflow_freefall_x(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+
+    const double SQ2GM = mesh->info.real_params[AC_sq2GM_star];
+    // const double G_CONST = mesh->info.real_params[AC_G_CONST];
+
+    const double xorig = mesh->info.real_params[AC_xorig];
+    double xx, RR;
+    double delx;
+    double /*u_x,*/ veltot;
+
+    const double star_pos_x = mesh->info.real_params[AC_star_pos_x];
+
+    const double ampl_lnrho = mesh->info.real_params[AC_ampl_lnrho];
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                xx = DX * double(i) - xorig;
+
+                delx = xx - star_pos_x;
+                
+                RR = fabs(delx);
+
+                veltot = SQ2GM/sqrt(RR); //Free fall velocity
+
+                if (isinf(veltot) == 1) printf("xx %e star_pos_x %e delz %e RR %e veltot %e\n",xx, star_pos_x, delx, RR, veltot);
+
+                //Normal velocity components
+                // u_x = - veltot; // Not used 
+
+                //Freefall condition 
+                //mesh->vertex_buffer[VTXBUF_UUX][idx] = u_x; 
+                //mesh->vertex_buffer[VTXBUF_UUY][idx] = 0.0;
+                //mesh->vertex_buffer[VTXBUF_UUZ][idx] = 0.0; 
+
+                //Starting with steady state
+                mesh->vertex_buffer[VTXBUF_UUX][idx] = 0.0; 
+                mesh->vertex_buffer[VTXBUF_UUY][idx] = 0.0;
+                mesh->vertex_buffer[VTXBUF_UUZ][idx] = 0.0; 
+
+                mesh->vertex_buffer[VTXBUF_LNRHO][idx] = AcReal(ampl_lnrho); 
+            }
+        }
+    }
+}
+
+
+
+void
+gaussian_radial_explosion(AcMesh* mesh)
+{
+    AcReal* uu_x = mesh->vertex_buffer[VTXBUF_UUX];
+    AcReal* uu_y = mesh->vertex_buffer[VTXBUF_UUY];
+    AcReal* uu_z = mesh->vertex_buffer[VTXBUF_UUZ];
+
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+
+    const int nx_min = mesh->info.int_params[AC_nx_min];
+    const int nx_max = mesh->info.int_params[AC_nx_max];
+    const int ny_min = mesh->info.int_params[AC_ny_min];
+    const int ny_max = mesh->info.int_params[AC_ny_max];
+    const int nz_min = mesh->info.int_params[AC_nz_min];
+    const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+    const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    const double xorig = double(XORIG) - 0.000001; 
+    const double yorig = double(YORIG) - 0.000001;
+    const double zorig = double(ZORIG) - 0.000001;
+
+    const double INIT_LOC_UU_X = 0.0;
+    const double INIT_LOC_UU_Y = 0.0;
+    const double INIT_LOC_UU_Z = 0.0;
+
+    const double AMPL_UU    = mesh->info.real_params[AC_ampl_uu];
+    const double UU_SHELL_R = 0.8;
+    const double WIDTH_UU   = 0.2;
+
+    // Outward explosion with gaussian initial velocity profile.
+    int idx;
+    double xx, yy, zz, rr2, rr, theta = 0.0, phi = 0.0;
+    double uu_radial;
+
+    // double theta_old = 0.0;
+
+    for (int k = nz_min; k < nz_max; k++) {
+        for (int j = ny_min; j < ny_max; j++) {
+            for (int i = nx_min; i < nx_max; i++) {
+                // Calculate the value of velocity in a particular radius.
+                idx = i + j * mx + k * mx * my;
+                // Determine the coordinates
+                xx = DX * (i - nx_min) - xorig;
+                xx = xx - INIT_LOC_UU_X;
+
+                yy = DY * (j - ny_min) - yorig;
+                yy = yy - INIT_LOC_UU_Y;
+
+                zz = DZ * (k - nz_min) - zorig;
+                zz = zz - INIT_LOC_UU_Z;
+
+                rr2 = pow(xx, 2.0) + pow(yy, 2.0) + pow(zz, 2.0);
+                rr  = sqrt(rr2);
+
+                // Origin is different!
+                double xx_abs, yy_abs, zz_abs;
+                if (rr > 0.0) {
+                    // theta range [0, PI]
+                    if (zz >= 0.0) {
+                        theta = acos(zz / rr);
+                        if (theta > M_PI / 2.0 || theta < 0.0) {
+                            printf("Explosion THETA WRONG: zz = %.3f, rr = "
+                                   "%.3f, theta = %.3e/PI, M_PI = %.3e\n",
+                                   zz, rr, theta / M_PI, M_PI);
+                        }
+                    }
+                    else {
+                        zz_abs = -zz; // Needs a posite value for acos
+                        theta  = M_PI - acos(zz_abs / rr);
+                        if (theta < M_PI / 2.0 || theta > 2 * M_PI) {
+                            printf("Explosion THETA WRONG: zz = %.3f, rr = "
+                                   "%.3f, theta = %.3e/PI, M_PI = %.3e\n",
+                                   zz, rr, theta / M_PI, M_PI);
+                        }
+                    }
+
+                    // phi range [0, 2*PI]i
+                    if (xx != 0.0) {
+                        if (xx < 0.0 && yy >= 0.0) {
+                            //-+
+                            xx_abs = -xx; // Needs a posite value for atan
+                            phi    = M_PI - atan(yy / xx_abs);
+                            if (phi < (M_PI / 2.0) || phi > M_PI) {
+                                printf("Explosion PHI WRONG -+: xx = %.3f, yy "
+                                       "= %.3f, phi = %.3e/PI, M_PI = %.3e\n",
+                                       xx, yy, phi / M_PI, M_PI);
+                            }
+                        }
+                        else if (xx > 0.0 && yy < 0.0) {
+                            //+-
+                            yy_abs = -yy;
+                            phi    = 2.0 * M_PI - atan(yy_abs / xx);
+                            if (phi < (3.0 * M_PI) / 2.0 ||
+                                phi > (2.0 * M_PI + 1e-6)) {
+                                printf("Explosion PHI WRONG +-: xx = %.3f, yy "
+                                       "= %.3f, phi = %.3e/PI, M_PI = %.3e\n",
+                                       xx, yy, phi / M_PI, M_PI);
+                            }
+                        }
+                        else if (xx < 0.0 && yy < 0.0) {
+                            //--
+                            yy_abs = -yy;
+                            xx_abs = -xx;
+                            phi    = M_PI + atan(yy_abs / xx_abs);
+                            if (phi < M_PI ||
+                                phi > ((3.0 * M_PI) / 2.0 + 1e-6)) {
+                                printf("Explosion PHI WRONG --: xx = %.3f, yy "
+                                       "= %.3f, xx_abs = %.3f, yy_abs = %.3f, "
+                                       "phi = %.3e, (3.0*M_PI)/2.0 = %.3e\n",
+                                       xx, yy, xx_abs, yy_abs, phi,
+                                       (3.0 * M_PI) / 2.0);
+                            }
+                        }
+                        else {
+                            //++
+                            phi = atan(yy / xx);
+                            if (phi < 0 || phi > M_PI / 2.0) {
+                                printf(
+                                    "Explosion PHI WRONG --: xx = %.3f, yy = "
+                                    "%.3f, phi = %.3e, (3.0*M_PI)/2.0 = %.3e\n",
+                                    xx, yy, phi, (3.0 * M_PI) / 2.0);
+                            }
+                        }
+                    }
+                    else { // To avoid div by zero with atan
+                        if (yy > 0.0) {
+                            phi = M_PI / 2.0;
+                        }
+                        else if (yy < 0.0) {
+                            phi = (3.0 * M_PI) / 2.0;
+                        }
+                        else {
+                            phi = 0.0;
+                        }
+                    }
+
+                    // Set zero for explicit safekeeping
+                    if (xx == 0.0 && yy == 0.0) {
+                        phi = 0.0;
+                    }
+
+                    // Gaussian velocity
+                    // uu_radial = AMPL_UU*exp( -rr2 / (2.0*pow(WIDTH_UU, 2.0))
+                    // ); New distribution, where that gaussion wave is not in
+                    // the exact centre coordinates uu_radial = AMPL_UU*exp(
+                    // -pow((rr - 4.0*WIDTH_UU),2.0) / (2.0*pow(WIDTH_UU, 2.0))
+                    // ); //TODO: Parametrize the peak location.
+                    uu_radial = AMPL_UU * exp(-pow((rr - UU_SHELL_R), 2.0) /
+                                              (2.0 * pow(WIDTH_UU, 2.0)));
+                }
+                else {
+                    uu_radial = 0.0; // TODO: There will be a discontinuity in
+                                     // the origin... Should the shape of the
+                                     // distribution be different?
+                }
+
+                // Determine the carthesian velocity components and lnrho
+                uu_x[idx] = AcReal(uu_radial * sin(theta) * cos(phi));
+                uu_y[idx] = AcReal(uu_radial * sin(theta) * sin(phi));
+                uu_z[idx] = AcReal(uu_radial * cos(theta));
+
+                // Temporary diagnosticv output (TODO: Remove after not needed)
+                // if (theta > theta_old) {
+                // if (theta > M_PI || theta < 0.0 || phi < 0.0 || phi > 2*M_PI)
+                // {
+                /*	printf("Explosion: xx = %.3f, yy = %.3f, zz = %.3f, rr =
+                   %.3f, phi = %.3e/PI, theta = %.3e/PI\n, M_PI = %.3e", xx, yy,
+                   zz, rr, phi/M_PI, theta/M_PI, M_PI); printf(" uu_radial =
+                   %.3e, uu_x[%i] = %.3e, uu_y[%i] = %.3e, uu_z[%i] = %.3e \n",
+                                uu_radial, idx, uu_x[idx], idx, uu_y[idx], idx,
+                   uu_z[idx]); theta_old = theta;
+                */
+            }
+        }
+    }
+}
+
+void
+acmesh_init_to(const InitType& init_type, AcMesh* mesh)
+{
+    srand(123456789);
+
+
+    const int n = AC_VTXBUF_SIZE(mesh->info);
+
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    const int nx_min = mesh->info.int_params[AC_nx_min];
+    const int nx_max = mesh->info.int_params[AC_nx_max];
+    const int ny_min = mesh->info.int_params[AC_ny_min];
+    const int ny_max = mesh->info.int_params[AC_ny_max];
+    const int nz_min = mesh->info.int_params[AC_nz_min];
+    const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    switch (init_type) {
+    case INIT_TYPE_RANDOM: {
+        acmesh_clear(mesh);
+        const AcReal range = AcReal(0.01);
+        for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+            for (int i = 0; i < n; ++i)
+                mesh->vertex_buffer[w][i] = 2 * range * randr() - range;
+
+        break;
+    }
+    case INIT_TYPE_GAUSSIAN_RADIAL_EXPL:
+        acmesh_clear(mesh);
+        //acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        gaussian_radial_explosion(mesh);
+
+        break;
+    case INIT_TYPE_XWAVE:
+        acmesh_clear(mesh);
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        for (int k = 0; k < mz; k++) {
+            for (int j = 0; j < my; j++) {
+                for (int i = 0; i < mx; i++) {
+                    int idx = i + j * mx + k * mx * my;
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = 2*AcReal(sin(j * AcReal(M_PI) / mx)) - 1;
+                }
+            }
+        }
+        break;
+    case INIT_TYPE_VEDGE: 
+        acmesh_clear(mesh);
+        inflow_vedge_freefall(mesh);
+        break;
+    case INIT_TYPE_VEDGEX: 
+        acmesh_clear(mesh);
+        inflow_freefall_x(mesh);
+        break;
+    case INIT_TYPE_RAYLEIGH_TAYLOR: 
+        acmesh_clear(mesh);
+        inflow_freefall_x(mesh);
+        lnrho_step(mesh);
+        break;
+    case INIT_TYPE_ABC_FLOW: {
+        acmesh_clear(mesh);
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        for (int k = nz_min; k < nz_max; k++) {
+            for (int j = ny_min; j < ny_max; j++) {
+                for (int i = nx_min; i < nx_max; i++) {
+                    const int idx = i + j * mx + k * mx * my;
+
+                    /*
+                    const double xx = double(
+                        mesh->info.real_params[AC_dsx] *
+                            (i - mesh->info.int_params[AC_nx_min]) -
+                        XORIG + AcReal(.5) * mesh->info.real_params[AC_dsx]);
+                    const double yy = double(
+                        mesh->info.real_params[AC_dsy] *
+                            (j - mesh->info.int_params[AC_ny_min]) -
+                        YORIG + AcReal(.5) * mesh->info.real_params[AC_dsy]);
+                    const double zz = double(
+                        mesh->info.real_params[AC_dsz] *
+                            (k - mesh->info.int_params[AC_nz_min]) -
+                        ZORIG + AcReal(.5) * mesh->info.real_params[AC_dsz]);
+                    */
+
+                    const AcReal xx = (i - nx_min) * mesh->info.real_params[AC_dsx] - XORIG;
+                    const AcReal yy = (j - ny_min) * mesh->info.real_params[AC_dsy] - YORIG;
+                    const AcReal zz = (k - nz_min) * mesh->info.real_params[AC_dsz] - ZORIG;
+
+                    const AcReal ampl_uu = 0.5;
+                    const AcReal ABC_A   = 1.;
+                    const AcReal ABC_B   = 1.;
+                    const AcReal ABC_C   = 1.;
+                    const AcReal kx_uu   = 8.;
+                    const AcReal ky_uu   = 8.;
+                    const AcReal kz_uu   = 8.;
+
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = ampl_uu * (ABC_A * (AcReal)sin(kz_uu * zz) + ABC_C * (AcReal)cos(ky_uu * yy));
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = ampl_uu * (ABC_B * (AcReal)sin(kx_uu * xx) + ABC_A * (AcReal)cos(kz_uu * zz));
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = ampl_uu * (ABC_C * (AcReal)sin(ky_uu * yy) + ABC_B * (AcReal)cos(kx_uu * xx));
+                }
+            }
+        }
+        break;
+    }
+    case INIT_TYPE_RAYLEIGH_BENARD: {
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        #if LTEMPERATURE
+        vertex_buffer_set(VTXBUF_LNRHO, 1, mesh);
+        const AcReal range = AcReal(0.9);
+        for (int k = nz_min; k < nz_max; k++) {
+            for (int j = ny_min; j < ny_max; j++) {
+                for (int i = nx_min; i < nx_max; i++) {
+                    const int idx = i + j * mx + k * mx * my;
+                    mesh->vertex_buffer[VTXBUF_TEMPERATURE][idx] = (range * (k - nz_min)) / mesh->info.int_params[AC_nz] + 0.1;
+                }
+            }
+        }
+        #else
+        WARNING("INIT_TYPE_RAYLEIGH_BERNARD called even though VTXBUF_TEMPERATURE is not used");
+        #endif
+        break;
+    }
+    default:
+        ERROR("Unknown init_type");
+    }
+
+    AcReal max_val = AcReal(-1e-32);
+    AcReal min_val = AcReal(1e32);
+    // Normalize the grid
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        for (int i = 0; i < n; ++i) {
+            if (mesh->vertex_buffer[w][i] < min_val)
+                min_val = mesh->vertex_buffer[w][i];
+            if (mesh->vertex_buffer[w][i] > max_val)
+                max_val = mesh->vertex_buffer[w][i];
+        }
+    }
+    printf("MAX: %f MIN %f\n", double(max_val), double(min_val));
+    /*
+    const AcReal inv_range = AcReal(1.) / fabs(max_val - min_val);
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        for (int i = 0; i < n; ++i) {
+            mesh->vertex_buffer[w][i] = 2*inv_range*(mesh->vertex_buffer[w][i] - min_val) - 1;
+        }
+    }
+    */
+}
+
+void
+acmesh_destroy(AcMesh* mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        free(mesh->vertex_buffer[VertexBufferHandle(i)]);
+
+    free(mesh);
+}
+
+
+ModelMesh*
+modelmesh_create(const AcMeshInfo& mesh_info)
+{
+    ModelMesh* mesh = (ModelMesh*)malloc(sizeof(*mesh));
+    mesh->info   = mesh_info;
+
+    const size_t bytes = AC_VTXBUF_SIZE(mesh->info) * sizeof(mesh->vertex_buffer[0][0]);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        mesh->vertex_buffer[VertexBufferHandle(i)] = (ModelScalar*)malloc(bytes);
+        ERRCHK(mesh->vertex_buffer[VertexBufferHandle(i)] != NULL);
+    }
+
+    return mesh;
+}
+
+void
+modelmesh_destroy(ModelMesh* mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        free(mesh->vertex_buffer[VertexBufferHandle(i)]);
+
+    free(mesh);
+}
+
+#include <string.h> // memcpy
+void
+acmesh_to_modelmesh(const AcMesh& acmesh, ModelMesh* modelmesh)
+{
+    ERRCHK(sizeof(acmesh.info) == sizeof(modelmesh->info));
+    memcpy(&modelmesh->info, &acmesh.info, sizeof(acmesh.info));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        for (size_t j = 0; j < AC_VTXBUF_SIZE(acmesh.info); ++j)
+            modelmesh->vertex_buffer[i][j] = (ModelScalar)acmesh.vertex_buffer[i][j];
+}
+
+void
+modelmesh_to_acmesh(const ModelMesh& modelmesh, AcMesh* acmesh)
+{
+    ERRCHK(sizeof(acmesh->info) == sizeof(modelmesh.info));
+    memcpy(&acmesh->info, &modelmesh.info, sizeof(modelmesh.info));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        for (size_t j = 0; j < AC_VTXBUF_SIZE(modelmesh.info); ++j)
+            acmesh->vertex_buffer[i][j] = (AcReal)modelmesh.vertex_buffer[i][j];
+}
--- a/src/standalone/model/host_memory.h
+++ b/src/standalone/model/host_memory.h
@@ -0,0 +1,58 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+// clang-format off
+#define AC_FOR_INIT_TYPES(FUNC)\
+        FUNC(INIT_TYPE_RANDOM), \
+        FUNC(INIT_TYPE_XWAVE), \
+        FUNC(INIT_TYPE_GAUSSIAN_RADIAL_EXPL), \
+        FUNC(INIT_TYPE_ABC_FLOW) , \
+        FUNC(INIT_TYPE_VEDGE), \
+        FUNC(INIT_TYPE_VEDGEX), \
+        FUNC(INIT_TYPE_RAYLEIGH_TAYLOR), \
+        FUNC(INIT_TYPE_RAYLEIGH_BENARD)
+// clang-format on
+
+typedef enum { AC_FOR_INIT_TYPES(AC_GEN_ID), NUM_INIT_TYPES } InitType;
+
+extern const char* init_type_names[]; // Defined in host_memory.cc
+
+AcMesh* acmesh_create(const AcMeshInfo& mesh_info);
+
+void acmesh_clear(AcMesh* mesh);
+
+void acmesh_init_to(const InitType& type, AcMesh* mesh);
+
+void acmesh_destroy(AcMesh* mesh);
+
+ModelMesh* modelmesh_create(const AcMeshInfo& mesh_info);
+void modelmesh_destroy(ModelMesh* mesh);
+void acmesh_to_modelmesh(const AcMesh& acmesh, ModelMesh* modelmesh);
+void modelmesh_to_acmesh(const ModelMesh& model, AcMesh* acmesh);
--- a/src/standalone/model/host_timestep.cc
+++ b/src/standalone/model/host_timestep.cc
@@ -0,0 +1,63 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "host_timestep.h"
+
+#include "core/math_utils.h"
+
+static AcReal timescale = AcReal(1.0);
+
+AcReal
+host_timestep(const AcReal& umax, const AcMeshInfo& mesh_info)
+{
+    const long double cdt      = mesh_info.real_params[AC_cdt];
+    const long double cdtv     = mesh_info.real_params[AC_cdtv];
+    // const long double cdts     = mesh_info.real_params[AC_cdts];
+    const long double cs2_sound = mesh_info.real_params[AC_cs2_sound];
+    const long double nu_visc  = mesh_info.real_params[AC_nu_visc];
+    const long double eta      = mesh_info.real_params[AC_eta];
+    const long double chi      = 0; // mesh_info.real_params[AC_chi]; // TODO not calculated
+    const long double gamma    = mesh_info.real_params[AC_gamma];
+    const long double dsmin    = mesh_info.real_params[AC_dsmin];
+
+    // Old ones from legacy Astaroth
+    //const long double uu_dt   = cdt * (dsmin / (umax + cs_sound));
+    //const long double visc_dt = cdtv * dsmin * dsmin / nu_visc;
+
+    // New, closer to the actual Courant timestep
+    // See Pencil Code user manual p. 38 (timestep section)
+    const long double uu_dt   = cdt * dsmin / (fabsl(umax) + sqrtl(cs2_sound + 0.0l));
+    const long double visc_dt = cdtv * dsmin * dsmin / max(max(nu_visc, eta), max(gamma, chi)) + 1; // TODO NOTE: comment the +1 out to get scientifically accurate results
+
+    const long double dt = min(uu_dt, visc_dt);
+    return AcReal(timescale) * AcReal(dt);
+}
+
+void
+set_timescale(const AcReal scale)
+{
+    timescale = scale;
+}
--- a/src/standalone/model/host_timestep.h
+++ b/src/standalone/model/host_timestep.h
@@ -0,0 +1,32 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+AcReal host_timestep(const AcReal& umax, const AcMeshInfo& mesh_info);
+
+void set_timescale(const AcReal scale);
--- a/src/standalone/model/model_boundconds.cc
+++ b/src/standalone/model/model_boundconds.cc
@@ -0,0 +1,487 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) amy later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT Amy WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "model_boundconds.h"
+
+#include "core/errchk.h"
+
+
+void
+boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh)
+{
+    #pragma omp parallel for
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const int3 start = (int3){0, 0, 0};
+        const int3 end = (int3){
+            mesh_info.int_params[AC_mx],
+            mesh_info.int_params[AC_my],
+            mesh_info.int_params[AC_mz]
+        };
+
+        const int nx = mesh_info.int_params[AC_nx];
+        const int ny = mesh_info.int_params[AC_ny];
+        const int nz = mesh_info.int_params[AC_nz];
+
+         const int nx_min = mesh_info.int_params[AC_nx_min];
+         const int ny_min = mesh_info.int_params[AC_ny_min];
+         const int nz_min = mesh_info.int_params[AC_nz_min];
+
+         // The old kxt was inclusive, but our mx_max is exclusive
+         const int nx_max = mesh_info.int_params[AC_nx_max];
+         const int ny_max = mesh_info.int_params[AC_ny_max];
+         const int nz_max = mesh_info.int_params[AC_nz_max];
+
+        for (int k_dst = start.z; k_dst < end.z; ++k_dst) {
+        for (int j_dst = start.y; j_dst < end.y; ++j_dst) {
+        for (int i_dst = start.x; i_dst < end.x; ++i_dst) {
+
+            // If destination index is inside the computational domain, return since
+            // the boundary conditions are only applied to the ghost zones
+            if (i_dst >= nx_min && i_dst < nx_max &&
+                j_dst >= ny_min && j_dst < ny_max &&
+                k_dst >= nz_min && k_dst < nz_max)
+                continue;
+
+            // Find the source index
+            // Map to nx, ny, nz coordinates
+            int i_src = i_dst - nx_min;
+            int j_src = j_dst - ny_min;
+            int k_src = k_dst - nz_min;
+
+            // Translate (s.t. the index is always positive)
+            i_src += nx;
+            j_src += ny;
+            k_src += nz;
+
+            // Wrap
+            i_src %= nx;
+            j_src %= ny;
+            k_src %= nz;
+
+            // Map to mx, my, mz coordinates
+            i_src += nx_min;
+            j_src += ny_min;
+            k_src += nz_min;
+
+            const size_t src_idx      = AC_VTXBUF_IDX(i_src, j_src, k_src, mesh_info);
+            const size_t dst_idx      = AC_VTXBUF_IDX(i_dst, j_dst, k_dst, mesh_info);
+            ERRCHK(src_idx < AC_VTXBUF_SIZE(mesh_info));
+            ERRCHK(dst_idx < AC_VTXBUF_SIZE(mesh_info));
+            mesh->vertex_buffer[w][dst_idx] = mesh->vertex_buffer[w][src_idx];
+        }
+        }
+        }
+    }
+}
+
+#if 0
+void
+boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh)
+{
+    const int mx = mesh_info.int_params[AC_mx];
+    const int my = mesh_info.int_params[AC_my];
+    const int mz = mesh_info.int_params[AC_mz];
+
+    // Volatile here suppresses the warning about strict-overflow (i.e. compiler
+    // wanted to optimize these loops by assuming that kxb etc never overflow)
+    // However we do not need the performance improvement (~1-3%) and it's
+    // not either good to
+    //	a) get useless warnings originating from here
+    //	b) disable the warnings completely
+    volatile const int kxb = mesh_info.int_params[AC_nx_min];
+    volatile const int kyb = mesh_info.int_params[AC_ny_min];
+    volatile const int kzb = mesh_info.int_params[AC_nz_min];
+
+    // The old kxt was inclusive, but our mx_max is exclusive
+    volatile const int kxt = mesh_info.int_params[AC_nx_max] - 1;
+    volatile const int kyt = mesh_info.int_params[AC_ny_max] - 1;
+    volatile const int kzt = mesh_info.int_params[AC_nz_max] - 1;
+    const int bound[3]     = {0, 0, 0};
+
+    // Periodic boundary conditions
+    if (bound[0] == 0) {
+        for (int k = kzb; k <= kzt; k++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int i = kxb; i <= kxb + 2; i++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (kxt + i - 2) + j * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int i = kxt - 2; i <= kxt; i++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - kxt + 2) + j * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    if (bound[1] == 0) {
+        for (int k = kzb; k <= kzt; k++) {
+            for (int i = kxb; i <= kxt; i++) {
+                for (int j = kyb; j <= kyb + 2; j++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (kyt + j - 2) * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int j = kyt - 2; j <= kyt; j++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - kyt + 2) * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+
+    if (bound[2] == 0) {
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + j * mx + (kzt + k - 2) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + j * mx + (k - kzt + 2) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+
+    // Copy the corners in the fully periodic case
+    if (bound[0] == 0 && bound[1] == 0 && bound[2] == 0) {
+        // Source corner: x=0, y=0, z=0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=0, z=0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=1, z=0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=0, z=1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=1, z=0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=0, z=1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=1, z=1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=1, z=1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    else {
+        ERROR("ONLY FULLY PERIODIC WORKS WITH CORNERS SO FAR! \n");
+    }
+
+    // Copy the edges in the fully periodic case
+    if (bound[0] == 0 && bound[1] == 0 && bound[2] == 0) {
+        // Source edge: x = 0, y = 0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, y = 0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, y = 1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, y = 1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, z = 0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + j * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, z = 0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + j * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, z = 1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + j * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, z = 1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + j * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 0, z = 0
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 1, z = 0
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 0, z = 1
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 1, z = 1
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    else {
+        ERROR("ONLY FULLY PERIODIC WORKS WITH EDGES SO FAR! \n");
+    }
+}
+#endif
--- a/src/standalone/model/model_boundconds.h
+++ b/src/standalone/model/model_boundconds.h
@@ -0,0 +1,31 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+void boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh);
--- a/src/standalone/model/model_diff.h
+++ b/src/standalone/model/model_diff.h
@@ -0,0 +1,353 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "core/errchk.h"
+
+typedef long double MODEL_REAL;
+
+typedef enum { AXIS_X, AXIS_Y, AXIS_Z, NUM_AXIS_TYPES } AxisType;
+
+template <AxisType axis>
+static inline MODEL_REAL
+der_scal(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+         const MODEL_REAL* scal)
+{
+    MODEL_REAL f0, f1, f2, f4, f5, f6;
+    MODEL_REAL ds;
+
+    switch (axis) {
+    case AXIS_X:
+        f0 = scal[AC_VTXBUF_IDX(i - 3, j, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i - 2, j, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i - 1, j, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i + 1, j, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i + 2, j, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i + 3, j, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsx];
+        break;
+    case AXIS_Y:
+        f0 = scal[AC_VTXBUF_IDX(i, j - 3, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j - 2, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j - 1, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j + 1, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j + 2, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j + 3, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsy];
+        break;
+    case AXIS_Z:
+        f0 = scal[AC_VTXBUF_IDX(i, j, k - 3, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j, k - 2, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j, k - 1, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j, k + 1, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j, k + 2, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j, k + 3, mesh_info)];
+        ds = mesh_info.real_params[AC_dsz];
+        break;
+    default:
+        ERROR("Unknown axis type");
+    }
+    return ((f6 - f0) + MODEL_REAL(-9.) * (f5 - f1) + MODEL_REAL(45.) * (f4 - f2)) /
+           (MODEL_REAL(60.) * ds);
+}
+
+template <AxisType axis>
+static inline MODEL_REAL
+der2_scal(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+          const MODEL_REAL* scal)
+{
+    MODEL_REAL f0, f1, f2, f3, f4, f5, f6;
+    MODEL_REAL ds;
+
+    f3 = scal[AC_VTXBUF_IDX(i, j, k, mesh_info)];
+
+    switch (axis) {
+    case AXIS_X:
+        f0 = scal[AC_VTXBUF_IDX(i - 3, j, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i - 2, j, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i - 1, j, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i + 1, j, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i + 2, j, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i + 3, j, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsx];
+        break;
+    case AXIS_Y:
+        f0 = scal[AC_VTXBUF_IDX(i, j - 3, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j - 2, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j - 1, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j + 1, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j + 2, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j + 3, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsy];
+        break;
+    case AXIS_Z:
+        f0 = scal[AC_VTXBUF_IDX(i, j, k - 3, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j, k - 2, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j, k - 1, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j, k + 1, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j, k + 2, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j, k + 3, mesh_info)];
+        ds = mesh_info.real_params[AC_dsz];
+        break;
+    default:
+        ERROR("Unknown axis type");
+    }
+    return (MODEL_REAL(2.) * (f0 + f6) + MODEL_REAL(-27.) * (f1 + f5) +
+            MODEL_REAL(270.) * (f2 + f4) + MODEL_REAL(-490.) * f3) /
+           (MODEL_REAL(180.) * ds * ds);
+}
+
+static MODEL_REAL
+laplace_scal(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* scal)
+{
+    return der2_scal<AXIS_X>(i, j, k, mesh_info, scal) +
+           der2_scal<AXIS_Y>(i, j, k, mesh_info, scal) +
+           der2_scal<AXIS_Z>(i, j, k, mesh_info, scal);
+}
+
+static void
+laplace_vec(const int& i, const int& j, const int& k,
+            const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+            const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, MODEL_REAL* laplace_x,
+            MODEL_REAL* laplace_y, MODEL_REAL* laplace_z)
+{
+    *laplace_x = laplace_scal(i, j, k, mesh_info, vec_x);
+    *laplace_y = laplace_scal(i, j, k, mesh_info, vec_y);
+    *laplace_z = laplace_scal(i, j, k, mesh_info, vec_z);
+}
+
+static MODEL_REAL
+div_vec(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+        const MODEL_REAL* vec_x, const MODEL_REAL* vec_y, const MODEL_REAL* vec_z)
+{
+    return der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+           der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+           der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z);
+}
+
+static void
+grad(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+     const MODEL_REAL* scal, MODEL_REAL* res_x, MODEL_REAL* res_y, MODEL_REAL* res_z)
+{
+    *res_x = der_scal<AXIS_X>(i, j, k, mesh_info, scal);
+    *res_y = der_scal<AXIS_Y>(i, j, k, mesh_info, scal);
+    *res_z = der_scal<AXIS_Z>(i, j, k, mesh_info, scal);
+}
+
+static MODEL_REAL
+vec_dot_nabla_scal(const int& i, const int& j, const int& k,
+                   const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+                   const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, const MODEL_REAL* scal)
+{
+    const int idx = AC_VTXBUF_IDX(i, j, k, mesh_info);
+    MODEL_REAL ddx_scal, ddy_scal, ddz_scal;
+    grad(i, j, k, mesh_info, scal, &ddx_scal, &ddy_scal, &ddz_scal);
+    return vec_x[idx] * ddx_scal + vec_y[idx] * ddy_scal +
+           vec_z[idx] * ddz_scal;
+}
+
+/*
+ * =============================================================================
+ * Viscosity
+ * =============================================================================
+ */
+typedef enum { DERNM_XY, DERNM_YZ, DERNM_XZ } DernmType;
+
+template <DernmType dernm>
+static MODEL_REAL
+dernm_scal(const int& i, const int& j, const int& k,
+           const AcMeshInfo& mesh_info, const MODEL_REAL* scal)
+{
+
+    MODEL_REAL fac;
+
+    const MODEL_REAL dsx = mesh_info.real_params[AC_dsx];
+    const MODEL_REAL dsy = mesh_info.real_params[AC_dsy];
+    const MODEL_REAL dsz = mesh_info.real_params[AC_dsz];
+
+    MODEL_REAL f_p1_p1, f_m1_p1, f_m1_m1, f_p1_m1;
+    MODEL_REAL f_p2_p2, f_m2_p2, f_m2_m2, f_p2_m2;
+    MODEL_REAL f_p3_p3, f_m3_p3, f_m3_m3, f_p3_m3;
+
+    switch (dernm) {
+    case DERNM_XY:
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsx) * (MODEL_REAL(1.) / dsy);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i + 1, j + 1, k, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i - 1, j + 1, k, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i - 1, j - 1, k, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i + 1, j - 1, k, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i + 2, j + 2, k, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i - 2, j + 2, k, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i - 2, j - 2, k, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i + 2, j - 2, k, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i + 3, j + 3, k, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i - 3, j + 3, k, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i - 3, j - 3, k, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i + 3, j - 3, k, mesh_info)];
+        break;
+    case DERNM_YZ:
+        // NOTE this is a bit different from the old one, second is j+1k-1
+        // instead of j-1,k+1
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsy) * (MODEL_REAL(1.) / dsz);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i, j + 1, k + 1, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i, j - 1, k + 1, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i, j - 1, k - 1, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i, j + 1, k - 1, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i, j + 2, k + 2, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i, j - 2, k + 2, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i, j - 2, k - 2, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i, j + 2, k - 2, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i, j + 3, k + 3, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i, j - 3, k + 3, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i, j - 3, k - 3, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i, j + 3, k - 3, mesh_info)];
+        break;
+    case DERNM_XZ:
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsx) * (MODEL_REAL(1.) / dsz);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i + 1, j, k + 1, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i - 1, j, k + 1, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i - 1, j, k - 1, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i + 1, j, k - 1, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i + 2, j, k + 2, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i - 2, j, k + 2, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i - 2, j, k - 2, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i + 2, j, k - 2, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i + 3, j, k + 3, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i - 3, j, k + 3, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i - 3, j, k - 3, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i + 3, j, k - 3, mesh_info)];
+        break;
+    default:
+        ERROR("Invalid dernm type");
+    }
+    return fac * (MODEL_REAL(270.) * (f_p1_p1 - f_m1_p1 + f_m1_m1 - f_p1_m1) -
+                  MODEL_REAL(27.) * (f_p2_p2 - f_m2_p2 + f_m2_m2 - f_p2_m2) +
+                  MODEL_REAL(2.) * (f_p3_p3 - f_m3_p3 + f_m3_m3 - f_p3_m3));
+}
+
+static void
+grad_div_vec(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+             const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, MODEL_REAL* gdvx,
+             MODEL_REAL* gdvy, MODEL_REAL* gdvz)
+{
+    *gdvx = der2_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+            dernm_scal<DERNM_XY>(i, j, k, mesh_info, vec_y) +
+            dernm_scal<DERNM_XZ>(i, j, k, mesh_info, vec_z);
+
+    *gdvy = dernm_scal<DERNM_XY>(i, j, k, mesh_info, vec_x) +
+            der2_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+            dernm_scal<DERNM_YZ>(i, j, k, mesh_info, vec_z);
+
+    *gdvz = dernm_scal<DERNM_XZ>(i, j, k, mesh_info, vec_x) +
+            dernm_scal<DERNM_YZ>(i, j, k, mesh_info, vec_y) +
+            der2_scal<AXIS_Z>(i, j, k, mesh_info, vec_z);
+}
+
+static void
+S_grad_lnrho(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+             const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, const MODEL_REAL* lnrho,
+             MODEL_REAL* sgrhox, MODEL_REAL* sgrhoy, MODEL_REAL* sgrhoz)
+{
+    const MODEL_REAL c23 = MODEL_REAL(2. / 3.);
+    const MODEL_REAL c13 = MODEL_REAL(1. / 3.);
+
+    const MODEL_REAL Sxx = c23 * der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) -
+                       c13 * (der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+                              der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z));
+    const MODEL_REAL Sxy = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Y>(i, j, k, mesh_info, vec_x) +
+                        der_scal<AXIS_X>(i, j, k, mesh_info, vec_y));
+    const MODEL_REAL Sxz = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Z>(i, j, k, mesh_info, vec_x) +
+                        der_scal<AXIS_X>(i, j, k, mesh_info, vec_z));
+
+    const MODEL_REAL Syx = Sxy;
+    const MODEL_REAL Syy = c23 * der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) -
+                       c13 * (der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+                              der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z));
+    const MODEL_REAL Syz = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Z>(i, j, k, mesh_info, vec_y) +
+                        der_scal<AXIS_Y>(i, j, k, mesh_info, vec_z));
+
+    const MODEL_REAL Szx = Sxz;
+    const MODEL_REAL Szy = Syz;
+    const MODEL_REAL Szz = c23 *
+                           der_scal<AXIS_Z>(
+                               i, j, k, mesh_info,
+                               vec_z) // replaced from "c23*der_scal<AXIS_Z>(i,
+                                      // j, k, mesh_info, vec_x)"! TODO recheck
+                                      // that ddz_uu_z is the correct one
+                       - c13 * (der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+                                der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y));
+
+    // Grad lnrho
+
+    MODEL_REAL glnx, glny, glnz;
+
+    grad(i, j, k, mesh_info, lnrho, &glnx, &glny, &glnz);
+
+    *sgrhox = Sxx * glnx + Sxy * glny + Sxz * glnz;
+    *sgrhoy = Syx * glnx + Syy * glny + Syz * glnz;
+    *sgrhoz = Szx * glnx + Szy * glny + Szz * glnz;
+}
+
+static void
+nu_const(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+         const MODEL_REAL* vec_x, const MODEL_REAL* vec_y, const MODEL_REAL* vec_z,
+         const MODEL_REAL* scal, MODEL_REAL* visc_x, MODEL_REAL* visc_y, MODEL_REAL* visc_z)
+{
+    MODEL_REAL lx, ly, lz;
+    laplace_vec(i, j, k, mesh_info, vec_x, vec_y, vec_z, &lx, &ly, &lz);
+    // lx = ly = lz = .0f;
+
+    MODEL_REAL gx, gy, gz;
+    grad_div_vec(i, j, k, mesh_info, vec_x, vec_y, vec_z, &gx, &gy, &gz);
+    // gx = gy =gz = .0f;
+
+    MODEL_REAL sgrhox, sgrhoy, sgrhoz;
+    S_grad_lnrho(i, j, k, mesh_info, vec_x, vec_y, vec_z, scal, &sgrhox,
+                 &sgrhoy, &sgrhoz);
+    // sgrhox = sgrhoy = sgrhoz = .0f;
+
+    *visc_x = mesh_info.real_params[AC_nu_visc] *
+              (lx + MODEL_REAL(1. / 3.) * gx + MODEL_REAL(2.) * sgrhox)
+              + mesh_info.real_params[AC_zeta] * gx;
+    *visc_y = mesh_info.real_params[AC_nu_visc] *
+              (ly + MODEL_REAL(1. / 3.) * gy + MODEL_REAL(2.) * sgrhoy)
+              + mesh_info.real_params[AC_zeta] * gy;
+    *visc_z = mesh_info.real_params[AC_nu_visc] *
+              (lz + MODEL_REAL(1. / 3.) * gz + MODEL_REAL(2.) * sgrhoz)
+              + mesh_info.real_params[AC_zeta] * gz;
+}
--- a/src/standalone/model/model_reduce.cc
+++ b/src/standalone/model/model_reduce.cc
@@ -0,0 +1,203 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "model_reduce.h"
+
+#include <math.h>
+
+#include "core/errchk.h"
+
+// Function pointer definitions
+typedef ModelScalar (*ReduceFunc)(const ModelScalar&, const ModelScalar&);
+typedef ModelScalar (*ReduceInitialScalFunc)(const ModelScalar&);
+typedef ModelScalar (*ReduceInitialVecFunc)(const ModelScalar&, const ModelScalar&,
+                                            const ModelScalar&);
+
+// clang-format off
+/* Comparison funcs */
+static inline ModelScalar
+max(const ModelScalar& a, const ModelScalar& b) { return a > b ? a : b; }
+
+static inline ModelScalar
+min(const ModelScalar& a, const ModelScalar& b) { return a < b ? a : b; }
+
+static inline ModelScalar
+sum(const ModelScalar& a, const ModelScalar& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static inline ModelScalar
+length(const ModelScalar& a) { return (ModelScalar)(a); }
+
+static inline ModelScalar
+length(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return sqrtl(a*a + b*b + c*c); }
+
+static inline ModelScalar
+squared(const ModelScalar& a) { return (ModelScalar)(a*a); }
+
+static inline ModelScalar
+squared(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return squared(a) + squared(b) + squared(c); }
+
+static inline ModelScalar
+exp_squared(const ModelScalar& a) { return expl(a)*expl(a); }
+
+static inline ModelScalar
+exp_squared(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return exp_squared(a) + exp_squared(b) + exp_squared(c); }
+// clang-format on
+
+ModelScalar
+model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
+                  const VertexBufferHandle& a)
+{
+    ReduceInitialScalFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = AC_VTXBUF_IDX(
+        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
+        mesh.info.int_params[AC_nz_min], mesh.info);
+
+    ModelScalar res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
+    else
+        res = .0f;
+
+    for (int k = mesh.info.int_params[AC_nz_min];
+         k < mesh.info.int_params[AC_nz_max]; ++k) {
+        for (int j = mesh.info.int_params[AC_ny_min];
+             j < mesh.info.int_params[AC_ny_max]; ++j) {
+            for (int i = mesh.info.int_params[AC_nx_min];
+                 i < mesh.info.int_params[AC_nx_max]; ++i) {
+                const int idx              = AC_VTXBUF_IDX(i, j, k, mesh.info);
+                const ModelScalar curr_val = reduce_initial(
+                    mesh.vertex_buffer[a][idx]);
+                res = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const ModelScalar inv_n = 1.0l / mesh.info.int_params[AC_nxyz];
+        return sqrtl(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
+
+ModelScalar
+model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
+                 const VertexBufferHandle& a, const VertexBufferHandle& b,
+                 const VertexBufferHandle& c)
+{
+    // ModelScalar (*reduce_initial)(ModelScalar, ModelScalar, ModelScalar);
+    ReduceInitialVecFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = AC_VTXBUF_IDX(
+        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
+        mesh.info.int_params[AC_nz_min], mesh.info);
+
+    ModelScalar res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx],
+                             mesh.vertex_buffer[b][initial_idx],
+                             mesh.vertex_buffer[c][initial_idx]);
+    else
+        res = 0;
+
+    for (int k = mesh.info.int_params[AC_nz_min];
+         k < mesh.info.int_params[AC_nz_max]; k++) {
+        for (int j = mesh.info.int_params[AC_ny_min];
+             j < mesh.info.int_params[AC_ny_max]; j++) {
+            for (int i = mesh.info.int_params[AC_nx_min];
+                 i < mesh.info.int_params[AC_nx_max]; i++) {
+                const int idx              = AC_VTXBUF_IDX(i, j, k, mesh.info);
+                const ModelScalar curr_val = reduce_initial(
+                    mesh.vertex_buffer[a][idx], mesh.vertex_buffer[b][idx],
+                    mesh.vertex_buffer[c][idx]);
+                res = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const ModelScalar inv_n = 1.0l / mesh.info.int_params[AC_nxyz];
+        return sqrtl(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
--- a/src/standalone/model/model_reduce.h
+++ b/src/standalone/model/model_reduce.h
@@ -0,0 +1,37 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+ModelScalar model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
+                              const VertexBufferHandle& a);
+
+ModelScalar model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
+                             const VertexBufferHandle& a,
+                             const VertexBufferHandle& b,
+                             const VertexBufferHandle& c);
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
--- a/src/standalone/model/model_rk3.h
+++ b/src/standalone/model/model_rk3.h
@@ -0,0 +1,33 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+void model_rk3(const ModelScalar dt, ModelMesh* mesh);
+
+void model_rk3_step(const int step_number, const ModelScalar dt, ModelMesh* mesh);
--- a/src/standalone/model/modelmesh.h
+++ b/src/standalone/model/modelmesh.h
@@ -0,0 +1,36 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+
+typedef long double ModelScalar;
+
+typedef struct {
+    ModelScalar* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} ModelMesh;
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -0,0 +1,447 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <SDL.h>    // Note: using local version in src/3rdparty dir
+#include <math.h>   // ceil
+#include <string.h> // memcpy
+
+#include "config_loader.h"
+#include "core/errchk.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+// Window
+SDL_Renderer* renderer      = NULL;
+static SDL_Window* window   = NULL;
+static int window_width     = 800;
+static int window_height    = 600;
+static const int window_bpp = 32; // Bits per pixel
+
+// Surfaces
+SDL_Surface* surfaces[NUM_VTXBUF_HANDLES];
+static int datasurface_width  = -1;
+static int datasurface_height = -1;
+static int k_slice = 0;
+static int k_slice_max = 0;
+
+// Colors
+static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
+static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
+static const int tiles_per_row = 3;
+
+/*
+ * =============================================================================
+ * Camera
+ * =============================================================================
+ */
+/*
+typedef struct {
+   float x, y;
+} float2;
+*/
+typedef struct {
+    float x, y, w, h;
+} vec4;
+
+typedef struct {
+    float2 pos;
+    float scale;
+} Camera;
+
+static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
+
+static inline vec4
+project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
+{
+    const vec4 rect = (vec4){
+        camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
+        camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
+        camera.scale * bbox.x, camera.scale * bbox.y};
+
+    return rect;
+}
+
+/*
+ * =============================================================================
+ * Renderer
+ * =============================================================================
+ */
+
+static int
+renderer_init(const int& mx, const int& my)
+{
+    // Init video
+    SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
+
+    // Setup window
+    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
+                              SDL_WINDOWPOS_UNDEFINED, window_width,
+                              window_height, SDL_WINDOW_SHOWN);
+
+    // Setup SDL renderer
+    renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+    //SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
+    SDL_GetWindowSize(window, &window_width, &window_height);
+
+    SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
+
+    datasurface_width  = mx;
+    datasurface_height = my;
+    // vec drawing uses the surface of the first component, no memory issues here
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
+            0, datasurface_width, datasurface_height, window_bpp,
+            SDL_PIXELFORMAT_RGBA8888);
+
+    camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
+                          -.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
+    camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
+                       window_height / float(datasurface_height * (num_tiles/tiles_per_row)));
+
+    SDL_RendererInfo renderer_info;
+    SDL_GetRendererInfo(renderer, &renderer_info);
+    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
+    return 0;
+}
+
+static int
+set_pixel(const int& i, const int& j, const uint32_t& color,
+          SDL_Surface* surface)
+{
+    uint32_t* pixels           = (uint32_t*)surface->pixels;
+    pixels[i + j * surface->w] = color;
+    return 0;
+}
+
+static int
+draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
+                   const int& tile)
+{
+    const float xoffset = (tile % tiles_per_row) * datasurface_width;
+    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+
+    /*
+    const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
+    const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
+    */
+    const float max = 1.f;//float(acReduceScal(RTYPE_MAX, vertex_buffer));
+    const float min = 0.f;//float(acReduceScal(RTYPE_MIN, vertex_buffer));
+    const float range = fabsf(max - min);
+    const float mid   = max - .5f * range;
+
+    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+
+    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
+        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
+            ERRCHK(i < datasurface_width && j < datasurface_height);
+
+            const int idx       = AC_VTXBUF_IDX(i, j, k, mesh.info);
+            const uint8_t shade = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
+                range);
+            uint8_t color[4]            = {0, 0, 0, 255};
+            color[tile % 3]             = shade;
+            const uint32_t mapped_color = SDL_MapRGBA(
+                surfaces[vertex_buffer]->format, color[0], color[1], color[2],
+                color[3]);
+            set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
+        }
+    }
+
+    const float2 pos   = (float2){xoffset, yoffset};
+    const float2 bbox  = (float2){.5f * datasurface_width,
+                                 .5f * datasurface_height};
+    const float2 wsize = (float2){float(window_width), float(window_height)};
+    const vec4 rectf   = project_ortho(pos, bbox, wsize);
+    SDL_Rect rect      = (SDL_Rect){
+        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+                                                    surfaces[vertex_buffer]);
+    SDL_RenderCopy(renderer, tex, NULL, &rect);
+    SDL_DestroyTexture(tex);
+
+    return 0;
+}
+
+static int
+draw_vertex_buffer_vec(const AcMesh& mesh,
+                       const VertexBufferHandle& vertex_buffer_a,
+                       const VertexBufferHandle& vertex_buffer_b,
+                       const VertexBufferHandle& vertex_buffer_c,
+                       const int& tile)
+{
+    const float xoffset = (tile % tiles_per_row) * datasurface_width;
+    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+
+    /*
+    const float maxx = float(
+        max(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_a),
+            max(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_b),
+                model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_c))));
+    const float minn = float(
+        min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_a),
+            min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
+                model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
+    */
+    const float maxx = float(
+        max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
+            max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
+                acReduceScal(RTYPE_MAX, vertex_buffer_c))));
+    const float minn = float(
+        min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
+            min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
+                acReduceScal(RTYPE_MIN, vertex_buffer_c))));
+    const float range = fabsf(maxx - minn);
+    const float mid   = maxx - .5f * range;
+
+    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
+        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
+            ERRCHK(i < datasurface_width && j < datasurface_height);
+
+            const int idx   = AC_VTXBUF_IDX(i, j, k, mesh.info);
+            const uint8_t r = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
+                range);
+            const uint8_t g = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
+                range);
+            const uint8_t b = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
+                range);
+            const uint32_t mapped_color = SDL_MapRGBA(
+                surfaces[vertex_buffer_a]->format, r, g, b, 255);
+            set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
+        }
+    }
+
+    const float2 pos   = (float2){xoffset, yoffset};
+    const float2 bbox  = (float2){.5f * datasurface_width,
+                                 .5f * datasurface_height};
+    const float2 wsize = (float2){float(window_width), float(window_height)};
+    const vec4 rectf   = project_ortho(pos, bbox, wsize);
+    SDL_Rect rect      = (SDL_Rect){
+        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+                                                    surfaces[vertex_buffer_a]);
+    SDL_RenderCopy(renderer, tex, NULL, &rect);
+    SDL_DestroyTexture(tex);
+
+    return 0;
+}
+
+static int
+renderer_draw(const AcMesh& mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
+    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
+                           NUM_VTXBUF_HANDLES);
+
+    // Drawing done, present
+    SDL_RenderPresent(renderer);
+    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
+                           color_bg.a);
+    SDL_RenderClear(renderer);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        const VertexBufferHandle vertex_buffer = VertexBufferHandle(i);
+        /*
+        printf("\t%s umax %e, min %e\n", vtxbuf_names[vertex_buffer],
+               (double)model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer),
+               (double)model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
+        */
+        printf("\t%s umax %e, min %e\n", vtxbuf_names[vertex_buffer],
+               (double)acReduceScal(RTYPE_MAX, vertex_buffer),
+               (double)acReduceScal(RTYPE_MIN, vertex_buffer));
+    }
+    printf("\n");
+
+    return 0;
+}
+
+static int
+renderer_quit(void)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        SDL_FreeSurface(surfaces[i]);
+
+    SDL_DestroyRenderer(renderer);
+    SDL_DestroyWindow(window);
+
+    renderer = NULL;
+    window   = NULL;
+
+    SDL_Quit();
+    return 0;
+}
+
+static int init_type = INIT_TYPE_GAUSSIAN_RADIAL_EXPL;
+
+static bool
+running(AcMesh* mesh)
+{
+    SDL_Event e;
+    while (SDL_PollEvent(&e)) {
+        if (e.type == SDL_QUIT) {
+            return false;
+        }
+        else if (e.type == SDL_KEYDOWN) {
+            if (e.key.keysym.sym == SDLK_ESCAPE)
+                return false;
+            if (e.key.keysym.sym == SDLK_SPACE) {
+                init_type = (init_type + 1) % NUM_INIT_TYPES;
+                acmesh_init_to(InitType(init_type), mesh);
+                acLoad(*mesh);
+            }
+            if (e.key.keysym.sym == SDLK_i) {
+                k_slice = (k_slice + 1) % k_slice_max;
+                printf("k_slice %d\n", k_slice);
+            }
+            if (e.key.keysym.sym == SDLK_k) {
+                k_slice = (k_slice - 1 + k_slice_max) % k_slice_max;
+                printf("k_slice %d\n", k_slice);
+            }
+        }
+    }
+    return true;
+}
+
+static void
+check_input(const float& dt)
+{
+    /* Camera movement */
+    const float camera_translate_rate = 1000.f / camera.scale;
+    const float camera_scale_rate     = 1.0001f;
+    const uint8_t* keystates          = (uint8_t*)SDL_GetKeyboardState(NULL);
+    if (keystates[SDL_SCANCODE_UP])
+        camera.pos.y += camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_DOWN])
+        camera.pos.y -= camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_LEFT])
+        camera.pos.x -= camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_RIGHT])
+        camera.pos.x += camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_PAGEUP])
+        camera.scale += camera.scale * camera_scale_rate * dt;
+    if (keystates[SDL_SCANCODE_PAGEDOWN])
+        camera.scale -= camera.scale * camera_scale_rate * dt;
+    if (keystates[SDL_SCANCODE_COMMA])
+        set_timescale(AcReal(.1));
+    if (keystates[SDL_SCANCODE_PERIOD])
+        set_timescale(AcReal(1.));
+}
+
+int
+run_renderer(void)
+{
+    /* Parse configs */
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    renderer_init(mesh_info.int_params[AC_mx], mesh_info.int_params[AC_my]);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(InitType(init_type), mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+    Timer frame_timer;
+    timer_reset(&frame_timer);
+
+    Timer wallclock;
+    timer_reset(&wallclock);
+
+    Timer io_timer;
+    timer_reset(&io_timer);
+
+    const float desired_frame_time = 1.f / 60.f;
+    int steps                      = 0;
+    k_slice                        = mesh->info.int_params[AC_mz] / 2;
+    k_slice_max                    = mesh->info.int_params[AC_mz];
+    while (running(mesh)) {
+
+        /* Input */
+        check_input(timer_diff_nsec(io_timer) / 1e9f);
+        timer_reset(&io_timer);
+
+/* Step the simulation */
+#if 1
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+#else
+        ModelMesh* model_mesh = modelmesh_create(mesh->info);
+        const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acmesh_to_modelmesh(*mesh, model_mesh);
+        model_rk3(dt, model_mesh);
+        modelmesh_to_acmesh(*model_mesh, mesh);
+        modelmesh_destroy(model_mesh);
+        acLoad(*mesh); // Just a quick hack s.t. we do not have to add an
+                       // additional if to the render part
+#endif
+
+        ++steps;
+
+        /* Render */
+        const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
+        if (timer_diff_sec >= desired_frame_time) {
+            //acStore(mesh);
+            const int num_vertices = mesh->info.int_params[AC_mxy];
+            const int3 dst         = (int3){0, 0, k_slice};
+            acStoreWithOffset(dst, num_vertices, mesh);
+            acSynchronize();
+            renderer_draw(*mesh); // Bottleneck is here
+            printf("Step #%d, dt: %f\n", steps, double(dt));
+            timer_reset(&frame_timer);
+        }
+    }
+    printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f));
+
+    acStore(mesh);
+    acQuit();
+    acmesh_destroy(mesh);
+
+    renderer_quit();
+
+    return 0;
+}
--- a/src/standalone/run.h
+++ b/src/standalone/run.h
@@ -0,0 +1,35 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+
+int run_autotest(void);
+
+int run_simulation(void);
+
+int run_benchmark(void);
+
+int run_renderer(void);
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -0,0 +1,339 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include "config_loader.h"
+#include "core/errchk.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+/*
+// DEPRECATED: TODO remove
+static inline void
+print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
+{
+    const int max_name_width = 16;
+    printf("Step %d, dt %e s\n", step, double(dt));
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
+    double(model_reduce_vec(mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)),
+    double(model_reduce_vec(mesh, RTYPE_MIN, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)),
+    double(model_reduce_vec(mesh, RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
+        double(model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(i))),
+        double(model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(i))),
+        double(model_reduce_scal(mesh, RTYPE_RMS, VertexBufferHandle(i))));
+    }
+}
+*/
+
+//Write all setting info into a separate ascii file. This is done to guarantee
+//that we have the data specifi information in the thing, even though in
+//principle these things are in the astaroth.conf.
+static inline 
+void write_mesh_info(const AcMeshInfo* config)
+{
+ 
+    FILE* infotxt;
+
+    infotxt = fopen("purge.sh","w");
+    fprintf(infotxt, "#!/bin/bash\n");
+    fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
+    fclose(infotxt);   
+
+    infotxt = fopen("mesh_info.list","w");
+
+    //Total grid dimensions
+    fprintf(infotxt, "int  AC_mx        %i \n", config->int_params[AC_mx]);
+    fprintf(infotxt, "int  AC_my        %i \n", config->int_params[AC_my]);
+    fprintf(infotxt, "int  AC_mz        %i \n", config->int_params[AC_mz]);
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    fprintf(infotxt, "int  AC_nx_min    %i \n", config->int_params[AC_nx_min]);
+    fprintf(infotxt, "int  AC_nx_max    %i \n", config->int_params[AC_nx_max]);
+    fprintf(infotxt, "int  AC_ny_min    %i \n", config->int_params[AC_ny_min]);
+    fprintf(infotxt, "int  AC_ny_max    %i \n", config->int_params[AC_ny_max]);
+    fprintf(infotxt, "int  AC_nz_min    %i \n", config->int_params[AC_nz_min]);
+    fprintf(infotxt, "int  AC_nz_max    %i \n", config->int_params[AC_nz_max]);
+
+    // Spacing
+    fprintf(infotxt, "real AC_inv_dsx   %e \n", (double)config->real_params[AC_inv_dsx]);
+    fprintf(infotxt, "real AC_inv_dsy   %e \n", (double)config->real_params[AC_inv_dsy]);
+    fprintf(infotxt, "real AC_inv_dsz   %e \n", (double)config->real_params[AC_inv_dsz]);
+    fprintf(infotxt, "real AC_dsmin     %e \n", (double)config->real_params[AC_dsmin  ]);
+
+    /* Additional helper params */
+    // Int helpers
+    fprintf(infotxt, "int  AC_mxy       %i \n", config->int_params[AC_mxy ]);
+    fprintf(infotxt, "int  AC_nxy       %i \n", config->int_params[AC_nxy ]);
+    fprintf(infotxt, "int  AC_nxyz      %i \n", config->int_params[AC_nxyz]);
+
+    // Real helpers
+    fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
+    fprintf(infotxt, "real AC_cv_sound  %e \n", (double)config->real_params[AC_cv_sound ]);
+
+    fclose(infotxt);
+}
+
+
+//This funtion writes a run state into a set of C binaries. For the sake of
+//accuracy, all floating point numbers are to be saved in long double precision
+//regardless of the choise of accuracy during runtime. 
+static inline void
+save_mesh(const AcMesh &save_mesh, const int step, 
+          const AcReal t_step)
+{
+    FILE* save_ptr;  
+
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const size_t n = AC_VTXBUF_SIZE(save_mesh.info);
+
+        const char* buffername = vtxbuf_names[w];
+        char cstep[10];
+        char bin_filename[80] = "\0";
+
+        //sprintf(bin_filename, "");
+
+        sprintf(cstep, "%d", step);
+
+        strcat(bin_filename, buffername);
+        strcat(bin_filename, "_");
+        strcat(bin_filename, cstep);
+        strcat(bin_filename, ".mesh");
+
+        printf("Savefile %s \n", bin_filename);
+
+        save_ptr = fopen(bin_filename,"wb");
+
+        //Start file with time stamp
+        long double write_long_buf =  (long double) t_step;
+        fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
+        //Grid data
+        for (size_t i = 0; i < n; ++i) {
+            const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
+            long double write_long_buf =  (long double) point_val;
+            fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
+        }
+        fclose(save_ptr);
+    }
+
+}
+
+
+
+// This function prints out the diagnostic values to std.out and also saves and
+// appends an ascii file to contain all the result. 
+static inline void
+print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file)
+{
+    
+    AcReal buf_rms, buf_max, buf_min;
+    const int max_name_width = 16;
+
+    // Calculate rms, min and max from the velocity vector field
+    buf_max = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+    buf_min = acReduceVec(RTYPE_MIN, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+    buf_rms = acReduceVec(RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+
+    // MV: The ordering in the earlier version was wrong in terms of variable
+    // MV: name and its diagnostics. 
+    printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
+           double(buf_min), double(buf_rms), double(buf_max));
+    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), 
+           double(buf_min), double(buf_rms), double(buf_max));
+    
+
+    // Calculate rms, min and max from the variables as scalars
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        buf_max = acReduceScal(RTYPE_MAX, VertexBufferHandle(i));
+        buf_min = acReduceScal(RTYPE_MIN, VertexBufferHandle(i));
+        buf_rms = acReduceScal(RTYPE_RMS, VertexBufferHandle(i));
+        
+        printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
+               double(buf_min), double(buf_rms), double(buf_max));
+        fprintf(diag_file, "%e %e %e ", double(buf_min), double(buf_rms), double(buf_max));
+    }
+
+    fprintf(diag_file, "\n");
+}
+
+    /* 
+        MV NOTE: At the moment I have no clear idea how to calculate magnetic
+        diagnostic variables from grid. Vector potential measures have a limited
+        value. TODO: Smart way to get brms, bmin and bmax.
+    */ 
+
+int
+run_simulation(void)
+{
+    /* Parse configs */
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(INIT_TYPE_GAUSSIAN_RADIAL_EXPL, mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+
+    FILE *diag_file;
+    diag_file = fopen("timeseries.ts", "a");
+    // TODO Get time from earlier state. 
+    AcReal t_step = 0.0;
+
+    // Generate the title row.
+    fprintf(diag_file, "step  t_step  dt  uu_total_min  uu_total_rms  uu_total_max  ");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
+    }
+
+    fprintf(diag_file, "\n");
+
+    write_mesh_info(&mesh_info);
+    print_diagnostics(0, AcReal(.0), t_step, diag_file);
+
+    acSynchronize();
+    acStore(mesh);
+    save_mesh(*mesh, 0, t_step);
+
+    const int max_steps = mesh_info.int_params[AC_max_steps];
+    const int save_steps = mesh_info.int_params[AC_save_steps];
+    const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info
+
+    AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
+    AcReal bin_crit_t = bin_save_t;
+
+    /* Step the simulation */
+    for (int i = 1; i < max_steps; ++i) {
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+
+        t_step += dt; 
+
+        /* Save the simulation state and print diagnostics */
+        if ((i % save_steps) == 0) {
+
+            /*
+		print_diagnostics() writes out both std.out printout from the
+		results and saves the diagnostics into a table for ascii file
+                timeseries.ts.
+            */
+
+            print_diagnostics(i, dt, t_step, diag_file);
+
+            /*
+		We would also might want an XY-average calculating funtion,
+		which can be very useful when observing behaviour of turbulent
+                simulations. (TODO)
+            */
+
+        }
+
+        /* Save the simulation state and print diagnostics */
+        if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) {
+
+            /*
+		This loop saves the data into simple C binaries which can be
+                used for analysing the data snapshots closely.
+ 
+                Saving simulation state should happen in a separate stage. We do 
+                not want to save it as often as diagnostics. The file format 
+                should IDEALLY be HDF5 which has become a well supported, portable and 
+		reliable data format when it comes to HPC applications.
+		However, implementing it will have to for more simpler approach
+                to function. (TODO?)
+            */
+                
+            /*
+                The updated mesh will be located on the GPU. Also all calls
+                to the astaroth interface (functions beginning with ac*) are
+                assumed to be asynchronous, so the meshes must be also synchronized
+                before transferring the data to the CPU. Like so:
+
+                acSynchronize();
+                acStore(mesh);
+            */
+
+            acSynchronize();
+            acStore(mesh);
+
+            save_mesh(*mesh, i, t_step);
+
+            bin_crit_t += bin_save_t; 
+
+        }
+
+    }
+
+    //////Save the final snapshot
+    ////acSynchronize();
+    ////acStore(mesh);
+
+    ////save_mesh(*mesh, , t_step);
+
+    acQuit();
+    acmesh_destroy(mesh);
+
+    fclose(diag_file);
+
+    return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/src/standalone/timer_hires.h
+++ b/src/standalone/timer_hires.h
@@ -0,0 +1,64 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+  @file
+ \brief High-resolution timer.
+
+    Usage:
+        Timer t;
+        timer_reset(&t);
+        timer_diff_nsec(t);
+
+    If there are issues, try compiling with -std=gnu11 -lrt
+ */
+#pragma once
+#include <stdio.h> // perror
+#include <time.h>
+
+typedef struct timespec Timer;
+// Contains at least the following members:
+// time_t tv_sec;
+// long tv_nsec;
+
+static inline int
+timer_reset(Timer* t)
+{
+    const int retval = clock_gettime(CLOCK_REALTIME, t);
+    if (retval == -1)
+        perror("clock_gettime failure");
+
+    return retval;
+}
+
+static inline long
+timer_diff_nsec(const Timer start)
+{
+    Timer end;
+    timer_reset(&end);
+    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
+                      (end.tv_nsec - start.tv_nsec);
+    return diff;
+}
+
+static inline void
+timer_diff_print(const Timer t)
+{
+    printf("Time elapsed: %g ms\n", timer_diff_nsec(t) / 1e6);
+}