From 0e48766a6853a3ab963ac5d2e33db12632111e19 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Fri, 14 Jun 2019 14:18:35 +0300
Subject: [PATCH] Added Astaroth 2.0

---
 3rdparty/setup_dependencies.sh                |   17 +
 CMakeLists.txt                                |  172 ++
 LICENCE.txt                                   |   18 +
 README.md                                     |  118 +-
 acc/.gitignore                                |    5 +
 acc/README.md                                 |   42 +
 acc/build_acc.sh                              |   25 +
 acc/clean.sh                                  |    5 +
 acc/compile.sh                                |   24 +
 acc/mhd_solver/stencil_assembly.sas           |   26 +
 acc/mhd_solver/stencil_process.sps            |  265 ++
 acc/preprocess.sh                             |    4 +
 acc/pseudodisk/stencil_process_gravx.sps      |  228 ++
 .../stencil_process_isotherm_gravx.sps        |  169 ++
 .../stencil_process_isotherm_linegrav.sps     |  174 ++
 acc/pseudodisk/stencil_process_linegrav.sps   |  233 ++
 acc/samples/common_header.h                   |  422 +++
 acc/samples/sample_stencil_assembly.sas       |   49 +
 acc/samples/sample_stencil_process.sps        |  149 +
 acc/src/acc.l                                 |   56 +
 acc/src/acc.y                                 |  234 ++
 acc/src/ast.h                                 |  126 +
 acc/src/code_generator.c                      |  569 ++++
 acc/test_grammar.sh                           |   48 +
 analysis/python/.gitignore                    |    1 +
 analysis/python/README.md                     |    7 +
 analysis/python/add_to_pythonpath.sh          |    3 +
 analysis/python/astar/__init__.py             |   24 +
 analysis/python/astar/data/__init__.py        |   21 +
 analysis/python/astar/data/read.py            |  142 +
 analysis/python/astar/visual/__init__.py      |   21 +
 analysis/python/astar/visual/slices.py        |   92 +
 analysis/python/calc/convert.sh               |    9 +
 analysis/python/calc/galli_shu_plotter.py     |  835 ++++++
 analysis/python/calc/purge.sh                 |    1 +
 analysis/python/calc/shu_selfsim.py           |  279 ++
 analysis/python/purgepng.sh                   |    1 +
 analysis/python/samples/README.md             |    3 +
 analysis/python/samples/lnrhobound.py         |   41 +
 analysis/python/samples/readtest.py           |  260 ++
 config/astaroth.conf                          |   54 +
 config/astaroth_pseudodisk.conf               |  121 +
 doc/doxygen/.gitignore                        |    4 +
 doc/manual/manual.md                          |  131 +
 doxyfile                                      | 2427 +++++++++++++++++
 include/astaroth.h                            |  422 +++
 scripts/ac_mkbuilddir.sh                      |   81 +
 scripts/auto_optimize.sh                      |   51 +
 scripts/buildtest.sh                          |    3 +
 scripts/compile_acc.sh                        |   52 +
 scripts/fix_style.sh                          |    9 +
 scripts/gen_rk3_threadblockconf.c             |   60 +
 scripts/generate_doc.sh                       |    2 +
 sourceme.sh                                   |    7 +
 src/core/CMakeLists.txt                       |   70 +
 src/core/astaroth.cu                          |  451 +++
 src/core/device.cu                            |  309 +++
 src/core/device.cuh                           |   82 +
 src/core/errchk.h                             |  112 +
 src/core/kernels/.gitignore                   |    2 +
 src/core/kernels/boundconds.cuh               | 1363 +++++++++
 src/core/kernels/kernels.cuh                  |  794 ++++++
 src/core/kernels/reduce.cuh                   |  338 +++
 src/core/kernels/rk3.cuh                      |  742 +++++
 src/core/math_utils.h                         |   91 +
 src/standalone/CMakeLists.txt                 |   10 +
 src/standalone/autotest.cc                    |  732 +++++
 src/standalone/benchmark.cc                   |  300 ++
 src/standalone/config_loader.cc               |  194 ++
 src/standalone/config_loader.h                |   34 +
 src/standalone/main.cc                        |   94 +
 src/standalone/model/host_memory.cc           |  737 +++++
 src/standalone/model/host_memory.h            |   58 +
 src/standalone/model/host_timestep.cc         |   63 +
 src/standalone/model/host_timestep.h          |   32 +
 src/standalone/model/model_boundconds.cc      |  487 ++++
 src/standalone/model/model_boundconds.h       |   31 +
 src/standalone/model/model_diff.h             |  353 +++
 src/standalone/model/model_reduce.cc          |  203 ++
 src/standalone/model/model_reduce.h           |   37 +
 src/standalone/model/model_rk3.cc             | 1044 +++++++
 src/standalone/model/model_rk3.h              |   33 +
 src/standalone/model/modelmesh.h              |   36 +
 src/standalone/renderer.cc                    |  447 +++
 src/standalone/run.h                          |   35 +
 src/standalone/simulation.cc                  |  339 +++
 src/standalone/timer_hires.h                  |   64 +
 87 files changed, 18058 insertions(+), 1 deletion(-)
 create mode 100755 3rdparty/setup_dependencies.sh
 create mode 100644 CMakeLists.txt
 create mode 100644 LICENCE.txt
 create mode 100644 acc/.gitignore
 create mode 100644 acc/README.md
 create mode 100755 acc/build_acc.sh
 create mode 100755 acc/clean.sh
 create mode 100755 acc/compile.sh
 create mode 100644 acc/mhd_solver/stencil_assembly.sas
 create mode 100644 acc/mhd_solver/stencil_process.sps
 create mode 100755 acc/preprocess.sh
 create mode 100644 acc/pseudodisk/stencil_process_gravx.sps
 create mode 100644 acc/pseudodisk/stencil_process_isotherm_gravx.sps
 create mode 100644 acc/pseudodisk/stencil_process_isotherm_linegrav.sps
 create mode 100644 acc/pseudodisk/stencil_process_linegrav.sps
 create mode 100644 acc/samples/common_header.h
 create mode 100644 acc/samples/sample_stencil_assembly.sas
 create mode 100644 acc/samples/sample_stencil_process.sps
 create mode 100644 acc/src/acc.l
 create mode 100644 acc/src/acc.y
 create mode 100644 acc/src/ast.h
 create mode 100644 acc/src/code_generator.c
 create mode 100755 acc/test_grammar.sh
 create mode 100644 analysis/python/.gitignore
 create mode 100644 analysis/python/README.md
 create mode 100644 analysis/python/add_to_pythonpath.sh
 create mode 100644 analysis/python/astar/__init__.py
 create mode 100644 analysis/python/astar/data/__init__.py
 create mode 100644 analysis/python/astar/data/read.py
 create mode 100644 analysis/python/astar/visual/__init__.py
 create mode 100644 analysis/python/astar/visual/slices.py
 create mode 100755 analysis/python/calc/convert.sh
 create mode 100644 analysis/python/calc/galli_shu_plotter.py
 create mode 100755 analysis/python/calc/purge.sh
 create mode 100644 analysis/python/calc/shu_selfsim.py
 create mode 100755 analysis/python/purgepng.sh
 create mode 100644 analysis/python/samples/README.md
 create mode 100644 analysis/python/samples/lnrhobound.py
 create mode 100644 analysis/python/samples/readtest.py
 create mode 100644 config/astaroth.conf
 create mode 100644 config/astaroth_pseudodisk.conf
 create mode 100644 doc/doxygen/.gitignore
 create mode 100644 doc/manual/manual.md
 create mode 100644 doxyfile
 create mode 100644 include/astaroth.h
 create mode 100755 scripts/ac_mkbuilddir.sh
 create mode 100755 scripts/auto_optimize.sh
 create mode 100755 scripts/buildtest.sh
 create mode 100755 scripts/compile_acc.sh
 create mode 100755 scripts/fix_style.sh
 create mode 100644 scripts/gen_rk3_threadblockconf.c
 create mode 100755 scripts/generate_doc.sh
 create mode 100644 sourceme.sh
 create mode 100644 src/core/CMakeLists.txt
 create mode 100644 src/core/astaroth.cu
 create mode 100644 src/core/device.cu
 create mode 100644 src/core/device.cuh
 create mode 100644 src/core/errchk.h
 create mode 100644 src/core/kernels/.gitignore
 create mode 100644 src/core/kernels/boundconds.cuh
 create mode 100644 src/core/kernels/kernels.cuh
 create mode 100644 src/core/kernels/reduce.cuh
 create mode 100644 src/core/kernels/rk3.cuh
 create mode 100644 src/core/math_utils.h
 create mode 100644 src/standalone/CMakeLists.txt
 create mode 100644 src/standalone/autotest.cc
 create mode 100644 src/standalone/benchmark.cc
 create mode 100644 src/standalone/config_loader.cc
 create mode 100644 src/standalone/config_loader.h
 create mode 100644 src/standalone/main.cc
 create mode 100644 src/standalone/model/host_memory.cc
 create mode 100644 src/standalone/model/host_memory.h
 create mode 100644 src/standalone/model/host_timestep.cc
 create mode 100644 src/standalone/model/host_timestep.h
 create mode 100644 src/standalone/model/model_boundconds.cc
 create mode 100644 src/standalone/model/model_boundconds.h
 create mode 100644 src/standalone/model/model_diff.h
 create mode 100644 src/standalone/model/model_reduce.cc
 create mode 100644 src/standalone/model/model_reduce.h
 create mode 100644 src/standalone/model/model_rk3.cc
 create mode 100644 src/standalone/model/model_rk3.h
 create mode 100644 src/standalone/model/modelmesh.h
 create mode 100644 src/standalone/renderer.cc
 create mode 100644 src/standalone/run.h
 create mode 100644 src/standalone/simulation.cc
 create mode 100644 src/standalone/timer_hires.h

diff --git a/3rdparty/setup_dependencies.sh b/3rdparty/setup_dependencies.sh
new file mode 100755
index 0000000..3eec468
--- /dev/null
+++ b/3rdparty/setup_dependencies.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+INITIAL_DIR=$(pwd)
+
+
+# Fetch SDL2
+git clone https://github.com/davidsiaw/SDL2.git
+cd SDL2
+git pull
+mkdir build
+cd build && cmake .. && make -j
+
+# See https://github.com/davidsiaw/SDL2/blob/master/docs/README-linux.md
+# if there are isses with building
+
+
+# Done
+cd $INITIAL_DIR
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..4b20ce8
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,172 @@
+#
+# CMakeLists.txt for generating the makefile for Astaroth.
+#   Usage: mkdir build && cd build && cmake <optional flags> ..
+#
+#   For example: cmake -DDOUBLE_PRECISION=ON ..
+#
+#   If you want to see the exact flags used during compilation, run
+#   "make -j VERBOSE=1"
+#
+# Make sure your machine satisfies the system requirements:
+# https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements
+
+#-------------------General---------------------------------------------------#
+
+project(ASTAROTH_2.0 CXX)
+set (CMAKE_CXX_STANDARD 98)
+cmake_minimum_required (VERSION 3.5.1) # Need >= 3.8 for first-class CUDA support
+cmake_policy (SET CMP0023 NEW)
+
+
+#-------------------Set user options with default values---------------------#
+
+#Usage f.ex. cmake -DBUILD_DEBUG=ON ..
+option(BUILD_DEBUG "Builds the program with extensive error checking" OFF)
+option(BUILD_STANDALONE "Builds standalone Astaroth" ON)
+option(DOUBLE_PRECISION "Generates double precision code" OFF)
+option(TIARA_CLUSTER "Special settings for compilation TIARA GPU cluster" OFF)
+option(MULTIGPU_ENABLED "If enabled, uses all the available GPUs" ON)
+option(ALTER_CONF "If enabled, loads astaroth.conf from the build directory" OFF)
+
+#-------------------Determine build type--------------------------------------#
+
+#Available types (case-sensitive):
+#RELEASE         (best performance)
+#DEBUG           (w/ debug information, non-concurrent kernels)
+if (BUILD_DEBUG)
+    set(CMAKE_BUILD_TYPE DEBUG)
+else ()
+    set(CMAKE_BUILD_TYPE RELEASE)
+endif()
+message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
+
+
+#----------------------Find packages------------------------------------------#
+
+# C++ compiler info
+message(STATUS "CMAKE_CXX_COMPILER: " ${CMAKE_CXX_COMPILER})
+message(STATUS "CMAKE_CXX_COMPILER: " ${CMAKE_CXX_COMPILER_ID})
+
+# SDL 2
+set(SDL2_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/3rdparty/SDL2/include/)
+set(SDL2_LIBRARY_DIR ${CMAKE_SOURCE_DIR}/3rdparty/SDL2/build/)
+set(SDL2_LIBRARY "SDL2")
+include_directories(${SDL2_INCLUDE_DIR})
+link_directories(${SDL2_LIBRARY_DIR})
+
+# CUDA
+find_package(CUDA)
+if (NOT CUDA_FOUND)
+    # find_package(CUDA REQUIRED) gives a confusing error message if it fails,
+    # therefore we print the reason here explicitly
+    message(FATAL_ERROR "CUDA not found")
+endif()
+include_directories(${CUDA_INCLUDE_DIRS})
+
+# OpenMP
+find_package(OpenMP)
+if (NOT OPENMP_FOUND)
+    message(WARNING "OpenMP not found. All host-side concurrency disabled \
+                    (lower performance).")
+else ()
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+#----------------------Compilation settings-----------------------------------#
+
+#Debug and verification
+#set(CMAKE_VERBOSE_MAKEFILE OFF)
+#set(CXX_VERBOSE_BUILD OFF)
+#set(CUDA_VERBOSE_BUILD OFF)
+#include(CTest)
+#add_test(ac_test ac_run)
+#find_program(MEMORYCHECK_COMMAND valgrind)
+#set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full" )
+
+
+#----------------------Setup defines------------------------------------------#
+
+if (DOUBLE_PRECISION)
+	add_definitions(-DAC_DOUBLE_PRECISION=1)
+else()
+    add_definitions(-DAC_DOUBLE_PRECISION=0)
+endif()
+
+# A full integration step is benchmarked by default, use this flag to override and
+# benchmark RK3 only
+if (GEN_BENCHMARK_RK3)
+    add_definitions(-DGEN_BENCHMARK_RK3=1)
+else()
+    add_definitions(-DGEN_BENCHMARK_RK3=0)
+endif()
+
+if (MULTIGPU_ENABLED)
+    add_definitions(-DAC_MULTIGPU_ENABLED=1)
+else()
+    add_definitions(-DAC_MULTIGPU_ENABLED=0)
+endif()
+
+#-----------------------TIARA specific options--------------------------------#
+#OLD#set (CXX_FLAGS_TIARA "-I/software/opt/cuda/9.0/include/")
+# %JP: NOTE! This should not be needed anymore because the command
+#      find_package(CUDA) above should find and include this directory automatically
+#USE THIS:
+if (TIARA_CLUSTER)
+	set (CXX_FLAGS_TIARA "-mno-bmi2")
+endif()
+
+#----------------------Setup CXX compilation flags----------------------------#
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}\
+                             -O2 -march=native -pipe")
+
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}\
+                             -O0 -g")
+
+set (CXX_FLAGS_WARNING "-Wall -Wextra -Werror -Wno-error=unused-parameter\
+                        -Wno-error=unused-function -Wno-error=unknown-pragmas")
+
+# Also warn about implicit conversions if the compiler supports it
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+    set (CXX_FLAGS_WARNING "${CXX_FLAGS_WARNING} -Wdouble-promotion -Wfloat-conversion")
+endif()
+
+# Other flags. -D_FORCE_INLINES is a workaround to some CUDA/C++ "feature"
+# which botches the compilation ("memcpy was not declared in this scope")
+# (Not required with cc >= 3.0)
+#set(CXX_FLAGS_ETC "-D_FORCE_INLINES")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}\
+                    ${CXX_FLAGS_WARNING}\
+                    ${CXX_FLAGS_ETC}\
+                    ${CXX_FLAGS_TIARA}") # %JP: CXX_FLAGS_TIARA should not be needed,
+					 #      see comments in "TIARA specific options"
+
+message("CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
+
+
+#----------------------Setup core subdirectories------------------------------#
+
+#Include root directory (.) so that the following modules can include their
+#parent dir (f.ex. #include "common/stuff.h" instead of "../common/stuff")
+include_directories(.)
+include_directories(include)
+include_directories(src)
+
+# CUDA sources
+add_subdirectory(src/core)
+
+#----------------------Link---------------------------------------------------#
+
+if (BUILD_STANDALONE)
+    #Define the config directory
+    if (ALTER_CONF)
+        set(ASTAROTH_CONF_PATH "${CMAKE_BINARY_DIR}/")
+    else()
+        set(ASTAROTH_CONF_PATH "${CMAKE_SOURCE_DIR}/config/")
+    endif()
+
+    #Add additional subdirectories
+    add_subdirectory (src/standalone)
+    cuda_add_executable(ac_run src/standalone/main.cc)
+    target_link_libraries(ac_run astaroth_standalone astaroth_core ${SDL2_LIBRARY})
+endif()
diff --git a/LICENCE.txt b/LICENCE.txt
new file mode 100644
index 0000000..a36573e
--- /dev/null
+++ b/LICENCE.txt
@@ -0,0 +1,18 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
diff --git a/README.md b/README.md
index 462dc46..c358335 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,118 @@
-# Astaroth
 
+# Astaroth - A Multi-GPU library for generic stencil computations
+
+Astaroth is a single-node multi-GPU library for multiphysics and other problems, which involve stencil computations in a discrete mesh. It's licenced under the terms of the GNU General Public Licence, version 3, or later (see [LICENCE.txt](https://bitbucket.org/miikkavaisala/astaroth-code/src/master/astaroth_2.0/LICENCE.txt)). Astaroth ships with a domain-specific language, that can be used to translate high-level representation of the stencil computations into a heavily inlined GPU pipeline.
+
+## System requirements
+
+NVIDIA GPU with >= 3.0 compute capability. See https://en.wikipedia.org/wiki/CUDA#GPUs_supported.
+
+## Building (3rd party libraries)
+
+1. `cd 3rdparty`
+1. `./setup_dependencies.sh` Note: this may take some time.
+
+## Building (Astaroth 2.0)
+
+1. `cd astaroth_2.0/build`
+1. `cmake -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ..` (Use `cmake -D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ..` if compiling on TIARA)
+1. `../scripts/compile_acc.sh && make -j`
+1. `./ac_run <options>`
+
+If you encounter issues, recheck that the 3rd party libraries were successfully built during the previous step.
+
+### Available options
+
+- `-s` simulation
+- `-b` benchmark
+- `-t` automated test (NOTE! This is expected to fail with the default configuration as there's no CPU model solution for forcing/entropy)
+
+By default, the program does a real-time visualization of the simulation domain. The camera and the initial conditions can be controller by `arrow keys`, `pgup`, `pgdown` and `spacebar`.
+
+## Generating documentation
+
+Run `doxygen doxyfile` in astaroth_2.0 directory. The generated files can be found in `doc/doxygen`. The main page of the documentation will be at `dox/doxygen/astaroth_doc_html/index.html`.
+
+## Formatting
+
+If you have clang-format, you may run `scripts/fix_style.sh`. This script will recursively fix style of all the source files down from the current working directory. The script will ask for a confirmation before making any changes. 
+
+## Directory structure
+
+## Coding style.
+
+### In a nutshell
+- Use [K&R indentation style](https://en.wikipedia.org/wiki/Indentation_style#K&R_style) and 4 space tabs. 
+- Line width is 100 characters
+- Start function names after a linebreak in source files. 
+- [Be generous with `const` type qualifiers](https://isocpp.org/wiki/faq/const-correctness). 
+- When in doubt, see [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+### Header example:
+```cpp
+// Licence notice and doxygen description here
+#pragma once
+#include "avoid_including_headers_here.h"
+
+/** Doxygen comments */
+void global_function(void);
+```
+
+
+### Source example:
+```cpp
+#include "parent_header.h"
+
+#include <standard_library_headers.h>
+
+#include "other_headers.h"
+#include "more_headers.h"
+
+typedef struct {
+	int data;
+} SomeStruct;
+
+static inline int small_function(const SomeStruct& stuff) { return stuff.data; }
+
+// Pass constant structs always by reference (&) and use const type qualifier.
+// Modified structs are always passed as pointers (*), never as references.
+// Constant parameters should be on the left-hand side, while non-consts go to the right.
+static void
+local_function(const SomeStruct& constant_struct, SomeStruct* modified_struct)
+{
+	modified_struct->data = constant_struct.data;
+}
+
+void
+global_function(void)
+{
+	return;
+}
+```
+## Miikka's compilation notes
+
+Modules Modules usen when compiling when compiling
+
+  * intel/2016                         
+  * hdf5/1.8.16_openmpi_1.10.2_ic16.0   
+  * cmake/3.9.5
+  * openmpi/1.10.2_ic16.0               
+  * gcc/5.3.0
+  * cuda/9.0
+
+Requires this gcc flag to compile: `-mno-bmi2` Otherwise you get assembler error! 
+
+For stencil pre-processing `flex` and particularly `libfl` is required for `acc/code_generator.c` to compile. 
+
+Need CUDA version 9.2 or above version. 
+
+Comment out cudaGetDeviceCount(&num_devices) in astaroth.cu 
+
+OLD: `astaroth_2.0/acc/build.sh` only work when each line is written individually. (**solution needed**)
+
+  
+(**These are here because I don't dare to delete them yet** OLD: Intel compiler does not get correct flags with cmake on default settings. 
+This worked with 1.0: `cmake -D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ..` 
+but not this time. Issue with calling c+11 + definin compiler flags correctly in nvcc. 
+
+OLD: I need to put `-I/software/opt/cuda/9.0/include` into the ../CMakeLists.txt so that it compiles. )
diff --git a/acc/.gitignore b/acc/.gitignore
new file mode 100644
index 0000000..bc4b7d8
--- /dev/null
+++ b/acc/.gitignore
@@ -0,0 +1,5 @@
+build
+testbin
+
+# Except this file
+!.gitignore
diff --git a/acc/README.md b/acc/README.md
new file mode 100644
index 0000000..6197fed
--- /dev/null
+++ b/acc/README.md
@@ -0,0 +1,42 @@
+# Dependencies
+## Debian/Ubuntu
+`apt install flex bison build-essential`
+
+# Usage
+* `./build_acc.sh # Builds the ASPL compiler (acc)`
+* `./compile.sh <.sps or .sas source> # Compiles the given stage into CUDA`
+* `./test.sh # Tries to compile the sample stages`
+* `./clean.sh # Removed directories generated by build_acc.sh and test.sh`
+
+## Example
+
+- `./compile.sh src/stencil_assembly.sas # Generates stencil_assembly.cuh`
+- `./compile.sh src/stencil_process.sps # Generates stencil_process.cuh`
+
+# What happens under the hood
+
+The compiler is made of a scanner (flex), parser (bison), implementation of the abstract syntax tree (AST) and a code generator.
+The language is defined by tokens and grammars found in acc.l and acc.y. These files are given as input to flex and bison, which generate the scanning and parsing stages for the compiler. The resulting AST is defined in ast.h. Finally, we traverse the generated AST with our code generator, generating CUDA code.
+
+## ACC compilation stages
+
+### In short: 
+* Preprocess .ac
+* Compile preprocessed .ac to .cuh
+* Compile .cuh
+
+### More detailed:
+0. A Parser is generated: bison --verbose -d acc.y
+0. A Scanner is generated: flex acc.l
+0. The compiler is built: gcc -std=gnu11 code_generator.c acc.tab.c lex.yy.c -lfl
+0. Source files (.sps and .sas) are preprocessed using the GCC preprocessor and cleaned from any residual directives which would be useful when compiling the code further with GCC. We do not need those when compiling with ACC and are not recognized by our grammar.
+0. Either the stencil processing stage (.sps) or the stencil assembly stage (.sas) are generated by passing the preprocessed file to acc. This emits the final CUDA code.
+0. Compilation is continued with the NVIDIA CUDA compiler
+
+### Even more detailed:
+The NVIDIA CUDA compiler compiles .cuh to .fatbin, which is embedded into a C++ binary containig host code of the program. A fatbin contains .cubin files, which contain the configuration of the GPU and the kernels in a streaming assembly code (.sass). We could also compile for a virtual architecture (.ptx) instead of the actual hardware-specific machine code (.cubin) by passing -code=compute_XX flag to nvcc, which would compile cuda sources at runtime (just-in-time compilation, JIT) when creating the CUDA context. However, we alway know which architecture we want to run the code on and JIT compilation would just increase the time to takes to launch the program.
+
+nvcc -DAC_DOUBLE_PRECISION=1 -ptx --relocatable-device-code true -O3 -std=c++11 --maxrregcount=255 -ftz=true -gencode arch=compute_60,code=sm_60 device.cu -I ../../include -I ../../
+nvcc -DAC_DOUBLE_PRECISION=1 -cubin --relocatable-device-code true -O3 -std=c++11 --maxrregcount=255 -ftz=true -gencode arch=compute_60,code=sm_60 device.cu -I ../../include -I ../../
+cuobjdump --dump-sass device.cubin > device.sass
+
diff --git a/acc/build_acc.sh b/acc/build_acc.sh
new file mode 100755
index 0000000..ed275d5
--- /dev/null
+++ b/acc/build_acc.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+cd `dirname $0` # Only operate in the same directory with this script
+
+COMPILER_NAME="acc"
+
+SRC_DIR=${PWD}/src
+BUILD_DIR=${PWD}/build
+
+echo "Created" ${BUILD_DIR}
+
+mkdir -p ${BUILD_DIR}
+cd ${BUILD_DIR}
+
+echo ${BASE_DIR}
+echo ${SRC_DIR}
+echo ${BUILD_DIR}
+
+# Generate Bison headers
+bison --verbose -d ${SRC_DIR}/${COMPILER_NAME}.y
+
+## Generate Flex sources and headers
+flex ${SRC_DIR}/${COMPILER_NAME}.l
+
+## Compile the ASPL compiler
+gcc -std=gnu11 ${SRC_DIR}/code_generator.c ${COMPILER_NAME}.tab.c lex.yy.c -lfl -I ${BUILD_DIR} -I ${SRC_DIR} -o ${COMPILER_NAME}
diff --git a/acc/clean.sh b/acc/clean.sh
new file mode 100755
index 0000000..ad012c4
--- /dev/null
+++ b/acc/clean.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+cd `dirname $0` # Only operate in the same directory with this script
+
+rm -rf build testbin
+
diff --git a/acc/compile.sh b/acc/compile.sh
new file mode 100755
index 0000000..55831cd
--- /dev/null
+++ b/acc/compile.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Usage ./compile <source file>
+
+ACC_DIR=`dirname $0`
+
+FULL_NAME=$(basename -- $1)
+FILENAME="${FULL_NAME%.*}"
+EXTENSION="${FULL_NAME##*.}"
+
+if [ "${EXTENSION}" = "sas" ]; then
+    echo "Generating stencil assembly stage ${FILENAME}.sas -> stencil_assembly.cuh"
+    COMPILE_FLAGS="-sas" # Generate stencil assembly stage
+    CUH_FILENAME="stencil_assembly.cuh"
+elif [ "${EXTENSION}" = "sps" ]; then
+    echo "Generating stencil processing stage:  ${FILENAME}.sps -> stencil_process.cuh"
+    COMPILE_FLAGS="-sps" # Generate stencil processing stage
+    CUH_FILENAME="stencil_process.cuh"
+else
+    echo "Error: unknown extension" ${EXTENSION} "of file" ${FULL_NAME}
+    echo "Extension should be either .sas or .sps"
+    exit
+fi
+
+${ACC_DIR}/preprocess.sh $1 | ${ACC_DIR}/build/acc ${COMPILE_FLAGS} > ${CUH_FILENAME}
diff --git a/acc/mhd_solver/stencil_assembly.sas b/acc/mhd_solver/stencil_assembly.sas
new file mode 100644
index 0000000..f9025f9
--- /dev/null
+++ b/acc/mhd_solver/stencil_assembly.sas
@@ -0,0 +1,26 @@
+
+Preprocessed Scalar
+value(in Scalar vertex)
+{
+    return vertex[vertexIdx];
+}
+
+Preprocessed Vector
+gradient(in Scalar vertex)
+{
+    return (Vector){derx(vertexIdx, vertex),
+                    dery(vertexIdx, vertex),
+                    derz(vertexIdx, vertex)};
+}
+
+Preprocessed Matrix
+hessian(in Scalar vertex)
+{
+    Matrix hessian;
+
+    hessian.row[0] = (Vector){derxx(vertexIdx, vertex), derxy(vertexIdx, vertex), derxz(vertexIdx, vertex)};
+    hessian.row[1] = (Vector){hessian.row[0].y,       deryy(vertexIdx, vertex), deryz(vertexIdx, vertex)};
+    hessian.row[2] = (Vector){hessian.row[0].z,       hessian.row[1].z,       derzz(vertexIdx, vertex)};
+
+    return hessian;
+}
diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
new file mode 100644
index 0000000..520ceb5
--- /dev/null
+++ b/acc/mhd_solver/stencil_process.sps
@@ -0,0 +1,265 @@
+#define LINDUCTION (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+#define LGRAVITY (0)
+
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar cv_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar zeta;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+#if LENTROPY
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar ss, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+    const Scalar cs2 = cs2_sound * exp(gamma * value(ss) / cp_sound + (gamma - 1) * (value(lnrho) - LNRHO0));
+    const Vector  j = (Scalar(1.) / mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Vector B = curl(aa);
+    const Scalar inv_rho = Scalar(1.) / exp(value(lnrho));
+
+    // Regex replace CPU constants with get\(AC_([a-zA-Z_0-9]*)\)
+    // \1
+    const Vector mom = - mul(gradients(uu), value(uu)) 
+                                                       - cs2 * ((Scalar(1.) / cp_sound) * gradient(ss) + gradient(lnrho))
+                                                       + inv_rho * cross(j, B)
+                                                       + nu_visc * (
+                                                            laplace_vec(uu) 
+                                                        + Scalar(1. / 3.) * gradient_of_divergence(uu) 
+                                                        + Scalar(2.) * mul(S, gradient(lnrho))
+                                                        )
+                                                        + zeta * gradient_of_divergence(uu);
+    return mom;
+}
+#elif LTEMPERATURE
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar tt) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+    const Vector pressure_term = (cp_sound - cv_sound) * (gradient(tt) + value(tt) * gradient(lnrho));
+
+  mom = -mul(gradients(uu), value(uu)) -
+    pressure_term +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  #if LGRAVITY
+  mom = mom - (Vector){0, 0, -10.0};
+  #endif
+
+  return mom;
+}
+#else
+Vector
+momentum(in Vector uu, in Scalar lnrho) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+    // Isothermal: we have constant speed of sound
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  #if LGRAVITY
+  mom = mom - (Vector){0, 0, -10.0};
+  #endif
+
+  return mom;
+}
+#endif
+
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+
+#if LENTROPY
+Scalar
+lnT( in Scalar ss, in Scalar lnrho) {
+  const Scalar lnT = LNT0 + gamma * value(ss) / cp_sound +
+    (gamma - Scalar(1.)) * (value(lnrho) - LNRHO0);
+  return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction( in Scalar ss, in Scalar lnrho) {
+  const Scalar inv_cp_sound = AcReal(1.) / cp_sound;
+
+  const Vector grad_ln_chi = - gradient(lnrho);
+
+  const Scalar first_term = gamma * inv_cp_sound * laplace(ss) +
+    (gamma - AcReal(1.)) * laplace(lnrho);
+  const Vector second_term = gamma * inv_cp_sound * gradient(ss) +
+    (gamma - AcReal(1.)) * gradient(lnrho);
+  const Vector third_term = gamma * (inv_cp_sound * gradient(ss) +
+    gradient(lnrho)) + grad_ln_chi;
+
+  const Scalar chi = AC_THERMAL_CONDUCTIVITY / (exp(value(lnrho)) * cp_sound);
+  return cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k) {
+  return 1;
+}
+
+Scalar
+entropy(in Scalar ss, in Vector uu, in Scalar lnrho, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+    const Scalar inv_pT = Scalar(1.) / (exp(value(lnrho)) * exp(lnT(ss, lnrho)));
+    const Vector  j = (Scalar(1.) / mu0) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const Scalar RHS = H_CONST - C_CONST
+                                                + eta * (mu0) * dot(j, j) 
+                                                + Scalar(2.) * exp(value(lnrho)) * nu_visc * contract(S)
+                                                + zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu);
+
+    return - dot(value(uu), gradient(ss))
+                  + inv_pT * RHS
+                  + heat_conduction(ss, lnrho);
+}
+#endif
+
+#if LTEMPERATURE
+Scalar
+heat_transfer(in Vector uu, in Scalar lnrho, in Scalar tt)
+{
+    const Matrix S = stress_tensor(uu);
+    const Scalar heat_diffusivity_k = 0.0008; //8e-4;
+    return -dot(value(uu), gradient(tt)) + heat_diffusivity_k * laplace(tt) + heat_diffusivity_k * dot(gradient(lnrho), gradient(tt)) + nu_visc * contract(S) * (Scalar(1.) / cv_sound) - (gamma - 1) * value(tt) * divergence(uu);
+}
+#endif
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+#if LENTROPY
+in Scalar ss = VTXBUF_ENTROPY;
+out Scalar out_ss = VTXBUF_ENTROPY;
+#endif
+
+#if LTEMPERATURE
+in Scalar tt = VTXBUF_TEMPERATURE;
+out Scalar out_tt = VTXBUF_TEMPERATURE;
+#endif
+
+Kernel void
+solve(Scalar dt) {
+    out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
+
+    #if LINDUCTION
+    out_aa = rk3(out_aa, aa, induction(uu, aa), dt);
+    #endif
+
+    #if LENTROPY
+        out_uu = rk3(out_uu, uu, momentum(uu, lnrho, ss, aa), dt);
+        out_ss  = rk3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt);
+    #elif LTEMPERATURE
+        out_uu =rk3(out_uu, uu, momentum(uu, lnrho, tt), dt);
+        out_tt = rk3(out_tt, tt, heat_transfer(uu, lnrho, tt), dt);
+    #else
+        out_uu = rk3(out_uu, uu, momentum(uu, lnrho), dt);
+    #endif    
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/acc/preprocess.sh b/acc/preprocess.sh
new file mode 100755
index 0000000..0ce6fbc
--- /dev/null
+++ b/acc/preprocess.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Preprocesses the give file using GCC. This script is usually automatically called in
+# ./compile.sh, but may be called also individually for debugging purposes.
+gcc -E -x c ${@} | sed "s/#.*//g"
diff --git a/acc/pseudodisk/stencil_process_gravx.sps b/acc/pseudodisk/stencil_process_gravx.sps
new file mode 100644
index 0000000..32d980a
--- /dev/null
+++ b/acc/pseudodisk/stencil_process_gravx.sps
@@ -0,0 +1,228 @@
+#define LINDUCTION (1)
+#define LENTROPY (1)
+
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance_x(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+// Gravitation for in negative x-direction. 
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+
+    const Scalar RR = vertex_pos.x - star_pos.x;
+
+    const Scalar G_force_abs = GM_star / (RR*RR); // Force per unit mass;
+
+    Vector G_force = (Vector){ - G_force_abs,
+                                 AcReal(0.0),
+                                 AcReal(0.0)};
+
+    return G_force;
+}
+
+#if LENTROPY
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar ss, in Vector aa, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom - cs2_sound * (Scalar(1.) / cp_sound) * gradient(ss);
+
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+  const Vector j = (Scalar(1.) / mu0) * (grad_div - lap);
+  const Vector B = curl(aa);
+  mom = mom + (Scalar(1.) / exp(value(lnrho))) * cross(j, B);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#else
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#endif
+
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+
+#if LENTROPY
+Scalar
+lnT( in Scalar ss, in Scalar lnrho) {
+  const Scalar lnT = LNT0 + value(ss) / cp_sound +
+    (gamma - AcReal(1.)) * (value(lnrho) - LNRHO0);
+  return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction( in Scalar ss, in Scalar lnrho) {
+  const Scalar inv_cp_sound = AcReal(1.) / cp_sound;
+
+  const Vector grad_ln_chi = (Vector) {
+    0,
+    0,
+    0
+  }; // TODO not used
+
+  const Scalar first_term = gamma * inv_cp_sound * laplace(ss) +
+    (gamma - AcReal(1.)) * laplace(lnrho);
+  const Vector second_term = gamma * inv_cp_sound * gradient(ss) +
+    (gamma - AcReal(1.)) * gradient(lnrho);
+  const Vector third_term = gamma * (inv_cp_sound * gradient(ss) +
+    gradient(lnrho)) + grad_ln_chi;
+
+  return cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k) {
+  return 1;
+}
+
+Scalar
+entropy(in Scalar ss, in Vector uu, in Scalar lnrho, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+
+    // nabla x nabla x A / mu0 = nabla(nabla dot A) - nabla^2(A)
+    const Vector j = gradient_of_divergence(aa) - laplace_vec(aa);
+
+    const Scalar inv_pT = AcReal(1.) / (exp(value(lnrho)) + exp(lnT(ss, lnrho)));
+
+    return -dot(value(uu), gradient(ss)) +
+      inv_pT * (H_CONST - C_CONST +
+        eta * mu0 * dot(j, j) +
+        AcReal(2.) * exp(value(lnrho)) * nu_visc * contract(S) +
+        zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu)
+      ) + heat_conduction(ss, lnrho);
+}
+#endif
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+#if LENTROPY
+in Scalar ss = VTXBUF_ENTROPY;
+out Scalar out_ss = VTXBUF_ENTROPY;
+#endif
+
+Kernel void
+solve(Scalar dt) {
+    WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+    #if LINDUCTION
+        WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+    #endif
+
+
+    #if LENTROPY
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, ss, aa, vertexIdx), dt));
+        WRITE(out_ss,    RK3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt));
+    #else
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+    #endif
+}
diff --git a/acc/pseudodisk/stencil_process_isotherm_gravx.sps b/acc/pseudodisk/stencil_process_isotherm_gravx.sps
new file mode 100644
index 0000000..f79b7ff
--- /dev/null
+++ b/acc/pseudodisk/stencil_process_isotherm_gravx.sps
@@ -0,0 +1,169 @@
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance_x(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+
+// "Line-like" gravity with no y-component
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+
+    const Scalar RR = vertex_pos.x - star_pos.x;
+
+    const Scalar G_force_abs = GM_star / (RR*RR); // Force per unit mass;
+
+    Vector G_force = (Vector){ - G_force_abs,
+                                 AcReal(0.0),
+                                 AcReal(0.0)};
+
+    return G_force;
+}
+
+
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu) 
+      + grav_force_line(vertexIdx);
+  
+
+  return mom;
+}
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+Kernel void
+solve(Scalar dt) {
+  WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+  #if LINDUCTION
+  WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+  #endif
+
+  WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/acc/pseudodisk/stencil_process_isotherm_linegrav.sps b/acc/pseudodisk/stencil_process_isotherm_linegrav.sps
new file mode 100644
index 0000000..a2b83f1
--- /dev/null
+++ b/acc/pseudodisk/stencil_process_isotherm_linegrav.sps
@@ -0,0 +1,174 @@
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+
+// "Line-like" gravity with no y-component
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    //Vector star_pos   = (Vector){star_pos_x      - xorig, dsy * vertexIdx.y - yorig, star_pos_z      - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, star_pos_z};
+    //LIKE THIS: Vector star_pos = (Vector){star_pos_x, 0.0, star_pos_z};
+
+    const Scalar RR = distance(star_pos, vertex_pos);
+
+    const Scalar G_force_abs   = GM_star / (RR*RR); // Force per unit mass;
+    //const Scalar G_force_abs = 1.0; // Simple temp. test;
+
+    Vector G_force = (Vector){ - G_force_abs*((vertex_pos.x-star_pos.x)/RR),
+                                 AcReal(0.0),
+                               - G_force_abs*((vertex_pos.z-star_pos.z)/RR)};
+
+    //printf("G_force %e %e %e", G_force_abs.x, G_force_abs.y, G_force_abs.z)
+
+    return G_force;
+}
+
+
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu) 
+      + grav_force_line(vertexIdx);
+  
+
+  return mom;
+}
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+Kernel void
+solve(Scalar dt) {
+  WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+  #if LINDUCTION
+  WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+  #endif
+
+  WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/acc/pseudodisk/stencil_process_linegrav.sps b/acc/pseudodisk/stencil_process_linegrav.sps
new file mode 100644
index 0000000..ecc6c99
--- /dev/null
+++ b/acc/pseudodisk/stencil_process_linegrav.sps
@@ -0,0 +1,233 @@
+#define LINDUCTION (1)
+#define LENTROPY (1)
+
+
+// Declare uniforms (i.e. device constants)
+uniform Scalar cs2_sound;
+uniform Scalar nu_visc;
+uniform Scalar cp_sound;
+uniform Scalar mu0;
+uniform Scalar eta;
+uniform Scalar gamma;
+uniform Scalar chi;
+uniform Scalar zeta;
+
+uniform int nx_min;
+uniform int ny_min;
+uniform int nz_min;
+uniform int nx;
+uniform int ny;
+uniform int nz;
+
+uniform Scalar xorig;
+uniform Scalar yorig;
+uniform Scalar zorig;
+
+//Star position
+uniform Scalar star_pos_x;
+uniform Scalar star_pos_z;
+uniform Scalar GM_star;
+
+//Needed for gravity
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+uniform Scalar inv_dsx;
+uniform Scalar inv_dsy;
+uniform Scalar inv_dsz;
+
+Scalar 
+distance_x(Vector a, Vector b) 
+{ 
+    return sqrt(dot(a-b, a-b)); 
+}
+
+Vector
+value(in Vector uu)
+{
+    return (Vector){value(uu.x), value(uu.y), value(uu.z)};
+}
+
+Matrix
+gradients(in Vector uu)
+{
+    return (Matrix){gradient(uu.x), gradient(uu.y), gradient(uu.z)};
+}
+
+Scalar
+continuity(in Vector uu, in Scalar lnrho) {
+    return -dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+// "Line-like" gravity with no y-component
+Vector 
+grav_force_line(const int3 vertexIdx)
+{
+    Vector vertex_pos = (Vector){dsx * vertexIdx.x - xorig, dsy * vertexIdx.y - yorig, dsz * vertexIdx.z - zorig};
+    //Vector star_pos   = (Vector){star_pos_x      - xorig, dsy * vertexIdx.y - yorig, star_pos_z      - zorig};
+    Vector star_pos   = (Vector){star_pos_x,                dsy * vertexIdx.y - yorig, star_pos_z};
+    //LIKE THIS: Vector star_pos = (Vector){star_pos_x, 0.0, star_pos_z};
+
+    const Scalar RR = distance(star_pos, vertex_pos);
+
+    const Scalar G_force_abs = GM_star / (RR*RR); // Force per unit mass;
+    //const Scalar G_force_abs = 1.0; // Simple temp. test;
+
+    Vector G_force = (Vector){ - G_force_abs*((vertex_pos.x-star_pos.x)/RR),
+                                 AcReal(0.0),
+                               - G_force_abs*((vertex_pos.z-star_pos.z)/RR)};
+
+    //printf("G_force %e %e %e", G_force_abs.x, G_force_abs.y, G_force_abs.z)
+
+    return G_force;
+}
+
+#if LENTROPY
+Vector
+momentum(in Vector uu, in Scalar lnrho, in Scalar ss, in Vector aa, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom - cs2_sound * (Scalar(1.) / cp_sound) * gradient(ss);
+
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+  const Vector j = (Scalar(1.) / mu0) * (grad_div - lap);
+  const Vector B = curl(aa);
+  mom = mom + (Scalar(1.) / exp(value(lnrho))) * cross(j, B);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#else
+Vector
+momentum(in Vector uu, in Scalar lnrho, const int3 vertexIdx) {
+  Vector mom;
+
+  const Matrix S = stress_tensor(uu);
+
+  mom = -mul(gradients(uu), value(uu)) -
+    cs2_sound * gradient(lnrho) +
+    nu_visc *
+    (laplace_vec(uu) + Scalar(1. / 3.) * gradient_of_divergence(uu) +
+      Scalar(2.) * mul(S, gradient(lnrho))) + zeta * gradient_of_divergence(uu);
+
+  mom = mom + grav_force_line(vertexIdx);
+
+  return mom;
+}
+#endif
+
+
+Vector
+induction(in Vector uu, in Vector aa) {
+  // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+  // x A)) in order to avoid taking the first derivative twice (did the math,
+  // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+  // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+  const Vector B = curl(aa);
+  const Vector grad_div = gradient_of_divergence(aa);
+  const Vector lap = laplace_vec(aa);
+
+  // Note, mu0 is cancelled out
+  const Vector ind = cross(value(uu), B) - eta * (grad_div - lap);
+
+  return ind;
+}
+
+
+#if LENTROPY
+Scalar
+lnT( in Scalar ss, in Scalar lnrho) {
+  const Scalar lnT = LNT0 + value(ss) / cp_sound +
+    (gamma - AcReal(1.)) * (value(lnrho) - LNRHO0);
+  return lnT;
+}
+
+// Nabla dot (K nabla T) / (rho T)
+Scalar
+heat_conduction( in Scalar ss, in Scalar lnrho) {
+  const Scalar inv_cp_sound = AcReal(1.) / cp_sound;
+
+  const Vector grad_ln_chi = (Vector) {
+    0,
+    0,
+    0
+  }; // TODO not used
+
+  const Scalar first_term = gamma * inv_cp_sound * laplace(ss) +
+    (gamma - AcReal(1.)) * laplace(lnrho);
+  const Vector second_term = gamma * inv_cp_sound * gradient(ss) +
+    (gamma - AcReal(1.)) * gradient(lnrho);
+  const Vector third_term = gamma * (inv_cp_sound * gradient(ss) +
+    gradient(lnrho)) + grad_ln_chi;
+
+  return cp_sound * chi * (first_term + dot(second_term, third_term));
+}
+
+Scalar
+heating(const int i, const int j, const int k) {
+  return 1;
+}
+
+Scalar
+entropy(in Scalar ss, in Vector uu, in Scalar lnrho, in Vector aa) {
+    const Matrix S = stress_tensor(uu);
+
+    // nabla x nabla x A / mu0 = nabla(nabla dot A) - nabla^2(A)
+    const Vector j = gradient_of_divergence(aa) - laplace_vec(aa);
+
+    const Scalar inv_pT = AcReal(1.) / (exp(value(lnrho)) + exp(lnT(ss, lnrho)));
+
+    return -dot(value(uu), gradient(ss)) +
+      inv_pT * (H_CONST - C_CONST +
+        eta * mu0 * dot(j, j) +
+        AcReal(2.) * exp(value(lnrho)) * nu_visc * contract(S) +
+        zeta * exp(value(lnrho)) * divergence(uu) * divergence(uu)
+      ) + heat_conduction(ss, lnrho);
+}
+#endif
+
+// Declare input and output arrays using locations specified in the
+// array enum in astaroth.h
+in Scalar lnrho = VTXBUF_LNRHO;
+out Scalar out_lnrho = VTXBUF_LNRHO;
+
+in Vector uu = (int3) {VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ};
+out Vector out_uu = (int3) {VTXBUF_UUX,VTXBUF_UUY,VTXBUF_UUZ};
+
+
+#if LINDUCTION
+in Vector aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+out Vector out_aa = (int3) {VTXBUF_AX,VTXBUF_AY,VTXBUF_AZ};
+#endif
+
+#if LENTROPY
+in Scalar ss = VTXBUF_ENTROPY;
+out Scalar out_ss = VTXBUF_ENTROPY;
+#endif
+
+Kernel void
+solve(Scalar dt) {
+    WRITE(out_lnrho, RK3(out_lnrho, lnrho, continuity(uu, lnrho), dt));
+
+    #if LINDUCTION
+        WRITE(out_aa,    RK3(out_aa, aa, induction(uu, aa), dt));
+    #endif
+
+
+    #if LENTROPY
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, ss, aa, vertexIdx), dt));
+        WRITE(out_ss,    RK3(out_ss, ss, entropy(ss, uu, lnrho, aa), dt));
+    #else
+        WRITE(out_uu, RK3(out_uu, uu, momentum(uu, lnrho, vertexIdx), dt));
+    #endif
+}
diff --git a/acc/samples/common_header.h b/acc/samples/common_header.h
new file mode 100644
index 0000000..14eed0c
--- /dev/null
+++ b/acc/samples/common_header.h
@@ -0,0 +1,422 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Provides an interface to Astaroth. Contains all the necessary configuration
+ * structs and functions for running the code on multiple GPUs.
+ *
+ * All interface functions declared here (such as acInit()) operate all GPUs
+ * available in the node under the hood, and the user does not need any
+ * information about the decomposition, synchronization or such to use these
+ * functions.
+ *
+ */
+#pragma once
+
+/* Prevent name mangling */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <float.h>        // FLT_EPSILON, etc
+#include <stdlib.h>       // size_t
+#include <vector_types.h> // CUDA vector types (float4, etc)
+
+
+/*
+ * =============================================================================
+ * Flags for auto-optimization
+ * =============================================================================
+ */
+#define AUTO_OPTIMIZE (0) // DEPRECATED TODO remove
+#define BOUNDCONDS_OPTIMIZE (0)
+#define GENERATE_BENCHMARK_DATA (0)
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define MAX_TB_DIM (MAX_THREADS_PER_BLOCK)
+#define NUM_ITERATIONS (10)
+#define WARP_SIZE (32)
+
+
+/*
+ * =============================================================================
+ * Compile-time constants used during simulation (user definable)
+ * =============================================================================
+ */
+#define STENCIL_ORDER (6)
+
+///////////// PAD TEST
+// NOTE: works only with nx is divisible by 32
+//#define PAD_LEAD (32 - STENCIL_ORDER/2)
+//#define PAD_SIZE (32 - STENCIL_ORDER)
+///////////// PAD TEST
+
+// L-prefix inherited from the old Astaroth, no idea what it means
+// MV: L means a Logical switch variale, something having true of false value.
+#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+#define LINDUCTION (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
+/*
+ * =============================================================================
+ * Identifiers used to construct the parameter lists for AcMeshInfo
+ * (IntParamType and RealParamType)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_INT_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_nx), \
+        FUNC(AC_ny), \
+        FUNC(AC_nz), \
+        FUNC(AC_mx), \
+        FUNC(AC_my), \
+        FUNC(AC_mz), \
+        FUNC(AC_nx_min), \
+        FUNC(AC_ny_min), \
+        FUNC(AC_nz_min), \
+        FUNC(AC_nx_max), \
+        FUNC(AC_ny_max), \
+        FUNC(AC_nz_max), \
+        /* Other */\
+        FUNC(AC_max_steps), \
+        FUNC(AC_save_steps), \
+        FUNC(AC_bin_steps), \
+        FUNC(AC_bc_type), \
+        /* Additional */\
+        FUNC(AC_mxy),\
+        FUNC(AC_nxy),\
+        FUNC(AC_nxyz)
+#define AC_FOR_REAL_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_dsx), \
+        FUNC(AC_dsy), \
+        FUNC(AC_dsz), \
+        FUNC(AC_dsmin), \
+        /* physical grid*/\
+        FUNC(AC_xlen), \
+        FUNC(AC_ylen), \
+        FUNC(AC_zlen), \
+        FUNC(AC_xorig), \
+        FUNC(AC_yorig), \
+        FUNC(AC_zorig), \
+        /*Physical units*/\
+        FUNC(AC_unit_density),\
+        FUNC(AC_unit_velocity),\
+        FUNC(AC_unit_length),\
+        /* properties of gravitating star*/\
+        FUNC(AC_star_pos_x),\
+        FUNC(AC_star_pos_y),\
+        FUNC(AC_star_pos_z),\
+        FUNC(AC_M_star),\
+        /* Run params */\
+        FUNC(AC_cdt), \
+        FUNC(AC_cdtv), \
+        FUNC(AC_cdts), \
+        FUNC(AC_nu_visc), \
+        FUNC(AC_cs_sound), \
+        FUNC(AC_eta), \
+        FUNC(AC_mu0), \
+        FUNC(AC_relhel), \
+        FUNC(AC_cp_sound), \
+        FUNC(AC_gamma), \
+        FUNC(AC_cv_sound), \
+        FUNC(AC_lnT0), \
+        FUNC(AC_lnrho0), \
+        FUNC(AC_zeta), \
+        FUNC(AC_trans),\
+        /* Other */\
+        FUNC(AC_bin_save_t), \
+        /* Initial condition params */\
+        FUNC(AC_ampl_lnrho), \
+        FUNC(AC_ampl_uu), \
+        FUNC(AC_angl_uu), \
+        FUNC(AC_lnrho_edge),\
+        FUNC(AC_lnrho_out),\
+        /* Additional helper params */\
+        /* (deduced from other params do not set these directly!) */\
+        FUNC(AC_G_CONST),\
+        FUNC(AC_GM_star),\
+        FUNC(AC_sq2GM_star),\
+        FUNC(AC_cs2_sound), \
+        FUNC(AC_inv_dsx), \
+        FUNC(AC_inv_dsy), \
+        FUNC(AC_inv_dsz)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Identifiers for VertexBufferHandle
+ * (i.e. the arrays used to construct AcMesh)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_LNRHO), \
+        FUNC(VTXBUF_UUX), \
+        FUNC(VTXBUF_UUY), \
+        FUNC(VTXBUF_UUZ), \
+        // FUNC(VTXBUF_DYE),
+
+#if LINDUCTION
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_AX), \
+        FUNC(VTXBUF_AY), \
+        FUNC(VTXBUF_AZ),
+#else
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LENTROPY
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_ENTROPY),
+#else
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LTEMPERATURE
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_TEMPERATURE),
+#else
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+#endif
+
+#define AC_FOR_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Single/double precision switch
+ * =============================================================================
+ */
+#if AC_DOUBLE_PRECISION == 1
+typedef double AcReal;
+typedef double3 AcReal3;
+#define AC_REAL_MAX (DBL_MAX)
+#define AC_REAL_MIN (DBL_MIN)
+#define AC_REAL_EPSILON (DBL_EPSILON)
+#else
+typedef float AcReal;
+typedef float3 AcReal3;
+#define AC_REAL_MAX (FLT_MAX)
+#define AC_REAL_MIN (FLT_MIN)
+#define AC_REAL_EPSILON (FLT_EPSILON)
+#endif
+
+typedef struct {
+    AcReal3 row[3];
+} AcMatrix;
+
+/*
+ * =============================================================================
+ * Helper macros
+ * =============================================================================
+ */
+#define AC_GEN_ID(X) X
+#define AC_GEN_STR(X) #X
+
+/*
+ * =============================================================================
+ * Error codes
+ * =============================================================================
+ */
+typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
+
+/*
+ * =============================================================================
+ * Reduction types
+ * =============================================================================
+ */
+typedef enum {
+    RTYPE_MAX,
+    RTYPE_MIN,
+    RTYPE_RMS,
+    RTYPE_RMS_EXP,
+    NUM_REDUCTION_TYPES
+} ReductionType;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
+    NUM_INT_PARAM_TYPES
+} AcIntParam;
+
+typedef enum {
+    AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
+    NUM_REAL_PARAM_TYPES
+} AcRealParam;
+
+extern const char* intparam_names[];  // Defined in astaroth.cu
+extern const char* realparam_names[]; // Defined in astaroth.cu
+
+typedef struct {
+    int int_params[NUM_INT_PARAM_TYPES];
+    AcReal real_params[NUM_REAL_PARAM_TYPES];
+} AcMeshInfo;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
+} VertexBufferHandle;
+
+extern const char* vtxbuf_names[]; // Defined in astaroth.cu
+
+/*
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+*/
+
+// NOTE: there's no particular benefit declaring AcMesh a class, since
+// a library user may already have allocated memory for the vertex_buffers.
+// But then we would allocate memory again when the user wants to start
+// filling the class with data. => Its better to consider AcMesh as a
+// payload-only struct
+typedef struct {
+    AcReal* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} AcMesh;
+
+#define AC_VTXBUF_SIZE(mesh_info)                                              \
+    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *      \
+              mesh_info.int_params[AC_mz]))
+
+#define AC_VTXBUF_SIZE_BYTES(mesh_info)                                        \
+    (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                   \
+    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] *               \
+     mesh_info.int_params[AC_nz])
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                             \
+    (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
+
+#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                      \
+    ((i) + (j)*mesh_info.int_params[AC_mx] +                                   \
+     (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my])
+
+/*
+ * =============================================================================
+ * Astaroth interface
+ * =============================================================================
+ */
+/** Starting point of all GPU computation. Handles the allocation and
+initialization of *all memory needed on all GPUs in the node*. In other words,
+setups everything GPU-side so that calling any other GPU interface function
+afterwards does not result in illegal memory accesses. */
+AcResult acInit(const AcMeshInfo& mesh_info);
+
+/** Splits the host_mesh and distributes it among the GPUs in the node */
+AcResult acLoad(const AcMesh& host_mesh);
+AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
+
+/** Does all three steps of the RK3 integration and computes the boundary
+conditions when necessary. Note that the boundary conditions are not applied
+after the final integration step.
+The result can be fetched to CPU memory with acStore(). */
+AcResult acIntegrate(const AcReal& dt);
+
+/** Performs a single RK3 step without computing boundary conditions. */
+AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
+
+/** Applies boundary conditions on the GPU meshs and communicates the
+ ghost zones among GPUs if necessary */
+AcResult acBoundcondStep(void);
+
+/** Performs a scalar reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
+
+/** Performs a vector reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+                   const VertexBufferHandle& b, const VertexBufferHandle& c);
+
+/** Stores the mesh distributed among GPUs of the node back to a single host
+ * mesh */
+AcResult acStore(AcMesh* host_mesh);
+AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
+
+/** Frees all GPU allocations and resets all devices in the node. Should be
+ * called at exit. */
+AcResult acQuit(void);
+
+/** Synchronizes all devices. All calls to Astaroth are asynchronous by default
+    unless otherwise stated. */
+AcResult acSynchronize(void);
+
+/* End extern "C" */
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ * =============================================================================
+ * Notes
+ * =============================================================================
+ */
+/*
+typedef enum {
+    VTX_BUF_LNRHO,
+    VTX_BUF_UUX,
+    VTX_BUF_UUY,
+    VTX_BUF_UUZ,
+    NUM_VERTEX_BUFFER_HANDLES
+} VertexBufferHandle
+
+// LNRHO etc
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+
+// Host
+typedef struct {
+    VertexBuffer vertex_buffers[NUM_VERTEX_BUFFER_HANDLES];
+    MeshInfo info;
+} Mesh;
+
+// Device
+typedef struct {
+    VertexBuffer in[NUM_VERTEX_BUFFER_HANDLES];
+    VertexBuffer out[NUM_VERTEX_BUFFER_HANDLES];
+} VertexBufferArray;
+*/
diff --git a/acc/samples/sample_stencil_assembly.sas b/acc/samples/sample_stencil_assembly.sas
new file mode 100644
index 0000000..4ddd64c
--- /dev/null
+++ b/acc/samples/sample_stencil_assembly.sas
@@ -0,0 +1,49 @@
+// TODO comments and reformatting
+
+//Scalar
+//dostuff(in Scalar uux)
+//{
+//   return uux[vertexIdx.x, vertexIdx.y, vertexIdx.z];
+//}
+
+// stencil_assembly.in
+Preprocessed Scalar
+some_exotic_stencil_computation(in Scalar uux)
+{
+    //#if STENCIL_ORDER == 2
+    //    const Scalar coefficients[] = {1, 1, 1};
+    //#else if STENCIL_ORDER == 4
+    //    const Scalar coefficients[] = {....};
+    //#endif
+
+    int i = vertexIdx.x;
+    int j = vertexIdx.y;
+    int k = vertexIdx.z;
+    const Scalar coefficients[] = {1, 2, 3};
+
+    return coefficients[0] * uux[i-1, j, k] + 
+           coefficients[1] * uux[i, j, k] + 
+           coefficients[2] * uux[i+1, j, k];
+}
+
+// stencil_process.in
+//in Scalar uux_in = VTXBUF_UUX;
+//out Scalar uux_out = VTXBUF_UUX;
+
+
+//Kernel
+//solve(Scalar dt)
+//{
+//    uux_out = some_exotic_stencil(uux_in);
+//}
+
+
+
+
+
+
+
+
+
+
+
diff --git a/acc/samples/sample_stencil_process.sps b/acc/samples/sample_stencil_process.sps
new file mode 100644
index 0000000..219e40e
--- /dev/null
+++ b/acc/samples/sample_stencil_process.sps
@@ -0,0 +1,149 @@
+// TODO comments and reformatting
+
+uniform Scalar dsx;
+uniform Scalar dsy;
+uniform Scalar dsz;
+
+uniform Scalar GM_star;
+// Other uniforms types than Scalar or int not yet supported
+
+// BUILTIN
+//Scalar dot(...){}
+
+// BUILTIN
+//Scalar distance(Vector a, Vector b) { return sqrt(dot(a, b)); }
+
+// BUILTIN
+// Scalar first_derivative(Scalar pencil[], Scalar inv_ds) { return pencil[3] * inv_ds; }
+
+Scalar first_derivative(Scalar pencil[], Scalar inv_ds)
+{
+    Scalar res = 0;
+    for (int i = 0; i < STENCIL_ORDER+1; ++i) {
+        res = res + pencil[i];
+    }
+    return inv_ds * res;
+}
+
+Scalar distance(Vector a, Vector b)
+{
+    return sqrt(a.x * b.x + a.y * b.y + a.z * b.z); 
+}
+
+Scalar
+gravity_potential(int i, int j, int k)
+{
+    Vector star_pos = (Vector){0, 0, 0};
+    Vector vertex_pos = (Vector){dsx * i, dsy * j, dsz * k};
+    return GM_star / distance(star_pos, vertex_pos);
+}
+
+Scalar
+gradx_gravity_potential(int i, int j, int k)
+{
+    Scalar pencil[STENCIL_ORDER + 1];
+    for (int offset = -STENCIL_ORDER; offset <= STENCIL_ORDER; ++offset) {
+        pencil[offset+STENCIL_ORDER] = gravity_potential(i + offset, j, k);
+    }
+
+    Scalar inv_ds = Scalar(1.) / dsx;
+    return first_derivative(pencil, inv_ds);
+}
+
+Scalar
+grady_gravity_potential(int i, int j, int k)
+{
+    Scalar pencil[STENCIL_ORDER + 1];
+    for (int offset = -STENCIL_ORDER; offset <= STENCIL_ORDER; ++offset) {
+        pencil[offset+STENCIL_ORDER] = gravity_potential(i, j + offset, k);
+    }
+
+    Scalar inv_ds = Scalar(1.) / dsy;
+    return first_derivative(pencil, inv_ds);
+}
+
+Scalar
+gradz_gravity_potential(int i, int j, int k)
+{
+    Scalar pencil[STENCIL_ORDER + 1];
+    for (int offset = -STENCIL_ORDER; offset <= STENCIL_ORDER; ++offset) {
+        pencil[offset+STENCIL_ORDER] = gravity_potential(i, j, k + offset);
+    }
+
+    Scalar inv_ds = Scalar(1.) / dsz;
+    return first_derivative(pencil, inv_ds);
+}
+
+Vector
+momentum(int i, int j, int k, in Vector uu)
+{
+
+    Vector gravity_potential = (Vector){gradx_gravity_potential(i, j, k),
+                                      grady_gravity_potential(i, j, k),
+                                      gradz_gravity_potential(i, j, k)};
+
+
+    return gravity_potential;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/acc/src/acc.l b/acc/src/acc.l
new file mode 100644
index 0000000..e68fe8b
--- /dev/null
+++ b/acc/src/acc.l
@@ -0,0 +1,56 @@
+%option yylineno
+
+D [0-9]
+L [a-zA-Z_]
+
+%{
+#include "acc.tab.h"
+%}
+
+%%
+
+"Scalar"    { return SCALAR; } /* Builtin types */
+"Vector"    { return VECTOR; }
+"Matrix"    { return MATRIX; }
+"void"      { return VOID; } /* Rest of the types inherited from C */
+"int"       { return INT; }
+"int3"      { return INT3; }
+
+"Kernel"       { return KERNEL; } /* Function specifiers */
+"Preprocessed" { return PREPROCESSED; }
+
+"const"     { return CONSTANT; }
+"in"        { return IN; } /* Device func storage specifiers */
+"out"       { return OUT; }
+"uniform"   { return UNIFORM; }
+
+"else if"   { return ELIF; }
+"if"        { return IF; }
+"else"      { return ELSE; }
+"for"       { return FOR; }
+"while"     { return WHILE; }
+
+"return"    { return RETURN; }
+
+{D}+"."?{D}*[flud]? { return NUMBER; } /* Literals */
+"."{D}+[flud]?      { return NUMBER; }
+{L}({L}|{D})*       { return IDENTIFIER; }
+\"(.)*\"            { return IDENTIFIER; } /* String */
+
+"=="                { return LEQU; }/* Logic operations */
+"&&"                { return LAND; }
+"||"                { return LOR; }
+"<="                { return LLEQU; }
+
+"++"                { return INPLACE_INC; }
+"--"                { return INPLACE_DEC; }
+
+[-+*/;=\[\]{}(),\.<>] { return yytext[0]; } /* Characters */
+
+
+"//".*              { /* Skip regular comments */ }
+[ \t\n\v\r]+        { /* Ignore whitespace, tabs and newlines */ }
+.                   { printf("unrecognized char %d: [%c]\n", *yytext, *yytext); }
+
+
+%%
diff --git a/acc/src/acc.y b/acc/src/acc.y
new file mode 100644
index 0000000..db49225
--- /dev/null
+++ b/acc/src/acc.y
@@ -0,0 +1,234 @@
+%{
+#include <stdio.h>
+#include <string.h>
+
+#include "ast.h"
+
+extern char* yytext;
+
+int yylex();
+int yyerror(const char* str);
+int yyget_lineno();
+
+#define YYSTYPE ASTNode* // Sets the default type
+%}
+
+%token CONSTANT IN OUT UNIFORM
+%token IDENTIFIER NUMBER
+%token RETURN
+%token SCALAR VECTOR MATRIX
+%token VOID INT INT3
+%token IF ELSE FOR WHILE ELIF
+%token LEQU LAND LOR LLEQU
+%token KERNEL PREPROCESSED 
+%token INPLACE_INC INPLACE_DEC
+
+%%
+
+root: program { root->lhs = $1; }
+    ;
+
+program: /* Empty*/                                                                     { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); }
+       | program function_definition                                                    { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+       | program assignment ';'        /* Global definition */                          { $$ = astnode_create(NODE_UNKNOWN, $1, $2); $$->postfix = ';'; }
+       | program declaration ';'       /* Global declaration */                         { $$ = astnode_create(NODE_UNKNOWN, $1, $2); $$->postfix = ';'; }
+       ;
+
+/*
+ * =============================================================================
+ * Functions
+ * =============================================================================
+ */
+
+function_definition: function_declaration compound_statement                            { $$ = astnode_create(NODE_FUNCTION_DEFINITION, $1, $2); }
+                   ;
+
+function_declaration: declaration function_parameter_declaration                        { $$ = astnode_create(NODE_FUNCTION_DECLARATION, $1, $2); }
+                    ;
+
+function_parameter_declaration: '(' ')'                                                 { $$ = astnode_create(NODE_FUNCTION_PARAMETER_DECLARATION, NULL, NULL);  $$->prefix = '('; $$->postfix = ')'; }
+                              | '(' declaration_list ')'                                { $$ = astnode_create(NODE_FUNCTION_PARAMETER_DECLARATION, $2, NULL);    $$->prefix = '('; $$->postfix = ')'; }
+                              ;
+
+/*
+ * =============================================================================
+ * Statement
+ * =============================================================================
+ */
+statement_list: statement                                                               { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | statement_list statement                                                { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+              ;
+
+compound_statement: '{' '}'                                                             { $$ = astnode_create(NODE_COMPOUND_STATEMENT, NULL, NULL); $$->prefix = '{'; $$->postfix = '}'; }
+                  | '{' statement_list '}'                                              { $$ = astnode_create(NODE_COMPOUND_STATEMENT, $2, NULL);   $$->prefix = '{'; $$->postfix = '}'; }
+                  ;
+
+statement: selection_statement                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+         | iteration_statement                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+         | exec_statement ';'                                                           { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+         ;
+
+selection_statement: IF expression else_selection_statement                             { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = IF; }
+                   ;
+
+else_selection_statement: compound_statement                                            { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                        | compound_statement elif_selection_statement                   { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }                        
+                        | compound_statement ELSE compound_statement                    { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ELSE; }
+                        ;
+
+elif_selection_statement: ELIF expression else_selection_statement                      { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = ELIF; }   
+                        ;
+
+iteration_statement: WHILE expression compound_statement                                { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = WHILE; }
+                   | FOR for_expression compound_statement                              { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = FOR; }
+                   ;
+
+for_expression: '(' for_init_param for_other_params ')'                                 { $$ = astnode_create(NODE_UNKNOWN, $2, $3); $$->prefix = '('; $$->postfix = ')'; }
+              ;
+
+for_init_param: expression ';'                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+              | assignment ';'                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+              ;
+
+for_other_params: expression ';'                                                        { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->postfix = ';'; }
+                | expression ';' expression                                             { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ';'; }
+                ;
+
+exec_statement: declaration                                                             { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | assignment                                                              { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | expression                                                              { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+              | return return_statement                                                 { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+              ;
+
+assignment: declaration '=' expression                                                  { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '='; }
+          | expression '=' expression                                                   { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '='; }
+          ; 
+
+return_statement: /* Empty */                                                           { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); }
+                | expression                                                            { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                ;
+
+/*
+ * =============================================================================
+ * Declaration
+ * =============================================================================
+ */
+
+declaration_list: declaration                                                           { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                | declaration_list ',' declaration                                      { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ','; }
+                ;
+
+declaration: type_declaration identifier                                                { $$ = astnode_create(NODE_DECLARATION, $1, $2); } // Note: accepts only one type qualifier. Good or not?
+           | type_declaration array_declaration                                         { $$ = astnode_create(NODE_DECLARATION, $1, $2); }
+           ;
+
+array_declaration: identifier '[' ']'                                                   { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); $$->infix = '['; $$->postfix = ']'; }
+                 | identifier '[' expression ']'                                        { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '['; $$->postfix = ']'; }
+                 ;
+
+type_declaration: type_specifier                                                        { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }                 
+                | type_qualifier type_specifier                                         { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+                ;
+
+/*
+ * =============================================================================
+ * Expressions
+ * =============================================================================
+ */
+expression_list: expression                                                             { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+               | expression_list ',' expression                                         { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = ','; }
+               ;
+
+expression: unary_expression                                                            { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+          | expression binary_expression                                                { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+          ;
+
+binary_expression: binary_operator unary_expression                                     { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+                 ;
+
+unary_expression: postfix_expression                                                    { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                | unary_operator postfix_expression                                     { $$ = astnode_create(NODE_UNKNOWN, $1, $2); }
+                ;
+
+postfix_expression: primary_expression                                                  { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                  | postfix_expression '[' expression_list ']' /* Subscript */          { $$ = astnode_create(NODE_MULTIDIM_SUBSCRIPT_EXPRESSION, $1, $3);    $$->infix = '['; $$->postfix = ']'; }
+                  | cast_expression '{' expression_list '}'    /* Array */              { $$ = astnode_create(NODE_UNKNOWN, $1, $3); $$->infix = '{'; $$->postfix = '}'; }
+                  | postfix_expression '(' ')'                 /* Function call */      { $$ = astnode_create(NODE_UNKNOWN, $1, NULL);  $$->infix = '('; $$->postfix = ')'; }
+                  | postfix_expression '(' expression_list ')' /* Function call */      { $$ = astnode_create(NODE_UNKNOWN, $1, $3);    $$->infix = '('; $$->postfix = ')'; }
+                  | type_specifier '(' expression_list ')'     /* Cast */               { $$ = astnode_create(NODE_UNKNOWN, $1, $3);  $$->infix = '('; $$->postfix = ')'; }
+                  | postfix_expression '.' identifier          /* Member access */      { $$ = astnode_create(NODE_UNKNOWN, $1, $3);    $$->infix = '.'; }
+                  ;
+
+cast_expression: /* Empty: implicit cast */                                             { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); }
+               | '(' type_specifier ')'                                                 { $$ = astnode_create(NODE_UNKNOWN, $2, NULL); $$->prefix = '('; $$->postfix = ')'; }
+               ;
+
+primary_expression: identifier                                                          { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                  | number                                                              { $$ = astnode_create(NODE_UNKNOWN, $1, NULL); }
+                  | '(' expression ')'                                                  { $$ = astnode_create(NODE_UNKNOWN, $2, NULL); $$->prefix = '('; $$->postfix = ')'; }
+                  ;
+
+
+
+/*
+ * =============================================================================
+ * Terminals
+ * =============================================================================
+ */
+
+binary_operator: '+'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '-'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '/'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '*'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '<'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+               | '>'                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }           
+               | LEQU                                                                   { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               | LAND                                                                   { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               | LOR                                                                    { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               | LLEQU                                                                  { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+               ;
+
+unary_operator: '-' /* C-style casts are disallowed, would otherwise be defined here */ { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+              | '!'                                                                     { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->infix = yytext[0]; }
+              | INPLACE_INC                                                             { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->token = INPLACE_INC; }
+              | INPLACE_DEC                                                             { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); $$->token = INPLACE_DEC; }
+              ;
+
+type_qualifier: KERNEL                                                                  { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = KERNEL; }
+              | PREPROCESSED                                                            { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = PREPROCESSED; }
+              | CONSTANT                                                                { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = CONSTANT; }
+              | IN                                                                      { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = IN; }
+              | OUT                                                                     { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = OUT; }
+              | UNIFORM                                                                 { $$ = astnode_create(NODE_TYPE_QUALIFIER, NULL, NULL); $$->token = UNIFORM; }
+              ;
+
+type_specifier: VOID                                                                    { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VOID; }
+              | INT                                                                     { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = INT; }
+              | INT3                                                                    { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = INT3; }
+              | SCALAR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = SCALAR;  }
+              | VECTOR                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = VECTOR;  }
+              | MATRIX                                                                  { $$ = astnode_create(NODE_TYPE_SPECIFIER, NULL, NULL); $$->token = MATRIX;  }
+              ;
+
+identifier: IDENTIFIER                                                                  { $$ = astnode_create(NODE_IDENTIFIER, NULL, NULL); astnode_set_buffer(yytext, $$); }
+          ;
+
+number: NUMBER                                                                          { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+      ;
+
+return: RETURN                                                                          { $$ = astnode_create(NODE_UNKNOWN, NULL, NULL); astnode_set_buffer(yytext, $$); }
+      ;
+
+%%
+
+void
+print(void)
+{
+    printf("%s\n", yytext);
+}
+
+int
+yyerror(const char* str)
+{
+    fprintf(stderr, "%s on line %d when processing char %d: [%s]\n", str, yyget_lineno(), *yytext, yytext);
+}
diff --git a/acc/src/ast.h b/acc/src/ast.h
new file mode 100644
index 0000000..830a8c1
--- /dev/null
+++ b/acc/src/ast.h
@@ -0,0 +1,126 @@
+/*
+    Nodes for the Abstract Syntax Tree
+
+    Statement: syntactic unit tha expresses some action.
+    May have internal components, expressions, which are evaluated
+
+    Statements: return value
+                block
+*/
+#include <stdlib.h>
+#include <assert.h>
+
+#define BUFFER_SIZE (4096)
+
+#define GEN_ID(X) X
+#define GEN_STR(X) #X
+
+#define FOR_NODE_TYPES(FUNC) \
+    FUNC(NODE_UNKNOWN), \
+    FUNC(NODE_DEFINITION), \
+    FUNC(NODE_GLOBAL_DEFINITION), \
+    FUNC(NODE_DECLARATION), \
+    FUNC(NODE_TYPE_QUALIFIER), \
+    FUNC(NODE_TYPE_SPECIFIER), \
+    FUNC(NODE_IDENTIFIER), \
+    FUNC(NODE_FUNCTION_DEFINITION), \
+    FUNC(NODE_FUNCTION_DECLARATION), \
+    FUNC(NODE_COMPOUND_STATEMENT), \
+    FUNC(NODE_FUNCTION_PARAMETER_DECLARATION), \
+    FUNC(NODE_MULTIDIM_SUBSCRIPT_EXPRESSION)
+
+/* 
+// Recreating strdup is not needed when using the GNU compiler.
+// Let's also just say that anything but the GNU
+// compiler is NOT supported, since there are also
+// some gcc-specific calls in the files generated 
+// by flex and being completely compiler-independent is
+// not a priority right now
+#ifndef strdup 
+static inline char*
+strdup(const char* in)
+{
+    const size_t len = strlen(in) + 1;
+    char* out = malloc(len);
+
+    if (out) {
+        memcpy(out, in, len);
+        return out;
+    } else {
+        return NULL;
+    }
+}
+#endif
+*/
+
+typedef enum {
+    FOR_NODE_TYPES(GEN_ID),
+    NUM_NODE_TYPES
+} NodeType;
+
+typedef struct astnode_s {
+    int id;
+    struct astnode_s* lhs;
+    struct astnode_s* rhs;
+    NodeType type;          // Type of the AST node
+    char* buffer;           // Indentifiers and other strings (empty by default)
+
+    int token;              // Type of a terminal (that is not a simple char)
+    int prefix;            // Tokens. Also makes the grammar since we don't have
+    int infix;             // to divide it into max two-child rules
+    int postfix;           // (which makes it much harder to read)
+} ASTNode;
+
+
+static inline ASTNode*
+astnode_create(const NodeType type, ASTNode* lhs, ASTNode* rhs)
+{
+    ASTNode* node = malloc(sizeof(node[0]));
+
+    static int id_counter = 0;
+    node->id     = id_counter++;
+    node->type   = type;
+    node->lhs    = lhs;
+    node->rhs    = rhs;
+    node->buffer = NULL;
+
+    node->prefix = node->infix = node->postfix = 0;
+
+    return node;
+}
+
+static inline void
+astnode_set_buffer(const char* buffer, ASTNode* node)
+{
+    node->buffer = strdup(buffer);
+}
+
+static inline void
+astnode_destroy(ASTNode* node)
+{
+    if (node->lhs)
+        astnode_destroy(node->lhs);
+    if (node->rhs)
+        astnode_destroy(node->rhs);
+    if (node->buffer)
+        free(node->buffer);
+    free(node);
+}
+
+
+extern ASTNode* root;
+
+/*
+typedef enum {
+    SCOPE_BLOCK
+} ScopeType;
+
+typedef struct symbol_s {
+    int type_specifier;
+    char* identifier;
+    int scope;
+    struct symbol_s* next;
+} Symbol;
+
+extern ASTNode* symbol_table;
+*/
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
new file mode 100644
index 0000000..66d4c9d
--- /dev/null
+++ b/acc/src/code_generator.c
@@ -0,0 +1,569 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "acc.tab.h"
+#include "ast.h"
+
+ASTNode* root = NULL;
+
+static const char inout_name_prefix[] = "handle_";
+static bool doing_stencil_assembly    = true;
+
+/*
+ * =============================================================================
+ * Translation
+ * =============================================================================
+ */
+#define TRANSLATION_TABLE_SIZE (1024)
+static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
+    [0] = NULL,
+    // Control flow
+    [IF]    = "if",
+    [ELSE]  = "else",
+    [ELIF]  = "else if",
+    [WHILE] = "while",
+    [FOR]   = "for",
+    // Type specifiers
+    [VOID]   = "void",
+    [INT]    = "int",
+    [INT3]   = "int3",
+    [SCALAR] = "AcReal",
+    [VECTOR] = "AcReal3",
+    [MATRIX] = "AcMatrix",
+    // Type qualifiers
+    [KERNEL] = "template <int step_number>  static "
+               "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
+                             // RK_LAUNCH_BOUND_MIN_BLOCKS),
+    [PREPROCESSED] = "static __device__ "
+                     "__forceinline__",
+    [CONSTANT] = "const",
+    [IN]       = "in",
+    [OUT]      = "out",
+    [UNIFORM]  = "uniform",
+    // ETC
+    [INPLACE_INC] = "++",
+    [INPLACE_DEC] = "--",
+    // Unary
+    [','] = ",",
+    [';'] = ";\n",
+    ['('] = "(",
+    [')'] = ")",
+    ['['] = "[",
+    [']'] = "]",
+    ['{'] = "{\n",
+    ['}'] = "}\n",
+    ['='] = "=",
+    ['+'] = "+",
+    ['-'] = "-",
+    ['/'] = "/",
+    ['*'] = "*",
+    ['<'] = "<",
+    ['>'] = ">",
+    ['!'] = "!",
+    ['.'] = "."};
+
+static const char*
+translate(const int token)
+{
+    assert(token >= 0);
+    assert(token < TRANSLATION_TABLE_SIZE);
+    if (token > 0) {
+        if (!translation_table[token])
+            printf("ERROR: unidentified token %d\n", token);
+        assert(translation_table[token]);
+    }
+
+    return translation_table[token];
+}
+
+/*
+ * =============================================================================
+ * Symbols
+ * =============================================================================
+ */
+typedef enum {
+    SYMBOLTYPE_FUNCTION,
+    SYMBOLTYPE_FUNCTION_PARAMETER,
+    SYMBOLTYPE_OTHER,
+    NUM_SYMBOLTYPES
+} SymbolType;
+
+#define MAX_ID_LEN (128)
+typedef struct {
+    SymbolType type;
+    int type_qualifier;
+    int type_specifier;
+    char identifier[MAX_ID_LEN];
+} Symbol;
+
+#define SYMBOL_TABLE_SIZE (4096)
+static Symbol symbol_table[SYMBOL_TABLE_SIZE] = {};
+static int num_symbols                        = 0;
+
+static int
+symboltable_lookup(const char* identifier)
+{
+    if (!identifier)
+        return -1;
+
+    for (int i = 0; i < num_symbols; ++i)
+        if (strcmp(identifier, symbol_table[i].identifier) == 0)
+            return i;
+
+    return -1;
+}
+
+static void
+add_symbol(const SymbolType type, const int tqualifier, const int tspecifier, const char* id)
+{
+    assert(num_symbols < SYMBOL_TABLE_SIZE);
+
+    symbol_table[num_symbols].type           = type;
+    symbol_table[num_symbols].type_qualifier = tqualifier;
+    symbol_table[num_symbols].type_specifier = tspecifier;
+    strcpy(symbol_table[num_symbols].identifier, id);
+
+    ++num_symbols;
+}
+
+static void
+rm_symbol(const int handle)
+{
+    assert(handle >= 0 && handle < num_symbols);
+
+    if (&symbol_table[handle] != &symbol_table[num_symbols - 1])
+        memcpy(&symbol_table[handle], &symbol_table[num_symbols - 1], sizeof(Symbol));
+    --num_symbols;
+}
+
+static void
+print_symbol(const int handle)
+{
+    assert(handle < SYMBOL_TABLE_SIZE);
+
+    const char* fields[]    = {translate(symbol_table[handle].type_qualifier),
+                            translate(symbol_table[handle].type_specifier),
+                            symbol_table[handle].identifier};
+    const size_t num_fields = sizeof(fields) / sizeof(fields[0]);
+
+    for (int i = 0; i < num_fields; ++i)
+        if (fields[i])
+            printf("%s ", fields[i]);
+}
+
+static void
+translate_latest_symbol(void)
+{
+    const int handle = num_symbols - 1;
+    assert(handle < SYMBOL_TABLE_SIZE);
+
+    Symbol* symbol = &symbol_table[handle];
+
+    // FUNCTION
+    if (symbol->type == SYMBOLTYPE_FUNCTION) {
+        // KERNEL FUNCTION
+        if (symbol->type_qualifier == KERNEL) {
+            printf("%s %s\n%s", translate(symbol->type_qualifier),
+                   translate(symbol->type_specifier), symbol->identifier);
+        }
+        // PREPROCESSED FUNCTION
+        else if (symbol->type_qualifier == PREPROCESSED) {
+            printf("%s %s\npreprocessed_%s", translate(symbol->type_qualifier),
+                   translate(symbol->type_specifier), symbol->identifier);
+        }
+        // OTHER FUNCTION
+        else {
+            const char* regular_function_decorator = "static __device__ "
+                                                     "__forceinline__";
+            printf("%s %s %s\n%s", regular_function_decorator,
+                   translate(symbol->type_qualifier) ? translate(symbol->type_qualifier) : "",
+                   translate(symbol->type_specifier), symbol->identifier);
+        }
+    }
+    // FUNCTION PARAMETER
+    else if (symbol->type == SYMBOLTYPE_FUNCTION_PARAMETER) {
+        if (symbol->type_qualifier == IN || symbol->type_qualifier == OUT) {
+            if (doing_stencil_assembly)
+                printf("const __restrict__ %s* %s", translate(symbol->type_specifier),
+                       symbol->identifier);
+            else
+                printf("const %sData& %s", translate(symbol->type_specifier), symbol->identifier);
+        }
+        else {
+            print_symbol(handle);
+        }
+    }
+    // UNIFORM
+    else if (symbol->type_qualifier == UNIFORM) {
+        /* Do nothing */
+    }
+    // IN / OUT
+    else if (symbol->type != SYMBOLTYPE_FUNCTION_PARAMETER &&
+             (symbol->type_qualifier == IN || symbol->type_qualifier == OUT)) {
+        const char* inout_type_qualifier = "static __device__ const auto";
+        printf("%s %s%s", inout_type_qualifier, inout_name_prefix, symbol_table[handle].identifier);
+    }
+    // OTHER
+    else {
+        print_symbol(handle);
+    }
+}
+
+static void
+print_symbol_table(void)
+{
+    for (int i = 0; i < num_symbols; ++i) {
+        printf("%d: ", i);
+        const char* fields[]    = {translate(symbol_table[i].type_qualifier),
+                                translate(symbol_table[i].type_specifier),
+                                symbol_table[i].identifier};
+        const size_t num_fields = sizeof(fields) / sizeof(fields[0]);
+
+        for (int i = 0; i < num_fields; ++i)
+            if (fields[i])
+                printf("%s ", fields[i]);
+
+        if (symbol_table[i].type == SYMBOLTYPE_FUNCTION)
+            printf("(function)");
+        else if (symbol_table[i].type == SYMBOLTYPE_FUNCTION_PARAMETER)
+            printf("(function parameter)");
+        else
+            printf("(other)");
+        printf("\n");
+    }
+}
+
+/*
+ * =============================================================================
+ * State
+ * =============================================================================
+ */
+static bool inside_declaration                    = false;
+static bool inside_function_declaration           = false;
+static bool inside_function_parameter_declaration = false;
+
+static bool inside_kernel       = false;
+static bool inside_preprocessed = false;
+
+static int scope_start = 0;
+
+/*
+ * =============================================================================
+ * AST traversal
+ * =============================================================================
+ */
+
+static void
+traverse(const ASTNode* node)
+{
+    // Prefix logic %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    if (node->type == NODE_FUNCTION_DECLARATION)
+        inside_function_declaration = true;
+    if (node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        inside_function_parameter_declaration = true;
+    if (node->type == NODE_DECLARATION)
+        inside_declaration = true;
+
+    if (!inside_declaration && translate(node->prefix))
+        printf("%s", translate(node->prefix));
+
+    // BOILERPLATE START////////////////////////////////////////////////////////
+    if (node->type == NODE_TYPE_QUALIFIER && node->token == KERNEL)
+        inside_kernel = true;
+
+    // Kernel parameter boilerplate
+    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, ";
+    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        printf("%s ", kernel_parameter_boilerplate);
+
+    // Kernel builtin variables boilerplate (read input/output arrays and setup
+    // indices)
+    const char* kernel_builtin_variables_boilerplate = "GEN_KERNEL_BUILTIN_VARIABLES_"
+                                                       "BOILERPLATE();";
+    if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
+        printf("%s ", kernel_builtin_variables_boilerplate);
+
+        for (int i = 0; i < num_symbols; ++i) {
+            if (symbol_table[i].type_qualifier == IN) {
+                printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
+                       symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+            } else if (symbol_table[i].type_qualifier == OUT) {
+                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+                //printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+            }
+        }
+    }
+
+    // Preprocessed parameter boilerplate
+    if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
+        inside_preprocessed = true;
+    static const char
+        preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
+    if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        printf("%s ", preprocessed_parameter_boilerplate);
+    // BOILERPLATE END////////////////////////////////////////////////////////
+
+    // Enter LHS
+    if (node->lhs)
+        traverse(node->lhs);
+
+    // Infix logic  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    if (!inside_declaration && translate(node->infix))
+        printf("%s ", translate(node->infix));
+
+    if (node->type == NODE_FUNCTION_DECLARATION)
+        inside_function_declaration = false;
+
+
+    // If the node is a subscript expression and the expression list inside it is not empty
+    if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
+        printf("IDX(");
+
+    // Do a regular translation
+    if (!inside_declaration) {
+        const int handle = symboltable_lookup(node->buffer);
+        if (handle >= 0) { // The variable exists in the symbol table
+            const Symbol* symbol = &symbol_table[handle];
+
+            //if (symbol->type_qualifier == OUT) {
+            //    printf("%s%s", inout_name_prefix, symbol->identifier);
+            //}
+            if (symbol->type_qualifier == UNIFORM) {
+                if (symbol->type_specifier == SCALAR)
+                    printf("DCONST_REAL(AC_%s) ", symbol->identifier);
+                else if (symbol->type_specifier == INT)
+                    printf("DCONST_INT(AC_%s) ", symbol->identifier);
+                else
+                    printf("INVALID UNIFORM type specifier %s with %s\n",
+                           translate(symbol->type_specifier), symbol->identifier);
+            }
+            else {
+                // Do a regular translation
+                if (translate(node->token))
+                    printf("%s ", translate(node->token));
+                if (node->buffer)
+                    printf("%s ", node->buffer);
+            }
+        }
+        else {
+            // Do a regular translation
+            if (translate(node->token))
+                printf("%s ", translate(node->token));
+            if (node->buffer)
+                printf("%s ", node->buffer);
+        }
+    }
+
+    if (node->type == NODE_FUNCTION_DECLARATION) {
+        scope_start = num_symbols;
+    }
+
+    // Enter RHS
+    if (node->rhs)
+        traverse(node->rhs);
+
+    // Postfix logic  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    // If the node is a subscript expression and the expression list inside it is not empty
+    if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
+        printf(")");    // Closing bracket of IDX()
+
+    // Generate writeback boilerplate for OUT fields
+    if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
+        for (int i = 0; i < num_symbols; ++i) {
+            if (symbol_table[i].type_qualifier == OUT) {
+                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+                //printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+            }
+        }
+    }
+
+    if (!inside_declaration && translate(node->postfix))
+        printf("%s", translate(node->postfix));
+
+    if (node->type == NODE_DECLARATION) {
+        inside_declaration = false;
+
+        int tqual = 0;
+        int tspec = 0;
+        if (node->lhs && node->lhs->lhs) {
+            if (node->lhs->lhs->type == NODE_TYPE_QUALIFIER)
+                tqual = node->lhs->lhs->token;
+            else if (node->lhs->lhs->type == NODE_TYPE_SPECIFIER)
+                tspec = node->lhs->lhs->token;
+        }
+        if (node->lhs && node->lhs->rhs) {
+            if (node->lhs->rhs->type == NODE_TYPE_SPECIFIER)
+                tspec = node->lhs->rhs->token;
+        }
+
+        // Determine symbol type
+        SymbolType symboltype = SYMBOLTYPE_OTHER;
+        if (inside_function_declaration)
+            symboltype = SYMBOLTYPE_FUNCTION;
+        else if (inside_function_parameter_declaration)
+            symboltype = SYMBOLTYPE_FUNCTION_PARAMETER;
+
+        // Determine identifier
+        if (node->rhs->type == NODE_IDENTIFIER) {
+            add_symbol(symboltype, tqual, tspec, node->rhs->buffer); // Ordinary
+            translate_latest_symbol();
+        }
+        else {
+            add_symbol(symboltype, tqual, tspec,
+                       node->rhs->lhs->buffer); // Array
+            translate_latest_symbol();
+            // Traverse the expression once again, this time with
+            // "inside_declaration" flag off
+            printf("%s ", translate(node->rhs->infix));
+            if (node->rhs->rhs)
+                traverse(node->rhs->rhs);
+            printf("%s ", translate(node->rhs->postfix));
+        }
+    }
+
+    if (node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+        inside_function_parameter_declaration = false;
+
+    if (node->type == NODE_FUNCTION_DEFINITION) {
+        while (num_symbols > scope_start)
+            rm_symbol(num_symbols - 1);
+
+        inside_kernel       = false;
+        inside_preprocessed = false;
+    }
+}
+
+// TODO: these should use the generic type names SCALAR and VECTOR
+static void
+generate_preprocessed_structures(void)
+{
+    // PREPROCESSED DATA STRUCT
+    printf("\n");
+    printf("typedef struct {\n");
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == PREPROCESSED)
+            printf("%s %s;\n", translate(symbol_table[i].type_specifier),
+                   symbol_table[i].identifier);
+    }
+    printf("} %sData;\n", translate(SCALAR));
+
+    // FILLING THE DATA STRUCT
+    printf("static __device__ __forceinline__ AcRealData\
+            read_data(const int3 vertexIdx,\
+            AcReal* __restrict__ buf[], const int handle)\
+            {\n\
+                %sData data;\n",
+           translate(SCALAR));
+
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == PREPROCESSED)
+            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier,
+                   symbol_table[i].identifier);
+    }
+    printf("return data;\n");
+    printf("}\n");
+
+    // FUNCTIONS FOR ACCESSING MEMBERS OF THE PREPROCESSED STRUCT
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == PREPROCESSED)
+            printf("static __device__ __forceinline__ %s\
+                    %s(const AcRealData& data)\
+                    {\n\
+                        return data.%s;\
+                    }\n",
+                   translate(symbol_table[i].type_specifier), symbol_table[i].identifier,
+                   symbol_table[i].identifier);
+    }
+
+    // Syntactic sugar: generate also a Vector data struct
+    printf("\
+        typedef struct {\
+            AcRealData x;\
+            AcRealData y;\
+            AcRealData z;\
+        } AcReal3Data;\
+        \
+        static __device__ __forceinline__ AcReal3Data\
+        read_data(const int3 vertexIdx,\
+                  AcReal* __restrict__ buf[], const int3& handle)\
+        {\
+            AcReal3Data data;\
+        \
+            data.x = read_data(vertexIdx, buf, handle.x);\
+            data.y = read_data(vertexIdx, buf, handle.y);\
+            data.z = read_data(vertexIdx, buf, handle.z);\
+        \
+            return data;\
+        }\
+    ");
+}
+
+int
+main(int argc, char** argv)
+{
+    if (argc == 2) {
+        if (!strcmp(argv[1], "-sas"))
+            doing_stencil_assembly = true;
+        else if (!strcmp(argv[1], "-sps"))
+            doing_stencil_assembly = false;
+        else
+            printf("Unknown flag %s. Generating stencil assembly.\n", argv[1]);
+    }
+    else {
+        printf("Usage: ./acc [flags]\n"
+               "Flags:\n"
+               "\t-sas - Generates code for the stencil assembly stage\n"
+               "\t-sps - Generates code for the stencil processing "
+               "stage\n");
+        printf("\n");
+        return EXIT_FAILURE;
+    }
+
+    root = astnode_create(NODE_UNKNOWN, NULL, NULL);
+
+    const int retval = yyparse();
+    if (retval) {
+        printf("COMPILATION FAILED\n");
+        return EXIT_FAILURE;
+    }
+
+    // Traverse
+    traverse(root);
+    if (doing_stencil_assembly)
+        generate_preprocessed_structures();
+
+    // print_symbol_table();
+
+    // Cleanup
+    astnode_destroy(root);
+    // printf("COMPILATION SUCCESS\n");
+}
diff --git a/acc/test_grammar.sh b/acc/test_grammar.sh
new file mode 100755
index 0000000..ee579de
--- /dev/null
+++ b/acc/test_grammar.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+cd `dirname $0` # Only operate in the same directory with this script
+
+./build_acc.sh
+
+mkdir -p testbin
+./compile.sh samples/sample_stencil_process.sps
+./compile.sh samples/sample_stencil_assembly.sas
+
+mv stencil_process.cuh testbin/
+mv stencil_assembly.cuh testbin/
+
+printf "
+#include <stdio.h>
+#include <stdlib.h>
+#include \"%s\" // i.e. astaroth.h
+
+__constant__ AcMeshInfo d_mesh_info;
+#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+#include \"%s\"
+#include \"%s\"
+int main(void) { printf(\"Grammar check complete.\\\nAll tests passed.\\\n\"); return EXIT_SUCCESS; }
+" common_header.h stencil_assembly.cuh stencil_process.cuh >testbin/test.cu
+
+cd testbin
+nvcc -std=c++11 test.cu -I ../samples -o test && ./test
diff --git a/analysis/python/.gitignore b/analysis/python/.gitignore
new file mode 100644
index 0000000..e33609d
--- /dev/null
+++ b/analysis/python/.gitignore
@@ -0,0 +1 @@
+*.png
diff --git a/analysis/python/README.md b/analysis/python/README.md
new file mode 100644
index 0000000..865f23d
--- /dev/null
+++ b/analysis/python/README.md
@@ -0,0 +1,7 @@
+# Python directory
+
+This directory is for Python script connected to data visualization and analysis. 
+
+Content of this directory should be structured so that it is always callable by
+`import astar` more task related scips should be written elsewhere, depending
+the user's convenience. 
diff --git a/analysis/python/add_to_pythonpath.sh b/analysis/python/add_to_pythonpath.sh
new file mode 100644
index 0000000..331b862
--- /dev/null
+++ b/analysis/python/add_to_pythonpath.sh
@@ -0,0 +1,3 @@
+
+
+export PYTHONPATH=${PYTHONPATH}:$PWD/
diff --git a/analysis/python/astar/__init__.py b/analysis/python/astar/__init__.py
new file mode 100644
index 0000000..42c4a5b
--- /dev/null
+++ b/analysis/python/astar/__init__.py
@@ -0,0 +1,24 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+# Developers note. We require Python 3 approach to have 
+# compatibility towards the future.
+
+import numpy as np 
+import pylab as plt
diff --git a/analysis/python/astar/data/__init__.py b/analysis/python/astar/data/__init__.py
new file mode 100644
index 0000000..0d767d2
--- /dev/null
+++ b/analysis/python/astar/data/__init__.py
@@ -0,0 +1,21 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+
+from . import read
diff --git a/analysis/python/astar/data/read.py b/analysis/python/astar/data/read.py
new file mode 100644
index 0000000..17a7d05
--- /dev/null
+++ b/analysis/python/astar/data/read.py
@@ -0,0 +1,142 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+# This module is for reading data.
+
+import numpy as np
+
+def read_bin(fname, fdir, fnum, minfo, numtype=np.longdouble):
+    '''Read in a floating point array'''
+    filename = fdir + fname + '_' + fnum + '.mesh'
+    datas = np.DataSource()
+    read_ok = datas.exists(filename)
+    if read_ok:
+        print(filename)
+        array = np.fromfile(filename, dtype=numtype)
+
+        timestamp = array[0]
+
+        array = np.reshape(array[1:], (minfo.contents['AC_mx'], 
+                                   minfo.contents['AC_my'], 
+                                   minfo.contents['AC_mz']), order='F')
+    else:
+        array = None
+        timestamp = None
+     
+    return array, timestamp, read_ok 
+
+def read_meshtxt(fdir, fname):
+    with open(fdir+fname) as f:
+        filetext = f.read().splitlines()
+
+    contents = {}  
+
+    for line in filetext:
+        line = line.split()
+        if line[0] == 'int':
+            contents[line[1]] = np.int(line[2])
+        elif line[0] == 'real':
+            contents[line[1]] = np.float(line[2])
+        else: 
+            print('ERROR: ' + line[0] +' no recognized!')
+
+    return contents
+
+class MeshInfo():
+    '''Object that contains all mesh info'''
+
+    def __init__(self, fdir):
+        self.contents = read_meshtxt(fdir, 'mesh_info.list') 
+
+class Mesh:
+    '''Class tha contains all 3d mesh data'''
+
+    def __init__(self, fnum, fdir=""):
+        fnum = str(fnum)
+        self.framenum = fnum.zfill(10)
+
+        self.minfo = MeshInfo(fdir)
+
+        self.lnrho, self.timestamp, self.ok = read_bin('VTXBUF_LNRHO', fdir, fnum, self.minfo)
+
+        if self.ok:
+
+            self.ss, timestamp, ok = read_bin('VTXBUF_ENTROPY', fdir, fnum, self.minfo)
+ 
+            #TODO Generalize is a dict. Do not hardcode!  
+            uux, timestamp, ok = read_bin('VTXBUF_UUX', fdir, fnum, self.minfo)
+            uuy, timestamp, ok = read_bin('VTXBUF_UUY', fdir, fnum, self.minfo) 
+            uuz, timestamp, ok = read_bin('VTXBUF_UUZ', fdir, fnum, self.minfo)
+            self.uu = (uux, uuy, uuz)
+            uux = []
+            uuy = [] 
+            uuz = []
+ 
+            aax, timestamp, ok = read_bin('VTXBUF_AX', fdir, fnum, self.minfo)
+            aay, timestamp, ok = read_bin('VTXBUF_AY', fdir, fnum, self.minfo) 
+            aaz, timestamp, ok = read_bin('VTXBUF_AZ', fdir, fnum, self.minfo)
+            self.aa = (aax, aay, aaz)
+            aax = []
+            aay = [] 
+            aaz = []
+
+            self.xx =  self.minfo.contents['AC_inv_dsx']*np.arange(self.minfo.contents['AC_mx'])
+            self.yy =  self.minfo.contents['AC_inv_dsy']*np.arange(self.minfo.contents['AC_my'])
+            self.zz =  self.minfo.contents['AC_inv_dsz']*np.arange(self.minfo.contents['AC_mz'])
+
+            self.xmid = int(self.minfo.contents['AC_mx']/2)
+            self.ymid = int(self.minfo.contents['AC_my']/2)
+            self.zmid = int(self.minfo.contents['AC_mz']/2)
+
+
+def parse_ts(fdir, fname):
+    with open(fdir+fname) as f:
+        filetext = f.read().splitlines()
+
+    var = {}  
+
+    line = filetext[0].split()
+    for i in range(len(line)):
+        line[i] = line[i].replace('VTXBUF_', "")
+        line[i] = line[i].replace('UU', "uu")
+        line[i] = line[i].replace('_total', "tot")
+        line[i] = line[i].replace('A', "aa")
+        line[i] = line[i].replace('LNRHO', "lnrho")
+        line[i] = line[i].replace('X', "x")
+        line[i] = line[i].replace('Y', "y")
+        line[i] = line[i].replace('Z', "z")
+
+    tsdata = np.loadtxt(fdir+fname,skiprows=1)
+
+    for i in range(len(line)):
+        var[line[i]] = tsdata[:,i]
+
+    var['step'] = np.int64(var['step'])
+
+    print("HERE ARE ALL KEYS FOR TS DATA:")
+    print(var.keys())
+   
+    return var
+
+class TimeSeries:
+    '''Class for time series data'''
+
+    def __init__(self, fdir="", fname="timeseries.ts"):
+
+        self.var = parse_ts(fdir, fname)
diff --git a/analysis/python/astar/visual/__init__.py b/analysis/python/astar/visual/__init__.py
new file mode 100644
index 0000000..44eca95
--- /dev/null
+++ b/analysis/python/astar/visual/__init__.py
@@ -0,0 +1,21 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+
+from . import slices
diff --git a/analysis/python/astar/visual/slices.py b/analysis/python/astar/visual/slices.py
new file mode 100644
index 0000000..6afa71a
--- /dev/null
+++ b/analysis/python/astar/visual/slices.py
@@ -0,0 +1,92 @@
+
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import pylab as plt 
+import numpy as np 
+import matplotlib.gridspec as gridspec
+import matplotlib.colors as colors
+
+CM_INFERNO = plt.get_cmap('inferno')
+
+def plot_3(mesh, input_grid, title = '', fname = 'default', bitmap=False, slicetype = 'middle', colrange=None, colormap=CM_INFERNO , contourplot=False):
+    fig = plt.figure(figsize=(8, 8))
+    grid = gridspec.GridSpec(2, 3, wspace=0.4, hspace=0.4, width_ratios=[1,1, 0.15])
+    ax00   = fig.add_subplot( grid[0,0] )
+    ax10   = fig.add_subplot( grid[0,1] )
+    ax11   = fig.add_subplot( grid[1,1] )
+    axcbar = fig.add_subplot( grid[:,2] )
+
+    print(mesh.minfo.contents.keys())
+
+    if slicetype == 'middle':
+        yz_slice = input_grid[mesh.xmid, :, :]
+        xz_slice = input_grid[:, mesh.ymid, :]
+        xy_slice = input_grid[:, :, mesh.zmid]
+        if colrange==None:
+            plotnorm = colors.Normalize(vmin=input_grid.min(),vmax=input_grid.max()) 
+        else:
+            plotnorm = colors.Normalize(vmin=colrange[0],vmax=colrange[1]) 
+    elif slicetype == 'sum':
+        yz_slice = np.sum(input_grid, axis=0) 
+        xz_slice = np.sum(input_grid, axis=1) 
+        xy_slice = np.sum(input_grid, axis=2) 
+        cmin = np.amin([yz_slice.min(), xz_slice.min(), xy_slice.min()])
+        cmax = np.amax([yz_slice.max(), xz_slice.max(), xy_slice.max()])
+        if colrange==None:
+            plotnorm = colors.Normalize(vmin=cmin,vmax=cmax) 
+        else:
+            plotnorm = colors.Normalize(vmin=colrange[0],vmax=colrange[1]) 
+        
+    
+    yy, zz = np.meshgrid(mesh.yy, mesh.zz, indexing='ij')
+    if contourplot:
+        map1 = ax00.contourf(yy, zz, yz_slice, norm=plotnorm, cmap=colormap, nlev=10)
+    else:
+        map1 = ax00.pcolormesh(yy, zz, yz_slice, norm=plotnorm, cmap=colormap)
+    ax00.set_xlabel('y')
+    ax00.set_ylabel('z')
+    ax00.set_title('%s t = %.4e' % (title, mesh.timestamp) )    
+    ax00.set_aspect('equal')
+    
+    xx, zz = np.meshgrid(mesh.xx, mesh.zz, indexing='ij')
+    if contourplot:
+        ax10.contourf(xx, zz, xz_slice, norm=plotnorm, cmap=colormap, nlev=10)
+    else:
+        ax10.pcolormesh(xx, zz, xz_slice, norm=plotnorm, cmap=colormap)
+    ax10.set_xlabel('x')
+    ax10.set_ylabel('z')
+    ax10.set_aspect('equal')
+    
+    xx, yy = np.meshgrid(mesh.xx, mesh.yy, indexing='ij')
+    if contourplot:
+        ax11.contourf(xx, yy, xy_slice, norm=plotnorm, cmap=colormap, nlev=10)
+    else:
+        ax11.pcolormesh(xx, yy, xy_slice, norm=plotnorm, cmap=colormap)
+    ax11.set_xlabel('x')
+    ax11.set_ylabel('y')
+    ax11.set_aspect('equal')
+    
+    cbar = plt.colorbar(map1, cax=axcbar) 
+
+    if bitmap:
+        plt.savefig('%s_%s.png' % (fname, mesh.framenum))
+        print('Saved %s_%s.png' % (fname, mesh.framenum))
+        plt.close(fig)
+         
+ 
diff --git a/analysis/python/calc/convert.sh b/analysis/python/calc/convert.sh
new file mode 100755
index 0000000..c444664
--- /dev/null
+++ b/analysis/python/calc/convert.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+#gm convert -delay 40 colden_*.png colden.gif
+
+DATE=`date '+%Y_%m_%d_%H_%M'`
+
+echo $DATE
+
+gm convert -delay 15 $1_*.png $1_$DATE.gif
diff --git a/analysis/python/calc/galli_shu_plotter.py b/analysis/python/calc/galli_shu_plotter.py
new file mode 100644
index 0000000..38a5248
--- /dev/null
+++ b/analysis/python/calc/galli_shu_plotter.py
@@ -0,0 +1,835 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import numpy as np
+import pylab as plt
+import scipy as scp
+
+import matplotlib.colors as colors
+
+G_newton = 6.674e-8 #cm**3 g**-1 s**-2  
+
+# Time to convert to physical quantities
+yr  = 3.154e+7 #s 
+kyr = 1000.0*yr
+km = 1e5 #cm
+AU = 1.496e+13 #cm
+Msun = 1.98847e33 #g
+
+#cs0 = 20000.0   #cs cm/s "a" in Shu notation
+cs0 = 35000.0   #cs cm/s "a" in Shu notation
+B0  = 30e-6   #G 
+ksii = 11.3 # 
+
+#GS Eq. 10
+ttm = 9.03e12*(cs0/35000.0)/(B0/30e-6) 
+
+
+CM_INFERNO = plt.cm.get_cmap('inferno')
+
+
+
+
+
+
+def P_harmonics(theta, J=666):
+    #Vector spherical harmonics in e_r direction
+    if J == 0: 
+        P = np.ones_like(theta)  # 1.0 
+    elif J == 2:
+        cos_theta = np.cos(theta)
+        P = (1.0/2.0)*(3.0*(cos_theta**2.0) - 1.0)
+    else:
+        P = 0.0
+  
+    #print("P_2", P) 
+    return P 
+    
+
+def B_harmonics(theta, J=666):
+    #Vector spherical harmonics in e_theta direction
+    #print("B_harmonics theta", theta)
+    if J == 2:
+        sin_theta = np.abs(np.sin(theta))
+        cos_theta = np.cos(theta)
+        #B = -(3.0/np.sqrt(6.0))*cos_theta*sin_theta #Morse & Feshbach 1953 book
+        B = -3.0*cos_theta*sin_theta #GS93 Appendix B
+    else:
+        B = 0.0*theta
+
+    #print("B_harmonics", B)
+   
+    return B 
+
+def get_tau(tt): 
+    return tt/ttm
+
+def get_SHU77_potential(xx_point):
+    #Copied here again for convenience
+    m0 = 0.975 #Shu 77 core reduced mass
+    xx_SHU_table   = np.array([ 0.05,  0.10,  0.15,  0.20,  0.25, 
+                          0.30,  0.35,  0.40,  0.45,  0.50, 
+                          0.55,  0.60,  0.65,  0.70,  0.75, 
+                          0.80,  0.85,  0.90,  0.95,  1.00]) 
+    
+    mm_SHU77_table = np.array([0.981, 0.993,  1.01,  1.03,  1.05, 
+                          1.08,  1.12,  1.16,  1.20,  1.25, 
+                          1.30,  1.36,  1.42,  1.49,  1.56, 
+                          1.64,  1.72,  1.81,  1.90,  2.00]) 
+ 
+    xx = xx_SHU_table[  np.where(xx_SHU <= xx_point)]
+    mm = mm_SHU77_table[np.where(xx_SHU <= xx_point)]
+
+    psi = - m0/xx_point + np.trapz(mm/(xx**2.0), xx)
+
+    return psi
+
+
+def psi2(xx_SHU, mm_term, pp_term, J=666):
+    #GS93 Eq. 113
+    if J == 0: 
+        psi2 = - mm_term/xx_SHU + pp_term
+    elif J == 2:
+        psi2 = - mm_term/(xx_SHU**3.0) + (xx_SHU**2.0)*pp_term 
+    else:
+        psi2 = 0.0
+
+    #print('psi2', psi2, 'J', J, 'mm_term', mm_term, 'xx_SHU', xx_SHU, 'pp_term', pp_term)
+
+    return psi2
+
+# Calculate the directional parameter
+def dv_dx(xx,vv, alpha):
+    EE = alpha*(xx-vv) - 2.0/xx 
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+def dalpha_dx(xx,vv, alpha):
+    EE = alpha*(alpha - (2.0/xx)*(xx-vv))
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+def dpsi_dx(xx, mm):
+    return mm/(xx**2.0)
+
+def dmm_dx(xx, alpha):
+    return (xx**2.0)*alpha
+
+def dphi_dx(xx, alpha, mm, theta):
+    ff_zero_der = 0.5*mm*dmm_dx(xx, alpha)
+    sin_theta = np.sin(theta)
+    return ff_zero_der*(sin_theta*2.0) 
+
+
+def deltaspace(theta, tau):
+    #Assuming J= 0, 2 only
+    v0 = -2.222e-1
+    v2 = 2.177e-1
+    deltaJ2 = -(1.0/3.0)*((v0+2.0/3.0)*P_harmonics(theta, J=0) + (v2 - 2.0/3.0)*P_harmonics(theta, J=2))
+    delta   = 1 + (tau**2.0)*deltaJ2 
+    return delta
+
+def delta2(theta, tau):
+    #Assuming J= 0, 2 only
+    return deltaspace(theta, tau)**2.0
+
+def yy_transform(xx_SHU, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93):
+    
+    
+
+    return alpha_mono_GS93, alpha_quad_GS93 
+
+# Calculating the perturbation stage
+def alpha_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta):
+    #Assuming J= 0, 2 only
+    directional = xx_SHU*dalpha_dx(xx_SHU, vv_SHU77, alpha_SHU77)*delta2(theta, tau)
+    directional = 0.0 # 
+    alpha       = alpha_mono_GS93*P_harmonics(theta, J=0) + alpha_quad_GS93*P_harmonics(theta, J=2) + directional
+    return alpha
+
+def vv_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta):
+    #Assuming J= 0, 2 only
+    directional = xx_SHU*dv_dx(xx_SHU, vv_SHU77, alpha_SHU77)*delta2(theta, tau)
+    directional = 0.0 # 
+    vv_mono  = vv_ww_mono_GS93[0]
+    vv_quad  = vv_ww_quad_GS93[0]
+    ww_mono  = vv_ww_mono_GS93[1]
+    ww_quad  = vv_ww_quad_GS93[1]
+    #print('vv_mono, vv_quad, ww_mono, ww_quad', vv_mono, vv_quad, ww_mono, ww_quad)
+    vv_r     = vv_mono*P_harmonics(theta, J=0) + vv_quad*P_harmonics(theta, J=2) + directional ## vv
+    vv_theta = ww_mono*B_harmonics(theta, J=0) + ww_quad*B_harmonics(theta, J=2) + directional ## ww
+    #print("vv_r, vv_theta", vv_r, vv_theta)
+    vv       = np.array([vv_r, vv_theta])
+    return vv
+
+def psi_perturb(tau, xx_SHU, mm_SHU77, mm_pp_mono_GS93, mm_pp_quad_GS93, theta):
+    #Assuming J= 0, 2 only
+    directional = xx_SHU*dpsi_dx(xx_SHU, mm_SHU77)*delta2(theta, tau)
+    directional = 0.0 # 
+    mm_mono  = mm_pp_mono_GS93[0]
+    mm_quad  = mm_pp_quad_GS93[0]
+    pp_mono  = mm_pp_mono_GS93[1]
+    pp_quad  = mm_pp_quad_GS93[1]
+
+    #print('mm_pp_mono_GS93', mm_pp_mono_GS93)
+    #print('mm_mono', mm_mono)
+    
+    psi      =   psi2(xx_SHU, mm_mono, pp_mono, J=0)*P_harmonics(theta, J=0) \
+               + psi2(xx_SHU, mm_quad, pp_quad, J=0)*P_harmonics(theta, J=2) \
+               + directional
+    
+    #print('psi_perturb', psi)
+ 
+    return psi
+
+def phi_vecpot_second_order(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta):
+    directional = xx_SHU*dphi_dx(xx_SHU, alpha_SHU77, mm_SHU77, theta)*delta2(theta, tau)
+    directional = 0.0 # 
+    sin_theta = np.sin(theta)
+    #print(FF_DD_mono_GS93)
+    #print(FF_DD_quad_GS93)
+    #print(ksii, P_harmonics(theta, J=0), P_harmonics(theta, J=2))
+    mono_term = (FF_DD_mono_GS93[0] + (1.0/ksii)*FF_DD_mono_GS93[1])
+    quad_term = (FF_DD_quad_GS93[0] + (1.0/ksii)*FF_DD_quad_GS93[1])
+    phi_vecpot_second = (sin_theta**2.0)*( mono_term*P_harmonics(theta, J=0) \
+                                           + quad_term*P_harmonics(theta, J=2) ) \
+                                           + directional
+    return phi_vecpot_second
+
+def phi_vecpot_zero_order(xx_SHU, mm_SHU77, theta):
+    ff_zero = 0.25*(mm_SHU77**2.0)
+    sin_theta = np.sin(theta)
+    phi_vecpot_zero = ff_zero*(sin_theta*2.0)
+    return phi_vecpot_zero
+
+
+# Combining the perturbation stage.
+def alpha_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta):
+    alpha = alpha_SHU77 + (tau**2.0)*alpha_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta)
+    return alpha
+
+def vv_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta):
+    vv = (tau**2.0)*vv_perturb(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta)
+    #print("BF",vv, vv_ww_mono_GS93, vv_ww_quad_GS93) 
+    vv[0] = vv_SHU77 + vv[0]
+    vv[1] = 0.0      + vv[1]   #No poloidal velocity in Shu77
+    #print("AF",vv)
+    return vv 
+
+def psi_xvec_tau(tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta):
+    #print("psi_xvec_tau --- tau, xx_SHU, mm_SHU7, mm_pp_mono, mm_pp_quad, theta", tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    psi = (tau**2.0)*psi_perturb(tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    psi77 = get_SHU77_potential(xx_SHU)
+    #print('psi77', psi77)
+    psi = psi77 + psi  
+    #print('psi_xvec_tau', psi)
+    return psi 
+
+
+def phi_vecpot_xvec_tau(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta):
+    phi_vecpot_second = (tau**2.0)*phi_vecpot_second_order(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta)
+    phi_vecpot_zero = phi_vecpot_zero_order(xx_SHU, mm_SHU77, theta)
+    phi_vecpot = phi_vecpot_zero + phi_vecpot_second 
+    return phi_vecpot 
+
+#Physical unit converion stage
+def rho_rt(tt, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta):
+    tau = get_tau(tt)
+    alpha_xvec = alpha_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta)
+    rho = (1.0/(4.0*np.pi*G_newton*(tt**2.0))) * alpha_xvec
+    return rho, alpha_xvec
+
+def uu_rt(tt, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta):
+    tau = get_tau(tt)
+    vv_xvec = vv_xvec_tau(tau, xx_SHU, vv_SHU77, alpha_SHU77, vv_ww_mono_GS93, vv_ww_quad_GS93, theta)
+    uu = cs0*vv_xvec
+    return uu, vv_xvec
+
+def grav_psi_rt(tt, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta):
+    tau = get_tau(tt)
+    #print("tt , xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta", tt, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    psi_xvec = psi_xvec_tau(tau, xx_SHU, mm_SHU77, mm_pp_mono, mm_pp_quad, theta)
+    Vpot     = (cs0**2.0)*psi_xvec
+    return Vpot, psi_xvec
+
+def vectorpot_rt(tt, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta):
+    tau = get_tau(tt)
+    phi_vecpot_xvec = phi_vecpot_xvec_tau(tau, xx_SHU, mm_SHU77, alpha_SHU77, FF_DD_mono_GS93, FF_DD_quad_GS93, theta)
+    Phi_flux = np.pi*B0*((cs0*tt)**2.0)*phi_vecpot_xvec
+    return Phi_flux, phi_vecpot_xvec
+
+
+
+###def match_xx(xx_rad, xx_SHU):
+###    xx_buffer = np.empty_like(xx_rad)
+###    stride = np.abs(xx_SHU[1] - xx_SHU[0])
+###    for xx in xx_SHU:
+###        #where  xx - stride <  xx_rad < xx + stride   -> xx_rad[i] = xx 
+###        #loc = np.where((xx_rad <= (xx + stride) and xx_rad > (xx - stride) ))
+###        loc = np.where(xx_rad <= (xx + stride) )
+###        print(loc)
+
+
+def get_shu_index(xx, xx_SHU):
+    stride = np.abs(xx_SHU[1] - xx_SHU[0])/2.0
+
+    #ishu = np.where((xx_SHU <= (xx + stride)) & (xx_SHU > (xx - stride)))[0]    
+
+
+    #TODO Now a purkka version. Do better. 
+    # Can be improve by taking the treatment of the actual low and high x cases. 
+    if (xx > xx_SHU[xx_SHU.size-1]):
+        ishu = xx_SHU.size-1 
+    elif (xx < xx_SHU[0]):
+        ishu = 0
+    else:
+        ishu = np.where((xx_SHU <= (xx + stride)) & (xx_SHU > (xx - stride)))[0]
+        #print("get_shu_index", ishu, ishu.size)
+        ishu = ishu[0]
+        #print("get_shu_index", ishu, ishu.size)
+
+    #print(ishu, xx_SHU[ishu], xx)
+
+    return ishu
+
+def plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, xxvar, physvar, 
+                vv_hor=np.array(None), vv_ver=np.array(None), uu_hor=np.array(None), uu_ver=np.array(None), 
+                title1=r"\alpha", title2=r"\rho", filetitle='density',
+                var_min=[None, None], var_max=[None, None], colmap=CM_INFERNO, normtype='log', 
+                streamlines = 0, contourplot = 0):
+
+    if var_min[0] != None:
+        if normtype == 'log':
+            mynorm1 = colors.LogNorm( vmin=var_min[0], vmax=var_max[0] )
+            mynorm2 = colors.LogNorm( vmin=var_min[1], vmax=var_max[1] )
+        else:
+            mynorm1 = colors.Normalize( vmin=var_min[0], vmax=var_max[0] )
+            mynorm2 = colors.Normalize( vmin=var_min[1], vmax=var_max[1] )
+    else:
+        mynorm1 = colors.Normalize( )
+        mynorm2 = colors.Normalize( )
+
+    if contourplot: 
+        if normtype =='cdensity':
+            numbers = np.arange(0, 20, dtype=np.float64)
+            contourlevs = 1e-20*(np.sqrt(2.0)**numbers)
+            contournorm = colors.LogNorm( vmin=contourlevs.min(), vmax=contourlevs.max() )
+        elif normtype =='cflux':
+            contourlevs = np.linspace(1.0, 1e31, num=20)
+            contournorm = colors.Normalize( vmin=contourlevs.min(), vmax=contourlevs.max() )
+        else: 
+            contourlevs = np.linspace(physvar.min(), physvar.max(), num=10)
+            contournorm = colors.Normalize( vmin=contourlevs.min(), vmax=contourlevs.max() )
+
+
+    ##rr_horizontal_corners = xx_horizontal_corners*(cs0*tt)/AU
+    ##rr_vertical_corners   = xx_vertical_corners*  (cs0*tt)/AU
+    ##rr_horizontal         = xx_horizontal*(cs0*tt)/AU
+    ##rr_vertical           = xx_vertical*  (cs0*tt)/AU
+
+    rr_horizontal_corners = xx_horizontal_corners*(cs0*tt)/1e17
+    rr_vertical_corners   = xx_vertical_corners*  (cs0*tt)/1e17
+    rr_horizontal         = xx_horizontal*(cs0*tt)/1e17
+    rr_vertical           = xx_vertical*  (cs0*tt)/1e17
+
+
+
+    figa, axa = plt.subplots(nrows=1, ncols=2, figsize=(16,6))
+    if contourplot:
+        mapa = axa[0].contourf(xx_horizontal, xx_vertical, xxvar, cmap=colmap, norm=mynorm1)
+        maprho = axa[1].contourf(rr_horizontal, rr_vertical, physvar, contourlevs, cmap=colmap, norm=contournorm)
+    else: 
+        mapa = axa[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, xxvar, cmap=colmap, norm=mynorm1 )
+        maprho = axa[1].pcolormesh(rr_horizontal_corners, rr_vertical_corners, physvar, cmap=colmap, norm=mynorm2)
+
+    #mapa = axa[0].contourf(xx_horizontal, xx_vertical, alpha, cmap=CM_INFERNO, norm=colors.LogNorm(vmin=0.1, vmax=50.0))
+    #maprho = axa[1].contourf(xx_horizontal*(cs0*tt)/AU, xx_vertical*(cs0*tt)/AU, rho, cmap=CM_INFERNO, norm=colors.LogNorm(vmin=1e15, vmax=1e20))
+
+    if vv_hor.any() != None:
+        if streamlines:
+            #vv_tot = np.sqrt(vv_hor**2.0 + vv_ver**2.0)
+            #vv_tot = np.log(vv_tot/vv_tot.max())
+            axa[0].streamplot(xx_horizontal, xx_vertical, vv_hor, vv_ver, color  = 'k')
+            axa[1].streamplot(rr_horizontal, rr_vertical, uu_hor, uu_ver, color = 'k' )
+        else:
+            axa[0].quiver(xx_horizontal, xx_vertical, vv_hor, vv_ver, pivot = 'middle')
+            axa[1].quiver(rr_horizontal, rr_vertical, uu_hor, uu_ver, pivot = 'middle')
+
+    fig.colorbar(mapa, ax=axa[0])
+    fig.colorbar(maprho, ax=axa[1])
+
+    tau    = get_tau(tt)
+    tt_kyr = tt/kyr
+    axa[0].set_title(r'$%s(x, \tau = %.3f)$ ' % (title1, tau))
+    axa[1].set_title(r'$%s(r, t = %.3f \mathrm{kyr})$ ' % (title2, tt_kyr))
+
+    axa[0].set_xlabel('x')
+    axa[0].set_ylabel('x')
+    #axa[1].set_xlabel('r (AU)')
+    #axa[1].set_ylabel('r (AU)')
+    axa[1].set_xlabel(r'r ($10^{17}$ cm)')
+    axa[1].set_ylabel(r'r ($10^{17}$ cm)' )
+
+    ##axa[1].set_xlim(0.0, 3e17/AU)
+    ##axa[1].set_ylim(0.0, 3e17/AU)
+    axa[1].set_xlim(0.0, 3.0)
+    axa[1].set_ylim(0.0, 3.0)
+
+    axa[0].set_aspect('equal', 'datalim')
+    #axa[1].set_aspect('equal', 'datalim')
+
+    figfile = '%s_%s.png' % (filetitle, str(numslice).zfill(6))
+    print(figfile)
+    figa.savefig(figfile)
+    plt.close(figa)
+
+
+
+xx_SHU      =  np.array([ 0.05,  0.10,  0.15,  0.20,  0.25, 
+                          0.30,  0.35,  0.40,  0.45,  0.50, 
+                          0.55,  0.60,  0.65,  0.70,  0.75, 
+                          0.80,  0.85,  0.90,  0.95,  1.00]) 
+
+alpha_SHU77 =  np.array([ 71.5,  27.8,  16.4,  11.5,  8.76, 
+                          7.09,  5.95,  5.14,  4.52,  4.04, 
+                          3.66,  3.35,  3.08,  2.86,  2.67, 
+                          2.50,  2.35,  2.22,  2.10,  2.00]) 
+
+vv_SHU77    = -np.array([ 5.44,  3.47,  2.58,  2.05,  1.68, 
+                          1.40,  1.18,  1.01, 0.861, 0.735, 
+                         0.625, 0.528, 0.442, 0.363, 0.291, 
+                         0.225, 0.163, 0.106, 0.051,  0.00]) 
+
+mm_SHU77    =  np.array([0.981, 0.993,  1.01,  1.03,  1.05, 
+                          1.08,  1.12,  1.16,  1.20,  1.25, 
+                          1.30,  1.36,  1.42,  1.49,  1.56, 
+                          1.64,  1.72,  1.81,  1.90,  2.00]) 
+
+
+
+
+#GS Table 1 
+
+alpha_mono_GS93 = np.array([    6.304,     2.600,     1.652,     1.156,  9.005e-1, 
+                             7.314e-1,  6.084e-1,  5.084e-1,  4.256e-1,  3.517e-1, 
+                             2.829e-1,  2.172e-1,  1.488e-1,  8.091e-2,  8.360e-3, 
+                            -6.826e-2, -1.512e-1, -2.406e-1, -3.382e-1, -4.444e-1]) 
+
+vv_ww_mono_GS93 = np.array([[4.372e-1,  3.335e-1,  2.390e-1,  1.918e-1,  1.522e-1,
+                             1.226e-1,  9.579e-2,  7.103e-2,  4.828e-2,  2.640e-2, 
+                             5.058e-3, -1.588e-2, -3.791e-2, -5.975e-2, -8.293e-2,
+                            -1.071e-1, -1.330e-1, -1.605e-1, -1.902e-1, -2.222e-1],
+                           [      0.0,       0.0,       0.0,       0.0,       0.0,
+                                  0.0,       0.0,       0.0,       0.0,       0.0,
+                                  0.0,       0.0,       0.0,       0.0,       0.0, 
+                                  0.0,       0.0,       0.0,       0.0,       0.0]])
+
+mm_pp_mono_GS93 = np.array([[8.634e-4, 1.959e-3, 3.560e-3, 5.661e-3, 8.235e-3,
+                             1.130e-2, 1.482e-2, 1.873e-2, 2.293e-2, 2.730e-2,
+                             3.166e-2, 3.579e-2, 3.935e-2, 4.196e-2, 4.312e-2,
+                             4.221e-2, 3.847e-2, 3.097e-2, 1.859e-2,      0.0],
+                           [      0.0,      0.0,      0.0,      0.0,      0.0,
+                                  0.0,      0.0,      0.0,      0.0,      0.0,
+                                  0.0,      0.0,      0.0,      0.0,      0.0,
+                                  0.0,      0.0,      0.0,      0.0,      0.0]])
+
+
+FF_DD_mono_GS93 = np.array([[   -1.130, -3.275e-1, -1.355e-1, -6.415e-2, -2.889e-2, #F
+                             -8.387e-3,  5.358e-3,  1.534e-2,  2.303e-2,  2.931e-2,
+                              3.454e-2,  3.888e-2,  4.225e-2,  4.442e-2,  4.504e-2,
+                              4.358e-2,  3.935e-2,  3.146e-2,  1.881e-2,      0.0],
+                           [  -1.246e1,    -3.168,    -1.141, -5.740e-1, -3.178e-1, #D
+                             -1.878e-1, -1.049e-1, -4.547e-2,  3.393e-4,  3.924e-2,
+                              7.431e-2,  1.070e-1,  1.376e-1,  1.650e-1,  1.867e-1,
+                              1.992e-1,  1.966e-1,  1.708e-1,  1.103e-1,       0.0]])
+
+
+
+#GS Table 2
+
+alpha_quad_GS93 = np.array([ -1.096e3, -1.191e2,  -3.148e1,  -1.158e1,    -5.105, 
+                               -2.456,   -1.217, -5.889e-1, -2.569e-1, -7.024e-2, 
+                             3.790e-2, 1.042e-1,  1.505e-1,  1.845e-1,  2.163e-1, 
+                             2.492e-1, 2.865e-1,  3.302e-1,  3.823e-1,  4.437e-1])
+
+vv_ww_quad_GS93 = np.array([[  -2.581,    -1.533, -8.072e-1, -5.666e-1, -3.905e-1, #v
+                            -2.790e-1, -1.928e-1, -1.254e-1, -7.156e-2, -2.614e-2, 
+                             1.267e-2,  4.650e-2,  7.724e-2,  1.042e-1,  1.288e-1,
+                             1.510e-1,  1.711e-1,  1.889e-1,  2.045e-1,  2.177e-1],
+                           [   -2.085,    -4.890,    -1.811, -8.842e-1, -4.816e-1, #w
+                            -2.807e-1, -1.628e-1, -8.779e-2, -3.852e-2, -4.481e-3,
+                             1.928e-2,  3.578e-2,  4.683e-2,  5.306e-2,  5.512e-2, 
+                             5.312e-2,  4.704e-2,  3.670e-2,  2.179e-2,  1.898e-3]])
+
+mm_pp_quad_GS93 = np.array([[-3.860e-5, -1.541e-4, -3.044e-4, -4.847e-4, -6.831e-4, #m
+                             -8.874e-4, -1.083e-3, -1.253e-3, -1.385e-3, -1.462e-3,
+                             -1.470e-3, -1.389e-3, -1.191e-3, -8.405e-4, -2.841e-4,
+                              5.579e-4,  1.800e-3,  3.609e-3,  6.218e-3,  9.951e-3],
+                            [ -7.539e1,    -7.275,    -1.730, -5.586e-1, -1.999e-1, #p
+                             -6.591e-1, -1.062e-2,  1.294e-2,  2.267e-2,  2.600e-2,
+                              2.625e-2,  2.500e-2,  2.294e-2,  2.046e-2,  1.769e-2,
+                              1.469e-2,  1.146e-2,  7.941e-3,  4.102e-3, -1.214e-4]])
+
+FF_DD_quad_GS93 = np.array([[   -2.253, -6.517e-1, -2.722e-1, -1.345e-1, -6.993e-2, #F
+                             -3.593e-2, -1.660e-2, -5.864e-3, -6.809e-4,  8.213e-4,
+                             -3.086e-4, -3.338e-3, -7.681e-3, -1.272e-2, -1.778e-2,
+                             -2.191e-2, -2.392e-2, -2.219e-2, -1.457e-2,  1.729e-3],
+                            [ -2.484e1,    -6.258,    -2.221,    -1.102, -6.127e-1, #D
+                             -3.645e-1, -2.213e-1, -1.297e-1, -7.020e-2, -1.112e-2,
+                             -2.139e-3, -1.615e-2,  2.744e-2,  3.252e-2,  3.269e-2,
+                              2.839e-2,  2.104e-2,  1.199e-2,  3.732e-3,       0.0]])
+
+
+tt = 0.3*ttm
+theta = 0.5*np.pi
+
+
+xx_SHU          = xx_SHU[:-1]  
+vv_SHU77        = vv_SHU77[:-1]
+alpha_SHU77     = alpha_SHU77[:-1]
+
+alpha_mono_GS93 = alpha_mono_GS93[:-1]
+alpha_quad_GS93 = alpha_quad_GS93[:-1]
+
+vv_ww_mono_GS93 = np.array([vv_ww_mono_GS93[0][:-1], vv_ww_mono_GS93[1][:-1]])
+vv_ww_quad_GS93 = np.array([vv_ww_quad_GS93[0][:-1], vv_ww_quad_GS93[1][:-1]])
+
+
+rho, alpha_xvec = rho_rt(tt, xx_SHU, vv_SHU77, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93, theta)
+
+rr = xx_SHU*cs0*tt 
+
+np.set_printoptions(linewidth=200)
+
+print(rho.shape)
+print(xx_SHU.shape)
+
+print(rho)
+print(xx_SHU)
+
+print(vv_ww_mono_GS93)
+print(vv_ww_quad_GS93)
+print(vv_ww_quad_GS93[0])
+print(vv_ww_quad_GS93[1])
+
+#plt.figure()
+#plt.plot(rr, rho)
+#
+#plt.figure()
+#plt.plot(xx_SHU, alpha_xvec, label = "GS93")
+#plt.plot(xx_SHU, alpha_SHU77, label = "Shu77")
+#plt.legend()
+
+
+#alpha_mono_yy, alpha_quad_yy, alpha_mono_yy = yy_transform(xx_SHU, alpha_SHU77, alpha_mono_GS93, alpha_quad_GS93)
+
+
+plt.figure()
+plt.plot(xx_SHU, alpha_SHU77, label=r"$\alpha^{(0)}$")
+plt.plot(xx_SHU, alpha_mono_GS93, label=r"$\alpha^{(2)}_0$")
+plt.plot(xx_SHU, alpha_quad_GS93, label=r"$\alpha^{(2)}_2$")
+plt.ylim([-5.0,5.0])
+plt.legend()
+plt.show()
+
+
+'''
+ii = 0
+theta_axis = np.linspace(0.0, np.pi)
+xx_theta = np.array([])
+
+print("PIIP")
+
+
+plt.figure()
+for ii in range(0,xx_SHU.size):
+    alpha_theta  = np.array([])
+    alpha_shuref = np.array([])
+    for theta in theta_axis: 
+        rho, alpha_xvec = rho_rt(tt, xx_SHU[ii], vv_SHU77[ii], alpha_SHU77[ii], alpha_mono_GS93[ii], alpha_quad_GS93[ii])
+        alpha_theta  = np.append(alpha_theta, alpha_xvec)
+        alpha_shuref = np.append(alpha_shuref, alpha_SHU77[ii])
+
+    plt.plot(alpha_theta, theta_axis, label = "GS93")
+    #plt.plot(alpha_shuref, theta_axis, label = "GS93")
+'''
+
+
+#Interpolate a mesh. 
+
+xx_SHU_GRID = np.insert(xx_SHU, 0, 0.0)
+print(xx_SHU_GRID)
+
+xx_horizontal, xx_vertical = np.meshgrid(xx_SHU_GRID, xx_SHU_GRID,  indexing='xy') 
+theta = np.arctan2(xx_horizontal, xx_vertical)
+
+#Take pcolormesh coordinate system into account, which marks corners instead of centre points. 
+dxx = np.abs(xx_horizontal[0,1] - xx_horizontal[0,0])
+    
+print(dxx)
+xx_horizontal_corners = xx_horizontal - dxx/2.0
+xx_vertical_corners   = xx_vertical - dxx/2.0 
+
+xx_rad = np.sqrt(xx_horizontal**2.0 +  xx_vertical**2.0)
+
+
+
+
+fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
+        
+map1 = ax[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, theta)
+map2 = ax[1].pcolormesh(xx_horizontal_corners, xx_vertical_corners, xx_rad)
+
+ax[0].set_title(r"$\theta$")
+ax[1].set_title(r"$x_\mathrm{rad}$")
+
+fig.colorbar(map1, ax=ax[0])
+fig.colorbar(map2, ax=ax[1])
+
+ax[0].set_aspect('equal', 'datalim')
+ax[1].set_aspect('equal', 'datalim')
+
+
+
+
+Pfig, Pax = plt.subplots(nrows=1, ncols=3, figsize=(16,4))
+
+print("P_harmonics(theta, J=0)", P_harmonics(theta, J=0))
+
+Pmap1 = Pax[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, P_harmonics(theta, J=0))
+Pmap2 = Pax[1].pcolormesh(xx_horizontal_corners, xx_vertical_corners, P_harmonics(theta, J=2))
+Pmap3 = Pax[2].pcolormesh(xx_horizontal_corners, xx_vertical_corners, deltaspace(theta, 0.5))
+
+Pax[0].set_title(r"$P_0(\theta)$")
+Pax[1].set_title(r"$P_2(\theta)$")
+Pax[2].set_title(r"$\Delta(\theta, \tau = 0.5)$")
+
+
+Pfig.colorbar(Pmap1, ax=Pax[0])
+Pfig.colorbar(Pmap2, ax=Pax[1])
+Pfig.colorbar(Pmap3, ax=Pax[2])
+
+Pax[0].set_aspect('equal', 'datalim')
+Pax[1].set_aspect('equal', 'datalim')
+Pax[2].set_aspect('equal', 'datalim')
+
+
+
+
+Bfig, Bax = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
+
+print("B_harmonics(theta, J=0)", B_harmonics(theta, J=0))
+
+Bmap1 = Bax[0].pcolormesh(xx_horizontal_corners, xx_vertical_corners, B_harmonics(theta, J=0))
+Bmap2 = Bax[1].pcolormesh(xx_horizontal_corners, xx_vertical_corners, B_harmonics(theta, J=2))
+
+Bax[0].set_title(r"$B_0(\theta)$")
+Bax[1].set_title(r"$B_2(\theta)$")
+
+Bfig.colorbar(Bmap1, ax=Bax[0])
+Bfig.colorbar(Bmap2, ax=Bax[1])
+
+Bax[0].set_aspect('equal', 'datalim')
+Bax[1].set_aspect('equal', 'datalim')
+
+
+plt.show()
+
+
+
+##xx_horizontal_corners = np.append(xx_horizontal_corners, (np.amax(xx_horizontal_corners)+dxx)*np.ones((xx_horizontal_corners.shape[1],1)), axis=1)
+
+print(xx_horizontal_corners[-1,:])
+print(xx_horizontal_corners)
+
+##xx_horizontal_corners = np.vstack((xx_horizontal_corners, xx_horizontal_corners[-1,:]))
+##print(xx_horizontal_corners)
+
+##xx_vertical_corners   = np.append(xx_vertical_corners,   (np.amax(xx_vertical_corners)+dxx)*np.ones((1,xx_vertical_corners.shape[0])),   axis=0)
+
+print(xx_vertical_corners[:, -1])
+print(xx_vertical_corners)
+##xx_vertical_corners   =  np.hstack((xx_vertical_corners, xx_vertical_corners[:,-1])) 
+print(xx_vertical_corners)
+
+numslice = 0
+frametot = 201
+#frametot = 101
+#frametot = 11
+for tt in np.linspace(0.1, ttm, num=frametot):
+    
+    alpha      = np.empty_like(xx_rad)
+    alpha77    = np.empty_like(xx_rad)
+    rho        = np.empty_like(xx_rad)
+
+    vv_rad     = np.empty_like(xx_rad)
+    vv_pol     = np.empty_like(xx_rad)
+    uu_rad     = np.empty_like(xx_rad)
+    uu_pol     = np.empty_like(xx_rad)
+
+    psi        = np.empty_like(xx_rad)
+    Vpot       = np.empty_like(xx_rad)
+
+    Delta      = np.empty_like(xx_rad)
+
+    Phi_flux = np.empty_like(xx_rad)
+    phi_vecpot     = np.empty_like(xx_rad)
+
+
+    alpha_2_J  = np.empty_like(xx_rad)
+
+    for ii in range(xx_SHU_GRID.size):
+        for kk in range(xx_SHU_GRID.size):
+            xx    = xx_rad[ii,kk]
+            th    = theta[ii,kk]
+            ishu  = get_shu_index(xx, xx_SHU)
+            rho[ii, kk], alpha[ii, kk] = rho_rt(tt, xx_SHU[ishu],
+                                                vv_SHU77[ishu],
+                                                alpha_SHU77[ishu],
+                                                alpha_mono_GS93[ishu],
+                                                alpha_quad_GS93[ishu], th)
+            alpha77[ii, kk] = alpha_SHU77[ishu]
+
+            vv_ww_mono_point = vv_ww_mono_GS93[:, ishu]
+            vv_ww_quad_point = vv_ww_quad_GS93[:, ishu]
+            uu_dump, vv_dump =  uu_rt(tt, xx_SHU[ishu], vv_SHU77[ishu], alpha_SHU77[ishu], vv_ww_mono_point, vv_ww_quad_point, th)
+            vv_rad[ii, kk]  = vv_dump[0] 
+            vv_pol[ii, kk]  = vv_dump[1] 
+            uu_rad[ii, kk] = uu_dump[0] 
+            uu_pol[ii, kk] = uu_dump[1] 
+
+            mm_pp_mono_point = mm_pp_mono_GS93[:, ishu]
+            mm_pp_quad_point = mm_pp_quad_GS93[:, ishu]
+            Vpot[ii, kk], psi[ii, kk] = grav_psi_rt(tt, xx_SHU[ishu], mm_SHU77[ishu], mm_pp_mono_point, mm_pp_quad_point, th)
+
+            Phi_flux[ii, kk], phi_vecpot[ii, kk] = vectorpot_rt(tt, xx_SHU[ishu], mm_SHU77[ishu], alpha_SHU77[ishu], 
+                                                                FF_DD_mono_GS93[:, ishu], 
+                                                                FF_DD_quad_GS93[:, ishu], th)
+
+            Delta[ii, kk] = deltaspace(th, get_tau(tt))
+            alpha_2_J[ii, kk] = alpha_mono_GS93[ishu]*P_harmonics(th, J=0) + alpha_quad_GS93[ishu]*P_harmonics(th, J=2) 
+
+
+    vv_hor =   vv_pol*np.cos(theta) + vv_rad*np.sin(theta)
+    vv_ver = - vv_pol*np.sin(theta) + vv_rad*np.cos(theta)
+    uu_hor =   uu_pol*np.cos(theta) + uu_rad*np.sin(theta)
+    uu_ver = - uu_pol*np.sin(theta) + uu_rad*np.cos(theta)
+
+
+    rho77 = alpha77 * (1.0/(4.0*np.pi*G_newton)*tt) #TODO WRONG COEFFS!!! 
+
+
+    #Apply mask
+    
+    rad_mask = 0.2
+
+   
+    alpha = np.ma.masked_where(xx_rad < rad_mask, alpha)
+    rho   = np.ma.masked_where(xx_rad < rad_mask, rho)
+
+    vv_rad = np.ma.masked_where(xx_rad < rad_mask, vv_rad) 
+    uu_rad = np.ma.masked_where(xx_rad < rad_mask, uu_rad) 
+    vv_pol = np.ma.masked_where(xx_rad < rad_mask, vv_pol) 
+    uu_pol = np.ma.masked_where(xx_rad < rad_mask, uu_pol) 
+
+    vv_hor = np.ma.masked_where(xx_rad < rad_mask, vv_hor)
+    vv_ver = np.ma.masked_where(xx_rad < rad_mask, vv_ver)
+    uu_hor = np.ma.masked_where(xx_rad < rad_mask, uu_hor)
+    uu_ver = np.ma.masked_where(xx_rad < rad_mask, uu_ver)
+
+    psi  = np.ma.masked_where(xx_rad < rad_mask, psi )
+    Vpot = np.ma.masked_where(xx_rad < rad_mask, Vpot)
+
+    phi_vecpot = np.ma.masked_where(xx_rad < rad_mask, phi_vecpot)
+    Phi_flux   = np.ma.masked_where(xx_rad < rad_mask, Phi_flux  )
+
+    alpha_2_J = np.ma.masked_where(xx_rad < rad_mask, alpha_2_J)
+    Delta     = np.ma.masked_where(xx_rad < rad_mask, Delta    )
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, alpha, rho, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"\alpha", title2=r"\rho", filetitle='GS93density',
+                streamlines = 1, contourplot=1, 
+                var_min=[0.00, 1e15], var_max=[16, 1e21], 
+                normtype = 'cdensity')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, alpha77, rho77, 
+                #var_min=[0.00, 0], var_max=[16, 1e20], 
+                title1=r"\alpha", title2=r"\rho", filetitle='S77density')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, vv_rad, uu_rad, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"v_r", title2=r"u_r", filetitle='GS93velocity_rad',
+                var_min=[-2.5, -2.5*cs0], var_max=[0.0, 0.0*cs0], 
+                normtype = 'lin')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, vv_pol, uu_pol, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"v_\theta", title2=r"u_\theta", filetitle='GS93velocity_pol',
+                var_min=[0.0, 0.0*cs0], var_max=[0.5, 0.5*cs0], 
+                normtype = 'lin')
+
+    
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, psi, Vpot, 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                title1=r"\psi", title2=r"V_\mathrm{pot}", filetitle='GS93gravpot',
+                var_min=[12.0, 12.0*(cs0**2.0)], var_max=[21.0, 21.0*(cs0**2.0)], 
+                normtype = 'lin')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, phi_vecpot, Phi_flux, 
+                title1=r"\phi", title2=r"\Phi_\mathrm{flux}", filetitle='GS93vecpot',
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                streamlines = 1, contourplot=1,
+                normtype = 'cflux')
+
+    plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, np.sqrt(vv_hor**2.0 + vv_ver**2.0), np.sqrt(uu_hor**2.0 + uu_ver**2.0), 
+                title1=r"|v|", title2=r"|u| (cm/s)", filetitle='GS93vel2',
+                var_min=[0.0, 0.0*cs0], var_max=[2.5, 2.5*cs0], 
+                vv_hor=vv_hor, vv_ver=vv_ver, uu_hor=uu_hor, uu_ver=uu_ver,
+                streamlines = 1,  
+                normtype = 'lin')
+
+    
+    ##plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, Delta, Delta,  
+    ##            title1=r"\Delta", title2=r"\Delta", filetitle='Delta',
+    ##            normtype = 'lin')
+
+    ##plot_figure(tt, xx_horizontal_corners, xx_vertical_corners, xx_horizontal, xx_vertical, alpha_2_J, alpha_2_J,  
+    ##            title1=r"\sum \alpha^{(2)}_J", title2=r"\sum \alpha^{(2)}_J", filetitle='alpha_2_J', 
+    ##            normtype = 'lin')
+
+    numslice += 1 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/analysis/python/calc/purge.sh b/analysis/python/calc/purge.sh
new file mode 100755
index 0000000..8723972
--- /dev/null
+++ b/analysis/python/calc/purge.sh
@@ -0,0 +1 @@
+rm *.png
diff --git a/analysis/python/calc/shu_selfsim.py b/analysis/python/calc/shu_selfsim.py
new file mode 100644
index 0000000..357aa63
--- /dev/null
+++ b/analysis/python/calc/shu_selfsim.py
@@ -0,0 +1,279 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import numpy as np
+import pylab as plt
+
+G_newton = 6.674e-8 #cm**3 g**-1 s**-2  
+
+def dv_dx(xx,vv, alpha):
+    EE = alpha*(xx-vv) - 2.0/xx 
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+def dalpha_dx(xx,vv, alpha):
+    EE = alpha*(alpha - (2.0/xx)*(xx-vv))
+    HH = (xx-vv)**2.0 - 1.0
+    return (EE/HH)*(xx-vv)
+
+###def dv_dx(xx,vv, alpha):
+###    return 2.0*(xx-vv)
+###
+###def dalpha_dx(xx,vv, alpha):
+###    return -1.0*(xx-vv)
+
+def get_m(xx, vv, alpha): 
+    mm = xx**2.0 * alpha * (xx-vv)
+    return mm 
+
+def alpha_to_rho(alpha, tt):
+    rho = alpha/(4.0*np.pi*G_newton*(tt**2.0))
+    return rho
+
+def vv_to_uu(vv, cs0):
+    uu = cs0*vv
+    return uu
+
+def mm_to_MM(mm, tt, cs0):
+    MM = (((cs0**3.0)*tt)/G_newton)*mm
+    return MM
+
+def euler(xx_step, xx, vv, alpha, mm, target):
+    diff = target - xx[-1]  
+    if diff >= 0:         
+        while xx[-1] <= target:
+            vv_step    = vv[-1]    + xx_step*dv_dx(xx[-1], vv[-1], alpha[-1])
+            alpha_step = alpha[-1] + xx_step*dalpha_dx(xx[-1], vv[-1], alpha[-1])
+        
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+    else: 
+        while xx[-1] <= target:
+            vv_step    = vv[-1]    + xx_step*dv_dx(xx[-1], vv[-1], alpha[-1])
+            alpha_step = alpha[-1] + xx_step*dalpha_dx(xx[-1], vv[-1], alpha[-1])
+        
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+    return xx, vv, alpha, mm
+
+def RK4_step(vv, xx, alpha, xx_step): 
+    vv1    =     xx_step*dv_dx(xx[-1], vv[-1], alpha[-1]) 
+    alpha1 = xx_step*dalpha_dx(xx[-1], vv[-1], alpha[-1])
+    
+    vv2 =        xx_step*dv_dx(xx[-1]+xx_step/2.0, vv[-1]+vv1/2.0, alpha[-1]+alpha1/2.0)
+    alpha2 = xx_step*dalpha_dx(xx[-1]+xx_step/2.0, vv[-1]+vv1/2.0, alpha[-1]+alpha1/2.0)
+    
+    vv3 =        xx_step*dv_dx(xx[-1]+xx_step/2.0, vv[-1]+vv2/2.0, alpha[-1]+alpha2/2.0)
+    alpha3 = xx_step*dalpha_dx(xx[-1]+xx_step/2.0, vv[-1]+vv2/2.0, alpha[-1]+alpha2/2.0)
+    
+    vv4 =        xx_step*dv_dx(xx[-1]+xx_step, vv[-1]+vv3, alpha[-1]+alpha3)
+    alpha4 = xx_step*dalpha_dx(xx[-1]+xx_step, vv[-1]+vv3, alpha[-1]+alpha3)
+    
+    vv_step    = vv[-1]    + (1.0/6.0)*(vv1 + 2.0*vv2 + 2.0*vv3 + vv4) 
+    alpha_step = alpha[-1] + (1.0/6.0)*(alpha1 + 2.0*alpha2 + 2.0*alpha3 + alpha4)
+
+    return vv_step, alpha_step
+
+def RK4(xx_step, xx, vv, alpha, mm, target, epsilon):
+    #Runge-Kutta RK4
+    diff = target - xx[-1]  
+    #if diff < 0: 
+
+    if diff >= 0:         
+        while xx[-1] <= target:
+            if (np.abs(xx[-1] - vv[-1] - 1.0) > epsilon):
+                vv_step, alpha_step = RK4_step(vv, xx, alpha, xx_step)
+                print( vv_step, alpha_step)
+            else: 
+                vv_step    = vv[-1]
+                alpha_step = alpha[-1]
+                print("PIIP") 
+
+            #print(np.abs(xx[-1] - vv[-1]), epsilon)
+ 
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+    else:         
+        while xx[-1] >= target:
+            if (np.abs(xx[-1] - vv[-1] - 1.0) > epsilon):
+                vv_step, alpha_step = RK4_step(vv, xx, alpha, xx_step)
+                print( vv_step, alpha_step)
+            else: 
+                vv_step    = vv[-1]
+                alpha_step = alpha[-1]
+                print("PIIP") 
+
+            #print(np.abs(xx[-1] - vv[-1]), epsilon)
+ 
+            xx = np.append(xx, xx[-1]+xx_step)
+            alpha = np.append(alpha, alpha_step)
+            vv = np.append(vv, vv_step)
+            mm_step    = get_m(xx[-1], vv[-1], alpha[-1])
+            mm = np.append(mm, mm_step)
+            
+
+    return xx, vv, alpha, mm
+
+# From Shu 1977 TABLE II
+
+xx_SHU    =  np.array([0.05 , 0.10 , 0.15 , 0.20 , 0.25 , 0.30 , 0.35 , 0.40 , 0.45 ,
+		       0.50 , 0.55 , 0.60 , 0.65 , 0.70 , 0.75 , 0.80 , 0.85 ,
+                       0.90 , 0.95 , 1.00]) 
+alpha_SHU =  np.array([71.5 , 27.8 , 16.4 , 11.5 , 8.76 , 7.09 , 5.95 , 5.14 , 4.52 ,
+		       4.04 , 3.66 , 3.35 , 3.08 , 2.86 , 2.67 , 2.50 , 2.35 ,
+                       2.22 , 2.10 , 2.00]) 
+vv_SHU    = -np.array([5.44 , 3.47 , 2.58 , 2.05 , 1.68 , 1.40 , 1.18 , 1.01 , 0.861,
+		       0.735, 0.625, 0.528, 0.442, 0.363, 0.291, 0.225, 0.163,
+                       0.106, 0.051, 0.00]) 
+mm_SHU    =  np.array([0.981, 0.993, 1.01 , 1.03 , 1.05 , 1.08 , 1.12 , 1.16 , 1.20 ,
+		       1.25 , 1.30 , 1.36 , 1.42 , 1.49 , 1.56 , 1.64 , 1.72 ,
+                       1.81 , 1.90 , 2.00]) 
+
+
+##From Shu (1977)
+#AA = [  2.0,  2.2,  2.4,  2.6,  2.8,  3.0,  3.2,  3.4,  3.6,  3.8, 4.0]
+#m0 = [0.975, 1.45, 1.88, 2.31, 2.74, 3.18, 3.63, 4.10, 4.58, 5.08, 5.58]
+#AA = np.array(AA)
+#m0 = np.array(m0)
+
+#xx0    = xx_SHU[1] 
+#alpha0 = alpha_SHU[1] 
+#vv0    = vv_SHU[1]
+#xx_step = 0.005
+#target = 1.0
+
+xx0    = xx_SHU[-3] 
+alpha0 = alpha_SHU[-3] 
+vv0    = vv_SHU[-3]
+target = 0.05
+xx_step = -0.005
+xx_step = -0.001
+             
+print(get_m(xx0, alpha0, vv0))
+
+xx = np.array([])
+alpha = np.array([])
+vv = np.array([])
+mm = np.array([])
+
+xx = np.append(xx, xx0)
+alpha = np.append(alpha, alpha0)
+vv = np.append(vv, vv0)
+mm = np.append(mm, get_m(xx0, alpha0, vv0))
+
+print(xx, alpha, vv, mm)
+
+
+xx_EUL, vv_EUL, alpha_EUL, mm_EUL = euler(xx_step, xx, vv, alpha, mm, target)
+xx_RK , vv_RK , alpha_RK , mm_RK  = RK4(xx_step, xx, vv, alpha, mm, target, epsilon = 0.000001)
+
+mm_EUL = get_m(xx_EUL, alpha_EUL, vv_EUL)
+mm_RK  = get_m(xx_RK , alpha_RK , vv_RK )
+mm_SHU = get_m(xx_SHU, alpha_SHU, vv_SHU)
+
+# Plotting time
+ 
+figQ, axQ = plt.subplots(nrows=2, ncols=2, sharex=True)
+
+axQ[0,0].plot(xx_EUL, alpha_EUL, label=r'$\alpha$ (Euler)', linewidth = 3.0)
+axQ[0,0].plot(xx_RK , alpha_RK , label=r'$\alpha$ (RK4)', linewidth = 3.0)
+axQ[0,0].plot(xx_SHU, alpha_SHU, 'd', label=r'$\alpha$ (Shu)', linewidth = 3.0)
+axQ[0,0].set_xlabel(r'x')
+axQ[0,0].set_ylabel(r'$\alpha$')
+axQ[0,0].legend()
+
+axQ[0,1].plot(xx_EUL, np.abs(vv_EUL), label='v (Euler)', linewidth = 3.0)
+axQ[0,1].plot(xx_RK , np.abs(vv_RK ), label='v (RK4)', linewidth = 3.0)
+axQ[0,1].plot(xx_SHU, np.abs(vv_SHU),'d', label='v (Shu)', linewidth = 3.0)
+axQ[0,1].set_xlabel(r'x')
+axQ[0,1].set_ylabel(r'-v')
+axQ[0,1].legend()
+
+axQ[1,0].plot(xx_EUL, mm_EUL, label='m (Euler)', linewidth = 3.0)
+axQ[1,0].plot(xx_RK , mm_RK , label='m (RK4)', linewidth = 3.0)
+axQ[1,0].plot(xx_SHU , mm_SHU , 'd', label='m (Shu)', linewidth = 3.0)
+axQ[1,0].set_xlabel(r'x')
+axQ[1,0].set_ylabel(r'm')
+axQ[1,0].legend()
+
+
+axQ[1,1].plot(xx_EUL, xx_EUL-vv_EUL, label='x-v (Euler)', linewidth = 3.0)
+axQ[1,1].plot(xx_RK , xx_RK -vv_RK , label='x-v (RK4)', linewidth = 3.0)
+axQ[1,1].plot(xx_SHU, xx_SHU-vv_SHU, 'd', label='x-v (Shu)', linewidth = 3.0)
+axQ[1,1].set_xlabel(r'x')
+axQ[1,1].set_ylabel(r'x-v')
+axQ[1,1].legend()
+
+# Time to convert to physical quantities
+yr  = 3.154e+7 #s 
+kyr = 1000.0*yr
+km = 1e5 #cm
+AU = 1.496e+13 #cm
+Msun = 1.98847e33 #g
+
+cs0 = 20000   #cs cm/s "a" in Shu notation
+
+tt_list = np.linspace(10*kyr, 20.0*kyr, num=4)
+mm = get_m(xx_RK, vv_RK, alpha_RK) 
+
+
+fig, ax = plt.subplots(nrows=1, ncols=3, sharex=True)
+
+for tt in tt_list:
+    rho = alpha_to_rho(alpha_RK, tt)
+    RR = xx_RK*(cs0*tt)
+    time = r'%.2f $\mathrm{kyr}$' % (tt/kyr) 
+    
+    ax[0].plot(RR/AU, rho, label= r'$\rho$, t = ' + time, linewidth = 3.0)
+    ax[0].set_xlabel(r'R (AU)')
+    ax[0].set_ylabel(r'$\rho$ (g/cm$^3$)')
+    ax[0].set_xscale('log')
+    ax[0].set_yscale('log')
+    ax[0].legend()
+
+    uu = vv_to_uu(vv_RK, cs0)
+
+    ax[1].plot(RR/AU, -uu/km, label= r'$u$, t = ' + time, linewidth = 3.0)
+    ax[1].set_xlabel(r'R (AU)')
+    ax[1].set_ylabel(r'-$u$ (km/s)')
+    ax[1].set_yscale('log')
+    ax[1].legend()
+
+    MM = mm_to_MM(mm, tt, cs0)
+
+    ax[2].plot(RR/AU, MM/Msun, label= r'$M$, t = ' + time, linewidth = 3.0)
+    ax[2].set_xlabel(r'R (AU)')
+    ax[2].set_ylabel(r'$M$ ($M_\odot}$)')
+    ax[2].legend()
+
+   
+
+plt.show()
+
+
+
+
diff --git a/analysis/python/purgepng.sh b/analysis/python/purgepng.sh
new file mode 100755
index 0000000..8723972
--- /dev/null
+++ b/analysis/python/purgepng.sh
@@ -0,0 +1 @@
+rm *.png
diff --git a/analysis/python/samples/README.md b/analysis/python/samples/README.md
new file mode 100644
index 0000000..bed7a71
--- /dev/null
+++ b/analysis/python/samples/README.md
@@ -0,0 +1,3 @@
+# Analysis script samples
+
+This directory is for sample scripts useable for data analysis and visualization. 
diff --git a/analysis/python/samples/lnrhobound.py b/analysis/python/samples/lnrhobound.py
new file mode 100644
index 0000000..2400bad
--- /dev/null
+++ b/analysis/python/samples/lnrhobound.py
@@ -0,0 +1,41 @@
+import pylab as plt
+import numpy as np 
+
+
+def do_bound(coeff):
+    vertex_buffer = np.zeros(7, dtype=np.float32)
+    xx = np.arange(vertex_buffer.size)
+    
+    edge_idx = 3
+    
+    for dst_idx in range(3):
+        i_diff = abs(edge_idx - dst_idx)
+        vertex_buffer[dst_idx] = coeff*np.exp(vertex_buffer[edge_idx])
+     
+        print("initial",vertex_buffer)
+    
+        for i in range(i_diff): 
+            vertex_buffer[dst_idx] = coeff*vertex_buffer[dst_idx]
+            print("looped", vertex_buffer[dst_idx])
+        
+        vertex_buffer[dst_idx] = np.log(vertex_buffer[dst_idx]);
+        print("final",vertex_buffer)
+
+    return xx, vertex_buffer
+
+
+AC_dsx = 0.04908738521
+coeff1 = 1.0 - AC_dsx/(25.0*AC_dsx)
+coeff2 = 1.0 - AC_dsx/(100.0*AC_dsx)
+
+
+plt.figure()
+xx, yy = do_bound(coeff1)
+plt.plot(xx, yy)
+
+plt.figure()
+xx, yy = do_bound(coeff2)
+plt.plot(xx, yy)
+
+plt.show()
+
diff --git a/analysis/python/samples/readtest.py b/analysis/python/samples/readtest.py
new file mode 100644
index 0000000..ad1d0b1
--- /dev/null
+++ b/analysis/python/samples/readtest.py
@@ -0,0 +1,260 @@
+'''
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import astar.data as ad
+import astar.visual as vis
+import pylab as plt 
+import numpy as np 
+import sys
+
+##mesh = ad.read.Mesh(500, fdir="/tiara/home/mvaisala/astaroth-code/astaroth_2.0/build/")
+##
+##print(np.shape(mesh.uu))
+##print(np.shape(mesh.lnrho))
+##
+##uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+##vis.slices.plot_3(mesh, uu_tot, title = r'$|u|$', bitmap = True, fname = 'uutot')
+##
+##vis.slices.plot_3(mesh, mesh.lnrho, title = r'$\ln \rho$', bitmap = True, fname = 'lnrho')
+##
+##print(mesh.minfo.contents)
+
+
+AC_unit_density  =  1e-17
+AC_unit_velocity = 1e5
+AC_unit_length   = 1.496e+13
+
+
+print("sys.argv", sys.argv)
+
+#meshdir = "/tiara/home/mvaisala/astaroth-code/astaroth_2.0/build/"
+meshdir  = "/tiara/ara/data/mvaisala/tmp/astaroth-code/astaroth_2.0/build/"
+#meshdir = "/tiara/ara/data/mvaisala/asth_testbed_double/"
+
+if "xtopbound" in sys.argv: 
+    for i in range(0, 171):
+        mesh = ad.read.Mesh(i, fdir=meshdir) 
+        if mesh.ok:
+            np.set_printoptions(precision=4, linewidth=150)
+            uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+            print(mesh.lnrho.shape)
+            print(range((mesh.lnrho.shape[0]-7),mesh.lnrho.shape[0]))
+            print('lnrho', i, mesh.lnrho[(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uux', i, mesh.uu[0][(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uuy', i, mesh.uu[1][(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uuz', i, mesh.uu[2][(mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+            print('uu_tot', i, uu_tot[    (mesh.lnrho.shape[0]-7):mesh.lnrho.shape[0], 20, 100]) 
+    
+
+if "single" in sys.argv:
+    mesh = ad.read.Mesh(1, fdir=meshdir)
+    print(mesh.lnrho.shape)
+    
+    print( mesh.lnrho[1, 50, 100], 0.0)
+    print( mesh.lnrho[197, 50, 100], 0.0)
+    print( mesh.lnrho[100, 50, 1], 0.0)
+    print( mesh.lnrho[100, 50, 197], 0.0)
+    print( mesh.lnrho[100, 1, 100], "periodic")
+    print( mesh.lnrho[100, 101, 00], "periodic")
+
+    angle = 0.78
+    UUXX = -0.25 * np.cos(angle)
+    zorig = 4.85965
+    zz = [0.0490874*1.0 - zorig,  0.0490874*100.0 - zorig, 0.0490874*197.0 - zorig]
+    print (zz) 
+    zz = np.array(zz)
+    UUZZ = - 0.25*np.sin(angle)*np.tanh(zz/0.2)
+    #plt.plot(np.linspace(-5.0, 5.0, num=100),- (0.25*np.sin(angle))*np.tanh(np.linspace(-5.0, 5.0, num=100)/0.2)) 
+    #plt.show()
+    print("---- UUX")
+    print( mesh.uu[0][1, 50, 100], 0.0)
+    print( mesh.uu[0][197, 50, 100], UUXX)
+    print( mesh.uu[0][100, 50, 1], UUXX)
+    print( mesh.uu[0][100, 50, 197], UUXX)
+    print( mesh.uu[0][100, 1, 100], "periodic")
+    print( mesh.uu[0][100, 101, 00], "periodic")
+    print("---- UUY")
+    print( mesh.uu[1][1, 50, 100], 0.0)
+    print( mesh.uu[1][197, 50, 100], 0.0)
+    print( mesh.uu[1][100, 50, 1], 0.0)
+    print( mesh.uu[1][100, 50, 197], 0.0)
+    print( mesh.uu[1][100, 1, 100], "periodic")
+    print( mesh.uu[1][100, 101, 00], "periodic")
+    print("---- UUZ")
+    print( mesh.uu[2][1, 50, 100], 0.0)
+    print( mesh.uu[2][197, 50, 100], UUZZ[1])
+    print( mesh.uu[2][100, 50, 1],   UUZZ[0])
+    print( mesh.uu[2][100, 50, 197], UUZZ[2])
+    print( mesh.uu[2][100, 1, 100], "periodic")
+    print( mesh.uu[2][100, 101, 00], "periodic")
+
+if 'xline' in sys.argv:
+    mesh = ad.read.Mesh(0, fdir=meshdir)
+    plt.figure()
+    plt.plot(mesh.uu[0][100, 50, :] , label="z")
+    plt.plot(mesh.uu[0][100, :, 100], label="x")
+    plt.plot(mesh.uu[0][:, 50, 100] , label="y")
+    plt.legend()
+
+    plt.figure()
+    plt.plot(mesh.uu[0][197, 50, :] , label="z edge")
+
+    plt.figure()
+    plt.plot(mesh.uu[1][100, 50, :] , label="z")
+    plt.plot(mesh.uu[1][100, :, 100], label="x")
+    plt.plot(mesh.uu[1][:, 50, 100] , label="y")
+    plt.legend()
+
+    plt.figure()
+    plt.plot(mesh.uu[2][100, 50, :] , label="z")
+    plt.plot(mesh.uu[2][100, :, 100], label="x")
+    plt.plot(mesh.uu[2][:, 50, 100] , label="y")
+    plt.legend()
+    plt.show()
+
+if 'check' in sys.argv:
+    mesh = ad.read.Mesh(0, fdir=meshdir)
+    vis.slices.plot_3(mesh, mesh.lnrho, title = r'$\ln \rho$', bitmap = False, fname = 'lnrho', contourplot = True)
+    plt.show()
+
+
+
+if 'diff' in sys.argv:
+    mesh0 = ad.read.Mesh(1, fdir=meshdir)
+    mesh1 = ad.read.Mesh(2, fdir=meshdir)
+    vis.slices.plot_3(mesh1, mesh1.lnrho - mesh0.lnrho, title = r'$\ln \rho$', bitmap = True, fname = 'lnrho')
+    vis.slices.plot_3(mesh1, mesh1.uu[0] - mesh0.uu[0], title = r'$u_x$',      bitmap = True, fname = 'uux')
+    vis.slices.plot_3(mesh1, mesh1.uu[1] - mesh0.uu[1], title = r'$u_y$',      bitmap = True, fname = 'uuy')
+    vis.slices.plot_3(mesh1, mesh1.uu[2] - mesh0.uu[2], title = r'$u_z$',      bitmap = True, fname = 'uuz')
+
+if '1d' in sys.argv:
+    plt.figure()
+    for i in range(0, 100001, 1000):
+        mesh = ad.read.Mesh(i, fdir=meshdir) 
+        if mesh.ok:
+
+            if 'lnrho' in sys.argv:
+                plt.plot(mesh.lnrho[:, 20, 100], label=i)
+            elif 'uux' in sys.argv:
+                plt.plot(mesh.uu[0][:, 20, 100], label=i)
+            elif 'uuy' in sys.argv:
+                plt.plot(mesh.uu[1][:, 20, 100], label=i)
+            elif 'uuz' in sys.argv:
+                plt.plot(mesh.uu[2][:, 20, 100], label=i)
+            elif 'uutot' in sys.argv:
+                uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+                plt.plot(uu_tot[:, 20, 100], label=i)
+ 
+            plt.legend()
+
+    plt.show()
+
+
+if 'sl' in sys.argv:
+    maxfiles = 200002
+    stride = 10000
+    for i in range(0, maxfiles, stride):
+        mesh = ad.read.Mesh(i, fdir=meshdir) 
+        print(" %i / %i" % (i, maxfiles))
+        if mesh.ok:
+            uu_tot = np.sqrt(mesh.uu[0]**2.0 + mesh.uu[1]**2.0 + mesh.uu[2]**2.0)
+
+            if 'lim' in sys.argv:
+                vis.slices.plot_3(mesh, mesh.lnrho,         title = r'$\ln \rho$', bitmap = True, fname = 'lnrho', colrange=[-0.02, 0.0])
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$\rho$', bitmap = True, fname = 'rho', colrange=[0.97, 1.0])
+                vis.slices.plot_3(mesh, mesh.uu[0],         title = r'$u_x$', bitmap = True, fname = 'uux', colrange=[-0.002, 0.002])
+                vis.slices.plot_3(mesh, mesh.uu[1],         title = r'$u_y$', bitmap = True, fname = 'uuy', colrange=[-1.0e-20, 1.0e-20])
+                vis.slices.plot_3(mesh, mesh.uu[2],         title = r'$u_z$', bitmap = True, fname = 'uuz', colrange=[-0.002, 0.002])
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$N_\mathrm{col}$', bitmap = True, fname = 'colden', slicetype = 'sum', colrange=[0.0, 100.0])
+                vis.slices.plot_3(mesh, uu_tot,             title = r'$|u|$', bitmap = True, fname = 'uutot', colrange=[0.00, 0.004])
+            else: 
+                vis.slices.plot_3(mesh, mesh.lnrho,         title = r'$\ln \rho$', bitmap = True, fname = 'lnrho')
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$\rho$', bitmap = True, fname = 'rho')
+                #vis.slices.plot_3(mesh, mesh.ss, title = r'$s$', bitmap = True, fname = 'ss')
+                vis.slices.plot_3(mesh, mesh.uu[0],         title = r'$u_x$', bitmap = True, fname = 'uux')
+                vis.slices.plot_3(mesh, mesh.uu[1],         title = r'$u_y$', bitmap = True, fname = 'uuy')
+                vis.slices.plot_3(mesh, mesh.uu[2],         title = r'$u_z$', bitmap = True, fname = 'uuz')
+                vis.slices.plot_3(mesh, np.exp(mesh.lnrho), title = r'$N_\mathrm{col}$', bitmap = True, fname = 'colden', slicetype = 'sum')
+                vis.slices.plot_3(mesh, uu_tot,             title = r'$|u|$', bitmap = True, fname = 'uutot')
+    
+    
+
+if 'ts' in sys.argv:
+   ts = ad.read.TimeSeries(fdir=meshdir)
+
+   end_rm = -1 #-35#-40
+
+   plt.figure()
+   xaxis  = 't_step'
+   yaxis1 = 'lnrho_rms'
+   yaxis2 = 'lnrho_min'
+   yaxis3 = 'lnrho_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uutot_rms'
+   yaxis2 = 'uutot_min'
+   yaxis3 = 'uutot_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uux_rms'
+   yaxis2 = 'uux_min'
+   yaxis3 = 'uux_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uuy_rms'
+   yaxis2 = 'uuy_min'
+   yaxis3 = 'uuy_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+   plt.figure()
+   xaxis = 't_step'
+   yaxis1 = 'uuz_rms'
+   yaxis2 = 'uuz_min'
+   yaxis3 = 'uuz_max'
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis1][:end_rm], label=yaxis1)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis2][:end_rm], label=yaxis2)
+   plt.plot(ts.var[xaxis][:end_rm], ts.var[yaxis3][:end_rm], label=yaxis3)
+   plt.xlabel(xaxis)
+   plt.legend()
+  
+  
+   plt.show()
+
+
diff --git a/config/astaroth.conf b/config/astaroth.conf
new file mode 100644
index 0000000..5100bf6
--- /dev/null
+++ b/config/astaroth.conf
@@ -0,0 +1,54 @@
+
+
+/*
+ * =============================================================================
+ * "Compile-time" params
+ * =============================================================================
+ */
+AC_nx = 192
+AC_ny = 120
+AC_nz = 7
+
+AC_dsx = 0.04908738521
+AC_dsy = 0.04908738521
+AC_dsz = 0.04908738521
+
+/*
+ * =============================================================================
+ * Run-time params
+ * =============================================================================
+ */
+AC_max_steps = 1001 
+AC_save_steps = 10
+AC_bin_steps = 1000
+AC_bin_save_t = 1e666
+
+// Hydro
+AC_cdt = 0.4
+AC_cdtv = 0.3
+AC_cdts = 1.0
+AC_nu_visc  = 5e-3
+AC_cs_sound = 1.0
+AC_zeta = 0.01
+
+// Magnetic
+AC_eta = 5e-3
+AC_mu0 = 1.4
+AC_chi = 0.0001
+
+// Forcing
+AC_relhel = 0.0
+
+// Entropy
+AC_cp_sound = 1.0
+AC_gamma = 0.5
+AC_lnT0 = 1.2
+AC_lnrho0 = 1.3
+
+/*
+ * =============================================================================
+ * Initial conditions
+ * =============================================================================
+ */
+AC_ampl_lnrho = 0.0
+AC_ampl_uu = 1.0
diff --git a/config/astaroth_pseudodisk.conf b/config/astaroth_pseudodisk.conf
new file mode 100644
index 0000000..4cfde41
--- /dev/null
+++ b/config/astaroth_pseudodisk.conf
@@ -0,0 +1,121 @@
+
+
+/*
+ * =============================================================================
+ * "Compile-time" params
+ * =============================================================================
+ */
+AC_nx = 192
+AC_ny = 48
+AC_nz = 192
+
+AC_dsx = 0.04908738521
+AC_dsy = 0.04908738521
+AC_dsz = 0.04908738521
+
+/*
+ * =============================================================================
+ * Run-time params
+ * =============================================================================
+ */
+//AC_max_steps = 16001
+//AC_save_steps = 50
+//AC_bin_steps = 16000
+
+//AC_max_steps = 1001
+//AC_save_steps = 10
+//AC_bin_steps = 1000
+
+//AC_max_steps = 11
+//AC_save_steps = 1
+//AC_bin_steps = 1
+
+//AC_max_steps = 4
+//AC_save_steps = 1
+//AC_bin_steps = 1
+
+//AC_max_steps = 1201
+//AC_save_steps = 10
+//AC_bin_steps = 1200
+//AC_bin_save_t = 5.0 
+
+
+//AC_max_steps = 50001
+//AC_save_steps = 100
+//AC_bin_steps = 10000
+
+AC_max_steps = 100001
+AC_save_steps = 500
+AC_bin_steps = 20000
+
+AC_bin_save_t = 2300000.0
+
+// Hydro
+AC_cdt = 0.4
+AC_cdtv = 0.3
+AC_cdts = 1.0
+//GOOD VISC Re_mesh = 3 
+//AC_nu_visc  = 3.0e-3
+AC_nu_visc  = 1.0e-3
+AC_cs_sound = 0.2
+AC_zeta = 1.0e-3
+
+// Magnetic
+AC_eta = 5e-3
+AC_mu0 = 1.4
+AC_chi = 0.0001
+
+// Forcing
+AC_relhel = 0.0
+
+// Entropy
+// cp arbitrary
+AC_cp_sound = 1.0
+// 5/3 adiabatic process
+AC_gamma = 1.66
+AC_lnT0 = 1.0
+AC_lnrho0 = 0.0
+
+
+// Boundary condition. Defined by arbitrary int. 
+AC_bc_type = 666 
+//AC_bc_type = 121 
+AC_trans = 0.6
+
+
+//Physical units (cgs) 
+// Based on Shu 1977 model calculations with t = 20 kyr, R = 500 AU
+// g/cm^3
+AC_unit_density =  1e-17 
+// cm/s
+// Now 1 km/s
+//AC_unit_velocity = 1e5 
+AC_unit_velocity = 1.0 
+// cm
+// Now 1 AU 
+AC_unit_length = 1.496e+13 
+
+//Properties of gravitating star*
+AC_star_pos_x = -500.0 
+//AC_star_pos_x = -10.0 
+AC_star_pos_y = 0.0 
+AC_star_pos_z = 0.0 
+//In M_sun 
+//AC_M_star = 0.05
+AC_M_star = 0.5
+//AC_M_star = 0.0
+
+/*
+ * =============================================================================
+ * Initial conditions
+ * =============================================================================
+ */
+AC_ampl_lnrho = 0.0
+AC_lnrho_edge = -1.0
+AC_lnrho_out  = 0.0
+//original
+//AC_ampl_uu = 0.25
+//For gravity test
+AC_ampl_uu = 0.0
+AC_angl_uu = 0.0
+//AC_angl_uu = 0.35
diff --git a/doc/doxygen/.gitignore b/doc/doxygen/.gitignore
new file mode 100644
index 0000000..5e7d273
--- /dev/null
+++ b/doc/doxygen/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/doc/manual/manual.md b/doc/manual/manual.md
new file mode 100644
index 0000000..007b1bc
--- /dev/null
+++ b/doc/manual/manual.md
@@ -0,0 +1,131 @@
+
+*Miikka Vaisala: This is just something I have astarted to write up to make sense about the Astaroth 2.0. Starting for personally important notes to understand the code. Will be refined as my understanding improves.*
+
+#Astaroth manual
+
+## Compilation
+
+See the `README.md`. At the moment, let us keep certaint things in one place.
+
+## Simulation instructions
+
+At the moment it is only possible to build and run in the `astaroth_2.0/build/` directory. Possibility to add separate run directories will be included later.
+
+### Choosing physics
+
+Runtime settings can be adjusted from `astaroth_2.0/include/astaroth.h` and `astaroth_2.0/config/astaroth.conf`.
+
+Howeve, physics switches LENTROPY, LFORCING etc. do not work at the moment. There has been an issue to get pre-processor combatible with astaroth-domain-specific language in Astaroth 2.0. Therefore, all features are online by default.
+
+To get the switcher working now, rename `astaroth_2.0/src/core/kernels/rk3handtuned.cuh` -> `rk3.cuh`. (**MV:** Not yet tested.)
+
+How to use?
+
+What kind of runtime settings?
+
+### Setting initial conditions
+
+Where can we effectively choose the initial condition?
+
+### Launchin a run
+
+`./ac_run -s` assuming you are doing a normal simulation. Basic code for this invocation can be found in the source file `astaroth_2.0/src/standalone/simulation.cc`.
+
+Please note that launching `./ac_run -t` will *fail if entropy and forcing are in use*. Test is mainly for finding paralleization bugs. (In principle if hydro stuff and induction work, so will forcing and entropy.)
+
+### Diagnostic variables
+
+What is calculated?
+
+Where it is saved?
+
+### Simulation data
+
+Saving output binaries is not enabled yet.
+
+**MV:** I am planning to implement HDF5 format for the data. **TOP PRIORITY**.
+
+#### Notes about data structures
+
+- Configuration parameters have prefix `AC_`, such as `AC_dsx`.
+
+- All configurations are stored in the struct `AcMeshInfo`, containing tables `int_params` ja `real_params`. **NOTE:** `int_params` and `real_params` require diligence. If you call e.g. `int_params[AC_dsx]`, the result will be something unexpected. So-far error checking with this has now been possible to be automated.
+
+
+- All mesh data is stored to the struct `AcMesh`, containing both configuration values and vertex data (`lnrho`, `uux`, etc.)
+
+- All essential tructs, macros and enumerators are found in astaroth.h for better reference.
+
+- In the case there is changes in the data layout, better use macro `AC_VTXBUF_IDX(i, j, k, mesh_info)`which transform indices from 3D to 1D. Therefore no need to start writing `i + j * mesh_info.int_params[AC_mx] + ...` which would affect the code readability.
+
+- AcReal on generic floating point real number type used everywhere in the code. Currently can be either `float` or `double`. Possibly in the future also `half` or `long double` could become available.
+
+Sample code:
+
+```cpp
+AcMeshInfo mesh_info;
+// Loads data from astaroth.conf into the AcMeshInfo struct
+load_config(&mesh_info);
+
+// Allocates data on the host for the AcMesh struct using information found in mesh_info.
+AcMesh* mesh = acmesh_create(mesh_info);
+
+// Initializes mesh to InitType (specified in standalone/model/host_memory.h)
+acmesh_init_to(INIT_TYPE_GAUSSIAN_RADIAL_EXPL, mesh); 
+
+// Allocates data on the device for the AcMesh struct
+acInit(mesh_info); 
+
+acLoad(*mesh); // Loads the mesh to the device
+
+
+const AcReal dt = 1.f;
+
+// Synchronizes previous device commands
+acSynchronize(); 
+
+// Does a full rk3 integration step on the device
+acIntegrate(dt); 
+
+acSynchronize();
+
+// Store data from device to host mesh
+acStore(mesh); 
+
+printf("nx: %d, dsx %f\n", 
+        mesh->info.int_params[AC_nx], 
+        double(mesh->info.real_params[AC_dsx]));
+printf("First vertex of the computational domain: %f\n",        
+double(mesh->vertex_buffer[VTXBUF_LNRHO][AC_VTXBUF_IDX(3, 3, 3, mesh_info)]));
+
+```
+
+
+### Reading data
+
+Depends on the output format. With HDF5 should be simple enough.
+
+[Jupyter notebook](http://jupyter.org/) visualization?
+
+Do we want to use [YT?](https://yt-project.org/)
+
+### Live rendering
+
+MV: Cool, but does not work for remote cluster so far. A GPU workstation is required.
+
+##Multi-GPU
+
+At the moment multi-GPU is not included in Astaroth 2.0. However, it has been implemented 1.0 (`astaroth_1.0/src/gpu/cuda/cuda_generic.cu`) could be essentially ported by copypasting to `astaroth_2.0/src/core/astaroth.cu` after we have clear idea how to run things with single GPU. Could be done overnight in principle.
+
+
+## Profiling
+
+The built-in beachmark is currently unreliable due to an unknown reason. Please use [nvprof and nvvp](https://docs.nvidia.com/cuda/profiler-users-guide/index.html) for precise profiling. Also, NVIDIA suggests their [Nsight Systems](https://developer.nvidia.com/nsight-systems).
+
+
+
+## ETC
+
+**Note** `auto_optimize.sh` does not currently work, but it aims to tune thread block dimensions automatically.
+
+
diff --git a/doxyfile b/doxyfile
new file mode 100644
index 0000000..7bab478
--- /dev/null
+++ b/doxyfile
@@ -0,0 +1,2427 @@
+# Doxyfile 1.8.11
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Astaroth"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      = cu=c++ cuh=c++
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           = doc/doxygen/doxygen_warnings.log
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = src include
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
+# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.cc *.h *.cu *.cuh
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = doc/doxygen/astaroth_doc_html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = doc/doxygen/astaroth_doc_latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/include/astaroth.h b/include/astaroth.h
new file mode 100644
index 0000000..14eed0c
--- /dev/null
+++ b/include/astaroth.h
@@ -0,0 +1,422 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Provides an interface to Astaroth. Contains all the necessary configuration
+ * structs and functions for running the code on multiple GPUs.
+ *
+ * All interface functions declared here (such as acInit()) operate all GPUs
+ * available in the node under the hood, and the user does not need any
+ * information about the decomposition, synchronization or such to use these
+ * functions.
+ *
+ */
+#pragma once
+
+/* Prevent name mangling */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <float.h>        // FLT_EPSILON, etc
+#include <stdlib.h>       // size_t
+#include <vector_types.h> // CUDA vector types (float4, etc)
+
+
+/*
+ * =============================================================================
+ * Flags for auto-optimization
+ * =============================================================================
+ */
+#define AUTO_OPTIMIZE (0) // DEPRECATED TODO remove
+#define BOUNDCONDS_OPTIMIZE (0)
+#define GENERATE_BENCHMARK_DATA (0)
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define MAX_TB_DIM (MAX_THREADS_PER_BLOCK)
+#define NUM_ITERATIONS (10)
+#define WARP_SIZE (32)
+
+
+/*
+ * =============================================================================
+ * Compile-time constants used during simulation (user definable)
+ * =============================================================================
+ */
+#define STENCIL_ORDER (6)
+
+///////////// PAD TEST
+// NOTE: works only with nx is divisible by 32
+//#define PAD_LEAD (32 - STENCIL_ORDER/2)
+//#define PAD_SIZE (32 - STENCIL_ORDER)
+///////////// PAD TEST
+
+// L-prefix inherited from the old Astaroth, no idea what it means
+// MV: L means a Logical switch variale, something having true of false value.
+#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+#define LINDUCTION (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
+/*
+ * =============================================================================
+ * Identifiers used to construct the parameter lists for AcMeshInfo
+ * (IntParamType and RealParamType)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_INT_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_nx), \
+        FUNC(AC_ny), \
+        FUNC(AC_nz), \
+        FUNC(AC_mx), \
+        FUNC(AC_my), \
+        FUNC(AC_mz), \
+        FUNC(AC_nx_min), \
+        FUNC(AC_ny_min), \
+        FUNC(AC_nz_min), \
+        FUNC(AC_nx_max), \
+        FUNC(AC_ny_max), \
+        FUNC(AC_nz_max), \
+        /* Other */\
+        FUNC(AC_max_steps), \
+        FUNC(AC_save_steps), \
+        FUNC(AC_bin_steps), \
+        FUNC(AC_bc_type), \
+        /* Additional */\
+        FUNC(AC_mxy),\
+        FUNC(AC_nxy),\
+        FUNC(AC_nxyz)
+#define AC_FOR_REAL_PARAM_TYPES(FUNC)\
+        /* cparams */\
+        FUNC(AC_dsx), \
+        FUNC(AC_dsy), \
+        FUNC(AC_dsz), \
+        FUNC(AC_dsmin), \
+        /* physical grid*/\
+        FUNC(AC_xlen), \
+        FUNC(AC_ylen), \
+        FUNC(AC_zlen), \
+        FUNC(AC_xorig), \
+        FUNC(AC_yorig), \
+        FUNC(AC_zorig), \
+        /*Physical units*/\
+        FUNC(AC_unit_density),\
+        FUNC(AC_unit_velocity),\
+        FUNC(AC_unit_length),\
+        /* properties of gravitating star*/\
+        FUNC(AC_star_pos_x),\
+        FUNC(AC_star_pos_y),\
+        FUNC(AC_star_pos_z),\
+        FUNC(AC_M_star),\
+        /* Run params */\
+        FUNC(AC_cdt), \
+        FUNC(AC_cdtv), \
+        FUNC(AC_cdts), \
+        FUNC(AC_nu_visc), \
+        FUNC(AC_cs_sound), \
+        FUNC(AC_eta), \
+        FUNC(AC_mu0), \
+        FUNC(AC_relhel), \
+        FUNC(AC_cp_sound), \
+        FUNC(AC_gamma), \
+        FUNC(AC_cv_sound), \
+        FUNC(AC_lnT0), \
+        FUNC(AC_lnrho0), \
+        FUNC(AC_zeta), \
+        FUNC(AC_trans),\
+        /* Other */\
+        FUNC(AC_bin_save_t), \
+        /* Initial condition params */\
+        FUNC(AC_ampl_lnrho), \
+        FUNC(AC_ampl_uu), \
+        FUNC(AC_angl_uu), \
+        FUNC(AC_lnrho_edge),\
+        FUNC(AC_lnrho_out),\
+        /* Additional helper params */\
+        /* (deduced from other params do not set these directly!) */\
+        FUNC(AC_G_CONST),\
+        FUNC(AC_GM_star),\
+        FUNC(AC_sq2GM_star),\
+        FUNC(AC_cs2_sound), \
+        FUNC(AC_inv_dsx), \
+        FUNC(AC_inv_dsy), \
+        FUNC(AC_inv_dsz)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Identifiers for VertexBufferHandle
+ * (i.e. the arrays used to construct AcMesh)
+ * (user definable)
+ * =============================================================================
+ */
+// clang-format off
+#define AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_LNRHO), \
+        FUNC(VTXBUF_UUX), \
+        FUNC(VTXBUF_UUY), \
+        FUNC(VTXBUF_UUZ), \
+        // FUNC(VTXBUF_DYE),
+
+#if LINDUCTION
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_AX), \
+        FUNC(VTXBUF_AY), \
+        FUNC(VTXBUF_AZ),
+#else
+#define AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LENTROPY
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_ENTROPY),
+#else
+#define AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)
+#endif
+
+#if LTEMPERATURE
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)\
+        FUNC(VTXBUF_TEMPERATURE),
+#else
+#define AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+#endif
+
+#define AC_FOR_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_HYDRO_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_INDUCTION_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_ENTROPY_VTXBUF_HANDLES(FUNC)\
+        AC_FOR_TEMPERATURE_VTXBUF_HANDLES(FUNC)
+// clang-format on
+
+/*
+ * =============================================================================
+ * Single/double precision switch
+ * =============================================================================
+ */
+#if AC_DOUBLE_PRECISION == 1
+typedef double AcReal;
+typedef double3 AcReal3;
+#define AC_REAL_MAX (DBL_MAX)
+#define AC_REAL_MIN (DBL_MIN)
+#define AC_REAL_EPSILON (DBL_EPSILON)
+#else
+typedef float AcReal;
+typedef float3 AcReal3;
+#define AC_REAL_MAX (FLT_MAX)
+#define AC_REAL_MIN (FLT_MIN)
+#define AC_REAL_EPSILON (FLT_EPSILON)
+#endif
+
+typedef struct {
+    AcReal3 row[3];
+} AcMatrix;
+
+/*
+ * =============================================================================
+ * Helper macros
+ * =============================================================================
+ */
+#define AC_GEN_ID(X) X
+#define AC_GEN_STR(X) #X
+
+/*
+ * =============================================================================
+ * Error codes
+ * =============================================================================
+ */
+typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
+
+/*
+ * =============================================================================
+ * Reduction types
+ * =============================================================================
+ */
+typedef enum {
+    RTYPE_MAX,
+    RTYPE_MIN,
+    RTYPE_RMS,
+    RTYPE_RMS_EXP,
+    NUM_REDUCTION_TYPES
+} ReductionType;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
+    NUM_INT_PARAM_TYPES
+} AcIntParam;
+
+typedef enum {
+    AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
+    NUM_REAL_PARAM_TYPES
+} AcRealParam;
+
+extern const char* intparam_names[];  // Defined in astaroth.cu
+extern const char* realparam_names[]; // Defined in astaroth.cu
+
+typedef struct {
+    int int_params[NUM_INT_PARAM_TYPES];
+    AcReal real_params[NUM_REAL_PARAM_TYPES];
+} AcMeshInfo;
+
+/*
+ * =============================================================================
+ * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
+ * =============================================================================
+ */
+typedef enum {
+    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
+} VertexBufferHandle;
+
+extern const char* vtxbuf_names[]; // Defined in astaroth.cu
+
+/*
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+*/
+
+// NOTE: there's no particular benefit declaring AcMesh a class, since
+// a library user may already have allocated memory for the vertex_buffers.
+// But then we would allocate memory again when the user wants to start
+// filling the class with data. => Its better to consider AcMesh as a
+// payload-only struct
+typedef struct {
+    AcReal* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} AcMesh;
+
+#define AC_VTXBUF_SIZE(mesh_info)                                              \
+    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *      \
+              mesh_info.int_params[AC_mz]))
+
+#define AC_VTXBUF_SIZE_BYTES(mesh_info)                                        \
+    (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                   \
+    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] *               \
+     mesh_info.int_params[AC_nz])
+
+#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                             \
+    (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
+
+#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                      \
+    ((i) + (j)*mesh_info.int_params[AC_mx] +                                   \
+     (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my])
+
+/*
+ * =============================================================================
+ * Astaroth interface
+ * =============================================================================
+ */
+/** Starting point of all GPU computation. Handles the allocation and
+initialization of *all memory needed on all GPUs in the node*. In other words,
+setups everything GPU-side so that calling any other GPU interface function
+afterwards does not result in illegal memory accesses. */
+AcResult acInit(const AcMeshInfo& mesh_info);
+
+/** Splits the host_mesh and distributes it among the GPUs in the node */
+AcResult acLoad(const AcMesh& host_mesh);
+AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
+
+/** Does all three steps of the RK3 integration and computes the boundary
+conditions when necessary. Note that the boundary conditions are not applied
+after the final integration step.
+The result can be fetched to CPU memory with acStore(). */
+AcResult acIntegrate(const AcReal& dt);
+
+/** Performs a single RK3 step without computing boundary conditions. */
+AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
+
+/** Applies boundary conditions on the GPU meshs and communicates the
+ ghost zones among GPUs if necessary */
+AcResult acBoundcondStep(void);
+
+/** Performs a scalar reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
+
+/** Performs a vector reduction on all GPUs in the node and returns the result.
+ */
+AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+                   const VertexBufferHandle& b, const VertexBufferHandle& c);
+
+/** Stores the mesh distributed among GPUs of the node back to a single host
+ * mesh */
+AcResult acStore(AcMesh* host_mesh);
+AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
+
+/** Frees all GPU allocations and resets all devices in the node. Should be
+ * called at exit. */
+AcResult acQuit(void);
+
+/** Synchronizes all devices. All calls to Astaroth are asynchronous by default
+    unless otherwise stated. */
+AcResult acSynchronize(void);
+
+/* End extern "C" */
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ * =============================================================================
+ * Notes
+ * =============================================================================
+ */
+/*
+typedef enum {
+    VTX_BUF_LNRHO,
+    VTX_BUF_UUX,
+    VTX_BUF_UUY,
+    VTX_BUF_UUZ,
+    NUM_VERTEX_BUFFER_HANDLES
+} VertexBufferHandle
+
+// LNRHO etc
+typedef struct {
+    AcReal* data;
+} VertexBuffer;
+
+// Host
+typedef struct {
+    VertexBuffer vertex_buffers[NUM_VERTEX_BUFFER_HANDLES];
+    MeshInfo info;
+} Mesh;
+
+// Device
+typedef struct {
+    VertexBuffer in[NUM_VERTEX_BUFFER_HANDLES];
+    VertexBuffer out[NUM_VERTEX_BUFFER_HANDLES];
+} VertexBufferArray;
+*/
diff --git a/scripts/ac_mkbuilddir.sh b/scripts/ac_mkbuilddir.sh
new file mode 100755
index 0000000..eac417e
--- /dev/null
+++ b/scripts/ac_mkbuilddir.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+if [ -z $AC_HOME ]
+then
+       echo "ASTAROTH_HOME environment variable not set, run \"source ./sourceme.sh\" in Astaroth home directory"
+       exit 1
+fi
+
+
+TIARA_SETUP_DEFAULT=""
+DOUBLE_DEFAULT="OFF"
+DEBUG_MODE_DEFAULT="OFF"
+BUILD_DIR_DEFAULT=${AC_HOME}/build/
+ALTER_CONF_DEFAULT="OFF"
+
+BUILD_DIR=${BUILD_DIR_DEFAULT}
+TIARA_SETUP=${TIARA_SETUP_DEFAULT}
+DOUBLE=${DOUBLE_DEFAULT}
+DEBUG_MODE=${DEBUG_MODE_DEFAULT}
+ALTER_CONF=${ALTER_CONF_DEFAULT}
+
+while [ "$#" -gt 0 ]
+do
+	case $1 in  
+		-h|--help)
+			echo "You can set up a build directory separe of the ASTAROTH_HOME"
+			echo "Available flags:"
+			echo "-b, --buildir [PATH] : Set build directory"
+			echo "-t,--tiara : Use TIARA cluster setting for cmake"
+			echo "-d, --double : Compile with double precision"
+			echo "-e, --debug: : Compile in debug mode"
+			echo "Example:"
+			echo "ac_mkbuilddir.sh -b my_build_dir/"
+			exit 0
+			;;
+		-b|--buildir)
+			shift
+                        BUILD_DIR=${1}
+			shift
+                        echo "Setting up build directory..."
+			ALTER_CONF="ON"
+			;;
+		-t|--tiara)
+			shift
+                        TIARA_SETUP="-D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc"
+                        echo "Using TIARA cluster compiler settings"
+			;;
+		-d|--double)
+			shift
+                        DOUBLE="ON"
+                        echo "Double precision"
+			;;
+		-e|--debug)
+			shift
+                        DEBUG_MODE="ON"
+                        echo "Debug mode compilation"
+			;;
+		*)
+			break
+	esac
+done
+
+echo "Creating build directory: ${BUILD_DIR}"
+
+mkdir ${BUILD_DIR}
+
+cd ${BUILD_DIR}
+
+#Set up the astaroth.conf to be define and customized in the build directory to
+#not always alter the default use i.e. for unit test etc. 
+#Assumed by default if you do this thing anyway.
+echo "cp ${AC_HOME}/config/astaroth.conf ${PWD}"
+cp ${AC_HOME}/config/astaroth.conf .
+
+CONF_DIR="-D ASTAROTH_CONF_PATH=${PWD}"
+
+
+#cmake -D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc -DDOUBLE_PRECISION=OFF -DBUILD_DEBUG=OFF ${AC_HOME}
+
+echo "cmake ${TIARA_SETUP} ${CONF_DIR} -DDOUBLE_PRECISION=${DOUBLE} -DBUILD_DEBUG=${DEBUG_MODE} -DALTER_CONF=${ALTER_CONF} ${AC_HOME}"
+
+cmake ${TIARA_SETUP} ${CONF_DIR} -DDOUBLE_PRECISION=${DOUBLE} -DBUILD_DEBUG=${DEBUG_MODE} -DALTER_CONF=${ALTER_CONF} ${AC_HOME}
diff --git a/scripts/auto_optimize.sh b/scripts/auto_optimize.sh
new file mode 100755
index 0000000..c86fa63
--- /dev/null
+++ b/scripts/auto_optimize.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Run this in your build directory (cd build && ../scripts/auto_optimize.sh)
+# Generates a ${BENCHMARK_FILE} which contains the threadblock dims and other
+# constants used in the integration in addition to the time used.
+
+MAX_THREADS=1024 # Max size of the thread block, depends on hardware
+
+BENCHMARK_FILE="benchmark.out"
+TBCONFCREATOR_SRC_PATH="../scripts/gen_rk3_threadblockconf.c"
+TBCONFFILE_DST_PATH="../src/core/kernels"
+
+C_COMPILER_NAME="gcc"
+
+rm ${BENCHMARK_FILE}
+
+for (( tz=2; tz<=8; tz*=2))
+do
+for (( ty=1; ty<=1; ty+=1))
+do
+for (( tx=16; tx<=64; tx*=2))
+do
+
+if ( (${tx}*${ty}*${tz}) > ${MAX_THREADS})
+then break
+fi
+
+for (( launch_bound=1; launch_bound<=8; launch_bound*=2))
+do
+for (( elems_per_thread=1; elems_per_thread<=128; elems_per_thread*=2))
+do
+    # Generate the threadblock configuration
+    ${C_COMPILER_NAME} ${TBCONFCREATOR_SRC_PATH} -o gen_rk3_threadblockconf
+    ./gen_rk3_threadblockconf ${tx} ${ty} ${tz} ${elems_per_thread} ${launch_bound}
+    rm gen_rk3_threadblockconf
+    mv rk3_threadblock.conf ${TBCONFFILE_DST_PATH}
+
+    # Compile and run the test build
+    cmake -DBUILD_DEBUG=OFF -DDOUBLE_PRECISION=OFF -DAUTO_OPTIMIZE=ON .. && make -j
+    #if ./ac_run -t; then
+    #    echo Success
+        ./ac_run -b
+    #else
+    #    echo fail!
+    #fi
+done 
+done 
+done 
+done
+done 
+
diff --git a/scripts/buildtest.sh b/scripts/buildtest.sh
new file mode 100755
index 0000000..693b1d8
--- /dev/null
+++ b/scripts/buildtest.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cmake -DCUDA_BUILD_LEGACY=OFF -DDOUBLE_PRECISION=ON .. && make -j && valgrind --leak-check=full --show-leak-kinds=all ./ac_run -t && make clean &&\
+cmake -DCUDA_BUILD_LEGACY=OFF -DDOUBLE_PRECISION=OFF .. && make -j && valgrind --leak-check=full --show-leak-kinds=all ./ac_run -t
diff --git a/scripts/compile_acc.sh b/scripts/compile_acc.sh
new file mode 100755
index 0000000..d051c7b
--- /dev/null
+++ b/scripts/compile_acc.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#!/bin/bash
+if [ -z $AC_HOME ]
+then
+       echo "ASTAROTH_HOME environment variable not set, run \"source ./sourceme.sh\" in Astaroth home directory"
+       exit 1
+fi
+
+KERNEL_DIR=${AC_HOME}"/src/core/kernels"
+ACC_DIR=${AC_HOME}"/acc"
+ACC_DEFAULT_SAS="mhd_solver/stencil_assembly.sas"
+ACC_DEFAULT_SPS="mhd_solver/stencil_process.sps"
+
+${ACC_DIR}/clean.sh
+${ACC_DIR}/build_acc.sh
+
+
+ACC_SAS=${ACC_DEFAULT_SAS}
+ACC_SPS=${ACC_DEFAULT_SPS}
+
+while [ "$#" -gt 0 ]
+do
+	case $1 in  
+		-h|--help)
+			echo "You can set a custom files for DSL under the path $AC_HOME/"
+			echo "Example:"
+			echo "compile_acc.sh -a custom_setup/custom_assembly.sas -p custom_setup/custom_process.sps"
+			exit 0
+			;;
+		-a|--assembly)
+			shift
+                        ACC_SAS=${1}
+			shift
+                        echo "CUSTOM Assembly file!"
+			;;
+		-p|--process)
+			shift
+                        ACC_SPS=${1}
+			shift
+			echo "CUSTOM Process file!"
+			;;
+		*)
+			break
+	esac
+done
+
+echo "Assembly file: ${ACC_DIR}/${ACC_SAS}"
+echo "Process file: ${ACC_DIR}/${ACC_SPS}"
+
+cd ${KERNEL_DIR}
+${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SAS}
+${ACC_DIR}/compile.sh ${ACC_DIR}/${ACC_SPS}
diff --git a/scripts/fix_style.sh b/scripts/fix_style.sh
new file mode 100755
index 0000000..776c734
--- /dev/null
+++ b/scripts/fix_style.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+if [[ $1 == "DO" && $2 == "IT!" ]]; then
+    find -name \*.h -o -name \*.cc -o -name \*.cu -o -name \*.cuh | xargs clang-format-6.0 -i -style=file
+    echo "It is done."
+else
+    find -name \*.h -o -name \*.cc -o -name \*.cu -o -name \*.cuh
+    echo "I'm going to try to fix the style of these files."
+    echo "If you're absolutely sure, give \"DO IT!\" (without quotes) as a parameter."
+fi
diff --git a/scripts/gen_rk3_threadblockconf.c b/scripts/gen_rk3_threadblockconf.c
new file mode 100644
index 0000000..b2f4c55
--- /dev/null
+++ b/scripts/gen_rk3_threadblockconf.c
@@ -0,0 +1,60 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+ * @file
+ * \brief Generates a threadblock config file for RK3 using the given parameters.
+ *
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <assert.h>
+
+const char* rk3_threadblockconf_path = "rk3_threadblock.conf";
+
+int
+write_to_file(int threads_x, int threads_y, int threads_z, int elems_per_thread, int launch_bound)
+{
+    FILE* fp;
+    fp = fopen(rk3_threadblockconf_path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "#define RK_THREADS_X (%d)\n", threads_x);
+        fprintf(fp, "#define RK_THREADS_Y (%d)\n", threads_y);
+        fprintf(fp, "#define RK_THREADS_Z (%d)\n", threads_z);
+        fprintf(fp, "#define RK_ELEMS_PER_THREAD (%d)\n", elems_per_thread);
+        fprintf(fp, "#define RK_LAUNCH_BOUND_MIN_BLOCKS (%d)\n", launch_bound);
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+// Takes arguments and writes them into a file
+// RK_THREADS_X, RK_THREADS_Y, RK_THREADS_Z, RK_ELEMS_PER_THREAD, RK_LAUNCH_BOUND_MIN_BLOCKS
+int
+main(int argc, char* argv[])
+{
+    assert(argc == 6);
+
+    return write_to_file(atoi(argv[1]), atoi(argv[2]),atoi(argv[3]), atoi(argv[4]), atoi(argv[5]));
+}
diff --git a/scripts/generate_doc.sh b/scripts/generate_doc.sh
new file mode 100755
index 0000000..4ee6aca
--- /dev/null
+++ b/scripts/generate_doc.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+doxygen doxyfile
diff --git a/sourceme.sh b/sourceme.sh
new file mode 100644
index 0000000..12eea7b
--- /dev/null
+++ b/sourceme.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+export AC_HOME=$PWD
+export PATH=${PATH}:$AC_HOME/scripts/
+
+echo $AC_HOME
+echo $PATH
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
new file mode 100644
index 0000000..7782cbb
--- /dev/null
+++ b/src/core/CMakeLists.txt
@@ -0,0 +1,70 @@
+########################################
+##  CMakeLists.txt for Astaroth Core  ##
+########################################
+
+#----------------------Find CUDA-----------------------------------------------#
+
+find_package(CUDA)
+if (NOT CUDA_FOUND)
+    # find_package(CUDA REQUIRED) gives a confusing error message if it fails,
+    # therefore we print the reason here explicitly
+    message(FATAL_ERROR "CUDA not found")
+endif()
+
+
+#----------------------CUDA settings-------------------------------------------#
+
+set(CUDA_SEPARABLE_COMPILATION ON)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
+
+# CUDA_BUILD_CUBIN requires that we're compiling for only one architecture
+# set(CUDA_BUILD_CUBIN ON)
+
+
+#----------------------Setup CUDA compilation flags----------------------------#
+
+# Generate code for the default architecture (Pascal)
+set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
+                    -gencode arch=compute_50,code=sm_50 
+                    -gencode arch=compute_60,code=sm_60 
+                    -gencode arch=compute_61,code=sm_61 
+                    -lineinfo 
+                    --maxrregcount=255
+                    -ftz=true 
+                    -std=c++11) #--maxrregcount=255 -ftz=true #ftz = flush denormalized floats to zero
+# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
+# =cg to opt out
+
+# Additional CUDA optimization flags
+if (CMAKE_BUILD_TYPE MATCHES RELEASE)
+    # Doesn't set any additional flags, see CUDA_NVCC_FLAGS_DEBUG below on how
+    # to add more
+    set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE})
+endif()
+
+# Additional CUDA debug flags
+if (CMAKE_BUILD_TYPE MATCHES DEBUG)
+    # The debug flags must be set inside this if clause, since either CMake 3.5
+    # or nvcc 7.5 is bugged:
+    # CMake converts these into empty strings when doing RELEASE build, but nvcc
+    # 7.5 fails to parse empty flags.
+    set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};
+                               --device-debug;
+                               --generate-line-info;
+                               --ptxas-options=-v)
+endif()
+
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCH_FLAGS}")
+
+
+message("CUDA_NVCC_FLAGS: " ${CUDA_NVCC_FLAGS})
+
+
+#------------------Compile and create a static library-------------------------#
+file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu")
+
+# Use -fPIC if -fpic not supported. Some quick non-scientific tests:
+# Without fpic: 4.94 user, 4.04 system, 0:09.88 elapsed
+# With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed
+# With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed
+CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic")
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
new file mode 100644
index 0000000..2111b95
--- /dev/null
+++ b/src/core/astaroth.cu
@@ -0,0 +1,451 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Multi-GPU implementation.
+ *
+ * Detailed info.
+ *
+ */
+#include "astaroth.h"
+#include "errchk.h"
+
+#include "device.cuh"
+#include "math_utils.h" // sum for reductions
+#include "standalone/config_loader.h" // update_config
+
+const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[]     = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]        = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+
+
+static const int MAX_NUM_DEVICES = 32;
+static int num_devices = 1;
+static Device devices[MAX_NUM_DEVICES] = {};
+
+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
+
+static Grid
+createGrid(const AcMeshInfo& config)
+{
+    Grid grid;
+    grid.m = (int3) {
+        config.int_params[AC_mx],
+        config.int_params[AC_my],
+        config.int_params[AC_mz]
+    };
+
+    grid.n = (int3) {
+        config.int_params[AC_nx],
+        config.int_params[AC_ny],
+        config.int_params[AC_nz]
+    };
+
+    return grid;
+}
+
+static Grid grid; // A grid consists of num_devices subgrids
+static Grid subgrid;
+
+static int
+gridIdx(const Grid& grid, const int i, const int j, const int k)
+{
+    return i + j * grid.m.x + k * grid.m.x * grid.m.y;
+}
+
+static int3
+gridIdx3d(const Grid& grid, const int idx)
+{
+    return (int3){idx % grid.m.x,
+                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+                  idx / (grid.m.x * grid.m.y)};
+}
+
+void
+printInt3(const int3 vec)
+{
+    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
+}
+
+AcResult
+acInit(const AcMeshInfo& config)
+{
+    // Check devices
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices < 1) {
+        ERROR("No CUDA devices found!");
+        return AC_FAILURE;
+    }
+    if (num_devices > MAX_NUM_DEVICES) {
+        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
+        num_devices = MAX_NUM_DEVICES;
+    }
+    if (!AC_MULTIGPU_ENABLED) {
+        WARNING("MULTIGPU_ENABLED was false. Using only one device");
+        num_devices = 1; // Use only one device if multi-GPU is not enabled
+    }
+    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // problem domain to multiple GPUs much easier since we do not have to worry
+    // about remainders
+    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+
+    // Decompose the problem domain
+    // The main grid
+    grid = createGrid(config);
+
+    // Subgrids
+    AcMeshInfo subgrid_config = config;
+    subgrid_config.int_params[AC_nz] /= num_devices;
+    update_config(&subgrid_config);
+    subgrid = createGrid(subgrid_config);
+
+    // Periodic boundary conditions become weird if the system can "fold unto itself".
+    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+
+    printf("Grid m "); printInt3(grid.m); printf("\n");
+    printf("Grid n "); printInt3(grid.n); printf("\n");
+    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+
+    // Initialize the devices
+    for (int i = 0; i < num_devices; ++i) {
+        createDevice(i, subgrid_config, &devices[i]);
+        printDeviceInfo(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acQuit(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        destroyDevice(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+int
+gridIdxx(const Grid grid, const int3 idx)
+{
+    return gridIdx(grid, idx.x, idx.y, idx.z);
+}
+
+AcResult
+acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
+{
+    /*
+    Here we decompose the host mesh and distribute it among the GPUs in
+    the node.
+    
+    The host mesh is a huge contiguous block of data. Its dimensions are given by
+    the global variable named "grid". A "grid" is decomposed into "subgrids",
+    one for each GPU. Here we check which parts of the range s0...s1 maps
+    to the memory space stored by some GPU, ranging d0...d1, and transfer
+    the data if needed.
+    
+    The index mapping is inherently quite involved, but here's a picture which
+    hopefully helps make sense out of all this.
+    
+
+    Grid
+                                     |----num_vertices---|
+    xxx|....................................................|xxx
+             ^                   ^   ^                   ^
+            d0                  d1  s0 (src)            s1
+
+    Subgrid
+
+             xxx|.............|xxx
+             ^                   ^
+            d0                  d1
+
+                                 ^   ^                   
+                                db  da                   
+
+    */
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
+{
+    // See acLoadWithOffset() for an explanation of the index mapping
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
+
+        const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
+        const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
+        }
+        printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+// acCopyMeshToDevice
+AcResult
+acLoad(const AcMesh& host_mesh)
+{
+    return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
+}
+
+// acCopyMeshToHost
+AcResult
+acStore(AcMesh* host_mesh)
+{
+    return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
+}
+
+AcResult
+acIntegrateStep(const int& isubstep, const AcReal& dt)
+{
+    const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
+    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
+                              STENCIL_ORDER/2 + subgrid.n.y,
+                              STENCIL_ORDER/2 + subgrid.n.z};
+    for (int i = 0; i < num_devices; ++i) {
+        rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acBoundcondStep(void)
+{
+    acSynchronize();
+    if (num_devices == 1) {
+        boundcondStep(devices[0], STREAM_PRIMARY,
+                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+    } else {
+        // Local boundary conditions
+        for (int i = 0; i < num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
+            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
+        }
+
+/*
+// ===MIIKKANOTE START==========================================
+%JP: The old way for computing boundary conditions conflicts with the 
+way we have to do things with multiple GPUs.
+
+The older approach relied on unified memory, which represented the whole
+memory area as one huge mesh instead of several smaller ones. However, unified memory
+in its current state is more meant for quick prototyping when performance is not an issue.
+Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
+when managing the memory explicitly.
+
+In this new approach, I have simplified the multi- and single-GPU layers significantly. 
+Quick rundown:
+	New struct: Grid. There are two global variables, "grid" and "subgrid", which
+	contain the extents of the whole simulation domain and the decomposed grids, respectively.
+	To simplify thing, we require that each GPU is assigned the same amount of work,
+	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
+	to work with.
+
+	The whole simulation domain is decomposed with respect to the z dimension.
+	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+	contain (nx, ny, nz / num_devices) vertices.
+ 
+	An local index (i, j, k) in some subgrid can be mapped to the global grid with
+		global idx = (i, j, k + device_id * subgrid.n.z)
+
+Terminology:
+	- Single-GPU function: a function defined on the single-GPU layer (device.cu)
+
+Changes required to this commented code block:
+	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
+	  instead. Same holds for any complex index calculations. Instead, the local coordinates
+  	  should be passed as an int3 type without having to consider how the data is actually
+	  laid out in device memory
+	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
+	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
+	  are stored in a global array "devices[num_devices]". 
+	- Every single-GPU function is executed asynchronously by default such that we
+	  can optimize Astaroth by executing memory transactions concurrently with computation.
+	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
+	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+	  as a parameter and commands executing in different streams can be processed 
+	  in parallel/concurrently.
+
+
+Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+
+	With multiple GPUs, periodic boundary conditions applied on indices ranging from 
+		
+		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+
+	on a single device are "local", in the sense that they can be computed without having
+	to exchange data with neighboring GPUs. Special care is needed only for transferring
+	the data to the fron and back plates outside this range. In the solution we use here,
+	we solve the local boundaries first, and then just exchange the front and back plates
+	in a "ring", like so 
+				device_id
+		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+			
+
+// ======MIIKKANOTE END==========================================
+
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
+                                                          moved into device.cu, function boundCondStep()
+                                                          In astaroth.cu, we use acBoundcondStep() 
+                                                          just to distribute the work and manage
+                                                          communication between GPUs.
+
+    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+
+    exit(0);
+    #else
+    
+        
+        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
+
+        const int3 start = (int3){0, 0, device_id * depth};
+        const int3 end = (int3){mesh_info.int_params[AC_mx],
+                                mesh_info.int_params[AC_my],
+                                min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
+
+        const dim3 tpb(8,2,8);
+
+        // TODO uses the default stream currently
+        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
+            wedge_boundconds(0, tpb, start, end, d_buffer);
+        } else { 
+            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) 
+                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+*/
+        // Exchange halos
+        for (int i = 0; i < num_devices; ++i) {
+            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
+            // ...|ooooxxx|... -> xxx|ooooooo|...
+            {
+                const int3 src = (int3) {0, 0, subgrid.n.z};
+                const int3 dst = (int3) {0, 0, 0};
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+            }
+            // ...|ooooooo|xxx <- ...|xxxoooo|...
+            {
+                const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
+                const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
+                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+            }
+        }
+    }
+    acSynchronize();
+    return AC_SUCCESS;
+}
+
+static AcResult
+acSwapBuffers(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        swapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acIntegrate(const AcReal& dt)
+{
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acBoundcondStep();
+        acIntegrateStep(isubstep, dt);
+        acSwapBuffers();
+    }
+    return AC_SUCCESS;
+}
+
+AcReal
+acReduceScal(const ReductionType& rtype,
+             const VertexBufferHandle& vtxbuffer_handle)
+{
+    // TODO
+    return 0;
+}
+
+AcReal
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+            const VertexBufferHandle& b, const VertexBufferHandle& c)
+{
+    // TODO
+    return 0;
+}
+
+AcResult
+acSynchronize(void)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], STREAM_ALL);
+    }
+
+    return AC_SUCCESS;
+}
diff --git a/src/core/device.cu b/src/core/device.cu
new file mode 100644
index 0000000..52adc6b
--- /dev/null
+++ b/src/core/device.cu
@@ -0,0 +1,309 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "device.cuh"
+
+#include "errchk.h"
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+} VertexBufferArray;
+
+__constant__ AcMeshInfo d_mesh_info;
+#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+#include "kernels/kernels.cuh"
+
+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+};
+
+AcResult
+printDeviceInfo(const Device device)
+{
+    const int device_id = device->id;
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);
+
+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf("    Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    // Memory
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
+               (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n",
+           props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n",
+           free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n",
+           total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    printf("    Shared mem per block: %ld KiB\n",
+           props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n",
+           props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
+
+    return AC_SUCCESS;
+}
+
+static __global__ void dummy_kernel(void) {}
+
+AcResult
+createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
+{
+    cudaSetDevice(id);
+    cudaDeviceReset();
+
+    // Create Device
+    struct device_s* device = (struct device_s*) malloc(sizeof(*device));
+    ERRCHK_ALWAYS(device);
+
+    device->id = id;
+    device->local_config = device_config;
+
+    // Check that the code was compiled for the proper GPU architecture
+    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
+           "device supports the CUDA architecture you are compiling for.\n"
+           "Running dummy kernel... ");
+    fflush(stdout);
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    printf("Success!\n");
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamCreate(&device->streams[i]);
+    }
+
+    // Memory
+    const size_t vba_size_bytes = AC_VTXBUF_SIZE_BYTES(device_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+    }
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
+                                  AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+
+    // Device constants
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
+                                          cudaMemcpyHostToDevice));
+
+    printf("Created device %d (%p)\n", device->id, device);
+    *device_handle = device;
+    return AC_SUCCESS;
+}
+
+AcResult
+destroyDevice(Device device)
+{
+    cudaSetDevice(device->id);
+    printf("Destroying device %d (%p)\n", device->id, device);
+
+    // Memory
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        cudaFree(device->vba.in[i]);
+        cudaFree(device->vba.out[i]);
+    }
+    cudaFree(device->reduce_scratchpad);
+    cudaFree(device->reduce_result);
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i)
+        cudaStreamDestroy(device->streams[i]);
+
+    // Destroy Device
+    free(device);
+    return AC_SUCCESS;
+}
+
+AcResult
+boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceScal(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+reduceVec(const Device device)
+{
+    cudaSetDevice(device->id);
+    return AC_SUCCESS;
+}
+
+AcResult
+rkStep(const Device device, const StreamType stream_type, const int step_number,
+       const int3& start, const int3& end, const AcReal dt)
+{
+    cudaSetDevice(device->id);
+    rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
+    return AC_SUCCESS;
+}
+
+AcResult
+synchronize(const Device device, const StreamType stream_type)
+{
+    cudaSetDevice(device->id);
+    if (stream_type == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    } else {
+        cudaStreamSynchronize(device->streams[stream_type]);
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+loadWithOffset(const Device device, const StreamType stream_type,
+               const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+static AcResult
+storeWithOffset(const Device device, const StreamType stream_type,
+                const AcReal* src, const size_t bytes, AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
+                                device->streams[stream_type]));
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToDevice(const Device device, const StreamType stream_type,
+                 const AcMesh& host_mesh, const int3& src, const int3& dst,
+                 const int num_vertices)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
+                       &device->vba.in[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshToHost(const Device device, const StreamType stream_type,
+               const int3& src, const int3& dst, const int num_vertices,
+               AcMesh* host_mesh)
+{
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
+                        num_vertices * sizeof(AcReal),
+                        &host_mesh->vertex_buffer[i][dst_idx]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
+                       const int3& src, Device dst_device, const int3& dst,
+                       const int num_vertices)
+{
+    cudaSetDevice(src_device->id);
+    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
+    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, dst_device->local_config);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
+                                        &src_device->vba.in[i][src_idx], src_device->id,
+                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
+                                        src_device->streams[stream_type]));
+    }
+    return AC_SUCCESS;
+}
+
+
+AcResult
+swapBuffers(const Device device)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        AcReal* tmp     = device->vba.in[i];
+        device->vba.in[i]  = device->vba.out[i];
+        device->vba.out[i] = tmp;
+    }
+    return AC_SUCCESS;
+}
diff --git a/src/core/device.cuh b/src/core/device.cuh
new file mode 100644
index 0000000..41db226
--- /dev/null
+++ b/src/core/device.cuh
@@ -0,0 +1,82 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+typedef enum {
+  STREAM_PRIMARY,
+  STREAM_SECONDARY,
+  NUM_STREAM_TYPES,
+  STREAM_ALL
+} StreamType;
+
+typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
+                                 // in Vulkan, f.ex. VkDevice
+
+/** */
+AcResult printDeviceInfo(const Device device);
+
+/** */
+AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
+
+/** */
+AcResult destroyDevice(Device device);
+
+/** */
+AcResult boundcondStep(const Device device, const StreamType stream_type,
+                       const int3& start, const int3& end);
+
+/** */
+AcResult reduceScal(const Device device);
+
+/** */
+AcResult reduceVec(const Device device);
+
+/** */
+AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
+                const int3& start, const int3& end, const AcReal dt);
+
+/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
+    a StreamType, the function synchronizes all streams on the device. */
+AcResult synchronize(const Device device, const StreamType stream_type);
+
+/** */
+AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
+                          const AcMesh& host_mesh, const int3& src, const int3& dst,
+                          const int num_vertices);
+
+/** */
+AcResult copyMeshToHost(const Device device, const StreamType stream_type,
+                        const int3& src, const int3& dst, const int num_vertices,
+                        AcMesh* host_mesh);
+
+/** */
+AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
+                                Device dst, const int3& dst_idx, const int num_vertices);
+
+/** Swaps the input/output buffers used in computations */
+AcResult swapBuffers(const Device device);
diff --git a/src/core/errchk.h b/src/core/errchk.h
new file mode 100644
index 0000000..bd89693
--- /dev/null
+++ b/src/core/errchk.h
@@ -0,0 +1,112 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+// clang-format off
+/*
+ * =============================================================================
+ * General error checking
+ * =============================================================================
+ */
+#define ERROR(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tError in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+    exit(EXIT_FAILURE); \
+    abort(); \
+}
+
+#define WARNING(str) \
+{ \
+    time_t t; time(&t); \
+    fprintf(stderr, "%s", ctime(&t)); \
+    fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
+                    __FILE__, __LINE__, str); \
+    fflush(stderr); \
+}
+
+// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
+#define ERRCHK(retval)  { if (!(retval)) ERROR(#retval " was false"); }
+#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
+#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
+
+/*
+ * =============================================================================
+ * CUDA-specific error checking
+ * =============================================================================
+ */
+#ifdef __CUDACC__
+static inline void
+cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
+{
+    if (code != cudaSuccess) {
+        time_t t; time(&t); \
+        fprintf(stderr, "%s", ctime(&t)); \
+        fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
+                        file, line, cudaGetErrorString(code)); \
+        fflush(stderr); \
+
+        if (abort)
+            exit(code);
+    }
+}
+
+#ifdef NDEBUG
+    #undef ERRCHK
+    #undef WARNCHK
+    #define ERRCHK(params)
+    #define WARNCHK(params)
+    #define ERRCHK_CUDA(params) params;
+    #define WARNCHK_CUDA(params) params;
+    #define ERRCHK_CUDA_KERNEL() {}
+#else
+    #define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
+    #define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
+
+    #define ERRCHK_CUDA_KERNEL()                                               \
+    {                                                                          \
+        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
+        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
+    }
+    #endif
+
+#endif
+
+#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
+
+#define ERRCHK_CUDA_KERNEL_ALWAYS()                                               \
+{                                                                          \
+    ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError());                                    \
+    ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize());                                  \
+}
+// clang-format on
diff --git a/src/core/kernels/.gitignore b/src/core/kernels/.gitignore
new file mode 100644
index 0000000..c5914af
--- /dev/null
+++ b/src/core/kernels/.gitignore
@@ -0,0 +1,2 @@
+# Ignore the generated headers
+stencil_process.cuh stencil_assembly.cuh
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
new file mode 100644
index 0000000..b411a06
--- /dev/null
+++ b/src/core/kernels/boundconds.cuh
@@ -0,0 +1,1363 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+
+//TODO MV: MAKE A BETTER SWITCH
+#define B_VELTYPE 666
+//#define B_VELTYPE 1
+
+////////////////////////////////////
+// Define the destination indices //
+////////////////////////////////////
+
+
+// Get the standard coordinate indices. 
+__device__ void
+get_dst_index(const int3 start, int* i_dst, int* j_dst, int* k_dst, int* dst_idx)
+{
+     *i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+     *j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+     *k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+     *dst_idx      = DEVICE_VTXBUF_IDX(*i_dst, *j_dst, *k_dst);
+
+     //printf("*i_dst = %i, *j_dst = %i, *k_dst = %i, *dst_idx = %i \n", *i_dst, *j_dst, *k_dst, *dst_idx);
+}
+
+
+__device__ void
+_sym_indexing_xz(int* edge_idx, int* src_idx, 
+                 const int i_dst, const int j_dst, const int k_dst)
+{
+
+    int i_edge, j_edge, k_edge;
+    int i_diff, k_diff;
+    int i_src,  k_src ;
+    int is_ztop = 0, is_zbot = 0;
+    int is_xtop = 0, is_xbot = 0;
+
+    if (i_dst  < DCONST_INT(AC_nx_min)){
+        i_edge = DCONST_INT(AC_nx_min);
+        is_xbot = 1;
+    } else if (i_dst >=  DCONST_INT(AC_nx_max)){
+        i_edge = DCONST_INT(AC_nx_max)-1;
+        is_xtop = 1;
+    } else {
+        i_edge = i_dst;
+    }
+
+    j_edge = j_dst;
+
+    if (k_dst < DCONST_INT(AC_nz_min)) {
+        k_edge = DCONST_INT(AC_nz_min);
+        is_zbot = 1;
+    } else if (k_dst >=  DCONST_INT(AC_nz_max)) {
+        k_edge = DCONST_INT(AC_nz_max)-1;
+        is_ztop = 1;
+    } else {
+        k_edge = k_dst;
+    }
+
+    *edge_idx = DEVICE_VTXBUF_IDX(i_edge, j_edge, k_edge);
+
+    //TODO: problematic on the corners!!! 
+    if (is_xtop == 1 || is_xbot == 1) {
+        i_diff = i_edge - i_dst;
+        i_src  = i_edge + i_diff;
+        //OK 
+        //printf("i_edge %i, j_edge %i, k_edge %i, i_dst %i, j_dst %i, k_dst %i \n", i_edge, j_edge, k_edge, i_dst, j_dst, k_dst);
+        *src_idx = DEVICE_VTXBUF_IDX(i_src, j_dst, k_dst);
+    } else if (is_ztop == 1 || is_zbot == 1) {
+        k_diff = k_edge - k_dst;
+        k_src  = k_edge + k_diff;
+        *src_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_src);
+    } else {
+        *src_idx = NULL;
+    }
+}
+
+
+//////////////////////////////////////
+// Choose surface value points only //
+//////////////////////////////////////
+
+// Choose negative x-boundary SURFACE values.  
+__device__ int
+choose_negxbound_point(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst == DCONST_INT(AC_nx_min))
+        return 0;
+
+    return 1;
+}
+
+// Choose positive x-boundary SURFACE values.  
+__device__ int
+choose_posxbound_point(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst == (DCONST_INT(AC_nx_max)-1))
+        return 0;
+
+    return 1;
+}
+
+// Choose negative z-boundary SURFACE values.  
+__device__ int
+choose_negzbound_point(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (k_dst == DCONST_INT(AC_nz_min))
+        return 0;
+
+    return 1;
+}
+
+// Choose positive z-boundary SURFACE values.  
+__device__ int
+choose_poszbound_point(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (k_dst == (DCONST_INT(AC_nz_max)-1))
+        return 0;
+
+    return 1;
+}
+
+////////////////////////////////////
+// Filtering out of bounds values //
+////////////////////////////////////
+
+// If within the start-end range (this allows threadblock dims that are not
+// divisible by end - start)
+__device__ int
+filter_outbound(const int3 end, const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return 1;
+
+    return 0; 
+}
+
+// If destination index is inside the computational domain, return since
+// the boundary conditions are only applied to the ghost zones
+__device__ int
+filter_inbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return 1;
+
+    return 0;
+}
+
+// If destination index is within x boundary, we do not need it
+__device__ int
+filter_xbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst < DCONST_INT(AC_nx_min) || i_dst >= DCONST_INT(AC_nx_max))
+        return 1;
+
+    return 0;
+}
+
+
+// Discard negative x-boundary values.  
+__device__ int
+filter_negxbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst < DCONST_INT(AC_nx_min))
+        return 1;
+
+    return 0;
+}
+
+// Discard positive x-boundary values.  
+__device__ int
+filter_posxbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst >= DCONST_INT(AC_nx_max))
+        return 1;
+
+    return 0;
+}
+
+// Discard y-boundary values.  
+__device__ int
+filter_ybound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if ((j_dst <  DCONST_INT(AC_ny_min) || j_dst >= DCONST_INT(AC_ny_max)))
+        return 1;
+
+    return 0;
+}
+
+// Discard negative y-boundary values.  
+__device__ int
+filter_negybound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (j_dst < DCONST_INT(AC_ny_min))
+        return 1;
+
+    return 0;
+}
+
+// Discard positive y-boundary values.  
+__device__ int
+filter_posybound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (j_dst >= DCONST_INT(AC_ny_max))
+        return 1;
+
+    return 0;
+}
+
+// If destination index is within z boundary, we do not need it
+__device__ int
+filter_zbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (k_dst < DCONST_INT(AC_nz_min) || k_dst >= DCONST_INT(AC_nz_max))
+        return 1;
+
+    return 0;
+}
+
+
+// Discard negative z-boundary values.  
+__device__ int
+filter_negzbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (k_dst < DCONST_INT(AC_nz_min))
+        return 1;
+
+    return 0;
+}
+
+// Discard positive x-boundary values.  
+__device__ int
+filter_poszbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (k_dst >= DCONST_INT(AC_nz_max))
+        return 1;
+
+    return 0;
+}
+
+// Discard all exept negative x
+__device__ int
+filter_allbut_negxbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst >= DCONST_INT(AC_nx_min))
+        return 1;
+
+    return 0;
+}
+
+// Discard all exept positive x
+__device__ int
+filter_allbut_posxbound(const int i_dst, const int j_dst, const int k_dst)
+{
+    if (i_dst < DCONST_INT(AC_nx_max))
+        return 1;
+
+    return 0;
+}
+
+
+
+
+
+
+/////////////////////////////////////
+// Constant values at the boundary //
+/////////////////////////////////////
+
+
+__global__ void
+_set_density_xzin(const int3 start, const int3 end, AcReal* density_buffer)    
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) &&
+         (choose_negzbound_point(i_dst, j_dst, k_dst)) && 
+         (choose_poszbound_point(i_dst, j_dst, k_dst)) ) return;
+
+    density_buffer[dst_idx] =  DCONST_REAL(AC_lnrho_edge);
+
+}
+
+__global__ void
+_set_density_xin(const int3 start, const int3 end, AcReal* density_buffer)    
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+    //const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    //const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    //const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+    //int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    //if ( choose_posxbound_point(i_dst, j_dst, k_dst) ) return;
+    if ( filter_allbut_posxbound(i_dst, j_dst, k_dst) ) return;
+
+    //if (dst_idx >= DCONST_INT(AC_mx)*DCONST_INT(AC_my)*DCONST_INT(AC_mz)) 
+    //printf(" %i %i %i %i XX %i %i  \n", i_dst, j_dst, k_dst, dst_idx,  
+    //                                    i_dst + j_dst*DCONST_INT(AC_mx) + k_dst*DCONST_INT(AC_mx)*DCONST_INT(AC_my), 
+    //                                    DCONST_INT(AC_mx)*DCONST_INT(AC_my)*DCONST_INT(AC_mz) );
+
+    //TODO: add a powerlaw gradient. 
+    density_buffer[dst_idx] = DCONST_REAL(AC_lnrho_edge);
+
+    //printf(" %i %i %i %i %e %i %i  \n", i_dst, j_dst, k_dst, dst_idx, density_buffer[dst_idx], 
+    //                                    i_dst + j_dst*DCONST_INT(AC_mx) + k_dst*DCONST_INT(AC_mx)*DCONST_INT(AC_my), 
+    //                                    DCONST_INT(AC_mx)*DCONST_INT(AC_my)*DCONST_INT(AC_mz) );
+
+}
+
+
+__global__ void
+_set_density_xout(const int3 start, const int3 end, AcReal* density_buffer)    
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if ( choose_negxbound_point(i_dst, j_dst, k_dst) ) return;
+
+    //printf(" PIIP ");
+    density_buffer[dst_idx] = DCONST_REAL(AC_lnrho_out); 
+    //printf(" %i %i %i %i %e %i %i  \n", i_dst, j_dst, k_dst, dst_idx, density_buffer[dst_idx], 
+    //                                    i_dst + j_dst*DCONST_INT(AC_mx) + k_dst*DCONST_INT(AC_mx)*DCONST_INT(AC_my), 
+    //                                    DCONST_INT(AC_mx)*DCONST_INT(AC_my)*DCONST_INT(AC_mz) );
+
+
+}
+
+__device__ void 
+_calc_csconst(const int dst_idx, AcReal* entropy_buffer, AcReal* density_buffer)
+{
+    //DUMMY!!! TODO: Test isothermal first. 
+
+    //AcReal TT_bound = 0.0; //AcReal(2.0) * DCONST_REAL(AC_cv_sound) * log(cs2bound/ DCONST_REAL(AC_cp_sound));
+
+    //entropy_buffer[dst_idx] = 0.0; //AcReal(0.5)*TT_bound
+                              //- (DCONST_REAL(AC_cp_sound) - DCONST_REAL(AC_cv_sound))
+                              //*(density_buffer[dst_idx] - DCONST_REAL(AC_lnrho0));
+}
+
+
+// This boundary condion sets the edge point in the system to a specific value.
+// In this case, the sound speed. At the outflow boundary
+// This way a constant value at the boundary is defined. The ghost zones are
+// instead for the purpose of defining the behaviour of the derivatives. 
+__global__ void
+_set_csconst_xbot(const int3 start, const int3 end, AcReal* entropy_buffer, AcReal* density_buffer) 
+{
+    //const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    //const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    //const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+    //const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (choose_negxbound_point(i_dst, j_dst, k_dst)) return;
+
+    _calc_csconst(dst_idx, entropy_buffer, density_buffer);
+
+}
+
+// This boundary condion sets the edge point in the system to a specific value.
+// In this case, the sound speed. At the inflow boundaries. 
+__global__ void
+_set_csconst_xzin(const int3 start, const int3 end, AcReal* entropy_buffer, AcReal* density_buffer) 
+{
+    
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) &&
+         (choose_negzbound_point(i_dst, j_dst, k_dst)) && 
+         (choose_poszbound_point(i_dst, j_dst, k_dst)) ) return;
+
+    _calc_csconst(dst_idx, entropy_buffer, density_buffer);
+
+}
+
+// This boundary condion sets the edge point in the system to a specific value.
+// In this case, the sound speed. At the inflow boundaries. 
+__global__ void
+_set_csconst_xin(const int3 start, const int3 end, AcReal* entropy_buffer, AcReal* density_buffer) 
+{
+    
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) ) return;
+
+    _calc_csconst(dst_idx, entropy_buffer, density_buffer);
+
+}
+
+
+// Calculate inflow velocity at a coordinate point
+__device__ void
+_calc_inflow_velocity(const int dst_idx, const AcReal xx, const AcReal zz, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)
+{
+
+    const AcReal delx = xx - DCONST_REAL(AC_star_pos_x);
+    const AcReal delz = zz - DCONST_REAL(AC_star_pos_z);
+
+    //TODO: Figure out isthis needed. Now a placeholder.
+    //tanhz = fabs(tanh(zz/DCONST_REAL(AC_trans)));
+    const AcReal tanhz = 1.0;
+                    
+    const AcReal RR     = sqrt(delx*delx + delz*delz);
+    const AcReal veltot = DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
+    
+    //Normal velocity components
+    const AcReal uu_x = - veltot*(delx/RR);  
+    const AcReal uu_z = - veltot*(delz/RR);
+
+    //Take into account either the top or bottom direction if the inflow angle is transformed in some way.  
+    if (delz >= 0.0) {
+        uux_buffer[dst_idx] = ( uu_x*cos(DCONST_REAL(AC_angl_uu)) - uu_z*sin(DCONST_REAL(AC_angl_uu)) )*tanhz;
+        uuz_buffer[dst_idx] = ( uu_x*sin(DCONST_REAL(AC_angl_uu)) + uu_z*cos(DCONST_REAL(AC_angl_uu)) )*tanhz;
+    } else {        
+        uux_buffer[dst_idx] = ( uu_x*cos(DCONST_REAL(AC_angl_uu)) + uu_z*sin(DCONST_REAL(AC_angl_uu)) )*tanhz;
+        uuz_buffer[dst_idx] = (-uu_x*sin(DCONST_REAL(AC_angl_uu)) + uu_z*cos(DCONST_REAL(AC_angl_uu)) )*tanhz;
+    }
+    uuy_buffer[dst_idx] = AcReal(0.0) ;
+}
+
+
+//TODO: Make inflow xtrapolation based on the free fall profile 
+__device__ void
+_extrapolate_inflow_velocity_xonly(const int dst_idx, const int edge_idx, const AcReal xx, 
+                                   const AcReal xx_edge, const AcReal zz, AcReal* uux_buffer, 
+                                   AcReal* uuy_buffer, AcReal* uuz_buffer)
+{
+
+    const AcReal delx      = xx - DCONST_REAL(AC_star_pos_x);
+    const AcReal delx_edge = xx_edge - DCONST_REAL(AC_star_pos_x);
+                    
+
+    const AcReal RR              = sqrt(delx*delx);
+    const AcReal RR_edge         = sqrt(delx_edge*delx_edge);
+    const AcReal RR_rel          = sqrt(RR_edge/RR); // 1/sqrt(R) scaling
+
+    const AcReal veltot          = uux_buffer[edge_idx]*RR_rel;
+    const AcReal veltot_freefall = -DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
+    
+    //Normal velocity components
+    //Set the roof to free fall velocity
+    AcReal uu_x;
+    if (veltot >= veltot_freefall) {
+        uu_x = veltot; 
+        //if (uux_buffer[edge_idx] < 0.0) printf("uux_buffer[%i] %e  veltot %e veltot_freefall %e RR_rel %e \n", edge_idx, uux_buffer[edge_idx], veltot, veltot_freefall, RR_rel);
+        //if (veltot < uux_buffer[edge_idx]) printf("uux_buffer[%i] %e  veltot %e veltot_freefall %e RR_rel %e \n", edge_idx, uux_buffer[edge_idx], veltot, veltot_freefall, RR_rel);
+        //if (veltot > uux_buffer[edge_idx]) printf("uux_buffer[%i] RR %e RR_edge %e RR_rel %e \n", edge_idx, RR, RR_edge, RR_rel);
+    } else { 
+        uu_x = veltot_freefall; 
+        //printf("%i %e %e \n", dst_idx, veltot, veltot_freefall);
+    }
+
+    //Take into account either the top or bottom direction if the inflow angle is transformed in some way.  
+    uux_buffer[dst_idx]    = uu_x;
+    uuz_buffer[dst_idx]    = AcReal(0.0);
+    uuy_buffer[dst_idx]    = AcReal(0.0);
+
+}
+
+
+__device__ void
+_calc_inflow_velocity_xonly(const int dst_idx, const AcReal xx, const AcReal zz, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)
+{
+
+    const AcReal delx = xx - DCONST_REAL(AC_star_pos_x);
+                    
+    const AcReal RR     = sqrt(delx*delx);
+    const AcReal veltot = DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
+    
+    //Normal velocity components
+    const AcReal uu_x = - veltot;  
+
+    //Take into account either the top or bottom direction if the inflow angle is transformed in some way.  
+    uux_buffer[dst_idx] = uu_x;
+    uuz_buffer[dst_idx] = AcReal(0.0);
+    uuy_buffer[dst_idx] = AcReal(0.0) ;
+}
+
+
+// Set inflow velocity based on free-fall at both vertical and horizontal boundaries. 
+__global__ void
+_set_uinflow_xzin(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)
+{
+    
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    const AcReal xx = DCONST_REAL(AC_dsx) * AcReal(i_dst) - DCONST_REAL(AC_xorig);
+    const AcReal zz = DCONST_REAL(AC_dsz) * AcReal(k_dst) - DCONST_REAL(AC_zorig);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) &&
+         (choose_negzbound_point(i_dst, j_dst, k_dst)) && 
+         (choose_poszbound_point(i_dst, j_dst, k_dst)) ) return;
+
+    _calc_inflow_velocity(dst_idx, xx, zz, uux_buffer, uuy_buffer, uuz_buffer);
+
+} 
+
+// Set inflow velocity based on free-fall only at horizontal boundary. 
+__global__ void
+_set_uinflow_xin(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer, AcReal* lnrho_buffer)
+{
+    
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    //Fix the whole boundary
+    //if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) ) return;
+    
+    if (filter_allbut_posxbound(i_dst, j_dst, k_dst)) return;
+
+    int i_edge, edge_idx, src_idx;
+    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
+    i_edge = DCONST_INT(AC_nx_max)-1;
+
+    const AcReal xx      = DCONST_REAL(AC_dsx) * AcReal(i_dst)  - DCONST_REAL(AC_xorig);
+    const AcReal zz      = DCONST_REAL(AC_dsz) * AcReal(k_dst)  - DCONST_REAL(AC_zorig);
+    const AcReal xx_edge = DCONST_REAL(AC_dsx) * AcReal(i_edge) - DCONST_REAL(AC_xorig);
+
+    //Free fall
+    ////_calc_inflow_velocity_xonly(dst_idx, xx, zz, uux_buffer, uuy_buffer, uuz_buffer);
+
+    //Basic momentum from boundcond
+    const AcReal delx = xx - DCONST_REAL(AC_star_pos_x);
+    const AcReal RR     = sqrt(delx*delx);
+    const AcReal vel_freefall = -DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
+
+    //Extrapolation scheme but only if inflow
+    if (uux_buffer[edge_idx] <= AcReal(0.0)) { 
+        _extrapolate_inflow_velocity_xonly(dst_idx, edge_idx, xx, xx_edge, zz, uux_buffer, uuy_buffer, uuz_buffer);
+
+        //uux_buffer[dst_idx] = AcReal(-0.001);
+        //uuz_buffer[dst_idx] = AcReal(0.0);
+        //uuy_buffer[dst_idx] = AcReal(0.0);
+    } else {
+        uux_buffer[dst_idx] = AcReal(0.0);
+        uuz_buffer[dst_idx] = AcReal(0.0);
+        uuy_buffer[dst_idx] = AcReal(0.0);
+
+        uux_buffer[edge_idx] = AcReal(0.0);
+    }
+          
+    //Simple linear interpolation for density scaling.
+    lnrho_buffer[dst_idx] = (DCONST_REAL(AC_lnrho_edge) - DCONST_REAL(AC_ampl_lnrho))*(uux_buffer[dst_idx]/vel_freefall) + DCONST_REAL(AC_ampl_lnrho);    
+   
+}
+
+
+///////////////////////////////////////
+// Antisymmetric boundary conditions //
+///////////////////////////////////////
+
+
+
+//Set constant pressure at the inflow boundary it manage the "ramming effect" causing wiggles.
+// WARNING: NOT TESTED
+__global__ void
+_inflow_set_der_xtop(const int3 start, const int3 end, const AcReal der_value, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
+
+    //Get the correct, symmetric indices for the boundary compurtation
+    int edge_idx, src_idx;
+    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
+
+    int i_edge = DCONST_INT(AC_nx_max)-1;
+    int i_diff = i_edge - i_dst;
+
+    const AcReal xx = DCONST_REAL(AC_dsx) * AcReal(i_edge + i_diff) - DCONST_REAL(AC_xorig);
+    const AcReal xx_edge = DCONST_REAL(AC_dsx) * AcReal(i_edge) - DCONST_REAL(AC_xorig);
+
+    vertex_buffer[dst_idx] =  vertex_buffer[src_idx] + (AcReal(2.0)*(xx_edge-xx))*der_value;
+
+}
+
+
+__device__ void
+_rel_antisym_general(int i_dst, int j_dst, int k_dst, int dst_idx, AcReal* vertex_buffer)
+{
+    //Get the correct, symmetric indices for the boundary compurtation
+    int edge_idx, src_idx;
+    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
+
+    AcReal sgn = -1.0; //Relative anti-symmetric
+    vertex_buffer[dst_idx] = sgn*vertex_buffer[src_idx] + AcReal(2.0)*vertex_buffer[edge_idx];       
+}
+
+__device__ void
+_sym_antisym_general(int i_dst, int j_dst, int k_dst, int dst_idx, AcReal* vertex_buffer, AcReal sgn)
+{
+    //Get the correct, symmetric indices for the boundary compurtation
+    int edge_idx, src_idx;
+    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
+
+    vertex_buffer[dst_idx] = sgn*vertex_buffer[src_idx];       
+}
+
+__global__ void
+_rel_antisym_xbot(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+    if (filter_posxbound(i_dst, j_dst, k_dst)) return; 
+
+    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
+
+    //printf(" %i %i %i %i %e %i %i  \n", i_dst, j_dst, k_dst, dst_idx, vertex_buffer[dst_idx], 
+    //                                    i_dst + j_dst*DCONST_INT(AC_mx) + k_dst*DCONST_INT(AC_mx)*DCONST_INT(AC_my), 
+    //                                    DCONST_INT(AC_mx)*DCONST_INT(AC_my)*DCONST_INT(AC_mz) );
+
+}
+
+__global__ void
+_rel_antisym_xtop(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
+
+    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
+
+}
+
+__global__ void
+_rel_antisym_zbot(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+    if (filter_poszbound(i_dst, j_dst, k_dst)) return; 
+    
+    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
+
+}
+
+__global__ void
+_rel_antisym_ztop(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+    if (filter_negzbound(i_dst, j_dst, k_dst)) return; 
+    
+    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
+
+}  
+
+__global__ void
+_antisym_xbot(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+    if (filter_posxbound(i_dst, j_dst, k_dst)) return; 
+
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
+
+}
+
+__global__ void
+_antisym_xtop(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
+
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
+
+}
+
+__global__ void
+_antisym_zbot(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+    if (filter_poszbound(i_dst, j_dst, k_dst)) return; 
+    
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
+
+}
+
+__global__ void
+_antisym_ztop(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+    if (filter_negzbound(i_dst, j_dst, k_dst)) return; 
+    
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
+
+}  
+
+ 
+__global__ void
+_sym_xbot(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+    if (filter_posxbound(i_dst, j_dst, k_dst)) return; 
+
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
+
+}
+
+__global__ void
+_sym_xtop(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
+
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
+
+}
+
+__global__ void
+_sym_zbot(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+    if (filter_poszbound(i_dst, j_dst, k_dst)) return; 
+    
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
+
+}
+
+__global__ void
+_sym_ztop(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_ybound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+    if (filter_negzbound(i_dst, j_dst, k_dst)) return; 
+    
+    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
+
+}  
+
+//Outflow boundary conditions: 
+//Symmetric conditon when velocity vector points outside the box.
+//Antisymmetric condition when velocity points inside.
+__global__ void
+_uu_outflow_xbot(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)     
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_allbut_negxbound(i_dst, j_dst, k_dst)) return;
+    
+    //Get the correct, symmetric indices for the boundary compurtation
+    int edge_idx, src_idx;
+    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
+
+    AcReal sgnx;
+    if (uux_buffer[edge_idx] <= 0.0) {  
+        sgnx = 1.0; //Symmetric
+    } else {
+        sgnx = -1.0; //Antisymmetric
+        //TODO: This might have synchronization problem.
+        uux_buffer[edge_idx] = 0.0;  //Antisymmetric condition requires value to vanish at the boundary. TODO: This might  
+    }
+    AcReal sgny = 1.0, sgnz = 1.0; //Symmetric
+
+    uux_buffer[dst_idx] = sgnx*uux_buffer[src_idx];       
+    uuy_buffer[dst_idx] = sgny*uuy_buffer[src_idx];       
+    uuz_buffer[dst_idx] = sgnz*uuz_buffer[src_idx];       
+
+
+} 
+
+//Inflow boundary conditions Pencil Code style: 
+//Symmetric conditon when velocity vector points inside the box.
+//Antisymmetric condition when velocity points outside.
+__global__ void
+_uu_inflow_simple_xbot(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)     
+{
+    int i_dst, j_dst, k_dst, dst_idx;
+    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_allbut_posxbound(i_dst, j_dst, k_dst)) return;
+
+    //Get the correct, symmetric indices for the boundary compurtation
+    int edge_idx, src_idx;
+    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
+
+    AcReal sgnx;
+    if (uux_buffer[edge_idx] <= 0.0) {  
+        sgnx = 1.0; //Symmetric
+    } else {
+        sgnx = -1.0; //Antisymmetric
+        //TODO: This might have synchronization problem.
+        uux_buffer[edge_idx] = 0.0;  //Antisymmetric condition requires value to vanish at the boundary. 
+    }
+    AcReal sgny = 1.0, sgnz = 1.0; //Symmetric
+
+    uux_buffer[dst_idx] = sgnx*uux_buffer[src_idx];       
+    uuy_buffer[dst_idx] = sgny*uuy_buffer[src_idx];       
+    uuz_buffer[dst_idx] = sgnz*uuz_buffer[src_idx];       
+
+
+} 
+
+ 
+
+//////////////////////////////////
+// Periodic boundary conditions //
+//////////////////////////////////
+
+// This boundary condition is only peridic in y-direction and will require
+// other boundaries to determine x and z direction. Build for the vedge model
+// in mind. Use with discretion. 
+__global__ void
+_y_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+    if (filter_zbound(i_dst, j_dst, k_dst)) return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates  
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vertex_buffer[dst_idx] = vertex_buffer[src_idx];     
+}
+
+
+
+
+// This boundary condition is only peridic in y-direction and will require
+// other boundaries to determine x direction. Build for the vedge model
+// teasting in mind. Use with discretion. 
+__global__ void
+_yz_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    //Skip threads which are not at valid boundaries.
+    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
+    if (filter_inbound(i_dst, j_dst, k_dst)) return;
+    if (filter_xbound(i_dst, j_dst, k_dst)) return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates  
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vertex_buffer[dst_idx] = vertex_buffer[src_idx];     
+}
+
+//Bundle of boundary conditions for xy inflow
+void
+xz_inflow_boundconds(const cudaStream_t stream, const dim3& tpb, const dim3 bpg,
+                     const int3& start, const int3& end, VertexBufferArray d_buffer)
+{
+    _set_density_xzin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]);                                               ERRCHK_CUDA_KERNEL();    
+    //_set_csconst_xzin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_ENTROPY], d_buffer.in[VTXBUF_LNRHO]);                  ERRCHK_CUDA_KERNEL();    
+    _set_uinflow_xzin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL();    
+
+    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();   
+    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]);     ERRCHK_CUDA_KERNEL();    
+    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]);     ERRCHK_CUDA_KERNEL();    
+    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]);     ERRCHK_CUDA_KERNEL();    
+
+    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();   
+    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]);     ERRCHK_CUDA_KERNEL();    
+    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]);     ERRCHK_CUDA_KERNEL();    
+    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]);     ERRCHK_CUDA_KERNEL();    
+
+    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();   
+    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]);     ERRCHK_CUDA_KERNEL();    
+    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]);     ERRCHK_CUDA_KERNEL();    
+    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]);     ERRCHK_CUDA_KERNEL();    
+}
+
+//Bundle of boundary conditions for x inflow
+void
+x_inflow_boundconds(const cudaStream_t stream, const dim3& tpb, const dim3 bpg, 
+                    const int3& start, const int3& end, VertexBufferArray d_buffer)
+{
+    //_set_density_xin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]);                                               ERRCHK_CUDA_KERNEL(); //OK!!!   
+    _set_uinflow_xin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ], d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();    
+    //_uu_inflow_simple_xbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL();
+
+    //acSynchronize();
+
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    ////TODO: Corners have to be determined by some condintion. Otherwise weird values appear. 
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    ////TODO: Corners have to be determined by some condintion. Otherwise weird values appear. 
+    //_sym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    //_sym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    //_sym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+
+
+
+}
+
+//Bundle of boundary conditions for outflow
+void
+x_outflow_boundconds(const cudaStream_t stream, const dim3& tpb, const dim3 bpg,
+                    const int3& start, const int3& end, VertexBufferArray d_buffer)
+{
+   // _set_density_xout<<<bpg, tpb, 0, stream>>>(start,  end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL(); //OK!!! 
+   //acSynchronize();
+
+   //DO we need to take the gravity effects explicitly in account here? 
+
+    //_rel_antisym_xbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();  //OK!!!   
+    _rel_antisym_xbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();  //OK!!!   
+    //TODO: Chenk that these actually work.  
+    _uu_outflow_xbot<<< bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL();     
+    
+}
+
+
+
+
+/*
+
+IMPORTANT NOTE! 
+
+The above boundary conditions have ben written for the pseudodisk model by
+mvaisala. At the moment therefore there two alternative approaches for the
+antisymmetric boundary condition. These need to be adapted in a smart way.
+
+Fortunately the kernels as they are do not interfere each other due to
+differing naming conventions.
+
+jpekkila's antisymmetric kernel
+
+_antisymmetric_boundconds(...)
+
+mvaisala's antisymmetric kernels: 
+
+_rel_antisym_general(...)
+_rel_antisym_xtop(...);    
+_rel_antisym_zbot(...);    
+_rel_antisym_ztop(...);    
+
+*/
+
+
+
+
+
+__global__ void
+_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vertex_buffer[dst_idx] = vertex_buffer[src_idx];    
+}
+
+void
+periodic_boundconds(const cudaStream_t stream, const dim3& tpb, 
+                    const int3& start, const int3& end, AcReal* vertex_buffer)
+{
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
+    ERRCHK_CUDA_KERNEL();
+}
+
+
+typedef enum {
+    X_AXIS,
+    Y_AXIS,
+    Z_AXIS,
+    NUM_AXES
+} Axis;
+
+__global__ void
+_antisymmetric_boundconds(const int3 start, const int3 end, const Axis symmetry_axis, const AcReal base_value, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+
+    if (base_value >= 0) {
+        vertex_buffer[dst_idx] = -vertex_buffer[src_idx];
+    } else {
+        int bound_idx = -1;
+        if (symmetry_axis == X_AXIS) {
+            int boundary_idx = 0;
+            if (i_dst < STENCIL_ORDER/2)
+                boundary_idx = STENCIL_ORDER/2;
+            else
+                boundary_idx = DCONST_INT(AC_nx_max) - 1;
+
+            bound_idx = DEVICE_VTXBUF_IDX(boundary_idx, j_src, k_src);
+        } else if (symmetry_axis == Y_AXIS) {
+            int boundary_idx = 0;
+            if (j_dst < STENCIL_ORDER/2)
+                boundary_idx = STENCIL_ORDER/2;
+            else
+                boundary_idx = DCONST_INT(AC_ny_max) - 1;
+
+            bound_idx = DEVICE_VTXBUF_IDX(i_src, boundary_idx, k_src);
+        } else { // symmetry_axis == Z_AXIS
+            int boundary_idx = 0;
+            if (k_dst < STENCIL_ORDER/2)
+                boundary_idx = STENCIL_ORDER/2;
+            else
+                boundary_idx = DCONST_INT(AC_nz_max) - 1;
+
+            bound_idx = DEVICE_VTXBUF_IDX(i_src, j_src, boundary_idx);
+        }
+        vertex_buffer[dst_idx] = -(vertex_buffer[src_idx] - vertex_buffer[bound_idx]) + vertex_buffer[bound_idx];
+    }
+}
+
+void
+antisymmetric_boundconds(const cudaStream_t stream, const dim3& tpb, 
+                    const int3& start, const int3& end, const Axis symmetry_axis, const AcReal base_value, AcReal* vertex_buffer)
+{
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    _antisymmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, symmetry_axis, base_value, vertex_buffer);
+    ERRCHK_CUDA_KERNEL();
+}
+
+
+//Here we attempt to construct the boundary condition for the vedge setup.
+void 
+wedge_boundconds(const cudaStream_t stream, const dim3& tpb,
+                 const int3& start, const int3& end, VertexBufferArray d_buffer)
+{
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    //Y direction is always periodic
+    // Repeat fo all buffers
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+#if B_VELTYPE == 1
+        _y_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[i]); 
+#else
+        _yz_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[i]);
+#endif
+        ERRCHK_CUDA_KERNEL();
+    }
+
+
+    //NOTE: Testing first now with fixed value antisymmetric boundary conditions. 
+    //Conditions for inflow boundary 
+#if B_VELTYPE == 1
+    xz_inflow_boundconds(stream, tpb, bpg, start, end, d_buffer);
+#else
+    x_inflow_boundconds(stream, tpb, bpg, start, end, d_buffer);
+#endif
+
+    //Conditions for outflow boundary
+    x_outflow_boundconds(stream, tpb, bpg, start, end, d_buffer);
+    
+    //Conditions for magnetic field 
+    //USING PERIODIC WHILE DOING BASIC HYDRO TESTING.
+    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_AX]); ERRCHK_CUDA_KERNEL();
+    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_AY]); ERRCHK_CUDA_KERNEL();
+    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_AZ]); ERRCHK_CUDA_KERNEL();
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh
new file mode 100644
index 0000000..cc05cf4
--- /dev/null
+++ b/src/core/kernels/kernels.cuh
@@ -0,0 +1,794 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+ #pragma once
+
+__global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vertex_buffer[dst_idx] = vertex_buffer[src_idx];
+}
+
+void
+periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vertex_buffer)
+{
+    const dim3 tpb(8,2,8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
+    ERRCHK_CUDA_KERNEL();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <assert.h>
+
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, 
+               const AcReal dt, VertexBufferArray* buffer)
+{
+    const dim3 tpb(32, 1, 4);
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
diff --git a/src/core/kernels/reduce.cuh b/src/core/kernels/reduce.cuh
new file mode 100644
index 0000000..e2415f4
--- /dev/null
+++ b/src/core/kernels/reduce.cuh
@@ -0,0 +1,338 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
+
+// Function pointer definitions
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
+typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
+typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
+                                       const AcReal&);
+
+// clang-format off
+/* Comparison funcs */
+__device__ inline AcReal
+_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+__device__ inline AcReal
+_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+__device__ inline AcReal
+_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+__device__ inline AcReal
+_device_length_scal(const AcReal& a) { return AcReal(a); }
+
+__device__ inline AcReal
+_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
+
+__device__ inline AcReal
+_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
+
+__device__ inline AcReal
+_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+__device__ inline AcReal
+_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
+
+__device__ inline AcReal
+_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
+// clang-format on
+
+__device__ inline bool
+oob(const int& i, const int& j, const int& k)
+{
+    if (i >= d_mesh_info.int_params[AC_nx] ||
+        j >= d_mesh_info.int_params[AC_ny] ||
+        k >= d_mesh_info.int_params[AC_nz])
+        return true;
+    else
+        return false;
+}
+
+template <ReduceInitialScalFunc reduce_initial>
+__global__ void
+_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src[src_idx]);
+}
+
+template <ReduceInitialVecFunc reduce_initial>
+__global__ void
+_kernel_reduce_vec(const __restrict__ AcReal* src_a,
+                   const __restrict__ AcReal* src_b,
+                   const __restrict__ AcReal* src_c, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
+                                  src_c[src_idx]);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+#define BLOCK_SIZE (1024)
+#define ELEMS_PER_THREAD (32)
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce(AcReal* src, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+
+    if (idx >= scratchpad_size)
+        return;
+
+    __shared__ AcReal smem[BLOCK_SIZE];
+
+    AcReal tmp = src[idx];
+
+    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
+        const int src_idx = idx + i * BLOCK_SIZE;
+        if (src_idx >= scratchpad_size) {
+            // This check is for safety: if accessing uninitialized values
+            // beyond the mesh boundaries, we will immediately start seeing NANs
+            if (threadIdx.x < BLOCK_SIZE)
+                smem[threadIdx.x] = NAN;
+            else
+                break;
+        }
+        tmp = reduce(tmp, src[src_idx]);
+    }
+
+    smem[threadIdx.x] = tmp;
+    __syncthreads();
+
+    int offset = BLOCK_SIZE / 2;
+    while (offset > 0) {
+
+        if (threadIdx.x < offset) {
+            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
+            smem[threadIdx.x] = tmp;
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        src[idx] = tmp;
+}
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
+{
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    AcReal tmp    = src[idx];
+    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
+    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
+        tmp = reduce(tmp, src[idx + i * block_offset]);
+
+    *result = tmp;
+}
+//////////////////////////////////////////////////////////////////////////////
+
+AcReal
+_reduce_scal(const cudaStream_t stream,
+             const ReductionType& rtype, const int& nx, const int& ny,
+             const int& nz, const AcReal* vertex_buffer,
+             AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
+                   int(ceil(AcReal(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
+                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_scal<_device_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_scal<_device_exp_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
+
+AcReal
+_reduce_vec(const cudaStream_t stream,
+            const ReductionType& rtype, const int& nx, const int& ny,
+            const int& nz, const AcReal* vertex_buffer_a,
+            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
+            AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    bool solve_mean = false;
+
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
+                   int(ceil(float(ny) / tpb.y)),
+                   int(ceil(float(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
+                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    // "Features" of this quick & efficient reduction:
+    // Block size must be smaller than the computational domain size
+    // (otherwise we would have do some additional bounds checking in the
+    // second half of _kernel_reduce, which gets quite confusing)
+    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
+    // the work without worrying too much about the array bounds.
+    ERRCHK(BLOCK_SIZE <= scratchpad_size);
+    ERRCHK(!(BLOCK_SIZE % 2));
+    // NOTE! Also does not work properly with non-power of two mesh dimension
+    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
+    // see that uninitialized smem values are used in the comparison
+    ERRCHK(is_power_of_two(nx));
+    ERRCHK(is_power_of_two(ny));
+    ERRCHK(is_power_of_two(nz));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_vec<_device_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_vec<_device_exp_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        solve_mean = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    if (solve_mean) {
+        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
+        return inv_n * result;
+    }
+    else {
+        return result;
+    }
+}
diff --git a/src/core/kernels/rk3.cuh b/src/core/kernels/rk3.cuh
new file mode 100644
index 0000000..b8e352a
--- /dev/null
+++ b/src/core/kernels/rk3.cuh
@@ -0,0 +1,742 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Implementation of the integration pipeline
+ *
+ *
+ *
+ */
+#pragma once
+#include "device_globals.cuh"
+
+#include <assert.h>
+
+/*
+#define RK_THREADS_X (32)
+#define RK_THREADS_Y (1)
+#define RK_THREADS_Z (4)
+#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
+#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
+*/
+
+static __device__ __forceinline__ int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+static __forceinline__ AcMatrix
+create_rotz(const AcReal radians)
+{
+    AcMatrix mat;
+
+    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
+    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
+    mat.row[2] = (AcReal3){0, 0, 0};
+
+    return mat;
+}
+
+
+#if AC_DOUBLE_PRECISION == 0
+#define sin __sinf
+#define cos __cosf
+#define exp __expf
+#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0
+
+
+/*
+typedef struct {
+    int i, j, k;
+} int3;*/
+
+/*
+ * =============================================================================
+ * Level 0 (Input Assembly Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 0.1 (Read stencil elements and solve derivatives)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 2.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = 0;
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static __device__ __forceinline__ AcReal
+second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = coefficients[0] * pencil[MID];
+
+#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static __device__ __forceinline__ AcReal
+cross_derivative(const AcReal* __restrict__ pencil_a,
+                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const AcReal coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const AcReal fac            = (1. / 720.);
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const AcReal fac            = (1. / 20160.);
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    AcReal res    = AcReal(0.);
+
+    #pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static __device__ __forceinline__ AcReal
+derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
+}
+
+static __device__ __forceinline__ AcReal
+derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
+}
+
+static __device__ __forceinline__ AcReal
+deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil_a[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    AcReal pencil_b[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+                            DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+static __device__ __forceinline__ AcReal
+derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
+{
+    AcReal pencil[STENCIL_ORDER + 1];
+#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
+}
+
+/*
+ * =============================================================================
+ * Level 0.2 (Caching functions)
+ * =============================================================================
+ */
+
+#include "stencil_assembly.cuh"
+
+/*
+typedef struct {
+    AcRealData x;
+    AcRealData y;
+    AcRealData z;
+} AcReal3Data;
+
+static __device__ __forceinline__ AcReal3Data
+read_data(const int i, const int j, const int k,
+          AcReal* __restrict__ buf[], const int3& handle)
+{
+    AcReal3Data data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+*/
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static __host__  __device__ __forceinline__ AcReal3
+operator*(const AcReal a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static __host__ __device__ __forceinline__ AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static __host__ __device__ __forceinline__ AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static __host__ __device__ __forceinline__ bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+
+
+/*
+ * =============================================================================
+ * Level 1 (Stencil Processing Stage)
+ * =============================================================================
+ */
+
+/*
+ * =============================================================================
+ * Level 1.1 (Terms)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+laplace(const AcRealData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static __device__ __forceinline__ AcReal
+divergence(const AcReal3Data& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static __device__ __forceinline__ AcReal3
+laplace_vec(const AcReal3Data& vec)
+{
+    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static __device__ __forceinline__ AcReal3
+curl(const AcReal3Data& vec)
+{
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static __device__ __forceinline__ AcReal3
+gradient_of_divergence(const AcReal3Data& vec)
+{
+    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static __device__ __forceinline__ AcMatrix
+stress_tensor(const AcReal3Data& vec)
+{
+    AcMatrix S;
+
+    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
+                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
+                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static __device__ __forceinline__ AcReal
+contract(const AcMatrix& mat)
+{
+    AcReal res = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Level 1.2 (Equations)
+ * =============================================================================
+ */
+static __device__ __forceinline__ AcReal
+length(const AcReal3& vec)
+{
+    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal
+reciprocal_len(const AcReal3& vec)
+{
+    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static __device__ __forceinline__ AcReal3
+normalized(const AcReal3& vec)
+{
+    const AcReal inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+// Sinusoidal forcing
+// https://arxiv.org/pdf/1704.04676.pdf
+__constant__ AcReal3 forcing_vec;
+__constant__ AcReal forcing_phi;
+static __device__ __forceinline__ AcReal3
+forcing(const int i, const int j, const int k)
+{
+    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+    AcReal inv_len = reciprocal_len(k_vec);
+    if (isnan(inv_len) || isinf(inv_len))
+        inv_len = 0;
+    if (inv_len > 2) // hack to make it cool
+        inv_len = 2;
+    const AcReal k_dot_x = dot(k_vec, forcing_vec);
+
+    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+
+    return inv_len * inv_len * waves * forcing_vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (AcReal(0.0))
+#define LNRHO0 (AcReal(0.0))
+
+#define H_CONST (AcReal(0.0))
+#define C_CONST (AcReal(0.0))
+
+
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number + 1] *
+                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
+              const AcReal rate_of_change, const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
+                           AcReal(8. / 15.)};
+
+
+    switch (step_number) {
+        case 0:
+            return state_current + beta[step_number] * rate_of_change * dt;
+        case 1: // Fallthrough
+        case 2:
+            return state_current +
+               beta[step_number] *
+                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
+                        (state_current - state_previous) +
+                    rate_of_change * dt);
+        default:
+            return NAN;
+    }
+}
+*/
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)\
+rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+/*
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const int idx, const AcReal out, const int handle,
+              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
+{
+    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
+                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
+{
+    return (AcReal3) {
+        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
+    };
+}
+
+#define RK3(handle, in_cached, rate_of_change, dt) \
+rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
+*/
+
+/*
+ * =============================================================================
+ * Level 1.3 (Kernels)
+ * =============================================================================
+ */
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3) { read_out(idx, field, handle.x),
+                                       read_out(idx, field, handle.y),
+                                       read_out(idx, field, handle.z) };
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+// also write for clarity here also, not for the DSL
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+
+#define GEN_KERNEL_PARAM_BOILERPLATE \
+        const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+            return;\
+\
+\
+        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+               vertexIdx.z < DCONST_INT(AC_nz_max));\
+\
+        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+\
+        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#include "stencil_process.cuh"
+
+/*
+ * =============================================================================
+ * Level 2 (Host calls)
+ * =============================================================================
+ */
+
+static AcReal
+randf(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+AcResult
+rk3_step_async(const cudaStream_t stream, const dim3& tpb,
+               const int3& start, const int3& end, const int& step_number,
+               const AcReal dt, const AcMeshInfo& /*mesh_info*/,
+               VertexBufferArray* buffer)
+{
+    /////////////////// Forcing
+    #if LFORCING
+    const AcReal ff_scale = AcReal(.2);
+    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcMatrix rotz = create_rotz(radians);
+    ff = mul(rotz, ff);
+    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
+
+    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    #endif // LFORCING
+    //////////////////////////
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+
+    const dim3 bpg(
+        (unsigned int)ceil(nx / AcReal(tpb.x)),
+        (unsigned int)ceil(ny / AcReal(tpb.y)),
+        (unsigned int)ceil(nz / AcReal(tpb.z)));
+
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
+
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
diff --git a/src/core/math_utils.h b/src/core/math_utils.h
new file mode 100644
index 0000000..fcffb53
--- /dev/null
+++ b/src/core/math_utils.h
@@ -0,0 +1,91 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include <math.h>   // isnan, isinf
+#include <stdlib.h> // rand
+
+template <class T>
+static inline const T
+max(const T& a, const T& b)
+{
+    return a > b ? a : b;
+}
+
+template <class T>
+static inline const T
+min(const T& a, const T& b)
+{
+    return a < b ? a : b;
+}
+
+template <class T>
+static inline const T
+sum(const T& a, const T& b)
+{
+    return a + b;
+}
+
+template <class T>
+static inline const T
+is_valid(const T& val)
+{
+    if (isnan(val) || isinf(val))
+        return false;
+    else
+        return true;
+}
+
+template <class T>
+static inline const T
+clamp(const T& val, const T& min, const T& max)
+{
+    return val < min ? min : val > max ? max : val;
+}
+
+static inline AcReal
+randr()
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+static inline int3
+operator+(const int3& a, const int3& b)
+{
+    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static inline int3
+operator-(const int3& a, const int3& b)
+{
+    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static inline bool
+is_power_of_two(const unsigned val)
+{
+    return val && !(val & (val - 1));
+}
diff --git a/src/standalone/CMakeLists.txt b/src/standalone/CMakeLists.txt
new file mode 100644
index 0000000..c6b535b
--- /dev/null
+++ b/src/standalone/CMakeLists.txt
@@ -0,0 +1,10 @@
+################################
+##  CMakeLists.txt for utils  ##
+################################
+
+file (GLOB SOURCES "*.cc" "model/*.cc")
+
+add_library(astaroth_standalone STATIC ${SOURCES})
+target_include_directories(astaroth_standalone PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+#target_compile_definitions(astaroth_standalone PRIVATE CONFIG_PATH=\"${CMAKE_SOURCE_DIR}/config/\")
+target_compile_definitions(astaroth_standalone PRIVATE CONFIG_PATH=\"${ASTAROTH_CONF_PATH}\")
diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc
new file mode 100644
index 0000000..52a2219
--- /dev/null
+++ b/src/standalone/autotest.cc
@@ -0,0 +1,732 @@
+/*
+   Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+   This file is part of Astaroth.
+
+   Astaroth is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Astaroth is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <stdio.h>
+
+#include "config_loader.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_boundconds.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+
+#include "core/errchk.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+// Defines for colored output
+#define RED "\x1B[31m"
+#define GRN "\x1B[32m"
+#define YEL "\x1B[33m"
+#define BLU "\x1B[34m"
+#define MAG "\x1B[35m"
+#define CYN "\x1B[36m"
+#define WHT "\x1B[37m"
+#define RESET "\x1B[0m"
+
+#define GEN_TEST_RESULT (1) // Generate a test file always during testing
+
+typedef struct {
+	int x, y, z;
+} vec3i;
+
+typedef struct {
+	AcReal x, y, z;
+} vec3r;
+
+
+typedef struct {
+	ModelScalar model;
+	AcReal candidate;
+	ModelScalar error;
+} ErrorInfo;
+
+#define QUICK_TEST (0)
+#define THOROUGH_TEST (1)
+#define TEST_TYPE QUICK_TEST
+
+static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
+// #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#if TEST_TYPE == QUICK_TEST // REGULAR TEST START HERE --------------------------------------------------------------------------------------------------------------
+	static inline ModelScalar
+get_absolute_error(const ModelScalar& model, const AcReal& candidate)
+{
+	return fabsl(candidate - model);
+}
+
+	static inline ModelScalar
+get_acceptable_absolute_error(const ModelScalar& range)
+{
+	// This is the upper limit, which assumes that both the min and max values
+	// are used in a calculation (which inherently leads to cancellation).
+	//
+	// AFAIK if this breaks, there is definitely something wrong with the code.
+	// Otherwise the error is so small it's indistiguishable from inherent
+	// inaccuracies in floating-point arithmetic.
+	return range * AC_REAL_EPSILON;
+}
+
+	static inline ModelScalar
+get_acceptable_relative_error(void)
+{
+	return 30; // machine epsilons
+}
+
+	static inline ModelScalar
+get_relative_error(const ModelScalar& model, const AcReal& candidate)
+{
+	ModelScalar error = NAN;
+
+#if 0
+	const ModelScalar abs_epsilon = get_acceptable_absolute_error(range);
+	if (fabsl(model) < abs_epsilon) { // Model is close to zero
+		/*
+		   if (fabsl(candidate - model) <= AC_REAL_EPSILON * fabsl(candidate))
+		   error = 0;
+		// Knuth section 4.2.2 pages 217-218 TODO
+		 */
+		if (fabsl(candidate) < abs_epsilon) // If candidate is close to zero
+			error = fabsl(candidate);       // return candidate itself
+		else
+			error = INFINITY;
+	}
+	else {
+		error = fabsl(1.0l - candidate / model);
+	}
+#endif
+	error = fabsl(1.0l - candidate / model);
+
+	// Return the relative error as multiples of the machine epsilon
+	// See Sect. Relative Error and Ulps in
+	// What Every Computer Scientist Should Know About Floating-Point Arithmetic
+	// By David Goldberg (1991)
+	return error / AC_REAL_EPSILON;
+}
+
+	static bool
+verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range)
+{
+	if (!is_valid(model) || !is_valid(cand))
+		return false;
+
+	const ModelScalar relative_error = get_relative_error(model, cand);
+	if (relative_error < get_acceptable_relative_error())
+		return true;
+
+	const ModelScalar absolute_error = get_absolute_error(model, cand);
+	if (absolute_error < get_acceptable_absolute_error(range))
+		return true;
+
+	return false;
+}
+
+	static ModelScalar
+get_reduction_range(const ModelMesh& mesh)
+{
+	ERRCHK(NUM_VTXBUF_HANDLES >= 3);
+
+	const ModelScalar max0     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(0));
+	const ModelScalar max1     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(1));
+	const ModelScalar max2     = model_reduce_scal(mesh, RTYPE_MAX,
+			VertexBufferHandle(2));
+	const ModelScalar max_scal = max(max0, max(max1, max2));
+
+	const ModelScalar min0     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(0));
+	const ModelScalar min1     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(1));
+	const ModelScalar min2     = model_reduce_scal(mesh, RTYPE_MIN,
+			VertexBufferHandle(2));
+	const ModelScalar min_scal = min(min0, min(min1, min2));
+
+	return max_scal - min_scal;
+}
+
+	static void
+print_debug_info(const ModelScalar& model, const AcReal& candidate,
+		const ModelScalar& range)
+{
+	printf("MeshPointInfo\n");
+	printf("\tModel: %e\n", double(model));
+	printf("\tCandidate: %e\n", double(candidate));
+	printf("\tRange: %e\n", double(range));
+
+	printf("\tAbsolute error: %Le (max acceptable: %Le)\n",
+			get_absolute_error(model, candidate),
+			get_acceptable_absolute_error(range));
+	printf("\tRelative error: %Le (max acceptable: %Le)\n",
+			get_relative_error(model, candidate),
+			get_acceptable_relative_error());
+	printf("\tIs acceptable: %d\n", verify(model, candidate, range));
+}
+
+static void
+print_result(const ModelScalar& model, const AcReal& candidate,
+		const ModelScalar& range, const char* name = "???")
+{
+	const ModelScalar rel_err = get_relative_error(model, candidate);
+	const ModelScalar abs_err = get_absolute_error(model, candidate);
+	if (!verify(model, candidate, range)) {
+		printf("\t%-12s... ", name);
+		printf(RED "FAIL! " RESET);
+	}
+	else {
+		printf("\t%-12s... ", name);
+		printf(GRN "OK! " RESET);
+	}
+
+	printf("(relative error: %.3Lg \u03B5, absolute error: %Lg)\n", rel_err, abs_err);
+	/*
+	// DEPRECATED: TODO remove
+	if (rel_err < get_acceptable_relative_error())
+	printf("(relative error: %Lg \u03B5, max accepted %Lg)\n", rel_err,
+	get_acceptable_relative_error());
+	else
+	printf("(absolute error: %Lg, max accepted %Lg)\n", abs_err,
+	get_acceptable_absolute_error(range));
+	 */
+}
+
+	static int
+check_reductions(const AcMeshInfo& config)
+{
+	printf("Testing reductions\n");
+	int num_failures = 0;
+
+	// Init CPU meshes
+	AcMesh* mesh = acmesh_create(config);
+	ModelMesh* modelmesh = modelmesh_create(config);
+
+	// Init GPU meshes
+	acInit(config);
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+        const InitType itype = test_cases[i];
+        printf("Checking %s...\n", init_type_names[InitType(itype)]);
+
+		// Init the mesh and figure out the acceptable range for error
+		acmesh_init_to(InitType(itype), mesh);
+
+		acmesh_to_modelmesh(*mesh, modelmesh);
+		const ModelScalar range = get_reduction_range(*modelmesh);
+
+		acLoad(*mesh);
+
+		for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) {
+			const VertexBufferHandle ftype = VTXBUF_UUX;
+
+			// Scal
+			ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
+					VertexBufferHandle(ftype));
+			AcReal candidate  = acReduceScal(ReductionType(rtype),
+					VertexBufferHandle(ftype));
+			print_result(model, candidate, range, "UUX scal");
+
+			bool is_acceptable = verify(model, candidate, range);
+			if (!is_acceptable) {
+				++num_failures;
+
+				// Print debug info
+				printf("Scalar reduction type %d FAIL\n", rtype);
+				print_debug_info(model, candidate, range);
+			}
+
+			// Vec
+			model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX,
+					VTXBUF_UUY, VTXBUF_UUZ);
+			candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX,
+					VTXBUF_UUY, VTXBUF_UUZ);
+			print_result(model, candidate, range, "UUXYZ vec");
+
+			is_acceptable = verify(model, candidate, range);
+			if (!is_acceptable) {
+				++num_failures;
+
+				// Print debug info
+				printf("Vector reduction type %d FAIL\n", rtype);
+				print_debug_info(model, candidate, range);
+			}
+		}
+
+		printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+	}
+	acQuit();
+	modelmesh_destroy(modelmesh);
+	acmesh_destroy(mesh);
+
+	return num_failures;
+}
+
+/** Finds the maximum and minimum in all meshes and computes the range.
+ * Note! Potentially dangerous if all meshes do not interact with each other.
+ * Otherwise the range may be too high.
+ */
+	static ModelScalar
+get_data_range(const ModelMesh& model)
+{
+	ModelScalar vertex_buffer_max_all = -INFINITY;
+	ModelScalar vertex_buffer_min_all = INFINITY;
+	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+		const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, VertexBufferHandle(w));
+		const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, VertexBufferHandle(w));
+
+		if (vertex_buffer_max > vertex_buffer_max_all)
+			vertex_buffer_max_all = vertex_buffer_max;
+		if (vertex_buffer_min < vertex_buffer_min_all)
+			vertex_buffer_min_all = vertex_buffer_min;
+	}
+	return fabsl(vertex_buffer_max_all - vertex_buffer_min_all);
+}
+
+// #define GEN_TEST_RESULT
+#if GEN_TEST_RESULT == 1
+static FILE* test_result = NULL;
+#endif
+
+	static bool
+verify_meshes(const ModelMesh& model, const AcMesh& candidate)
+{
+	bool retval = true;
+
+#if GEN_TEST_RESULT == 1
+	ErrorInfo err = ErrorInfo();
+#endif
+
+	const ModelScalar range = get_data_range(model);
+	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+		const size_t n = AC_VTXBUF_SIZE(model.info);
+
+		// Maximum errors
+		ErrorInfo max_abs_error = ErrorInfo();
+		ErrorInfo max_rel_error = ErrorInfo();
+
+		for (size_t i = 0; i < n; ++i) {
+			const ModelScalar model_val = model.vertex_buffer[VertexBufferHandle(w)][i];
+			const AcReal cand_val = candidate.vertex_buffer[VertexBufferHandle(w)][i];
+
+			if (!verify(model_val, cand_val, range)) {
+				const int i0 = i % model.info.int_params[AC_mx];
+				const int j0 = ((i % (model.info.int_params[AC_mx] *
+								model.info.int_params[AC_my])) /
+						model.info.int_params[AC_mx]);
+				const int k0 = i / (model.info.int_params[AC_mx] *
+						model.info.int_params[AC_my]);
+				printf("Index (%d, %d, %d)\n", i0, j0, k0);
+				print_debug_info(model_val, cand_val, range);
+				retval = false;
+			}
+
+			const ModelScalar abs_error = get_absolute_error(model_val,
+					cand_val);
+			if (abs_error > max_abs_error.error) {
+				max_abs_error.error     = abs_error;
+				max_abs_error.model     = model_val;
+				max_abs_error.candidate = cand_val;
+			}
+
+			const ModelScalar rel_error = get_relative_error(model_val, cand_val);
+			if (rel_error > max_rel_error.error) {
+				max_rel_error.error     = rel_error;
+				max_rel_error.model     = model_val;
+				max_rel_error.candidate = cand_val;
+			}
+
+#if GEN_TEST_RESULT == 1
+			if (abs_error > err.error) {
+				err.error = abs_error;
+				err.model = model_val;
+				err.candidate = cand_val;
+			}
+#endif
+		}
+		//print_result(max_rel_error.model, max_rel_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
+		print_result(max_abs_error.model, max_abs_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
+	}
+
+#if GEN_TEST_RESULT == 1
+	const ModelScalar rel_err = get_relative_error(err.model, err.candidate);
+	const ModelScalar abs_err = get_absolute_error(err.model, err.candidate);
+	fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
+#endif
+
+	printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+
+	return retval;
+}
+
+	int
+check_rk3(const AcMeshInfo& mesh_info)
+{
+	const int num_iterations = 1; // Note: should work up to at least 15 steps
+	printf("Testing RK3 (running %d steps before checking the result)\n",
+			num_iterations);
+	int num_failures = 0;
+
+	// Init CPU meshes
+	AcMesh* gpu_mesh   = acmesh_create(mesh_info);
+	ModelMesh* model_mesh = modelmesh_create(mesh_info);
+
+	// Init GPU meshes
+	acInit(mesh_info);
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+        const InitType itype = test_cases[i];
+		printf("Checking %s...\n", init_type_names[InitType(itype)]);
+
+		// Init the mesh and figure out the acceptable range for error
+		acmesh_init_to(InitType(itype), gpu_mesh);
+
+		acLoad(*gpu_mesh);
+		acmesh_to_modelmesh(*gpu_mesh, model_mesh);
+
+		acBoundcondStep();
+		boundconds(model_mesh->info, model_mesh);
+
+		for (int i = 0; i < num_iterations; ++i) {
+			//const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+            //const AcReal dt   = host_timestep(umax, mesh_info);
+			const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
+
+			acIntegrate(dt);
+			acBoundcondStep();
+			acSynchronize();
+
+			model_rk3(dt, model_mesh);
+			boundconds(model_mesh->info, model_mesh);
+		}
+		acStore(gpu_mesh);
+
+		bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
+		if (!is_acceptable) {
+			++num_failures;
+		}
+	}
+
+	acQuit();
+	acmesh_destroy(gpu_mesh);
+	modelmesh_destroy(model_mesh);
+
+	return num_failures;
+}
+
+	int
+run_autotest(void)
+{
+#if GEN_TEST_RESULT == 1
+	char testresult_path[256];
+	sprintf(testresult_path, "%s_fullstep_testresult.out", AC_DOUBLE_PRECISION ? "double" : "float");
+
+	test_result = fopen(testresult_path, "w");
+	ERRCHK(test_result);
+
+	fprintf(test_result, "n, max abs error, corresponding rel error\n");
+#endif
+
+	/* Parse configs */
+	AcMeshInfo config;
+	load_config(&config);
+
+	if (STENCIL_ORDER > 6)
+		printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
+	/*
+	   const vec3i test_dims[] = {              //
+	   {15, 11, 13}, //
+	   {17, 61, 127}, //
+	   {511, 17, 16},  //
+	   {64, 64, 8},  //
+	   {32, 32, 64}, //
+	   {64, 32, 32}, //
+	   {128, 64, 32}};
+	 */
+	const vec3i test_dims[] = {{512, 16, 32},  //
+		{64, 64, 32},  //
+		{32, 32, 64}, //
+		{64, 32, 32}, //
+		{128, 64, 32}};
+
+	//const vec3i test_dims[] = {{256,256,256}};
+	//const vec3i test_dims[] = {{256,256,256}};
+	//const vec3i test_dims[] = {{32, 32, 32}};
+
+	int num_failures = 0;
+	/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+		config.int_params[AC_nx] = test_dims[i].x;
+		config.int_params[AC_ny] = test_dims[i].y;
+		config.int_params[AC_nz] = test_dims[i].z;
+		update_config(&config);
+
+		printf("Testing mesh (%d, %d, %d):\n", //
+				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+
+		num_failures += check_reductions(config);
+		fflush(stdout);
+	}*/ // TODO uncomment
+
+	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+		config.int_params[AC_nx] = test_dims[i].x;
+		config.int_params[AC_ny] = test_dims[i].y;
+		config.int_params[AC_nz] = test_dims[i].z;
+		update_config(&config);
+
+		printf("Testing mesh (%d, %d, %d):\n", //
+				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+
+		num_failures += check_rk3(config);
+		fflush(stdout);
+	}
+
+	printf("\n--------Testing done---------\n");
+	printf("Failures found: %d\n", num_failures);
+
+#if GEN_TEST_RESULT == 1
+	fflush(test_result);
+	fclose(test_result);
+#endif
+
+	if (num_failures > 0)
+		return EXIT_FAILURE;
+	else
+		return EXIT_SUCCESS;
+}
+
+#elif TEST_TYPE == THOROUGH_TEST // GEN TEST FILE START HERE --------------------------------------------------------------------------------------------------------------
+typedef struct {
+	ModelScalar model;
+	AcReal candidate;
+	ModelScalar abs_error;
+	ModelScalar ulp_error;
+	ModelScalar rel_error;
+	ModelScalar maximum_magnitude;
+	ModelScalar minimum_magnitude;
+} Error;
+
+Error get_error(ModelScalar model, AcReal candidate)
+{
+	Error error;
+        error.abs_error = 0;
+
+	error.model = model;
+	error.candidate = candidate;
+
+	if (error.model == error.candidate || fabsl(model - candidate) == 0) { // If exact
+		error.abs_error = 0;
+		error.rel_error = 0;
+		error.ulp_error = 0;
+	} else if (!is_valid(error.model) || !is_valid(error.candidate)) {
+		error.abs_error = INFINITY;
+		error.rel_error = INFINITY;
+		error.ulp_error = INFINITY;
+	} else {
+		const int base = 2;
+		const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant
+
+		const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2));
+
+		const ModelScalar ulp = powl(base, e - (p-1));
+		const ModelScalar machine_epsilon = 0.5 * powl(base, -(p-1));
+		error.abs_error = fabsl(model - candidate);
+		error.ulp_error	= error.abs_error / ulp;
+		error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon;
+	}
+
+	return error;
+}
+
+Error get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+{
+	Error error;
+        error.abs_error = -1;
+
+	for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
+		for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+			Error curr_error = get_error(model_mesh.vertex_buffer[j][i], candidate_mesh.vertex_buffer[j][i]);
+			if (curr_error.abs_error > error.abs_error)
+				error = curr_error;
+		}
+	}
+
+	error.maximum_magnitude = -1; // Not calculated.
+	error.minimum_magnitude = -1; // Not calculated.
+
+	return error;
+}
+
+static ModelScalar
+get_maximum_magnitude(const ModelScalar* field, const AcMeshInfo info)
+{
+	ModelScalar maximum = -INFINITY;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+		maximum = max(maximum, fabsl(field[i]));
+
+	return maximum;
+}
+
+
+static ModelScalar
+get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
+{
+	ModelScalar minimum = INFINITY;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+		minimum = min(minimum, fabsl(field[i]));
+
+	return minimum;
+}
+
+Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+{
+	ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
+	AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle];
+
+	Error error;
+        error.abs_error = -1;
+
+	for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+
+		Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
+
+		if (curr_error.abs_error > error.abs_error)
+			error = curr_error;
+	}
+
+
+	error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
+	error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
+
+	return error;
+}
+
+void
+print_error_to_file(const char* path, const int n, const Error error)
+{
+    FILE* file = fopen(path, "a");
+    fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
+    //fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
+    fclose(file);
+}
+
+#define MAX_PATH_LEN (256)
+
+int run_autotest(void)
+{
+
+#define N_MIN (32)
+#define N_MAX (512)
+	for (int n = N_MIN; n <= N_MAX; n += N_MIN) {
+		AcMeshInfo config;
+		load_config(&config);
+		config.int_params[AC_nx] = config.int_params[AC_ny] = config.int_params[AC_nz] = n;
+		update_config(&config);
+
+		// Init host
+		AcMesh* candidate_mesh = acmesh_create(config);
+		ModelMesh* model_mesh = modelmesh_create(config);
+
+		// Init device
+		acInit(config);
+
+		// Check all initial conditions
+        for (int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+            const InitType init_type = test_cases[i];
+			acmesh_init_to((InitType)init_type, candidate_mesh);
+			acmesh_to_modelmesh(*candidate_mesh, model_mesh);   // Load to Host
+			acLoad(*candidate_mesh);                             // Load to Device
+
+			boundconds(model_mesh->info, model_mesh);
+			acBoundcondStep();
+
+            { // Check boundconds
+                acStore(candidate_mesh);
+                Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh);
+                char boundcond_path[MAX_PATH_LEN];
+                sprintf(boundcond_path, "%s_boundcond_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(boundcond_path, n, boundcond_error);
+            }
+
+            { // Check scalar max reduction
+                ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                Error scalar_reduce_error = get_error(model, candidate);
+                char scalar_reduce_path[MAX_PATH_LEN];
+                sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(scalar_reduce_path, n, scalar_reduce_error);
+            }
+
+            { // Check vector max reduction
+                ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                Error vector_reduce_error = get_error(model, candidate);
+                char vector_reduce_path[MAX_PATH_LEN];
+                sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                print_error_to_file(vector_reduce_path, n, vector_reduce_error);
+            }
+
+            // Time advance
+            {
+                const AcReal umax =  (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                const AcReal dt = host_timestep(umax, config);
+
+                // Host integration step
+                model_rk3(dt, model_mesh);
+                boundconds(config, model_mesh);
+
+                // Device integration step
+                acIntegrate(dt);
+                acBoundcondStep();
+                acSynchronize();
+                acStore(candidate_mesh);
+
+                // Check fields
+                for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) {
+                    Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, *model_mesh, *candidate_mesh);
+
+			printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
+
+                    char field_path[MAX_PATH_LEN];
+                    sprintf(field_path, "%s_integrationstep_%s_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type], vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
+                    print_error_to_file(field_path, n, field_error);
+                }
+            }
+		}
+
+		// Deallocate host
+		acmesh_destroy(candidate_mesh);
+		modelmesh_destroy(model_mesh);
+
+		// Deallocate device
+		acQuit();
+	}
+
+	return 0;
+}
+#endif
diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc
new file mode 100644
index 0000000..a5f163a
--- /dev/null
+++ b/src/standalone/benchmark.cc
@@ -0,0 +1,300 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <stdlib.h> // EXIT_SUCCESS
+
+#include "config_loader.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+#include <vector>
+#include <algorithm>
+#include <math.h>
+#include "src/core/errchk.h"
+
+static bool
+smaller_than(const double& a, const double& b)
+{
+    return a < b;
+}
+
+static int
+write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
+{
+    FILE* fp;
+    fp = fopen(path, "a");
+
+    if (fp != NULL) {
+        fprintf(fp, "%d, %f, %f, %f, %f\n", n, min, max, median, perc);
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+static int
+write_percentiles(const char* path, const int num_iters, const std::vector<double>& results)
+{
+    FILE* fp;
+    fp = fopen(path, "w");
+
+    if (fp != NULL) {
+        for (int i = 0; i < 100; ++i) {
+            fprintf(fp, "%f\n", results[(long unsigned)((i / 100.) * num_iters)]);
+        }
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+
+int
+run_benchmark(void)
+{
+    char runningtime_path[256];
+    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+
+    FILE* fp;
+    fp = fopen(runningtime_path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "n, min, max, median, perc\n");
+        fclose(fp);
+    } else {
+        return EXIT_FAILURE;
+    }
+
+    #define N_STEP_SIZE (128)
+    #define MAX_MESH_DIM (128)
+    #define NUM_ITERS (100)
+    for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
+        /* Parse configs */
+        AcMeshInfo mesh_info;
+        load_config(&mesh_info);
+        mesh_info.int_params[AC_nx] = n;
+        mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+        mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+        update_config(&mesh_info);
+
+        AcMesh* mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+        acInit(mesh_info);
+        acLoad(*mesh);
+
+        std::vector<double> results;
+        results.reserve(NUM_ITERS);
+
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            acIntegrate(0);
+            acSynchronize();
+        }
+
+        Timer t;
+        for (int i = 0; i < NUM_ITERS; ++i) {
+
+            timer_reset(&t);
+            #if GEN_BENCHMARK_RK3 == 1
+            acIntegrateStep(2, FLT_EPSILON);
+            #else // GEN_BENCHMARK_FULL
+            //const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+            const AcReal dt   = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
+            acIntegrate(dt);
+            #endif
+            acSynchronize();
+
+            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
+            results.push_back(ms_elapsed);
+        }
+
+        #define NTH_PERCENTILE (0.95)
+        std::sort(results.begin(), results.end(), smaller_than);
+        write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+
+        char percentile_path[256];
+        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+        write_percentiles(percentile_path, NUM_ITERS, results);
+
+        printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
+
+        acStore(mesh);
+        acQuit();
+        acmesh_destroy(mesh);
+    }
+
+    return 0;
+}
+
+/*
+
+#if AUTO_OPTIMIZE
+const char* benchmark_path = "benchmark.out";
+
+#include "core/kernels/rk3_threadblock.conf"
+static int
+write_result_to_file(const float& ms_per_step)
+{
+    FILE* fp;
+    fp = fopen(benchmark_path, "a");
+
+    if (fp != NULL) {
+        fprintf(fp,
+                "(%d, %d, %d), %d elems per thread, launch bound %d, %f ms\n",
+                RK_THREADS_X, RK_THREADS_Y, RK_THREADS_Z, RK_ELEMS_PER_THREAD,
+                RK_LAUNCH_BOUND_MIN_BLOCKS, double(ms_per_step));
+        fclose(fp);
+        return EXIT_SUCCESS;
+    }
+    return EXIT_FAILURE;
+}
+#endif
+
+#if GENERATE_BENCHMARK_DATA != 1
+int
+run_benchmark(void)
+{
+    // Parse configs
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    mesh_info.int_params[AC_nx] = 128;
+    mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+    mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+    update_config(&mesh_info);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+    Timer t;
+    timer_reset(&t);
+
+    int steps           = 0;
+    const int num_steps = 100;
+    while (steps < num_steps) {
+        // Advance the simulation
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+        ++steps;
+    }
+    acSynchronize();
+    const float wallclock = timer_diff_nsec(t) / 1e9f;
+    printf("%d steps. Wallclock time %f s per step\n", steps,
+           double(wallclock) / num_steps);
+    #if AUTO_OPTIMIZE
+    write_result_to_file(wallclock * 1e3f / steps);
+    #endif
+
+    acStore(mesh);
+    acQuit();
+    acmesh_destroy(mesh);
+
+    return 0;
+}
+
+#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
+
+
+
+
+int
+run_benchmark(void)
+{
+    const char path[] = "result.out";
+    FILE* fp;
+    fp = fopen(path, "w");
+
+    if (fp != NULL) {
+        fprintf(fp, "n, min, max, median, perc\n");
+        fclose(fp);
+    } else {
+        return EXIT_FAILURE;
+    }
+
+    #define N_STEP_SIZE (256)
+    #define MAX_MESH_DIM (256)
+    #define NUM_ITERS (1000)
+    for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
+        // Parse configs
+        AcMeshInfo mesh_info;
+        load_config(&mesh_info);
+        mesh_info.int_params[AC_nx] = n;
+        mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx];
+        mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
+        update_config(&mesh_info);
+
+        AcMesh* mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_ABC_FLOW, mesh);
+
+        acInit(mesh_info);
+        acLoad(*mesh);
+
+        std::vector<double> results;
+        results.reserve(NUM_ITERS);
+
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            acIntegrate(0);
+            acSynchronize();
+        }
+
+        Timer t;
+
+        const AcReal dt = AcReal(1e-5);
+        for (int i = 0; i < NUM_ITERS; ++i) {
+
+            timer_reset(&t);
+            //acIntegrate(dt);
+            acIntegrateStep(2, dt);
+            acSynchronize();
+
+            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
+            results.push_back(ms_elapsed);
+        }
+
+
+
+        #define NTH_PERCENTILE (0.95)
+        std::sort(results.begin(), results.end(), smaller_than);
+        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+        write_percentiles(n, NUM_ITERS, results);
+    }
+
+    return 0;
+}
+#endif
+*/
diff --git a/src/standalone/config_loader.cc b/src/standalone/config_loader.cc
new file mode 100644
index 0000000..6054802
--- /dev/null
+++ b/src/standalone/config_loader.cc
@@ -0,0 +1,194 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "config_loader.h"
+
+#include <limits.h> // UINT_MAX
+#include <stdint.h> // uint8_t, uint32_t
+#include <stdio.h>  // print
+#include <string.h> // memset
+
+#include "core/errchk.h"
+#include "core/math_utils.h"
+
+static inline void
+print(const AcMeshInfo& config)
+{
+    for (int i = 0; i < NUM_INT_PARAM_TYPES; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_REAL_PARAM_TYPES; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+}
+
+/**
+ \brief Find the index of the keyword in names
+ \return Index in range 0...n if the keyword is in names. -1 if the keyword was
+ not found.
+ */
+static int
+find_str(const char keyword[], const char* names[], const int& n)
+{
+    for (int i = 0; i < n; ++i)
+        if (!strcmp(keyword, names[i]))
+            return i;
+
+    return -1;
+}
+
+static void
+parse_config(const char* path, AcMeshInfo* config)
+{
+    FILE* fp;
+    fp = fopen(path, "r");
+    // For knowing which .conf file will be used 
+    printf("Config file path: \n %s \n ", path);
+    ERRCHK(fp != NULL);
+
+    const size_t BUF_SIZE = 128;
+    char keyword[BUF_SIZE];
+    char value[BUF_SIZE];
+    int items_matched;
+    while ((items_matched = fscanf(fp, "%s = %s", keyword, value)) != EOF) {
+
+        if (items_matched < 2)
+            continue;
+
+        int idx = -1;
+        if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
+            config->int_params[idx] = atoi(value);
+        else if ((idx = find_str(keyword, realparam_names,
+                                 NUM_REAL_PARAM_TYPES)) >= 0)
+            config->real_params[idx] = AcReal(atof(value));
+    }
+
+    fclose(fp);
+}
+
+void
+update_config(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    //config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
+                                    config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] +
+                                    STENCIL_ORDER / 2;
+    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] +
+                                    STENCIL_ORDER / 2;
+
+    // Spacing
+    config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
+    config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
+    config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
+    config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
+
+    // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
+    config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx]; 
+    config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my];
+    config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz];
+
+    config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];  
+    config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen]; 
+    config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen]; 
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy] = config->int_params[AC_mx] *
+                                 config->int_params[AC_my];
+    config->int_params[AC_nxy] = config->int_params[AC_nx] *
+                                 config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
+                                  config->int_params[AC_nz];
+
+    // Real helpers
+    config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
+                                        config->real_params[AC_cs_sound];
+
+    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
+
+    AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
+    AcReal M_sun       = AcReal(1.989e33);  // g solar mass
+
+    config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun / 
+                                     ( (config->real_params[AC_unit_length]*
+                                        config->real_params[AC_unit_length]*
+                                        config->real_params[AC_unit_length]) * 
+                                        config->real_params[AC_unit_density] ) ;
+
+    config->real_params[AC_G_CONST] = G_CONST_CGS / 
+                                      ( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
+                                        (config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
+
+    config->real_params[AC_GM_star]  = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
+    config->real_params[AC_sq2GM_star]  = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
+
+
+    const bool print_config = true;
+    if (print_config) {
+        printf("###############################################################"
+               "\n");
+        printf("Config dimensions recalculated:\n");
+        print(*config);
+        printf("###############################################################"
+               "\n");
+    }
+}
+
+/**
+\brief Loads data from astaroth.conf into a config struct.
+\return 0 on success, -1 if there are potentially uninitialized values.
+*/
+int
+load_config(AcMeshInfo* config)
+{
+    int retval = 0;
+    // memset reads the second parameter as a byte even though it says int in
+    // the function declaration
+    memset(config, (uint8_t)0xFF, sizeof(*config));
+
+    parse_config(CONFIG_PATH "astaroth.conf", config);
+    update_config(config);
+
+    // sizeof(config) must be a multiple of 4 bytes for this to work
+    ERRCHK(sizeof(*config) % sizeof(uint32_t) == 0);
+    for (size_t i = 0; i < sizeof(*config) / sizeof(uint32_t); ++i) {
+        if (((uint32_t*)config)[i] == (uint32_t)0xFFFFFFFF) {
+            WARNING("Some config values may be uninitialized. "
+                    "See that all are defined in astaroth.conf\n");
+            retval = -1;
+        }
+    }
+    return retval;
+}
diff --git a/src/standalone/config_loader.h b/src/standalone/config_loader.h
new file mode 100644
index 0000000..ff1a798
--- /dev/null
+++ b/src/standalone/config_loader.h
@@ -0,0 +1,34 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Functions for loading and updating AcMeshInfo.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+/** Loads data from the config file */
+int load_config(AcMeshInfo* config);
+
+/** Recalculates the portion of int parameters which get their values from nx,
+ * ny and nz. Must be called after modifying the config struct or otherwise
+ * contents of the struct will be incorrect */
+void update_config(AcMeshInfo* config);
diff --git a/src/standalone/main.cc b/src/standalone/main.cc
new file mode 100644
index 0000000..e2fc6b5
--- /dev/null
+++ b/src/standalone/main.cc
@@ -0,0 +1,94 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core/errchk.h"
+#include "run.h"
+
+// Write all errors from stderr to an <errorlog_name> in the current working
+// directory
+static const bool write_log_to_a_file = false;
+static const char* errorlog_name      = "error.log";
+
+static void
+errorlog_init(void)
+{
+    FILE* fp = freopen(errorlog_name, "w", stderr); // Log errors to a file
+    if (!fp)
+        perror("Error redirecting stderr to a file");
+}
+
+static void
+errorlog_quit(void)
+{
+    fclose(stderr);
+
+    // Print contents of the latest errorlog to screen
+    FILE* fp = fopen(errorlog_name, "r");
+    if (fp) {
+        for (int c = getc(fp); c != EOF; c = getc(fp))
+            putchar(c);
+        fclose(fp);
+    }
+    else {
+        perror("Error opening error log");
+    }
+}
+
+int
+main(int argc, char* argv[])
+{
+    if (write_log_to_a_file) {
+        errorlog_init();
+        atexit(errorlog_quit);
+    }
+
+    printf("Args: \n");
+    for (int i = 0; i < argc; ++i)
+        printf("%d: %s\n", i, argv[i]);
+
+    if (argc == 1) {
+        return run_renderer();
+    }
+    else if (argc == 2) {
+        if (strcmp(argv[1], "-t") == 0)
+            return run_autotest();
+        else if (strcmp(argv[1], "-b") == 0)
+            return run_benchmark();
+        else if (strcmp(argv[1], "-s") == 0)
+            return run_simulation();
+        else
+            WARNING("Unrecognized option");
+    }
+    else {
+        WARNING("Too many options given");
+    }
+
+    return EXIT_FAILURE;
+}
diff --git a/src/standalone/model/host_memory.cc b/src/standalone/model/host_memory.cc
new file mode 100644
index 0000000..795a1dc
--- /dev/null
+++ b/src/standalone/model/host_memory.cc
@@ -0,0 +1,737 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "host_memory.h"
+
+#include <math.h>
+
+#include "core/errchk.h"
+
+const char* init_type_names[] = {AC_FOR_INIT_TYPES(AC_GEN_STR)};
+
+#define XORIG (AcReal(.5) * mesh->info.int_params[AC_nx] * mesh->info.real_params[AC_dsx])
+#define YORIG (AcReal(.5) * mesh->info.int_params[AC_ny] * mesh->info.real_params[AC_dsy])
+#define ZORIG (AcReal(.5) * mesh->info.int_params[AC_nz] * mesh->info.real_params[AC_dsz])
+
+/*
+#include <stdint.h>
+static uint64_t ac_rand_next = 1;
+
+static int32_t
+ac_rand(void)
+{
+	ac_rand_next = ac_rand_next * 1103515245 + 12345;
+	return (uint32_t)(ac_rand_next/65536) % 32768;
+}
+
+static void
+ac_srand(const uint32_t seed)
+{
+	ac_rand_next = seed;	
+}
+*/
+
+AcMesh*
+acmesh_create(const AcMeshInfo& mesh_info)
+{
+    AcMesh* mesh = (AcMesh*)malloc(sizeof(*mesh));
+    mesh->info   = mesh_info;
+
+    const size_t bytes = AC_VTXBUF_SIZE_BYTES(mesh->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        mesh->vertex_buffer[VertexBufferHandle(i)] = (AcReal*)malloc(bytes);
+        ERRCHK(mesh->vertex_buffer[VertexBufferHandle(i)] != NULL);
+    }
+
+    return mesh;
+}
+
+static void
+vertex_buffer_set(const VertexBufferHandle& key, const AcReal& val,
+                  AcMesh* mesh)
+{
+    const int n = AC_VTXBUF_SIZE(mesh->info);
+    for (int i = 0; i < n; ++i)
+        mesh->vertex_buffer[key][i] = val;
+}
+
+
+/** Inits all fields to 1. Setting the mesh to zero is problematic because some fields are supposed
+    to be > 0 and the results would vary widely, which leads to loss of precision in the
+    computations */
+void
+acmesh_clear(AcMesh* mesh)
+{
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+        vertex_buffer_set(VertexBufferHandle(w), 1, mesh); // Init all fields to 1 by default.
+}
+
+static AcReal
+randr(void)
+{
+    return AcReal(rand()) / AcReal(RAND_MAX);
+}
+
+
+void
+lnrho_step(AcMesh* mesh)
+{
+    const int    mx     = mesh->info.int_params[AC_mx];
+    const int    my     = mesh->info.int_params[AC_my];
+    const int    mz     = mesh->info.int_params[AC_mz];
+
+    // const int    nx_min = mesh->info.int_params[AC_nx_min];
+    // const int    nx_max = mesh->info.int_params[AC_nx_max];
+    // const int    ny_min = mesh->info.int_params[AC_ny_min];
+    // const int    ny_max = mesh->info.int_params[AC_ny_max];
+    // const int    nz_min = mesh->info.int_params[AC_nz_min];
+    // const int    nz_max = mesh->info.int_params[AC_nz_max];
+
+    // const AcReal DX     = mesh->info.real_params[AC_dsx];
+    // const AcReal DY     = mesh->info.real_params[AC_dsy];
+    // const AcReal DZ     = mesh->info.real_params[AC_dsz];
+    // const AcReal xmax   = DX * (nx_max - nx_min) ;
+    // const AcReal zmax   = DZ * (nz_max - nz_min) ;
+
+    // const AcReal lnrho1 = (AcReal) -1.0; // TODO mesh->info.real_params[AC_lnrho1];  
+    const AcReal lnrho2 = (AcReal) 0.0; // TODO mesh->info.real_params[AC_lnrho2]; 
+    // const AcReal rho1   = (AcReal) exp(lnrho1); 
+    // const AcReal rho2   = (AcReal) exp(lnrho2);
+
+    // const AcReal k_pert    = (AcReal) 1.0; //mesh->info.real_params[AC_k_pert]; //Wamenumber of the perturbation
+    // const AcReal k_pert    = 4.0; //mesh->info.real_params[AC_k_pert]; //Wamenumber of the perturbation
+    //const AcReal ampl_pert = xmax/10.0; // xmax/mesh->info.real_params[AC_pert]; //Amplitude of the perturbation
+    // const AcReal ampl_pert = (AcReal) 0.0;//xmax/20.0; // xmax/mesh->info.real_params[AC_pert]; //Amplitude of the perturbation
+    // const AcReal two_pi       = (AcReal) 6.28318531;
+
+    // const AcReal xorig  = mesh->info.real_params[AC_xorig];
+    // const AcReal zorig  = mesh->info.real_params[AC_zorig];
+    // const AcReal trans  = mesh->info.real_params[AC_trans];
+     
+    
+    // AcReal       xx, zz, tanhprof, cosz_wave;
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                // zz = DZ * AcReal(k) - zorig; // Not used
+                // cosz_wave = ampl_pert*AcReal(cos(k_pert*((zz/zmax)*two_pi))); // Not used        
+                // xx = DX * AcReal(i) - xorig + cosz_wave; //ADD WAVE TODO // Not used
+                // tanhprof = AcReal(0.5)*((rho2+rho1) + (rho2-rho1)*AcReal(tanh(xx/trans))); // Not used
+                // Commented out the step function initial codition. 
+                //mesh->vertex_buffer[VTXBUF_LNRHO][idx] = log(tanhprof);
+                mesh->vertex_buffer[VTXBUF_LNRHO][idx] = lnrho2;
+            }
+        }
+    } 
+
+
+}
+
+// This is the initial condition type for the infalling vedge in the pseudodisk
+// model. 
+void
+inflow_vedge(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    // const int nx_min = mesh->info.int_params[AC_nx_min];
+    // const int nx_max = mesh->info.int_params[AC_nx_max];
+    // const int ny_min = mesh->info.int_params[AC_ny_min];
+    // const int ny_max = mesh->info.int_params[AC_ny_max];
+    // const int nz_min = mesh->info.int_params[AC_nz_min];
+    // const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    // const double DX    = mesh->info.real_params[AC_dsx];
+    // const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    const double AMPL_UU = mesh->info.real_params[AC_ampl_uu];
+    const double ANGL_UU = mesh->info.real_params[AC_angl_uu];
+
+    const double zorig = mesh->info.real_params[AC_zorig];
+    double zz;
+    double trans = mesh->info.real_params[AC_trans];
+
+    // const AcReal range = AcReal(.5);
+
+    // const AcReal zmax  = AcReal(DZ * (nz_max - nz_min));
+    // const AcReal gaussr  = zmax / AcReal(4.0);
+
+    //for (int k = nz_min; k < nz_max; k++) {
+    //    for (int j = ny_min; j < ny_max; j++) {
+    //        for (int i = nx_min; i < nx_max; i++) {
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                zz = DZ * double(k) - zorig;
+                //mesh->vertex_buffer[VTXBUF_UUX][idx] = -AMPL_UU*cos(ANGL_UU); 
+                mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal(-AMPL_UU*cos(ANGL_UU)*fabs(tanh(zz/trans))); 
+                mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal(-AMPL_UU*sin(ANGL_UU)*tanh(zz/trans)); 
+
+                //Variarion to density
+                //AcReal rho = exp(mesh->vertex_buffer[VTXBUF_LNRHO][idx]);
+                //NO GAUSSIAN//rho = rho*exp(-(zz/gaussr)*(zz/gaussr));
+                //mesh->vertex_buffer[VTXBUF_LNRHO][idx] = log(rho + (range*rho) * (randr() - AcReal(-0.5)));
+            }
+        }
+    }
+}
+
+// This is the initial condition type for the infalling vedge in the pseudodisk
+// model. 
+void
+inflow_vedge_freefall(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    // const int nx_min = mesh->info.int_params[AC_nx_min];
+    // const int nx_max = mesh->info.int_params[AC_nx_max];
+    // const int ny_min = mesh->info.int_params[AC_ny_min];
+    // const int ny_max = mesh->info.int_params[AC_ny_max];
+    // const int nz_min = mesh->info.int_params[AC_nz_min];
+    // const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+    // const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    // const double AMPL_UU = mesh->info.real_params[AC_ampl_uu];
+    const double ANGL_UU = mesh->info.real_params[AC_angl_uu];
+    const double SQ2GM = mesh->info.real_params[AC_sq2GM_star];
+    // const double GM = mesh->info.real_params[AC_GM_star];
+    // const double M_star  = mesh->info.real_params[AC_M_star];
+    // const double G_CONST = mesh->info.real_params[AC_G_CONST];
+
+    // const double unit_length   = mesh->info.real_params[AC_unit_length];
+    // const double unit_density  = mesh->info.real_params[AC_unit_density];
+    // const double unit_velocity = mesh->info.real_params[AC_unit_velocity];
+
+    const double xorig = mesh->info.real_params[AC_xorig];
+    // const double yorig = mesh->info.real_params[AC_yorig];
+    const double zorig = mesh->info.real_params[AC_zorig];
+    // const double trans = mesh->info.real_params[AC_trans];
+    //  double xx, yy, zz, RR;
+    double xx, zz, RR;
+    // double delx, dely, delz;
+    double delx, delz;
+    // double u_x, u_y, u_z, veltot, tanhz;
+    double u_x, u_z, veltot, tanhz;
+
+    const double star_pos_x = mesh->info.real_params[AC_star_pos_x];
+    const double star_pos_z = mesh->info.real_params[AC_star_pos_z];
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                xx = DX * double(i) - xorig;
+                zz = DZ * double(k) - zorig;
+
+                delx = xx - star_pos_x; 
+                delz = zz - star_pos_z;
+                //TODO: Figure out isthis needed. Now a placeholder.
+                //tanhz = fabs(tanh(zz/trans));
+                tanhz = 1.0;
+                
+                RR = sqrt(delx*delx + delz*delz);
+                veltot = SQ2GM/sqrt(RR); //Free fall velocity
+
+                //Normal velocity components
+                u_x = - veltot*(delx/RR);  
+                u_z = - veltot*(delz/RR);
+
+                //printf("star_pos_z %e, zz %e, delz %e, RR %e\n", star_pos_z, zz, delz, RR);
+
+                //printf("unit_length = %e, unit_density = %e, unit_velocity = %e,\n M_star = %e, G_CONST = %e, GM = %e, SQ2GM = %e, \n RR = %e, u_x = %e, u_z %e\n", 
+                //        unit_length, unit_density, 
+                //        unit_velocity, M_star, G_CONST, GM, SQ2GM, RR, u_x, u_z);
+                //printf("%e\n", unit_length*unit_length*unit_length); 
+
+ 
+                //Here including an angel tilt due to pseudodisk
+                if (delz >= 0.0) {
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal((u_x*cos(ANGL_UU) - u_z*sin(ANGL_UU))*tanhz); 
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal((u_x*sin(ANGL_UU) + u_z*cos(ANGL_UU))*tanhz); 
+                } else {
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = AcReal((u_x*cos(ANGL_UU) + u_z*sin(ANGL_UU))*tanhz); 
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = AcReal(0.0);
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = AcReal((-u_x*sin(ANGL_UU) + u_z*cos(ANGL_UU))*tanhz); 
+                }
+            }
+        }
+    }
+}
+
+// Only x-direction free fall 
+void
+inflow_freefall_x(AcMesh* mesh)
+{
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+
+    const double SQ2GM = mesh->info.real_params[AC_sq2GM_star];
+    // const double G_CONST = mesh->info.real_params[AC_G_CONST];
+
+    const double xorig = mesh->info.real_params[AC_xorig];
+    double xx, RR;
+    double delx;
+    double /*u_x,*/ veltot;
+
+    const double star_pos_x = mesh->info.real_params[AC_star_pos_x];
+
+    const double ampl_lnrho = mesh->info.real_params[AC_ampl_lnrho];
+
+    for (int k = 0; k < mz; k++) {
+        for (int j = 0; j < my; j++) {
+            for (int i = 0; i < mx; i++) {
+                int idx = i + j * mx + k * mx * my;
+                xx = DX * double(i) - xorig;
+
+                delx = xx - star_pos_x;
+                
+                RR = fabs(delx);
+
+                veltot = SQ2GM/sqrt(RR); //Free fall velocity
+
+                if (isinf(veltot) == 1) printf("xx %e star_pos_x %e delz %e RR %e veltot %e\n",xx, star_pos_x, delx, RR, veltot);
+
+                //Normal velocity components
+                // u_x = - veltot; // Not used 
+
+                //Freefall condition 
+                //mesh->vertex_buffer[VTXBUF_UUX][idx] = u_x; 
+                //mesh->vertex_buffer[VTXBUF_UUY][idx] = 0.0;
+                //mesh->vertex_buffer[VTXBUF_UUZ][idx] = 0.0; 
+
+                //Starting with steady state
+                mesh->vertex_buffer[VTXBUF_UUX][idx] = 0.0; 
+                mesh->vertex_buffer[VTXBUF_UUY][idx] = 0.0;
+                mesh->vertex_buffer[VTXBUF_UUZ][idx] = 0.0; 
+
+                mesh->vertex_buffer[VTXBUF_LNRHO][idx] = AcReal(ampl_lnrho); 
+            }
+        }
+    }
+}
+
+
+
+void
+gaussian_radial_explosion(AcMesh* mesh)
+{
+    AcReal* uu_x = mesh->vertex_buffer[VTXBUF_UUX];
+    AcReal* uu_y = mesh->vertex_buffer[VTXBUF_UUY];
+    AcReal* uu_z = mesh->vertex_buffer[VTXBUF_UUZ];
+
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+
+    const int nx_min = mesh->info.int_params[AC_nx_min];
+    const int nx_max = mesh->info.int_params[AC_nx_max];
+    const int ny_min = mesh->info.int_params[AC_ny_min];
+    const int ny_max = mesh->info.int_params[AC_ny_max];
+    const int nz_min = mesh->info.int_params[AC_nz_min];
+    const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    const double DX    = mesh->info.real_params[AC_dsx];
+    const double DY    = mesh->info.real_params[AC_dsy];
+    const double DZ    = mesh->info.real_params[AC_dsz];
+
+    const double xorig = double(XORIG) - 0.000001; 
+    const double yorig = double(YORIG) - 0.000001;
+    const double zorig = double(ZORIG) - 0.000001;
+
+    const double INIT_LOC_UU_X = 0.0;
+    const double INIT_LOC_UU_Y = 0.0;
+    const double INIT_LOC_UU_Z = 0.0;
+
+    const double AMPL_UU    = mesh->info.real_params[AC_ampl_uu];
+    const double UU_SHELL_R = 0.8;
+    const double WIDTH_UU   = 0.2;
+
+    // Outward explosion with gaussian initial velocity profile.
+    int idx;
+    double xx, yy, zz, rr2, rr, theta = 0.0, phi = 0.0;
+    double uu_radial;
+
+    // double theta_old = 0.0;
+
+    for (int k = nz_min; k < nz_max; k++) {
+        for (int j = ny_min; j < ny_max; j++) {
+            for (int i = nx_min; i < nx_max; i++) {
+                // Calculate the value of velocity in a particular radius.
+                idx = i + j * mx + k * mx * my;
+                // Determine the coordinates
+                xx = DX * (i - nx_min) - xorig;
+                xx = xx - INIT_LOC_UU_X;
+
+                yy = DY * (j - ny_min) - yorig;
+                yy = yy - INIT_LOC_UU_Y;
+
+                zz = DZ * (k - nz_min) - zorig;
+                zz = zz - INIT_LOC_UU_Z;
+
+                rr2 = pow(xx, 2.0) + pow(yy, 2.0) + pow(zz, 2.0);
+                rr  = sqrt(rr2);
+
+                // Origin is different!
+                double xx_abs, yy_abs, zz_abs;
+                if (rr > 0.0) {
+                    // theta range [0, PI]
+                    if (zz >= 0.0) {
+                        theta = acos(zz / rr);
+                        if (theta > M_PI / 2.0 || theta < 0.0) {
+                            printf("Explosion THETA WRONG: zz = %.3f, rr = "
+                                   "%.3f, theta = %.3e/PI, M_PI = %.3e\n",
+                                   zz, rr, theta / M_PI, M_PI);
+                        }
+                    }
+                    else {
+                        zz_abs = -zz; // Needs a posite value for acos
+                        theta  = M_PI - acos(zz_abs / rr);
+                        if (theta < M_PI / 2.0 || theta > 2 * M_PI) {
+                            printf("Explosion THETA WRONG: zz = %.3f, rr = "
+                                   "%.3f, theta = %.3e/PI, M_PI = %.3e\n",
+                                   zz, rr, theta / M_PI, M_PI);
+                        }
+                    }
+
+                    // phi range [0, 2*PI]i
+                    if (xx != 0.0) {
+                        if (xx < 0.0 && yy >= 0.0) {
+                            //-+
+                            xx_abs = -xx; // Needs a posite value for atan
+                            phi    = M_PI - atan(yy / xx_abs);
+                            if (phi < (M_PI / 2.0) || phi > M_PI) {
+                                printf("Explosion PHI WRONG -+: xx = %.3f, yy "
+                                       "= %.3f, phi = %.3e/PI, M_PI = %.3e\n",
+                                       xx, yy, phi / M_PI, M_PI);
+                            }
+                        }
+                        else if (xx > 0.0 && yy < 0.0) {
+                            //+-
+                            yy_abs = -yy;
+                            phi    = 2.0 * M_PI - atan(yy_abs / xx);
+                            if (phi < (3.0 * M_PI) / 2.0 ||
+                                phi > (2.0 * M_PI + 1e-6)) {
+                                printf("Explosion PHI WRONG +-: xx = %.3f, yy "
+                                       "= %.3f, phi = %.3e/PI, M_PI = %.3e\n",
+                                       xx, yy, phi / M_PI, M_PI);
+                            }
+                        }
+                        else if (xx < 0.0 && yy < 0.0) {
+                            //--
+                            yy_abs = -yy;
+                            xx_abs = -xx;
+                            phi    = M_PI + atan(yy_abs / xx_abs);
+                            if (phi < M_PI ||
+                                phi > ((3.0 * M_PI) / 2.0 + 1e-6)) {
+                                printf("Explosion PHI WRONG --: xx = %.3f, yy "
+                                       "= %.3f, xx_abs = %.3f, yy_abs = %.3f, "
+                                       "phi = %.3e, (3.0*M_PI)/2.0 = %.3e\n",
+                                       xx, yy, xx_abs, yy_abs, phi,
+                                       (3.0 * M_PI) / 2.0);
+                            }
+                        }
+                        else {
+                            //++
+                            phi = atan(yy / xx);
+                            if (phi < 0 || phi > M_PI / 2.0) {
+                                printf(
+                                    "Explosion PHI WRONG --: xx = %.3f, yy = "
+                                    "%.3f, phi = %.3e, (3.0*M_PI)/2.0 = %.3e\n",
+                                    xx, yy, phi, (3.0 * M_PI) / 2.0);
+                            }
+                        }
+                    }
+                    else { // To avoid div by zero with atan
+                        if (yy > 0.0) {
+                            phi = M_PI / 2.0;
+                        }
+                        else if (yy < 0.0) {
+                            phi = (3.0 * M_PI) / 2.0;
+                        }
+                        else {
+                            phi = 0.0;
+                        }
+                    }
+
+                    // Set zero for explicit safekeeping
+                    if (xx == 0.0 && yy == 0.0) {
+                        phi = 0.0;
+                    }
+
+                    // Gaussian velocity
+                    // uu_radial = AMPL_UU*exp( -rr2 / (2.0*pow(WIDTH_UU, 2.0))
+                    // ); New distribution, where that gaussion wave is not in
+                    // the exact centre coordinates uu_radial = AMPL_UU*exp(
+                    // -pow((rr - 4.0*WIDTH_UU),2.0) / (2.0*pow(WIDTH_UU, 2.0))
+                    // ); //TODO: Parametrize the peak location.
+                    uu_radial = AMPL_UU * exp(-pow((rr - UU_SHELL_R), 2.0) /
+                                              (2.0 * pow(WIDTH_UU, 2.0)));
+                }
+                else {
+                    uu_radial = 0.0; // TODO: There will be a discontinuity in
+                                     // the origin... Should the shape of the
+                                     // distribution be different?
+                }
+
+                // Determine the carthesian velocity components and lnrho
+                uu_x[idx] = AcReal(uu_radial * sin(theta) * cos(phi));
+                uu_y[idx] = AcReal(uu_radial * sin(theta) * sin(phi));
+                uu_z[idx] = AcReal(uu_radial * cos(theta));
+
+                // Temporary diagnosticv output (TODO: Remove after not needed)
+                // if (theta > theta_old) {
+                // if (theta > M_PI || theta < 0.0 || phi < 0.0 || phi > 2*M_PI)
+                // {
+                /*	printf("Explosion: xx = %.3f, yy = %.3f, zz = %.3f, rr =
+                   %.3f, phi = %.3e/PI, theta = %.3e/PI\n, M_PI = %.3e", xx, yy,
+                   zz, rr, phi/M_PI, theta/M_PI, M_PI); printf(" uu_radial =
+                   %.3e, uu_x[%i] = %.3e, uu_y[%i] = %.3e, uu_z[%i] = %.3e \n",
+                                uu_radial, idx, uu_x[idx], idx, uu_y[idx], idx,
+                   uu_z[idx]); theta_old = theta;
+                */
+            }
+        }
+    }
+}
+
+void
+acmesh_init_to(const InitType& init_type, AcMesh* mesh)
+{
+    srand(123456789);
+
+
+    const int n = AC_VTXBUF_SIZE(mesh->info);
+
+    const int mx = mesh->info.int_params[AC_mx];
+    const int my = mesh->info.int_params[AC_my];
+    const int mz = mesh->info.int_params[AC_mz];
+
+    const int nx_min = mesh->info.int_params[AC_nx_min];
+    const int nx_max = mesh->info.int_params[AC_nx_max];
+    const int ny_min = mesh->info.int_params[AC_ny_min];
+    const int ny_max = mesh->info.int_params[AC_ny_max];
+    const int nz_min = mesh->info.int_params[AC_nz_min];
+    const int nz_max = mesh->info.int_params[AC_nz_max];
+
+    switch (init_type) {
+    case INIT_TYPE_RANDOM: {
+        acmesh_clear(mesh);
+        const AcReal range = AcReal(0.01);
+        for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+            for (int i = 0; i < n; ++i)
+                mesh->vertex_buffer[w][i] = 2 * range * randr() - range;
+
+        break;
+    }
+    case INIT_TYPE_GAUSSIAN_RADIAL_EXPL:
+        acmesh_clear(mesh);
+        //acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        gaussian_radial_explosion(mesh);
+
+        break;
+    case INIT_TYPE_XWAVE:
+        acmesh_clear(mesh);
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        for (int k = 0; k < mz; k++) {
+            for (int j = 0; j < my; j++) {
+                for (int i = 0; i < mx; i++) {
+                    int idx = i + j * mx + k * mx * my;
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = 2*AcReal(sin(j * AcReal(M_PI) / mx)) - 1;
+                }
+            }
+        }
+        break;
+    case INIT_TYPE_VEDGE: 
+        acmesh_clear(mesh);
+        inflow_vedge_freefall(mesh);
+        break;
+    case INIT_TYPE_VEDGEX: 
+        acmesh_clear(mesh);
+        inflow_freefall_x(mesh);
+        break;
+    case INIT_TYPE_RAYLEIGH_TAYLOR: 
+        acmesh_clear(mesh);
+        inflow_freefall_x(mesh);
+        lnrho_step(mesh);
+        break;
+    case INIT_TYPE_ABC_FLOW: {
+        acmesh_clear(mesh);
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        for (int k = nz_min; k < nz_max; k++) {
+            for (int j = ny_min; j < ny_max; j++) {
+                for (int i = nx_min; i < nx_max; i++) {
+                    const int idx = i + j * mx + k * mx * my;
+
+                    /*
+                    const double xx = double(
+                        mesh->info.real_params[AC_dsx] *
+                            (i - mesh->info.int_params[AC_nx_min]) -
+                        XORIG + AcReal(.5) * mesh->info.real_params[AC_dsx]);
+                    const double yy = double(
+                        mesh->info.real_params[AC_dsy] *
+                            (j - mesh->info.int_params[AC_ny_min]) -
+                        YORIG + AcReal(.5) * mesh->info.real_params[AC_dsy]);
+                    const double zz = double(
+                        mesh->info.real_params[AC_dsz] *
+                            (k - mesh->info.int_params[AC_nz_min]) -
+                        ZORIG + AcReal(.5) * mesh->info.real_params[AC_dsz]);
+                    */
+
+                    const AcReal xx = (i - nx_min) * mesh->info.real_params[AC_dsx] - XORIG;
+                    const AcReal yy = (j - ny_min) * mesh->info.real_params[AC_dsy] - YORIG;
+                    const AcReal zz = (k - nz_min) * mesh->info.real_params[AC_dsz] - ZORIG;
+
+                    const AcReal ampl_uu = 0.5;
+                    const AcReal ABC_A   = 1.;
+                    const AcReal ABC_B   = 1.;
+                    const AcReal ABC_C   = 1.;
+                    const AcReal kx_uu   = 8.;
+                    const AcReal ky_uu   = 8.;
+                    const AcReal kz_uu   = 8.;
+
+                    mesh->vertex_buffer[VTXBUF_UUX][idx] = ampl_uu * (ABC_A * (AcReal)sin(kz_uu * zz) + ABC_C * (AcReal)cos(ky_uu * yy));
+                    mesh->vertex_buffer[VTXBUF_UUY][idx] = ampl_uu * (ABC_B * (AcReal)sin(kx_uu * xx) + ABC_A * (AcReal)cos(kz_uu * zz));
+                    mesh->vertex_buffer[VTXBUF_UUZ][idx] = ampl_uu * (ABC_C * (AcReal)sin(ky_uu * yy) + ABC_B * (AcReal)cos(kx_uu * xx));
+                }
+            }
+        }
+        break;
+    }
+    case INIT_TYPE_RAYLEIGH_BENARD: {
+        acmesh_init_to(INIT_TYPE_RANDOM, mesh);
+        #if LTEMPERATURE
+        vertex_buffer_set(VTXBUF_LNRHO, 1, mesh);
+        const AcReal range = AcReal(0.9);
+        for (int k = nz_min; k < nz_max; k++) {
+            for (int j = ny_min; j < ny_max; j++) {
+                for (int i = nx_min; i < nx_max; i++) {
+                    const int idx = i + j * mx + k * mx * my;
+                    mesh->vertex_buffer[VTXBUF_TEMPERATURE][idx] = (range * (k - nz_min)) / mesh->info.int_params[AC_nz] + 0.1;
+                }
+            }
+        }
+        #else
+        WARNING("INIT_TYPE_RAYLEIGH_BERNARD called even though VTXBUF_TEMPERATURE is not used");
+        #endif
+        break;
+    }
+    default:
+        ERROR("Unknown init_type");
+    }
+
+    AcReal max_val = AcReal(-1e-32);
+    AcReal min_val = AcReal(1e32);
+    // Normalize the grid
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        for (int i = 0; i < n; ++i) {
+            if (mesh->vertex_buffer[w][i] < min_val)
+                min_val = mesh->vertex_buffer[w][i];
+            if (mesh->vertex_buffer[w][i] > max_val)
+                max_val = mesh->vertex_buffer[w][i];
+        }
+    }
+    printf("MAX: %f MIN %f\n", double(max_val), double(min_val));
+    /*
+    const AcReal inv_range = AcReal(1.) / fabs(max_val - min_val);
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        for (int i = 0; i < n; ++i) {
+            mesh->vertex_buffer[w][i] = 2*inv_range*(mesh->vertex_buffer[w][i] - min_val) - 1;
+        }
+    }
+    */
+}
+
+void
+acmesh_destroy(AcMesh* mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        free(mesh->vertex_buffer[VertexBufferHandle(i)]);
+
+    free(mesh);
+}
+
+
+ModelMesh*
+modelmesh_create(const AcMeshInfo& mesh_info)
+{
+    ModelMesh* mesh = (ModelMesh*)malloc(sizeof(*mesh));
+    mesh->info   = mesh_info;
+
+    const size_t bytes = AC_VTXBUF_SIZE(mesh->info) * sizeof(mesh->vertex_buffer[0][0]);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        mesh->vertex_buffer[VertexBufferHandle(i)] = (ModelScalar*)malloc(bytes);
+        ERRCHK(mesh->vertex_buffer[VertexBufferHandle(i)] != NULL);
+    }
+
+    return mesh;
+}
+
+void
+modelmesh_destroy(ModelMesh* mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        free(mesh->vertex_buffer[VertexBufferHandle(i)]);
+
+    free(mesh);
+}
+
+#include <string.h> // memcpy
+void
+acmesh_to_modelmesh(const AcMesh& acmesh, ModelMesh* modelmesh)
+{
+    ERRCHK(sizeof(acmesh.info) == sizeof(modelmesh->info));
+    memcpy(&modelmesh->info, &acmesh.info, sizeof(acmesh.info));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        for (size_t j = 0; j < AC_VTXBUF_SIZE(acmesh.info); ++j)
+            modelmesh->vertex_buffer[i][j] = (ModelScalar)acmesh.vertex_buffer[i][j];
+}
+
+void
+modelmesh_to_acmesh(const ModelMesh& modelmesh, AcMesh* acmesh)
+{
+    ERRCHK(sizeof(acmesh->info) == sizeof(modelmesh.info));
+    memcpy(&acmesh->info, &modelmesh.info, sizeof(modelmesh.info));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        for (size_t j = 0; j < AC_VTXBUF_SIZE(modelmesh.info); ++j)
+            acmesh->vertex_buffer[i][j] = (AcReal)modelmesh.vertex_buffer[i][j];
+}
diff --git a/src/standalone/model/host_memory.h b/src/standalone/model/host_memory.h
new file mode 100644
index 0000000..55e03c4
--- /dev/null
+++ b/src/standalone/model/host_memory.h
@@ -0,0 +1,58 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+// clang-format off
+#define AC_FOR_INIT_TYPES(FUNC)\
+        FUNC(INIT_TYPE_RANDOM), \
+        FUNC(INIT_TYPE_XWAVE), \
+        FUNC(INIT_TYPE_GAUSSIAN_RADIAL_EXPL), \
+        FUNC(INIT_TYPE_ABC_FLOW) , \
+        FUNC(INIT_TYPE_VEDGE), \
+        FUNC(INIT_TYPE_VEDGEX), \
+        FUNC(INIT_TYPE_RAYLEIGH_TAYLOR), \
+        FUNC(INIT_TYPE_RAYLEIGH_BENARD)
+// clang-format on
+
+typedef enum { AC_FOR_INIT_TYPES(AC_GEN_ID), NUM_INIT_TYPES } InitType;
+
+extern const char* init_type_names[]; // Defined in host_memory.cc
+
+AcMesh* acmesh_create(const AcMeshInfo& mesh_info);
+
+void acmesh_clear(AcMesh* mesh);
+
+void acmesh_init_to(const InitType& type, AcMesh* mesh);
+
+void acmesh_destroy(AcMesh* mesh);
+
+ModelMesh* modelmesh_create(const AcMeshInfo& mesh_info);
+void modelmesh_destroy(ModelMesh* mesh);
+void acmesh_to_modelmesh(const AcMesh& acmesh, ModelMesh* modelmesh);
+void modelmesh_to_acmesh(const ModelMesh& model, AcMesh* acmesh);
diff --git a/src/standalone/model/host_timestep.cc b/src/standalone/model/host_timestep.cc
new file mode 100644
index 0000000..9c51d83
--- /dev/null
+++ b/src/standalone/model/host_timestep.cc
@@ -0,0 +1,63 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "host_timestep.h"
+
+#include "core/math_utils.h"
+
+static AcReal timescale = AcReal(1.0);
+
+AcReal
+host_timestep(const AcReal& umax, const AcMeshInfo& mesh_info)
+{
+    const long double cdt      = mesh_info.real_params[AC_cdt];
+    const long double cdtv     = mesh_info.real_params[AC_cdtv];
+    // const long double cdts     = mesh_info.real_params[AC_cdts];
+    const long double cs2_sound = mesh_info.real_params[AC_cs2_sound];
+    const long double nu_visc  = mesh_info.real_params[AC_nu_visc];
+    const long double eta      = mesh_info.real_params[AC_eta];
+    const long double chi      = 0; // mesh_info.real_params[AC_chi]; // TODO not calculated
+    const long double gamma    = mesh_info.real_params[AC_gamma];
+    const long double dsmin    = mesh_info.real_params[AC_dsmin];
+
+    // Old ones from legacy Astaroth
+    //const long double uu_dt   = cdt * (dsmin / (umax + cs_sound));
+    //const long double visc_dt = cdtv * dsmin * dsmin / nu_visc;
+
+    // New, closer to the actual Courant timestep
+    // See Pencil Code user manual p. 38 (timestep section)
+    const long double uu_dt   = cdt * dsmin / (fabsl(umax) + sqrtl(cs2_sound + 0.0l));
+    const long double visc_dt = cdtv * dsmin * dsmin / max(max(nu_visc, eta), max(gamma, chi)) + 1; // TODO NOTE: comment the +1 out to get scientifically accurate results
+
+    const long double dt = min(uu_dt, visc_dt);
+    return AcReal(timescale) * AcReal(dt);
+}
+
+void
+set_timescale(const AcReal scale)
+{
+    timescale = scale;
+}
diff --git a/src/standalone/model/host_timestep.h b/src/standalone/model/host_timestep.h
new file mode 100644
index 0000000..2bc2b02
--- /dev/null
+++ b/src/standalone/model/host_timestep.h
@@ -0,0 +1,32 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+AcReal host_timestep(const AcReal& umax, const AcMeshInfo& mesh_info);
+
+void set_timescale(const AcReal scale);
diff --git a/src/standalone/model/model_boundconds.cc b/src/standalone/model/model_boundconds.cc
new file mode 100644
index 0000000..11be905
--- /dev/null
+++ b/src/standalone/model/model_boundconds.cc
@@ -0,0 +1,487 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) amy later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT Amy WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "model_boundconds.h"
+
+#include "core/errchk.h"
+
+
+void
+boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh)
+{
+    #pragma omp parallel for
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const int3 start = (int3){0, 0, 0};
+        const int3 end = (int3){
+            mesh_info.int_params[AC_mx],
+            mesh_info.int_params[AC_my],
+            mesh_info.int_params[AC_mz]
+        };
+
+        const int nx = mesh_info.int_params[AC_nx];
+        const int ny = mesh_info.int_params[AC_ny];
+        const int nz = mesh_info.int_params[AC_nz];
+
+         const int nx_min = mesh_info.int_params[AC_nx_min];
+         const int ny_min = mesh_info.int_params[AC_ny_min];
+         const int nz_min = mesh_info.int_params[AC_nz_min];
+
+         // The old kxt was inclusive, but our mx_max is exclusive
+         const int nx_max = mesh_info.int_params[AC_nx_max];
+         const int ny_max = mesh_info.int_params[AC_ny_max];
+         const int nz_max = mesh_info.int_params[AC_nz_max];
+
+        for (int k_dst = start.z; k_dst < end.z; ++k_dst) {
+        for (int j_dst = start.y; j_dst < end.y; ++j_dst) {
+        for (int i_dst = start.x; i_dst < end.x; ++i_dst) {
+
+            // If destination index is inside the computational domain, return since
+            // the boundary conditions are only applied to the ghost zones
+            if (i_dst >= nx_min && i_dst < nx_max &&
+                j_dst >= ny_min && j_dst < ny_max &&
+                k_dst >= nz_min && k_dst < nz_max)
+                continue;
+
+            // Find the source index
+            // Map to nx, ny, nz coordinates
+            int i_src = i_dst - nx_min;
+            int j_src = j_dst - ny_min;
+            int k_src = k_dst - nz_min;
+
+            // Translate (s.t. the index is always positive)
+            i_src += nx;
+            j_src += ny;
+            k_src += nz;
+
+            // Wrap
+            i_src %= nx;
+            j_src %= ny;
+            k_src %= nz;
+
+            // Map to mx, my, mz coordinates
+            i_src += nx_min;
+            j_src += ny_min;
+            k_src += nz_min;
+
+            const size_t src_idx      = AC_VTXBUF_IDX(i_src, j_src, k_src, mesh_info);
+            const size_t dst_idx      = AC_VTXBUF_IDX(i_dst, j_dst, k_dst, mesh_info);
+            ERRCHK(src_idx < AC_VTXBUF_SIZE(mesh_info));
+            ERRCHK(dst_idx < AC_VTXBUF_SIZE(mesh_info));
+            mesh->vertex_buffer[w][dst_idx] = mesh->vertex_buffer[w][src_idx];
+        }
+        }
+        }
+    }
+}
+
+#if 0
+void
+boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh)
+{
+    const int mx = mesh_info.int_params[AC_mx];
+    const int my = mesh_info.int_params[AC_my];
+    const int mz = mesh_info.int_params[AC_mz];
+
+    // Volatile here suppresses the warning about strict-overflow (i.e. compiler
+    // wanted to optimize these loops by assuming that kxb etc never overflow)
+    // However we do not need the performance improvement (~1-3%) and it's
+    // not either good to
+    //	a) get useless warnings originating from here
+    //	b) disable the warnings completely
+    volatile const int kxb = mesh_info.int_params[AC_nx_min];
+    volatile const int kyb = mesh_info.int_params[AC_ny_min];
+    volatile const int kzb = mesh_info.int_params[AC_nz_min];
+
+    // The old kxt was inclusive, but our mx_max is exclusive
+    volatile const int kxt = mesh_info.int_params[AC_nx_max] - 1;
+    volatile const int kyt = mesh_info.int_params[AC_ny_max] - 1;
+    volatile const int kzt = mesh_info.int_params[AC_nz_max] - 1;
+    const int bound[3]     = {0, 0, 0};
+
+    // Periodic boundary conditions
+    if (bound[0] == 0) {
+        for (int k = kzb; k <= kzt; k++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int i = kxb; i <= kxb + 2; i++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (kxt + i - 2) + j * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int i = kxt - 2; i <= kxt; i++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - kxt + 2) + j * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    if (bound[1] == 0) {
+        for (int k = kzb; k <= kzt; k++) {
+            for (int i = kxb; i <= kxt; i++) {
+                for (int j = kyb; j <= kyb + 2; j++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (kyt + j - 2) * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int j = kyt - 2; j <= kyt; j++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - kyt + 2) * mx + k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+
+    if (bound[2] == 0) {
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + j * mx + (kzt + k - 2) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + j * mx + (k - kzt + 2) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+
+    // Copy the corners in the fully periodic case
+    if (bound[0] == 0 && bound[1] == 0 && bound[2] == 0) {
+        // Source corner: x=0, y=0, z=0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=0, z=0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=1, z=0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=0, z=1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=1, z=0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=0, z=1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=0, y=1, z=1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source corner: x=1, y=1, z=1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    else {
+        ERROR("ONLY FULLY PERIODIC WORKS WITH CORNERS SO FAR! \n");
+    }
+
+    // Copy the edges in the fully periodic case
+    if (bound[0] == 0 && bound[1] == 0 && bound[2] == 0) {
+        // Source edge: x = 0, y = 0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, y = 0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j + my - STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, y = 1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, y = 1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + (j - my + STENCIL_ORDER) * mx +
+                                     k * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, z = 0
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + j * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, z = 0
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + j * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 0, z = 1
+        for (int i = kxb; i <= kxb + 2; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i + mx - STENCIL_ORDER) + j * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: x = 1, z = 1
+        for (int i = kxt - 2; i <= kxt; i++) {
+            for (int j = kyb; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = (i - mx + STENCIL_ORDER) + j * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 0, z = 0
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j + my - STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 1, z = 0
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzb; k <= kzb + 2; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - my + STENCIL_ORDER) * mx +
+                                     (k + mz - STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 0, z = 1
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyb; j <= kyb + 2; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j + my - STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+        // Source edge: y = 1, z = 1
+        for (int i = kxb; i <= kxt; i++) {
+            for (int j = kyt - 2; j <= kyt; j++) {
+                for (int k = kzt - 2; k <= kzt; k++) {
+                    const int inds = i + j * mx + k * mx * my;
+                    const int indr = i + (j - my + STENCIL_ORDER) * mx +
+                                     (k - mz + STENCIL_ORDER) * mx * my;
+                    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                        mesh->vertex_buffer[w]
+                                           [indr] = mesh->vertex_buffer[w]
+                                                                       [inds];
+                }
+            }
+        }
+    }
+    else {
+        ERROR("ONLY FULLY PERIODIC WORKS WITH EDGES SO FAR! \n");
+    }
+}
+#endif
diff --git a/src/standalone/model/model_boundconds.h b/src/standalone/model/model_boundconds.h
new file mode 100644
index 0000000..75974b2
--- /dev/null
+++ b/src/standalone/model/model_boundconds.h
@@ -0,0 +1,31 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+void boundconds(const AcMeshInfo& mesh_info, ModelMesh* mesh);
diff --git a/src/standalone/model/model_diff.h b/src/standalone/model/model_diff.h
new file mode 100644
index 0000000..a3ae484
--- /dev/null
+++ b/src/standalone/model/model_diff.h
@@ -0,0 +1,353 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "core/errchk.h"
+
+typedef long double MODEL_REAL;
+
+typedef enum { AXIS_X, AXIS_Y, AXIS_Z, NUM_AXIS_TYPES } AxisType;
+
+template <AxisType axis>
+static inline MODEL_REAL
+der_scal(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+         const MODEL_REAL* scal)
+{
+    MODEL_REAL f0, f1, f2, f4, f5, f6;
+    MODEL_REAL ds;
+
+    switch (axis) {
+    case AXIS_X:
+        f0 = scal[AC_VTXBUF_IDX(i - 3, j, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i - 2, j, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i - 1, j, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i + 1, j, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i + 2, j, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i + 3, j, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsx];
+        break;
+    case AXIS_Y:
+        f0 = scal[AC_VTXBUF_IDX(i, j - 3, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j - 2, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j - 1, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j + 1, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j + 2, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j + 3, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsy];
+        break;
+    case AXIS_Z:
+        f0 = scal[AC_VTXBUF_IDX(i, j, k - 3, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j, k - 2, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j, k - 1, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j, k + 1, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j, k + 2, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j, k + 3, mesh_info)];
+        ds = mesh_info.real_params[AC_dsz];
+        break;
+    default:
+        ERROR("Unknown axis type");
+    }
+    return ((f6 - f0) + MODEL_REAL(-9.) * (f5 - f1) + MODEL_REAL(45.) * (f4 - f2)) /
+           (MODEL_REAL(60.) * ds);
+}
+
+template <AxisType axis>
+static inline MODEL_REAL
+der2_scal(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+          const MODEL_REAL* scal)
+{
+    MODEL_REAL f0, f1, f2, f3, f4, f5, f6;
+    MODEL_REAL ds;
+
+    f3 = scal[AC_VTXBUF_IDX(i, j, k, mesh_info)];
+
+    switch (axis) {
+    case AXIS_X:
+        f0 = scal[AC_VTXBUF_IDX(i - 3, j, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i - 2, j, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i - 1, j, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i + 1, j, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i + 2, j, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i + 3, j, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsx];
+        break;
+    case AXIS_Y:
+        f0 = scal[AC_VTXBUF_IDX(i, j - 3, k, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j - 2, k, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j - 1, k, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j + 1, k, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j + 2, k, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j + 3, k, mesh_info)];
+        ds = mesh_info.real_params[AC_dsy];
+        break;
+    case AXIS_Z:
+        f0 = scal[AC_VTXBUF_IDX(i, j, k - 3, mesh_info)];
+        f1 = scal[AC_VTXBUF_IDX(i, j, k - 2, mesh_info)];
+        f2 = scal[AC_VTXBUF_IDX(i, j, k - 1, mesh_info)];
+        f4 = scal[AC_VTXBUF_IDX(i, j, k + 1, mesh_info)];
+        f5 = scal[AC_VTXBUF_IDX(i, j, k + 2, mesh_info)];
+        f6 = scal[AC_VTXBUF_IDX(i, j, k + 3, mesh_info)];
+        ds = mesh_info.real_params[AC_dsz];
+        break;
+    default:
+        ERROR("Unknown axis type");
+    }
+    return (MODEL_REAL(2.) * (f0 + f6) + MODEL_REAL(-27.) * (f1 + f5) +
+            MODEL_REAL(270.) * (f2 + f4) + MODEL_REAL(-490.) * f3) /
+           (MODEL_REAL(180.) * ds * ds);
+}
+
+static MODEL_REAL
+laplace_scal(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* scal)
+{
+    return der2_scal<AXIS_X>(i, j, k, mesh_info, scal) +
+           der2_scal<AXIS_Y>(i, j, k, mesh_info, scal) +
+           der2_scal<AXIS_Z>(i, j, k, mesh_info, scal);
+}
+
+static void
+laplace_vec(const int& i, const int& j, const int& k,
+            const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+            const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, MODEL_REAL* laplace_x,
+            MODEL_REAL* laplace_y, MODEL_REAL* laplace_z)
+{
+    *laplace_x = laplace_scal(i, j, k, mesh_info, vec_x);
+    *laplace_y = laplace_scal(i, j, k, mesh_info, vec_y);
+    *laplace_z = laplace_scal(i, j, k, mesh_info, vec_z);
+}
+
+static MODEL_REAL
+div_vec(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+        const MODEL_REAL* vec_x, const MODEL_REAL* vec_y, const MODEL_REAL* vec_z)
+{
+    return der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+           der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+           der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z);
+}
+
+static void
+grad(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+     const MODEL_REAL* scal, MODEL_REAL* res_x, MODEL_REAL* res_y, MODEL_REAL* res_z)
+{
+    *res_x = der_scal<AXIS_X>(i, j, k, mesh_info, scal);
+    *res_y = der_scal<AXIS_Y>(i, j, k, mesh_info, scal);
+    *res_z = der_scal<AXIS_Z>(i, j, k, mesh_info, scal);
+}
+
+static MODEL_REAL
+vec_dot_nabla_scal(const int& i, const int& j, const int& k,
+                   const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+                   const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, const MODEL_REAL* scal)
+{
+    const int idx = AC_VTXBUF_IDX(i, j, k, mesh_info);
+    MODEL_REAL ddx_scal, ddy_scal, ddz_scal;
+    grad(i, j, k, mesh_info, scal, &ddx_scal, &ddy_scal, &ddz_scal);
+    return vec_x[idx] * ddx_scal + vec_y[idx] * ddy_scal +
+           vec_z[idx] * ddz_scal;
+}
+
+/*
+ * =============================================================================
+ * Viscosity
+ * =============================================================================
+ */
+typedef enum { DERNM_XY, DERNM_YZ, DERNM_XZ } DernmType;
+
+template <DernmType dernm>
+static MODEL_REAL
+dernm_scal(const int& i, const int& j, const int& k,
+           const AcMeshInfo& mesh_info, const MODEL_REAL* scal)
+{
+
+    MODEL_REAL fac;
+
+    const MODEL_REAL dsx = mesh_info.real_params[AC_dsx];
+    const MODEL_REAL dsy = mesh_info.real_params[AC_dsy];
+    const MODEL_REAL dsz = mesh_info.real_params[AC_dsz];
+
+    MODEL_REAL f_p1_p1, f_m1_p1, f_m1_m1, f_p1_m1;
+    MODEL_REAL f_p2_p2, f_m2_p2, f_m2_m2, f_p2_m2;
+    MODEL_REAL f_p3_p3, f_m3_p3, f_m3_m3, f_p3_m3;
+
+    switch (dernm) {
+    case DERNM_XY:
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsx) * (MODEL_REAL(1.) / dsy);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i + 1, j + 1, k, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i - 1, j + 1, k, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i - 1, j - 1, k, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i + 1, j - 1, k, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i + 2, j + 2, k, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i - 2, j + 2, k, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i - 2, j - 2, k, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i + 2, j - 2, k, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i + 3, j + 3, k, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i - 3, j + 3, k, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i - 3, j - 3, k, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i + 3, j - 3, k, mesh_info)];
+        break;
+    case DERNM_YZ:
+        // NOTE this is a bit different from the old one, second is j+1k-1
+        // instead of j-1,k+1
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsy) * (MODEL_REAL(1.) / dsz);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i, j + 1, k + 1, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i, j - 1, k + 1, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i, j - 1, k - 1, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i, j + 1, k - 1, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i, j + 2, k + 2, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i, j - 2, k + 2, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i, j - 2, k - 2, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i, j + 2, k - 2, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i, j + 3, k + 3, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i, j - 3, k + 3, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i, j - 3, k - 3, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i, j + 3, k - 3, mesh_info)];
+        break;
+    case DERNM_XZ:
+        fac     = MODEL_REAL(1. / 720.) * (MODEL_REAL(1.) / dsx) * (MODEL_REAL(1.) / dsz);
+        f_p1_p1 = scal[AC_VTXBUF_IDX(i + 1, j, k + 1, mesh_info)];
+        f_m1_p1 = scal[AC_VTXBUF_IDX(i - 1, j, k + 1, mesh_info)];
+        f_m1_m1 = scal[AC_VTXBUF_IDX(i - 1, j, k - 1, mesh_info)];
+        f_p1_m1 = scal[AC_VTXBUF_IDX(i + 1, j, k - 1, mesh_info)];
+
+        f_p2_p2 = scal[AC_VTXBUF_IDX(i + 2, j, k + 2, mesh_info)];
+        f_m2_p2 = scal[AC_VTXBUF_IDX(i - 2, j, k + 2, mesh_info)];
+        f_m2_m2 = scal[AC_VTXBUF_IDX(i - 2, j, k - 2, mesh_info)];
+        f_p2_m2 = scal[AC_VTXBUF_IDX(i + 2, j, k - 2, mesh_info)];
+
+        f_p3_p3 = scal[AC_VTXBUF_IDX(i + 3, j, k + 3, mesh_info)];
+        f_m3_p3 = scal[AC_VTXBUF_IDX(i - 3, j, k + 3, mesh_info)];
+        f_m3_m3 = scal[AC_VTXBUF_IDX(i - 3, j, k - 3, mesh_info)];
+        f_p3_m3 = scal[AC_VTXBUF_IDX(i + 3, j, k - 3, mesh_info)];
+        break;
+    default:
+        ERROR("Invalid dernm type");
+    }
+    return fac * (MODEL_REAL(270.) * (f_p1_p1 - f_m1_p1 + f_m1_m1 - f_p1_m1) -
+                  MODEL_REAL(27.) * (f_p2_p2 - f_m2_p2 + f_m2_m2 - f_p2_m2) +
+                  MODEL_REAL(2.) * (f_p3_p3 - f_m3_p3 + f_m3_m3 - f_p3_m3));
+}
+
+static void
+grad_div_vec(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+             const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, MODEL_REAL* gdvx,
+             MODEL_REAL* gdvy, MODEL_REAL* gdvz)
+{
+    *gdvx = der2_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+            dernm_scal<DERNM_XY>(i, j, k, mesh_info, vec_y) +
+            dernm_scal<DERNM_XZ>(i, j, k, mesh_info, vec_z);
+
+    *gdvy = dernm_scal<DERNM_XY>(i, j, k, mesh_info, vec_x) +
+            der2_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+            dernm_scal<DERNM_YZ>(i, j, k, mesh_info, vec_z);
+
+    *gdvz = dernm_scal<DERNM_XZ>(i, j, k, mesh_info, vec_x) +
+            dernm_scal<DERNM_YZ>(i, j, k, mesh_info, vec_y) +
+            der2_scal<AXIS_Z>(i, j, k, mesh_info, vec_z);
+}
+
+static void
+S_grad_lnrho(const int& i, const int& j, const int& k,
+             const AcMeshInfo& mesh_info, const MODEL_REAL* vec_x,
+             const MODEL_REAL* vec_y, const MODEL_REAL* vec_z, const MODEL_REAL* lnrho,
+             MODEL_REAL* sgrhox, MODEL_REAL* sgrhoy, MODEL_REAL* sgrhoz)
+{
+    const MODEL_REAL c23 = MODEL_REAL(2. / 3.);
+    const MODEL_REAL c13 = MODEL_REAL(1. / 3.);
+
+    const MODEL_REAL Sxx = c23 * der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) -
+                       c13 * (der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) +
+                              der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z));
+    const MODEL_REAL Sxy = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Y>(i, j, k, mesh_info, vec_x) +
+                        der_scal<AXIS_X>(i, j, k, mesh_info, vec_y));
+    const MODEL_REAL Sxz = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Z>(i, j, k, mesh_info, vec_x) +
+                        der_scal<AXIS_X>(i, j, k, mesh_info, vec_z));
+
+    const MODEL_REAL Syx = Sxy;
+    const MODEL_REAL Syy = c23 * der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y) -
+                       c13 * (der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+                              der_scal<AXIS_Z>(i, j, k, mesh_info, vec_z));
+    const MODEL_REAL Syz = MODEL_REAL(.5) *
+                       (der_scal<AXIS_Z>(i, j, k, mesh_info, vec_y) +
+                        der_scal<AXIS_Y>(i, j, k, mesh_info, vec_z));
+
+    const MODEL_REAL Szx = Sxz;
+    const MODEL_REAL Szy = Syz;
+    const MODEL_REAL Szz = c23 *
+                           der_scal<AXIS_Z>(
+                               i, j, k, mesh_info,
+                               vec_z) // replaced from "c23*der_scal<AXIS_Z>(i,
+                                      // j, k, mesh_info, vec_x)"! TODO recheck
+                                      // that ddz_uu_z is the correct one
+                       - c13 * (der_scal<AXIS_X>(i, j, k, mesh_info, vec_x) +
+                                der_scal<AXIS_Y>(i, j, k, mesh_info, vec_y));
+
+    // Grad lnrho
+
+    MODEL_REAL glnx, glny, glnz;
+
+    grad(i, j, k, mesh_info, lnrho, &glnx, &glny, &glnz);
+
+    *sgrhox = Sxx * glnx + Sxy * glny + Sxz * glnz;
+    *sgrhoy = Syx * glnx + Syy * glny + Syz * glnz;
+    *sgrhoz = Szx * glnx + Szy * glny + Szz * glnz;
+}
+
+static void
+nu_const(const int& i, const int& j, const int& k, const AcMeshInfo& mesh_info,
+         const MODEL_REAL* vec_x, const MODEL_REAL* vec_y, const MODEL_REAL* vec_z,
+         const MODEL_REAL* scal, MODEL_REAL* visc_x, MODEL_REAL* visc_y, MODEL_REAL* visc_z)
+{
+    MODEL_REAL lx, ly, lz;
+    laplace_vec(i, j, k, mesh_info, vec_x, vec_y, vec_z, &lx, &ly, &lz);
+    // lx = ly = lz = .0f;
+
+    MODEL_REAL gx, gy, gz;
+    grad_div_vec(i, j, k, mesh_info, vec_x, vec_y, vec_z, &gx, &gy, &gz);
+    // gx = gy =gz = .0f;
+
+    MODEL_REAL sgrhox, sgrhoy, sgrhoz;
+    S_grad_lnrho(i, j, k, mesh_info, vec_x, vec_y, vec_z, scal, &sgrhox,
+                 &sgrhoy, &sgrhoz);
+    // sgrhox = sgrhoy = sgrhoz = .0f;
+
+    *visc_x = mesh_info.real_params[AC_nu_visc] *
+              (lx + MODEL_REAL(1. / 3.) * gx + MODEL_REAL(2.) * sgrhox)
+              + mesh_info.real_params[AC_zeta] * gx;
+    *visc_y = mesh_info.real_params[AC_nu_visc] *
+              (ly + MODEL_REAL(1. / 3.) * gy + MODEL_REAL(2.) * sgrhoy)
+              + mesh_info.real_params[AC_zeta] * gy;
+    *visc_z = mesh_info.real_params[AC_nu_visc] *
+              (lz + MODEL_REAL(1. / 3.) * gz + MODEL_REAL(2.) * sgrhoz)
+              + mesh_info.real_params[AC_zeta] * gz;
+}
diff --git a/src/standalone/model/model_reduce.cc b/src/standalone/model/model_reduce.cc
new file mode 100644
index 0000000..d96defd
--- /dev/null
+++ b/src/standalone/model/model_reduce.cc
@@ -0,0 +1,203 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "model_reduce.h"
+
+#include <math.h>
+
+#include "core/errchk.h"
+
+// Function pointer definitions
+typedef ModelScalar (*ReduceFunc)(const ModelScalar&, const ModelScalar&);
+typedef ModelScalar (*ReduceInitialScalFunc)(const ModelScalar&);
+typedef ModelScalar (*ReduceInitialVecFunc)(const ModelScalar&, const ModelScalar&,
+                                            const ModelScalar&);
+
+// clang-format off
+/* Comparison funcs */
+static inline ModelScalar
+max(const ModelScalar& a, const ModelScalar& b) { return a > b ? a : b; }
+
+static inline ModelScalar
+min(const ModelScalar& a, const ModelScalar& b) { return a < b ? a : b; }
+
+static inline ModelScalar
+sum(const ModelScalar& a, const ModelScalar& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static inline ModelScalar
+length(const ModelScalar& a) { return (ModelScalar)(a); }
+
+static inline ModelScalar
+length(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return sqrtl(a*a + b*b + c*c); }
+
+static inline ModelScalar
+squared(const ModelScalar& a) { return (ModelScalar)(a*a); }
+
+static inline ModelScalar
+squared(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return squared(a) + squared(b) + squared(c); }
+
+static inline ModelScalar
+exp_squared(const ModelScalar& a) { return expl(a)*expl(a); }
+
+static inline ModelScalar
+exp_squared(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) { return exp_squared(a) + exp_squared(b) + exp_squared(c); }
+// clang-format on
+
+ModelScalar
+model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
+                  const VertexBufferHandle& a)
+{
+    ReduceInitialScalFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = AC_VTXBUF_IDX(
+        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
+        mesh.info.int_params[AC_nz_min], mesh.info);
+
+    ModelScalar res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
+    else
+        res = .0f;
+
+    for (int k = mesh.info.int_params[AC_nz_min];
+         k < mesh.info.int_params[AC_nz_max]; ++k) {
+        for (int j = mesh.info.int_params[AC_ny_min];
+             j < mesh.info.int_params[AC_ny_max]; ++j) {
+            for (int i = mesh.info.int_params[AC_nx_min];
+                 i < mesh.info.int_params[AC_nx_max]; ++i) {
+                const int idx              = AC_VTXBUF_IDX(i, j, k, mesh.info);
+                const ModelScalar curr_val = reduce_initial(
+                    mesh.vertex_buffer[a][idx]);
+                res = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const ModelScalar inv_n = 1.0l / mesh.info.int_params[AC_nxyz];
+        return sqrtl(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
+
+ModelScalar
+model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
+                 const VertexBufferHandle& a, const VertexBufferHandle& b,
+                 const VertexBufferHandle& c)
+{
+    // ModelScalar (*reduce_initial)(ModelScalar, ModelScalar, ModelScalar);
+    ReduceInitialVecFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = AC_VTXBUF_IDX(
+        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
+        mesh.info.int_params[AC_nz_min], mesh.info);
+
+    ModelScalar res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx],
+                             mesh.vertex_buffer[b][initial_idx],
+                             mesh.vertex_buffer[c][initial_idx]);
+    else
+        res = 0;
+
+    for (int k = mesh.info.int_params[AC_nz_min];
+         k < mesh.info.int_params[AC_nz_max]; k++) {
+        for (int j = mesh.info.int_params[AC_ny_min];
+             j < mesh.info.int_params[AC_ny_max]; j++) {
+            for (int i = mesh.info.int_params[AC_nx_min];
+                 i < mesh.info.int_params[AC_nx_max]; i++) {
+                const int idx              = AC_VTXBUF_IDX(i, j, k, mesh.info);
+                const ModelScalar curr_val = reduce_initial(
+                    mesh.vertex_buffer[a][idx], mesh.vertex_buffer[b][idx],
+                    mesh.vertex_buffer[c][idx]);
+                res = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const ModelScalar inv_n = 1.0l / mesh.info.int_params[AC_nxyz];
+        return sqrtl(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
diff --git a/src/standalone/model/model_reduce.h b/src/standalone/model/model_reduce.h
new file mode 100644
index 0000000..91bce2b
--- /dev/null
+++ b/src/standalone/model/model_reduce.h
@@ -0,0 +1,37 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+ModelScalar model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
+                              const VertexBufferHandle& a);
+
+ModelScalar model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
+                             const VertexBufferHandle& a,
+                             const VertexBufferHandle& b,
+                             const VertexBufferHandle& c);
diff --git a/src/standalone/model/model_rk3.cc b/src/standalone/model/model_rk3.cc
new file mode 100644
index 0000000..520457d
--- /dev/null
+++ b/src/standalone/model/model_rk3.cc
@@ -0,0 +1,1044 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "model_rk3.h"
+
+#include <math.h>
+
+#include "host_memory.h"
+#include "model_boundconds.h"
+
+typedef struct {
+    ModelScalar x, y, z;
+} ModelVector;
+
+typedef struct {
+    ModelVector row[3];
+} ModelMatrix;
+
+typedef struct {
+    ModelScalar value;
+    ModelVector gradient;
+    ModelMatrix hessian;
+} ModelScalarData;
+
+typedef struct {
+    ModelScalarData x;
+    ModelScalarData y;
+    ModelScalarData z;
+} ModelVectorData;
+
+
+static AcMeshInfo* mesh_info = NULL;
+
+static inline int
+get(const AcIntParam param)
+{
+    return mesh_info->int_params[param];
+}
+
+static inline ModelScalar
+get(const AcRealParam param)
+{
+    return mesh_info->real_params[param];
+}
+
+static inline int
+IDX(const int i, const int j, const int k)
+{
+    return AC_VTXBUF_IDX(i, j, k, (*mesh_info));
+}
+
+/*
+ * =============================================================================
+ * Stencil Assembly Stage
+ * =============================================================================
+ */
+static inline ModelScalar
+first_derivative(const ModelScalar* pencil, const ModelScalar inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const ModelScalar coefficients[] = {0, 1. / 2.};
+#elif STENCIL_ORDER == 4
+    const ModelScalar coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
+#elif STENCIL_ORDER == 6
+    const ModelScalar coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
+#elif STENCIL_ORDER == 8
+    const ModelScalar coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+                                   -1.0 / 280.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    ModelScalar res    = 0;
+
+//#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
+
+    return res * inv_ds;
+}
+
+static inline ModelScalar
+second_derivative(const ModelScalar* pencil, const ModelScalar inv_ds)
+{
+#if STENCIL_ORDER == 2
+    const ModelScalar coefficients[] = {-2., 1.};
+#elif STENCIL_ORDER == 4
+    const ModelScalar coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+#elif STENCIL_ORDER == 6
+    const ModelScalar coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+                                   1.0 / 90.0};
+#elif STENCIL_ORDER == 8
+    const ModelScalar coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+                                   8.0 / 315.0, -1.0 / 560.0};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    ModelScalar res    = coefficients[0] * pencil[MID];
+
+//#pragma unroll
+    for (int i = 1; i <= MID; ++i)
+        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
+
+    return res * inv_ds * inv_ds;
+}
+
+/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
+static inline ModelScalar
+cross_derivative(const ModelScalar* pencil_a,
+                 const ModelScalar* pencil_b, const ModelScalar inv_ds_a,
+                 const ModelScalar inv_ds_b)
+{
+#if STENCIL_ORDER == 2
+    const ModelScalar coefficients[] = {0, 1.0 / 4.0};
+#elif STENCIL_ORDER == 4
+    const ModelScalar coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+#elif STENCIL_ORDER == 6
+    const ModelScalar fac            = (1. / 720.);
+    const ModelScalar coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+                                   2.0 * fac};
+#elif STENCIL_ORDER == 8
+    const ModelScalar fac            = (1. / 20160.);
+    const ModelScalar coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+                                   128. * fac, -9. * fac};
+#endif
+
+    #define MID (STENCIL_ORDER / 2)
+    ModelScalar res    = ModelScalar(0.);
+
+    //#pragma unroll
+    for (int i = 1; i <= MID; ++i) {
+        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+                                  pencil_b[MID + i] - pencil_b[MID - i]);
+    }
+    return res * inv_ds_a * inv_ds_b;
+}
+
+static inline ModelScalar
+derx(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j, k)];
+
+    return first_derivative(pencil, get(AC_inv_dsx));
+}
+
+static inline ModelScalar
+derxx(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j, k)];
+
+    return second_derivative(pencil, get(AC_inv_dsx));
+}
+
+static inline ModelScalar
+derxy(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil_a[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2,
+                                   j + offset - STENCIL_ORDER / 2, k)];
+
+    ModelScalar pencil_b[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2,
+                                   j + STENCIL_ORDER / 2 - offset, k)];
+
+    return cross_derivative(pencil_a, pencil_b, get(AC_inv_dsx),
+                            get(AC_inv_dsy));
+}
+
+static inline ModelScalar
+derxz(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil_a[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j,
+                                   k + offset - STENCIL_ORDER / 2)];
+
+    ModelScalar pencil_b[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j,
+                                   k + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, get(AC_inv_dsx),
+                            get(AC_inv_dsz));
+}
+
+static inline ModelScalar
+dery(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(i, j + offset - STENCIL_ORDER / 2, k)];
+
+    return first_derivative(pencil, get(AC_inv_dsy));
+}
+
+static inline ModelScalar
+deryy(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(i, j + offset - STENCIL_ORDER / 2, k)];
+
+    return second_derivative(pencil, get(AC_inv_dsy));
+}
+
+static inline ModelScalar
+deryz(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil_a[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_a[offset] = arr[IDX(i, j + offset - STENCIL_ORDER / 2,
+                                   k + offset - STENCIL_ORDER / 2)];
+
+    ModelScalar pencil_b[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil_b[offset] = arr[IDX(i, j + offset - STENCIL_ORDER / 2,
+                                   k + STENCIL_ORDER / 2 - offset)];
+
+    return cross_derivative(pencil_a, pencil_b, get(AC_inv_dsy),
+                            get(AC_inv_dsz));
+}
+
+static inline ModelScalar
+derz(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(i, j, k + offset - STENCIL_ORDER / 2)];
+
+    return first_derivative(pencil, get(AC_inv_dsz));
+}
+
+static inline ModelScalar
+derzz(const int i, const int j, const int k, const ModelScalar* arr)
+{
+    ModelScalar pencil[STENCIL_ORDER + 1];
+//#pragma unroll
+    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
+        pencil[offset] = arr[IDX(i, j, k + offset - STENCIL_ORDER / 2)];
+
+    return second_derivative(pencil, get(AC_inv_dsz));
+}
+
+static inline ModelScalar
+compute_value(const int i, const int j, const int k,
+              const ModelScalar* arr)
+{
+    return arr[IDX(i, j, k)];
+}
+
+static inline ModelVector
+compute_gradient(const int i, const int j, const int k,
+                 const ModelScalar* arr)
+{
+    return (ModelVector){derx(i, j, k, arr), dery(i, j, k, arr),
+                     derz(i, j, k, arr)};
+}
+
+static inline ModelMatrix
+compute_second_deriv(const int i, const int j, const int k,
+                     const ModelScalar* arr)
+{
+    ModelMatrix hessian;
+
+    hessian.row[0] = (ModelVector){derxx(i, j, k, arr), 0, 0};
+    hessian.row[1] = (ModelVector){0, deryy(i, j, k, arr), 0};
+    hessian.row[2] = (ModelVector){0, 0, derzz(i, j, k, arr)};
+
+    return hessian;
+}
+
+static inline ModelMatrix
+compute_hessian(const int i, const int j, const int k,
+                const ModelScalar* arr)
+{
+    ModelMatrix hessian;
+
+    hessian.row[0] = (ModelVector){derxx(i, j, k, arr), derxy(i, j, k, arr), derxz(i, j, k, arr)};
+    hessian.row[1] = (ModelVector){hessian.row[0].y,    deryy(i, j, k, arr), deryz(i, j, k, arr)};
+    hessian.row[2] = (ModelVector){hessian.row[0].z,    hessian.row[1].z,    derzz(i, j, k, arr)};
+
+    return hessian;
+}
+
+static inline ModelScalarData
+read_data(const int i, const int j, const int k,
+          ModelScalar* buf[], const int handle)
+{
+    ModelScalarData data;
+
+    data.value    = compute_value(i, j, k, buf[handle]);
+    data.gradient = compute_gradient(i, j, k, buf[handle]);
+
+    // No significant effect on performance even though we do not need the
+    // diagonals with all arrays
+    data.hessian = compute_hessian(i, j, k, buf[handle]);
+
+    return data;
+}
+
+static inline ModelVectorData
+read_data(const int i, const int j, const int k,
+          ModelScalar* buf[], const int3& handle)
+{
+    ModelVectorData data;
+
+    data.x = read_data(i, j, k, buf, handle.x);
+    data.y = read_data(i, j, k, buf, handle.y);
+    data.z = read_data(i, j, k, buf, handle.z);
+
+    return data;
+}
+
+static inline ModelScalar
+value(const ModelScalarData& data)
+{
+    return data.value;
+}
+
+static inline ModelVector
+gradient(const ModelScalarData& data)
+{
+    return data.gradient;
+}
+
+static inline ModelMatrix
+hessian(const ModelScalarData& data)
+{
+    return data.hessian;
+}
+
+static inline ModelVector
+value(const ModelVectorData& data)
+{
+    return (ModelVector){value(data.x), value(data.y), value(data.z)};
+}
+
+static inline ModelMatrix
+gradients(const ModelVectorData& data)
+{
+    return (ModelMatrix){gradient(data.x), gradient(data.y), gradient(data.z)};
+}
+
+static inline ModelScalar val2ue(const int i, const int j, const int k, ModelScalar* vertex) {
+  return vertex[IDX(i, j, k)];
+}
+static inline ModelVector gradien2t(const int i, const int j, const int k, ModelScalar* vertex) {
+  return (ModelVector){vertex[IDX(i - 1, j, k)] + vertex[IDX(i, j, k)] + vertex[IDX(i + 1, j, k)], vertex[IDX(i, j - 1, k)] + vertex[IDX(i, j, k)] + vertex[IDX(i, j + 1, k)], vertex[IDX(i, j, k - 1)] + vertex[IDX(i, j, k)] + vertex[IDX(i, j, k + 1)]};
+}
+
+/*
+ * =============================================================================
+ * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
+ * =============================================================================
+ */
+
+static inline ModelVector
+operator-(const ModelVector& a, const ModelVector& b)
+{
+    return (ModelVector){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static inline ModelVector
+operator+(const ModelVector& a, const ModelVector& b)
+{
+    return (ModelVector){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static inline ModelVector
+operator-(const ModelVector& a)
+{
+    return (ModelVector){-a.x, -a.y, -a.z};
+}
+
+static  inline ModelVector
+operator*(const ModelScalar a, const ModelVector& b)
+{
+    return (ModelVector){a * b.x, a * b.y, a * b.z};
+}
+
+static inline ModelScalar
+dot(const ModelVector& a, const ModelVector& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline ModelVector
+mul(const ModelMatrix& aa, const ModelVector& x)
+{
+    return (ModelVector){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static inline ModelVector
+cross(const ModelVector& a, const ModelVector& b)
+{
+    ModelVector c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+/*
+static inline bool
+is_valid(const ModelScalar a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static inline bool
+is_valid(const ModelVector& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
+*/
+/*
+ * =============================================================================
+ * Stencil Processing Stage (helper functions)
+ * =============================================================================
+ */
+static inline ModelScalar
+laplace(const ModelScalarData& data)
+{
+    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
+}
+
+static inline ModelScalar
+divergence(const ModelVectorData& vec)
+{
+    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
+}
+
+static inline ModelVector
+laplace_vec(const ModelVectorData& vec)
+{
+    return (ModelVector){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
+}
+
+static inline ModelVector
+curl(const ModelVectorData& vec)
+{
+    return (ModelVector){gradient(vec.z).y - gradient(vec.y).z,
+                     gradient(vec.x).z - gradient(vec.z).x,
+                     gradient(vec.y).x - gradient(vec.x).y};
+}
+
+static inline ModelVector
+gradient_of_divergence(const ModelVectorData& vec)
+{
+    return (ModelVector){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
+                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
+                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
+}
+
+// Takes uu gradients and returns S
+static inline ModelMatrix
+stress_tensor(const ModelVectorData& vec)
+{
+    ModelMatrix S;
+
+    S.row[0].x = ModelScalar(2. / 3.) * gradient(vec.x).x -
+                 ModelScalar(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
+    S.row[0].y = ModelScalar(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
+    S.row[0].z = ModelScalar(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
+
+    S.row[1].y = ModelScalar(2. / 3.) * gradient(vec.y).y -
+                 ModelScalar(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
+
+    S.row[1].z = ModelScalar(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
+
+    S.row[2].z = ModelScalar(2. / 3.) * gradient(vec.z).z -
+                 ModelScalar(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
+
+    S.row[1].x = S.row[0].y;
+    S.row[2].x = S.row[0].z;
+    S.row[2].y = S.row[1].z;
+
+    return S;
+}
+
+static inline ModelScalar
+contract(const ModelMatrix& mat)
+{
+    ModelScalar res = 0;
+
+    //#pragma unroll
+    for (int i = 0; i < 3; ++i)
+        res += dot(mat.row[i], mat.row[i]);
+
+    return res;
+}
+
+/*
+ * =============================================================================
+ * Stencil Processing Stage (equations)
+ * =============================================================================
+ */
+static inline ModelScalar
+continuity(const ModelVectorData& uu, const ModelScalarData& lnrho)
+{
+    return - dot(value(uu), gradient(lnrho)) - divergence(uu);
+}
+
+static inline ModelScalar
+length(const ModelVector& vec)
+{
+    return sqrtl(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static inline ModelScalar
+reciprocal_len(const ModelVector& vec)
+{
+    return 1.l / sqrtl(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
+}
+
+static inline ModelVector
+normalized(const ModelVector& vec)
+{
+    const ModelScalar inv_len = reciprocal_len(vec);
+    return inv_len * vec;
+}
+
+
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+#define LNT0 (ModelScalar(0.0))
+#define LNRHO0 (ModelScalar(0.0))
+
+#define H_CONST (ModelScalar(0.0))
+#define C_CONST (ModelScalar(0.0))
+
+static inline ModelVector
+momentum(const ModelVectorData& uu, const ModelScalarData& lnrho
+#if LENTROPY
+, const ModelScalarData& ss, const ModelVectorData& aa
+#endif
+)
+{
+    #if LENTROPY
+    const ModelMatrix S = stress_tensor(uu);
+    const ModelScalar cs2 = get(AC_cs2_sound) * expl(get(AC_gamma) * value(ss) / get(AC_cp_sound) + (get(AC_gamma) - 1) * (value(lnrho) - LNRHO0));
+    const ModelVector  j = (ModelScalar(1.) / get(AC_mu0)) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const ModelVector B = curl(aa);
+    const ModelScalar inv_rho = ModelScalar(1.) / expl(value(lnrho));
+
+    const ModelVector mom = - mul(gradients(uu), value(uu)) 
+                                                       - cs2 * ((ModelScalar(1.) / get(AC_cp_sound)) * gradient(ss) + gradient(lnrho))
+                                                       + inv_rho * cross(j, B)
+                                                       + get(AC_nu_visc) * (
+                                                            laplace_vec(uu) 
+                                                        + ModelScalar(1. / 3.) * gradient_of_divergence(uu) 
+                                                        + ModelScalar(2.) * mul(S, gradient(lnrho))
+                                                        )
+                                                        + get(AC_zeta) * gradient_of_divergence(uu);
+    return mom;
+    #endif
+
+    #if 0
+    const ModelMatrix S = stress_tensor(uu);
+
+    //#if LENTROPY
+    //const ModelScalar lnrho0 = 1; // TODO correct lnrho0
+    const ModelScalar cs02 = get(AC_cs2_sound); // TODO better naming
+    const ModelScalar cs2 = cs02;// * expl(get(AC_gamma) * value(ss) / get(AC_cp_sound) + (get(AC_gamma)-ModelScalar(1.l)) * (value(lnrho) - lnrho0));
+
+    mom = -mul(gradients(uu), value(uu)) -
+    cs2 * ((ModelScalar(1.) / get(AC_cp_sound)) * gradient(ss) + gradient(lnrho)) +
+    get(AC_nu_visc) *
+    (laplace_vec(uu) + ModelScalar(1.l / 3.l) * gradient_of_divergence(uu) +
+      ModelScalar(2.l) * mul(S, gradient(lnrho))) + get(AC_zeta) * gradient_of_divergence(uu);
+
+    const ModelVector grad_div = gradient_of_divergence(aa);
+    const ModelVector lap = laplace_vec(aa);
+    const ModelVector j = (ModelScalar(1.l) / get(AC_mu0)) * (grad_div - lap);
+    const ModelVector B = curl(aa);
+    mom = mom + (ModelScalar(1.l) / expl(value(lnrho))) * cross(j, B);
+    //#else // Basic hydro
+        const ModelScalar cs02 = get(AC_cs2_sound);
+        mom = -mul(gradients(uu), value(uu)) -
+          cs02 * gradient(lnrho) +
+          get(AC_nu_visc) *
+              (laplace_vec(uu) + ModelScalar(1. / 3.) * gradient_of_divergence(uu) +
+               ModelScalar(2.) * mul(S, gradient(lnrho))) + get(AC_zeta) * gradient_of_divergence(uu);
+    //#endif
+    #endif
+    return mom;
+}
+
+static inline ModelVector
+induction(const ModelVectorData& uu, const ModelVectorData& aa)
+{
+    ModelVector ind;
+    // Note: We do (-nabla^2 A + nabla(nabla dot A)) instead of (nabla x (nabla
+    // x A)) in order to avoid taking the first derivative twice (did the math,
+    // yes this actually works. See pg.28 in arXiv:astro-ph/0109497)
+    // u cross B - ETA * mu0 * (mu0^-1 * [- laplace A + grad div A ])
+    const ModelVector B        = curl(aa);
+    const ModelVector grad_div = gradient_of_divergence(aa);
+    const ModelVector lap      = laplace_vec(aa);
+
+    // Note, mu0 is cancelled out
+    ind = cross(value(uu), B) - get(AC_eta) * (grad_div - lap);
+
+    return ind;
+}
+
+static inline ModelScalar
+lnT(const ModelScalarData& ss, const ModelScalarData& lnrho)
+{
+    const ModelScalar lnT = LNT0 + get(AC_gamma) * value(ss) / get(AC_cp_sound)
+                     + (get(AC_gamma) - ModelScalar(1.)) * (value(lnrho) - LNRHO0);
+    return lnT;
+}
+
+
+// Nabla dot (K nabla T) / (rho T)
+static inline ModelScalar
+heat_conduction(const ModelScalarData& ss, const ModelScalarData& lnrho)
+{
+    const ModelScalar inv_cp_sound = ModelScalar(1.) / get(AC_cp_sound);
+
+    const ModelVector grad_ln_chi = - gradient(lnrho);
+
+    const ModelScalar first_term = get(AC_gamma) * inv_cp_sound * laplace(ss)
+                           + (get(AC_gamma) - ModelScalar(1.)) * laplace(lnrho);
+    const ModelVector second_term = get(AC_gamma) * inv_cp_sound * gradient(ss)
+                             + (get(AC_gamma) - ModelScalar(1.)) * gradient(lnrho);
+    const ModelVector third_term = get(AC_gamma) * (inv_cp_sound * gradient(ss)
+                                        + gradient(lnrho)) + grad_ln_chi;
+
+    const ModelScalar chi = AC_THERMAL_CONDUCTIVITY / (expl(value(lnrho)) * get(AC_cp_sound));
+    return get(AC_cp_sound) * chi * (first_term + dot(second_term, third_term));
+}
+
+static inline ModelScalar
+entropy(const ModelScalarData& ss, const ModelVectorData& uu, const ModelScalarData& lnrho, const ModelVectorData& aa)
+{
+    const ModelMatrix S = stress_tensor(uu);
+    const ModelScalar inv_pT = ModelScalar(1.) / (expl(value(lnrho)) * expl(lnT(ss, lnrho)));
+    const ModelVector  j = (ModelScalar(1.) / get(AC_mu0)) * (gradient_of_divergence(aa) - laplace_vec(aa)); // Current density
+    const ModelScalar RHS = H_CONST - C_CONST
+                                                + get(AC_eta) * get(AC_mu0) * dot(j, j) 
+                                                + ModelScalar(2.) * expl(value(lnrho)) * get(AC_nu_visc) * contract(S)
+                                                + get(AC_zeta) * expl(value(lnrho)) * divergence(uu) * divergence(uu);
+
+    return - dot(value(uu), gradient(ss))
+                  + inv_pT * RHS
+                  + heat_conduction(ss, lnrho);
+    /*
+    const ModelMatrix S = stress_tensor(uu);
+
+    // nabla x nabla x A / mu0 = nabla(nabla dot A) - nabla^2(A)
+    const ModelVector j = gradient_of_divergence(aa) - laplace_vec(aa);
+
+    const ModelScalar inv_pT = ModelScalar(1.) / (expl(value(lnrho)) + expl(lnT(ss, lnrho)));
+
+    return - dot(value(uu), gradient(ss))
+           + inv_pT * ( H_CONST - C_CONST
+                + get(AC_eta) * get(AC_mu0) * dot(j, j)
+                + ModelScalar(2.) * expl(value(lnrho)) * get(AC_nu_visc) * contract(S)
+                + get(AC_zeta) * expl(value(lnrho)) * divergence(uu) * divergence(uu)
+            )
+            + heat_conduction(ss, lnrho);
+    */
+}
+
+static void
+solve_alpha_step(const int step_number, const ModelScalar dt,
+                 const int i, const int j, const int k,
+                 ModelMesh& in, ModelMesh* out)
+{
+    const int idx = AC_VTXBUF_IDX(i, j, k, in.info);
+
+    const ModelScalarData lnrho = read_data(i, j, k, in.vertex_buffer, VTXBUF_LNRHO);
+    const ModelVectorData uu = read_data(i, j, k, in.vertex_buffer, (int3){VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ});
+
+    ModelScalar rate_of_change[NUM_VTXBUF_HANDLES] = {0};
+    rate_of_change[VTXBUF_LNRHO] = continuity(uu, lnrho);
+
+    #if LINDUCTION
+        const ModelVectorData aa = read_data(i, j, k, in.vertex_buffer, (int3){VTXBUF_AX, VTXBUF_AY, VTXBUF_AZ});
+        const ModelVector aa_res = induction(uu, aa);
+        rate_of_change[VTXBUF_AX] = aa_res.x;
+        rate_of_change[VTXBUF_AY] = aa_res.y;
+        rate_of_change[VTXBUF_AZ] = aa_res.z;
+    #endif
+    #if LENTROPY
+        const ModelScalarData ss = read_data(i, j, k, in.vertex_buffer, VTXBUF_ENTROPY);
+        const ModelVector uu_res = momentum(uu, lnrho, ss, aa);
+        rate_of_change[VTXBUF_UUX] = uu_res.x;
+        rate_of_change[VTXBUF_UUY] = uu_res.y;
+        rate_of_change[VTXBUF_UUZ] = uu_res.z;
+        rate_of_change[VTXBUF_ENTROPY] = entropy(ss, uu, lnrho, aa);
+    #else
+        const ModelVector uu_res = momentum(uu, lnrho);
+        rate_of_change[VTXBUF_UUX] = uu_res.x;
+        rate_of_change[VTXBUF_UUY] = uu_res.y;
+        rate_of_change[VTXBUF_UUZ] = uu_res.z;
+    #endif
+
+
+
+    // Williamson (1980) NOTE: older version of astaroth used inhomogenous
+    const ModelScalar alpha[] = {ModelScalar(.0), ModelScalar(-5. / 9.), ModelScalar(-153. / 128.)};
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        if (step_number == 0) {
+            out->vertex_buffer[w][idx] = rate_of_change[w] * dt;
+        } else {
+            out->vertex_buffer[w][idx] = alpha[step_number] * out->vertex_buffer[w][idx]
+                                       + rate_of_change[w] * dt;
+        }
+    }
+}
+
+static void
+solve_beta_step(const int step_number, const int i, const int j, const int k,
+                   const ModelMesh& in, ModelMesh* out)
+{
+    const int idx = AC_VTXBUF_IDX(i, j, k, in.info);
+
+    // Williamson (1980) NOTE: older version of astaroth used inhomogenous
+    const ModelScalar beta[]  = {ModelScalar(1. / 3.), ModelScalar(15. / 16.), ModelScalar(8. / 15.)};
+
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+        out->vertex_buffer[w][idx] += beta[step_number] * in.vertex_buffer[w][idx];
+}
+
+void
+model_rk3_step(const int step_number, const ModelScalar dt, ModelMesh* mesh)
+{
+    mesh_info = &(mesh->info);
+
+    ModelMesh* tmp = modelmesh_create(mesh->info);
+
+	boundconds(mesh->info, mesh);
+	#pragma omp parallel for
+	for (int k = get(AC_nz_min); k < get(AC_nz_max); ++k) {
+	    for (int j = get(AC_ny_min); j < get(AC_ny_max); ++j) {
+		for (int i = get(AC_nx_min); i < get(AC_nx_max); ++i) {
+		    solve_alpha_step(step_number, dt, i, j, k, *mesh, tmp);
+		}
+	    }
+	}
+	#pragma omp parallel for
+	for (int k = get(AC_nz_min); k < get(AC_nz_max); ++k) {
+	    for (int j = get(AC_ny_min); j < get(AC_ny_max); ++j) {
+		for (int i = get(AC_nx_min); i < get(AC_nx_max); ++i) {
+		    solve_beta_step(step_number, i, j, k, *tmp, mesh);
+		}
+	    }
+	}
+
+    modelmesh_destroy(tmp);
+    mesh_info = NULL;
+}
+
+void
+model_rk3(const ModelScalar dt, ModelMesh* mesh)
+{
+    mesh_info = &(mesh->info);
+
+    ModelMesh* tmp = modelmesh_create(mesh->info);
+
+    for (int step_number = 0; step_number < 3; ++step_number) {
+        boundconds(mesh->info, mesh);
+        #pragma omp parallel for
+        for (int k = get(AC_nz_min); k < get(AC_nz_max); ++k) {
+            for (int j = get(AC_ny_min); j < get(AC_ny_max); ++j) {
+                for (int i = get(AC_nx_min); i < get(AC_nx_max); ++i) {
+                    solve_alpha_step(step_number, dt, i, j, k, *mesh, tmp);
+                }
+            }
+        }
+        #pragma omp parallel for
+        for (int k = get(AC_nz_min); k < get(AC_nz_max); ++k) {
+            for (int j = get(AC_ny_min); j < get(AC_ny_max); ++j) {
+                for (int i = get(AC_nx_min); i < get(AC_nx_max); ++i) {
+                    solve_beta_step(step_number, i, j, k, *tmp, mesh);
+                }
+            }
+        }
+    }
+
+    modelmesh_destroy(tmp);
+    mesh_info = NULL;
+}
+#if 0
+static MODEL_REAL
+continuity(const int& i, const int& j, const int& k, const ModelMesh& mesh)
+{
+    return -vec_dot_nabla_scal(
+               i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_UUX],
+               mesh.vertex_buffer[VTXBUF_UUY], mesh.vertex_buffer[VTXBUF_UUZ],
+               mesh.vertex_buffer[VTXBUF_LNRHO]) -
+           div_vec(i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_UUX],
+                   mesh.vertex_buffer[VTXBUF_UUY],
+                   mesh.vertex_buffer[VTXBUF_UUZ]);
+
+    // return laplace_scal(i, j, k, mesh.info,
+    // mesh.vertex_buffer[VTXBUF_LNRHO])*mesh.info.real_params[AC_nu_visc];
+}
+
+static void
+momentum(const int& i, const int& j, const int& k, const ModelMesh& mesh,
+         MODEL_REAL* mom_x, MODEL_REAL* mom_y, MODEL_REAL* mom_z)
+{
+    // Vec dot nabla uu
+    const MODEL_REAL vec_dot_nabla_uux = vec_dot_nabla_scal(
+        i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_UUX],
+        mesh.vertex_buffer[VTXBUF_UUY], mesh.vertex_buffer[VTXBUF_UUZ],
+        mesh.vertex_buffer[VTXBUF_UUX]);
+    const MODEL_REAL vec_dot_nabla_uuy = vec_dot_nabla_scal(
+        i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_UUX],
+        mesh.vertex_buffer[VTXBUF_UUY], mesh.vertex_buffer[VTXBUF_UUZ],
+        mesh.vertex_buffer[VTXBUF_UUY]);
+    const MODEL_REAL vec_dot_nabla_uuz = vec_dot_nabla_scal(
+        i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_UUX],
+        mesh.vertex_buffer[VTXBUF_UUY], mesh.vertex_buffer[VTXBUF_UUZ],
+        mesh.vertex_buffer[VTXBUF_UUZ]);
+    // Gradient
+    MODEL_REAL ddx_lnrho, ddy_lnrho, ddz_lnrho;
+    grad(i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_LNRHO], &ddx_lnrho,
+         &ddy_lnrho, &ddz_lnrho);
+
+    // Viscosity
+    MODEL_REAL visc_x, visc_y, visc_z;
+    nu_const(i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_UUX],
+             mesh.vertex_buffer[VTXBUF_UUY], mesh.vertex_buffer[VTXBUF_UUZ],
+             mesh.vertex_buffer[VTXBUF_LNRHO], &visc_x, &visc_y, &visc_z);
+
+    *mom_x = -vec_dot_nabla_uux -
+             mesh.info.real_params[AC_cs2_sound] * ddx_lnrho + visc_x;
+    *mom_y = -vec_dot_nabla_uuy -
+             mesh.info.real_params[AC_cs2_sound] * ddy_lnrho + visc_y;
+    *mom_z = -vec_dot_nabla_uuz -
+             mesh.info.real_params[AC_cs2_sound] * ddz_lnrho + visc_z;
+
+
+    #if  LENTROPY
+
+    #endif
+}
+
+#if LINDUCTION
+static void
+induction(const int& i, const int& j, const int& k, const ModelMesh& mesh,
+          MODEL_REAL* ind_x, MODEL_REAL* ind_y, MODEL_REAL* ind_z)
+{
+    /*
+     *ind_x = mesh.vertex_buffer[VTXBUF_AX][AC_VTXBUF_IDX(i, j, k, mesh.info)];
+     *ind_y = mesh.vertex_buffer[VTXBUF_AY][AC_VTXBUF_IDX(i, j, k, mesh.info)];
+     *ind_z = mesh.vertex_buffer[VTXBUF_AZ][AC_VTXBUF_IDX(i, j, k, mesh.info)];
+     */
+    const MODEL_REAL ddx_Az = der_scal<AXIS_X>(i, j, k, mesh.info,
+                                           mesh.vertex_buffer[VTXBUF_AZ]);
+    const MODEL_REAL ddx_Ay = der_scal<AXIS_X>(i, j, k, mesh.info,
+                                           mesh.vertex_buffer[VTXBUF_AY]);
+    const MODEL_REAL ddy_Ax = der_scal<AXIS_Y>(i, j, k, mesh.info,
+                                           mesh.vertex_buffer[VTXBUF_AX]);
+    const MODEL_REAL ddy_Az = der_scal<AXIS_Y>(i, j, k, mesh.info,
+                                           mesh.vertex_buffer[VTXBUF_AZ]);
+    const MODEL_REAL ddz_Ay = der_scal<AXIS_Z>(i, j, k, mesh.info,
+                                           mesh.vertex_buffer[VTXBUF_AY]);
+    const MODEL_REAL ddz_Ax = der_scal<AXIS_Z>(i, j, k, mesh.info,
+                                           mesh.vertex_buffer[VTXBUF_AX]);
+
+    const MODEL_REAL Bx = ddy_Az - ddz_Ay;
+    const MODEL_REAL By = ddz_Ax - ddx_Az;
+    const MODEL_REAL Bz = ddx_Ay - ddy_Ax;
+
+    MODEL_REAL lx, ly, lz;
+    laplace_vec(i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_AX],
+                mesh.vertex_buffer[VTXBUF_AY], mesh.vertex_buffer[VTXBUF_AZ],
+                &lx, &ly, &lz);
+
+    MODEL_REAL gx, gy, gz;
+    grad_div_vec(i, j, k, mesh.info, mesh.vertex_buffer[VTXBUF_AX],
+                 mesh.vertex_buffer[VTXBUF_AY], mesh.vertex_buffer[VTXBUF_AZ],
+                 &gx, &gy, &gz);
+
+    const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
+    *ind_x        = mesh.vertex_buffer[VTXBUF_UUY][idx] * Bz -
+             mesh.vertex_buffer[VTXBUF_UUZ][idx] * By -
+             mesh.info.real_params[AC_eta] * (-lx + gx);
+    *ind_y = mesh.vertex_buffer[VTXBUF_UUZ][idx] * Bx -
+             mesh.vertex_buffer[VTXBUF_UUX][idx] * Bz -
+             mesh.info.real_params[AC_eta] * (-ly + gy);
+    *ind_z = mesh.vertex_buffer[VTXBUF_UUX][idx] * By -
+             mesh.vertex_buffer[VTXBUF_UUY][idx] * Bx -
+             mesh.info.real_params[AC_eta] * (-lz + gz);
+}
+#endif
+
+#if LINDUCTION
+static inline void
+entropy(const int& i, const int& j, const int& k, const ModelMesh& mesh,
+        MODEL_REAL* entropy)
+{
+    // Unused
+    (void)i;
+    (void)j;
+    (void)k;
+    (void)mesh;
+    (void)entropy;
+}
+#endif
+
+void
+model_rk3(const MODEL_REAL& dt, ModelMesh* mesh)
+{
+#define INT_PARAM(X) (mesh->info.int_params[X])
+
+    ModelMesh* tmp = modelmesh_create(mesh->info);
+
+    // Williamson (1980) NOTE: older version of astaroth used inhomogenous
+    const ModelScalar alphas[] = {.0l, -5.l / 9.l, -153.l / 128.l};
+    const ModelScalar betas[]  = {1.l / 3.l, 15.l / 16.l, 8.l / 15.l};
+
+    for (int step_number = 0; step_number < 3; ++step_number) {
+        const MODEL_REAL alpha = MODEL_REAL(alphas[step_number]);
+        const MODEL_REAL beta  = MODEL_REAL(betas[step_number]);
+
+        boundconds(mesh->info, mesh);
+//#pragma omp parallel for
+        for (int k = INT_PARAM(AC_nz_min); k < INT_PARAM(AC_nz_max); ++k) {
+            for (int j = INT_PARAM(AC_ny_min); j < INT_PARAM(AC_ny_max); ++j) {
+                for (int i = INT_PARAM(AC_nx_min); i < INT_PARAM(AC_nx_max);
+                     ++i) {
+                    const int idx = AC_VTXBUF_IDX(i, j, k, mesh->info);
+
+                    if (step_number == 0) {
+                        for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+                            tmp->vertex_buffer[w][idx] = 0;
+                    }
+
+                    tmp->vertex_buffer
+                        [VTXBUF_LNRHO]
+                        [idx] = alpha * tmp->vertex_buffer[VTXBUF_LNRHO][idx] +
+                                continuity(i, j, k, *mesh) * dt;
+
+                    MODEL_REAL mom_x, mom_y, mom_z;
+                    momentum(i, j, k, *mesh, &mom_x, &mom_y, &mom_z);
+                    tmp->vertex_buffer[VTXBUF_UUX]
+                                      [idx] = alpha *
+                                                  tmp->vertex_buffer[VTXBUF_UUX]
+                                                                    [idx] +
+                                              mom_x * dt;
+                    tmp->vertex_buffer[VTXBUF_UUY]
+                                      [idx] = alpha *
+                                                  tmp->vertex_buffer[VTXBUF_UUY]
+                                                                    [idx] +
+                                              mom_y * dt;
+                    tmp->vertex_buffer[VTXBUF_UUZ]
+                                      [idx] = alpha *
+                                                  tmp->vertex_buffer[VTXBUF_UUZ]
+                                                                    [idx] +
+                                              mom_z * dt;
+
+#if LINDUCTION
+                    MODEL_REAL indx, indy, indz;
+                    induction(i, j, k, *mesh, &indx, &indy, &indz);
+                    tmp->vertex_buffer[VTXBUF_AX]
+                                      [idx] = alpha *
+                                                  tmp->vertex_buffer[VTXBUF_AX]
+                                                                    [idx] +
+                                              indx * dt;
+                    tmp->vertex_buffer[VTXBUF_AY]
+                                      [idx] = alpha *
+                                                  tmp->vertex_buffer[VTXBUF_AY]
+                                                                    [idx] +
+                                              indy * dt;
+                    tmp->vertex_buffer[VTXBUF_AZ]
+                                      [idx] = alpha *
+                                                  tmp->vertex_buffer[VTXBUF_AZ]
+                                                                    [idx] +
+                                              indz * dt;
+#endif
+
+#if LENTROPY
+    //MODEL_REAL s
+#endif
+                }
+            }
+        }
+//#pragma omp parallel for
+        for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+            for (int k = INT_PARAM(AC_nz_min); k < INT_PARAM(AC_nz_max); ++k) {
+                for (int j = INT_PARAM(AC_ny_min); j < INT_PARAM(AC_ny_max);
+                     ++j) {
+                    for (int i = INT_PARAM(AC_nx_min); i < INT_PARAM(AC_nx_max);
+                         ++i) {
+                        const int idx = AC_VTXBUF_IDX(i, j, k, mesh->info);
+                        mesh->vertex_buffer[VertexBufferHandle(
+                            w)][idx] += beta *
+                                        tmp->vertex_buffer[VertexBufferHandle(
+                                            w)][idx];
+                    }
+                }
+            }
+        }
+    }
+
+#undef INT_PARAM
+}
+#endif
diff --git a/src/standalone/model/model_rk3.h b/src/standalone/model/model_rk3.h
new file mode 100644
index 0000000..b60d9df
--- /dev/null
+++ b/src/standalone/model/model_rk3.h
@@ -0,0 +1,33 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+#include "modelmesh.h"
+
+void model_rk3(const ModelScalar dt, ModelMesh* mesh);
+
+void model_rk3_step(const int step_number, const ModelScalar dt, ModelMesh* mesh);
diff --git a/src/standalone/model/modelmesh.h b/src/standalone/model/modelmesh.h
new file mode 100644
index 0000000..f7bb6ee
--- /dev/null
+++ b/src/standalone/model/modelmesh.h
@@ -0,0 +1,36 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+
+typedef long double ModelScalar;
+
+typedef struct {
+    ModelScalar* vertex_buffer[NUM_VTXBUF_HANDLES];
+    AcMeshInfo info;
+} ModelMesh;
diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc
new file mode 100644
index 0000000..0dac03a
--- /dev/null
+++ b/src/standalone/renderer.cc
@@ -0,0 +1,447 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include <SDL.h>    // Note: using local version in src/3rdparty dir
+#include <math.h>   // ceil
+#include <string.h> // memcpy
+
+#include "config_loader.h"
+#include "core/errchk.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+// Window
+SDL_Renderer* renderer      = NULL;
+static SDL_Window* window   = NULL;
+static int window_width     = 800;
+static int window_height    = 600;
+static const int window_bpp = 32; // Bits per pixel
+
+// Surfaces
+SDL_Surface* surfaces[NUM_VTXBUF_HANDLES];
+static int datasurface_width  = -1;
+static int datasurface_height = -1;
+static int k_slice = 0;
+static int k_slice_max = 0;
+
+// Colors
+static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
+static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
+static const int tiles_per_row = 3;
+
+/*
+ * =============================================================================
+ * Camera
+ * =============================================================================
+ */
+/*
+typedef struct {
+   float x, y;
+} float2;
+*/
+typedef struct {
+    float x, y, w, h;
+} vec4;
+
+typedef struct {
+    float2 pos;
+    float scale;
+} Camera;
+
+static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
+
+static inline vec4
+project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
+{
+    const vec4 rect = (vec4){
+        camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
+        camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
+        camera.scale * bbox.x, camera.scale * bbox.y};
+
+    return rect;
+}
+
+/*
+ * =============================================================================
+ * Renderer
+ * =============================================================================
+ */
+
+static int
+renderer_init(const int& mx, const int& my)
+{
+    // Init video
+    SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
+
+    // Setup window
+    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
+                              SDL_WINDOWPOS_UNDEFINED, window_width,
+                              window_height, SDL_WINDOW_SHOWN);
+
+    // Setup SDL renderer
+    renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+    //SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
+    SDL_GetWindowSize(window, &window_width, &window_height);
+
+    SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
+
+    datasurface_width  = mx;
+    datasurface_height = my;
+    // vec drawing uses the surface of the first component, no memory issues here
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
+            0, datasurface_width, datasurface_height, window_bpp,
+            SDL_PIXELFORMAT_RGBA8888);
+
+    camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
+                          -.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
+    camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
+                       window_height / float(datasurface_height * (num_tiles/tiles_per_row)));
+
+    SDL_RendererInfo renderer_info;
+    SDL_GetRendererInfo(renderer, &renderer_info);
+    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
+    return 0;
+}
+
+static int
+set_pixel(const int& i, const int& j, const uint32_t& color,
+          SDL_Surface* surface)
+{
+    uint32_t* pixels           = (uint32_t*)surface->pixels;
+    pixels[i + j * surface->w] = color;
+    return 0;
+}
+
+static int
+draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
+                   const int& tile)
+{
+    const float xoffset = (tile % tiles_per_row) * datasurface_width;
+    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+
+    /*
+    const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
+    const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
+    */
+    const float max = 1.f;//float(acReduceScal(RTYPE_MAX, vertex_buffer));
+    const float min = 0.f;//float(acReduceScal(RTYPE_MIN, vertex_buffer));
+    const float range = fabsf(max - min);
+    const float mid   = max - .5f * range;
+
+    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+
+    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
+        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
+            ERRCHK(i < datasurface_width && j < datasurface_height);
+
+            const int idx       = AC_VTXBUF_IDX(i, j, k, mesh.info);
+            const uint8_t shade = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
+                range);
+            uint8_t color[4]            = {0, 0, 0, 255};
+            color[tile % 3]             = shade;
+            const uint32_t mapped_color = SDL_MapRGBA(
+                surfaces[vertex_buffer]->format, color[0], color[1], color[2],
+                color[3]);
+            set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
+        }
+    }
+
+    const float2 pos   = (float2){xoffset, yoffset};
+    const float2 bbox  = (float2){.5f * datasurface_width,
+                                 .5f * datasurface_height};
+    const float2 wsize = (float2){float(window_width), float(window_height)};
+    const vec4 rectf   = project_ortho(pos, bbox, wsize);
+    SDL_Rect rect      = (SDL_Rect){
+        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+                                                    surfaces[vertex_buffer]);
+    SDL_RenderCopy(renderer, tex, NULL, &rect);
+    SDL_DestroyTexture(tex);
+
+    return 0;
+}
+
+static int
+draw_vertex_buffer_vec(const AcMesh& mesh,
+                       const VertexBufferHandle& vertex_buffer_a,
+                       const VertexBufferHandle& vertex_buffer_b,
+                       const VertexBufferHandle& vertex_buffer_c,
+                       const int& tile)
+{
+    const float xoffset = (tile % tiles_per_row) * datasurface_width;
+    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+
+    /*
+    const float maxx = float(
+        max(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_a),
+            max(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_b),
+                model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer_c))));
+    const float minn = float(
+        min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_a),
+            min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
+                model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
+    */
+    const float maxx = float(
+        max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
+            max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
+                acReduceScal(RTYPE_MAX, vertex_buffer_c))));
+    const float minn = float(
+        min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
+            min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
+                acReduceScal(RTYPE_MIN, vertex_buffer_c))));
+    const float range = fabsf(maxx - minn);
+    const float mid   = maxx - .5f * range;
+
+    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
+        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
+            ERRCHK(i < datasurface_width && j < datasurface_height);
+
+            const int idx   = AC_VTXBUF_IDX(i, j, k, mesh.info);
+            const uint8_t r = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
+                range);
+            const uint8_t g = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
+                range);
+            const uint8_t b = (uint8_t)(
+                255.f *
+                (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
+                range);
+            const uint32_t mapped_color = SDL_MapRGBA(
+                surfaces[vertex_buffer_a]->format, r, g, b, 255);
+            set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
+        }
+    }
+
+    const float2 pos   = (float2){xoffset, yoffset};
+    const float2 bbox  = (float2){.5f * datasurface_width,
+                                 .5f * datasurface_height};
+    const float2 wsize = (float2){float(window_width), float(window_height)};
+    const vec4 rectf   = project_ortho(pos, bbox, wsize);
+    SDL_Rect rect      = (SDL_Rect){
+        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+                                                    surfaces[vertex_buffer_a]);
+    SDL_RenderCopy(renderer, tex, NULL, &rect);
+    SDL_DestroyTexture(tex);
+
+    return 0;
+}
+
+static int
+renderer_draw(const AcMesh& mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
+    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
+                           NUM_VTXBUF_HANDLES);
+
+    // Drawing done, present
+    SDL_RenderPresent(renderer);
+    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
+                           color_bg.a);
+    SDL_RenderClear(renderer);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        const VertexBufferHandle vertex_buffer = VertexBufferHandle(i);
+        /*
+        printf("\t%s umax %e, min %e\n", vtxbuf_names[vertex_buffer],
+               (double)model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer),
+               (double)model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
+        */
+        printf("\t%s umax %e, min %e\n", vtxbuf_names[vertex_buffer],
+               (double)acReduceScal(RTYPE_MAX, vertex_buffer),
+               (double)acReduceScal(RTYPE_MIN, vertex_buffer));
+    }
+    printf("\n");
+
+    return 0;
+}
+
+static int
+renderer_quit(void)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        SDL_FreeSurface(surfaces[i]);
+
+    SDL_DestroyRenderer(renderer);
+    SDL_DestroyWindow(window);
+
+    renderer = NULL;
+    window   = NULL;
+
+    SDL_Quit();
+    return 0;
+}
+
+static int init_type = INIT_TYPE_GAUSSIAN_RADIAL_EXPL;
+
+static bool
+running(AcMesh* mesh)
+{
+    SDL_Event e;
+    while (SDL_PollEvent(&e)) {
+        if (e.type == SDL_QUIT) {
+            return false;
+        }
+        else if (e.type == SDL_KEYDOWN) {
+            if (e.key.keysym.sym == SDLK_ESCAPE)
+                return false;
+            if (e.key.keysym.sym == SDLK_SPACE) {
+                init_type = (init_type + 1) % NUM_INIT_TYPES;
+                acmesh_init_to(InitType(init_type), mesh);
+                acLoad(*mesh);
+            }
+            if (e.key.keysym.sym == SDLK_i) {
+                k_slice = (k_slice + 1) % k_slice_max;
+                printf("k_slice %d\n", k_slice);
+            }
+            if (e.key.keysym.sym == SDLK_k) {
+                k_slice = (k_slice - 1 + k_slice_max) % k_slice_max;
+                printf("k_slice %d\n", k_slice);
+            }
+        }
+    }
+    return true;
+}
+
+static void
+check_input(const float& dt)
+{
+    /* Camera movement */
+    const float camera_translate_rate = 1000.f / camera.scale;
+    const float camera_scale_rate     = 1.0001f;
+    const uint8_t* keystates          = (uint8_t*)SDL_GetKeyboardState(NULL);
+    if (keystates[SDL_SCANCODE_UP])
+        camera.pos.y += camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_DOWN])
+        camera.pos.y -= camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_LEFT])
+        camera.pos.x -= camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_RIGHT])
+        camera.pos.x += camera_translate_rate * dt;
+    if (keystates[SDL_SCANCODE_PAGEUP])
+        camera.scale += camera.scale * camera_scale_rate * dt;
+    if (keystates[SDL_SCANCODE_PAGEDOWN])
+        camera.scale -= camera.scale * camera_scale_rate * dt;
+    if (keystates[SDL_SCANCODE_COMMA])
+        set_timescale(AcReal(.1));
+    if (keystates[SDL_SCANCODE_PERIOD])
+        set_timescale(AcReal(1.));
+}
+
+int
+run_renderer(void)
+{
+    /* Parse configs */
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    renderer_init(mesh_info.int_params[AC_mx], mesh_info.int_params[AC_my]);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(InitType(init_type), mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+    Timer frame_timer;
+    timer_reset(&frame_timer);
+
+    Timer wallclock;
+    timer_reset(&wallclock);
+
+    Timer io_timer;
+    timer_reset(&io_timer);
+
+    const float desired_frame_time = 1.f / 60.f;
+    int steps                      = 0;
+    k_slice                        = mesh->info.int_params[AC_mz] / 2;
+    k_slice_max                    = mesh->info.int_params[AC_mz];
+    while (running(mesh)) {
+
+        /* Input */
+        check_input(timer_diff_nsec(io_timer) / 1e9f);
+        timer_reset(&io_timer);
+
+/* Step the simulation */
+#if 1
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+#else
+        ModelMesh* model_mesh = modelmesh_create(mesh->info);
+        const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acmesh_to_modelmesh(*mesh, model_mesh);
+        model_rk3(dt, model_mesh);
+        modelmesh_to_acmesh(*model_mesh, mesh);
+        modelmesh_destroy(model_mesh);
+        acLoad(*mesh); // Just a quick hack s.t. we do not have to add an
+                       // additional if to the render part
+#endif
+
+        ++steps;
+
+        /* Render */
+        const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
+        if (timer_diff_sec >= desired_frame_time) {
+            //acStore(mesh);
+            const int num_vertices = mesh->info.int_params[AC_mxy];
+            const int3 dst         = (int3){0, 0, k_slice};
+            acStoreWithOffset(dst, num_vertices, mesh);
+            acSynchronize();
+            renderer_draw(*mesh); // Bottleneck is here
+            printf("Step #%d, dt: %f\n", steps, double(dt));
+            timer_reset(&frame_timer);
+        }
+    }
+    printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f));
+
+    acStore(mesh);
+    acQuit();
+    acmesh_destroy(mesh);
+
+    renderer_quit();
+
+    return 0;
+}
diff --git a/src/standalone/run.h b/src/standalone/run.h
new file mode 100644
index 0000000..5ddde7d
--- /dev/null
+++ b/src/standalone/run.h
@@ -0,0 +1,35 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+
+int run_autotest(void);
+
+int run_simulation(void);
+
+int run_benchmark(void);
+
+int run_renderer(void);
diff --git a/src/standalone/simulation.cc b/src/standalone/simulation.cc
new file mode 100644
index 0000000..ce35e31
--- /dev/null
+++ b/src/standalone/simulation.cc
@@ -0,0 +1,339 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "run.h"
+
+#include "config_loader.h"
+#include "core/errchk.h"
+#include "core/math_utils.h"
+#include "model/host_memory.h"
+#include "model/host_timestep.h"
+#include "model/model_reduce.h"
+#include "model/model_rk3.h"
+#include "timer_hires.h"
+
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+/*
+// DEPRECATED: TODO remove
+static inline void
+print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
+{
+    const int max_name_width = 16;
+    printf("Step %d, dt %e s\n", step, double(dt));
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
+    double(model_reduce_vec(mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)),
+    double(model_reduce_vec(mesh, RTYPE_MIN, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)),
+    double(model_reduce_vec(mesh, RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)));
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
+        double(model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(i))),
+        double(model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(i))),
+        double(model_reduce_scal(mesh, RTYPE_RMS, VertexBufferHandle(i))));
+    }
+}
+*/
+
+//Write all setting info into a separate ascii file. This is done to guarantee
+//that we have the data specifi information in the thing, even though in
+//principle these things are in the astaroth.conf.
+static inline 
+void write_mesh_info(const AcMeshInfo* config)
+{
+ 
+    FILE* infotxt;
+
+    infotxt = fopen("purge.sh","w");
+    fprintf(infotxt, "#!/bin/bash\n");
+    fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
+    fclose(infotxt);   
+
+    infotxt = fopen("mesh_info.list","w");
+
+    //Total grid dimensions
+    fprintf(infotxt, "int  AC_mx        %i \n", config->int_params[AC_mx]);
+    fprintf(infotxt, "int  AC_my        %i \n", config->int_params[AC_my]);
+    fprintf(infotxt, "int  AC_mz        %i \n", config->int_params[AC_mz]);
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    fprintf(infotxt, "int  AC_nx_min    %i \n", config->int_params[AC_nx_min]);
+    fprintf(infotxt, "int  AC_nx_max    %i \n", config->int_params[AC_nx_max]);
+    fprintf(infotxt, "int  AC_ny_min    %i \n", config->int_params[AC_ny_min]);
+    fprintf(infotxt, "int  AC_ny_max    %i \n", config->int_params[AC_ny_max]);
+    fprintf(infotxt, "int  AC_nz_min    %i \n", config->int_params[AC_nz_min]);
+    fprintf(infotxt, "int  AC_nz_max    %i \n", config->int_params[AC_nz_max]);
+
+    // Spacing
+    fprintf(infotxt, "real AC_inv_dsx   %e \n", (double)config->real_params[AC_inv_dsx]);
+    fprintf(infotxt, "real AC_inv_dsy   %e \n", (double)config->real_params[AC_inv_dsy]);
+    fprintf(infotxt, "real AC_inv_dsz   %e \n", (double)config->real_params[AC_inv_dsz]);
+    fprintf(infotxt, "real AC_dsmin     %e \n", (double)config->real_params[AC_dsmin  ]);
+
+    /* Additional helper params */
+    // Int helpers
+    fprintf(infotxt, "int  AC_mxy       %i \n", config->int_params[AC_mxy ]);
+    fprintf(infotxt, "int  AC_nxy       %i \n", config->int_params[AC_nxy ]);
+    fprintf(infotxt, "int  AC_nxyz      %i \n", config->int_params[AC_nxyz]);
+
+    // Real helpers
+    fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
+    fprintf(infotxt, "real AC_cv_sound  %e \n", (double)config->real_params[AC_cv_sound ]);
+
+    fclose(infotxt);
+}
+
+
+//This funtion writes a run state into a set of C binaries. For the sake of
+//accuracy, all floating point numbers are to be saved in long double precision
+//regardless of the choise of accuracy during runtime. 
+static inline void
+save_mesh(const AcMesh &save_mesh, const int step, 
+          const AcReal t_step)
+{
+    FILE* save_ptr;  
+
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const size_t n = AC_VTXBUF_SIZE(save_mesh.info);
+
+        const char* buffername = vtxbuf_names[w];
+        char cstep[10];
+        char bin_filename[80] = "\0";
+
+        //sprintf(bin_filename, "");
+
+        sprintf(cstep, "%d", step);
+
+        strcat(bin_filename, buffername);
+        strcat(bin_filename, "_");
+        strcat(bin_filename, cstep);
+        strcat(bin_filename, ".mesh");
+
+        printf("Savefile %s \n", bin_filename);
+
+        save_ptr = fopen(bin_filename,"wb");
+
+        //Start file with time stamp
+        long double write_long_buf =  (long double) t_step;
+        fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
+        //Grid data
+        for (size_t i = 0; i < n; ++i) {
+            const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
+            long double write_long_buf =  (long double) point_val;
+            fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
+        }
+        fclose(save_ptr);
+    }
+
+}
+
+
+
+// This function prints out the diagnostic values to std.out and also saves and
+// appends an ascii file to contain all the result. 
+static inline void
+print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file)
+{
+    
+    AcReal buf_rms, buf_max, buf_min;
+    const int max_name_width = 16;
+
+    // Calculate rms, min and max from the velocity vector field
+    buf_max = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+    buf_min = acReduceVec(RTYPE_MIN, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+    buf_rms = acReduceVec(RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+
+    // MV: The ordering in the earlier version was wrong in terms of variable
+    // MV: name and its diagnostics. 
+    printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
+           double(buf_min), double(buf_rms), double(buf_max));
+    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), 
+           double(buf_min), double(buf_rms), double(buf_max));
+    
+
+    // Calculate rms, min and max from the variables as scalars
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        buf_max = acReduceScal(RTYPE_MAX, VertexBufferHandle(i));
+        buf_min = acReduceScal(RTYPE_MIN, VertexBufferHandle(i));
+        buf_rms = acReduceScal(RTYPE_RMS, VertexBufferHandle(i));
+        
+        printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
+               double(buf_min), double(buf_rms), double(buf_max));
+        fprintf(diag_file, "%e %e %e ", double(buf_min), double(buf_rms), double(buf_max));
+    }
+
+    fprintf(diag_file, "\n");
+}
+
+    /* 
+        MV NOTE: At the moment I have no clear idea how to calculate magnetic
+        diagnostic variables from grid. Vector potential measures have a limited
+        value. TODO: Smart way to get brms, bmin and bmax.
+    */ 
+
+int
+run_simulation(void)
+{
+    /* Parse configs */
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+
+    AcMesh* mesh = acmesh_create(mesh_info);
+    acmesh_init_to(INIT_TYPE_GAUSSIAN_RADIAL_EXPL, mesh);
+
+    acInit(mesh_info);
+    acLoad(*mesh);
+
+
+    FILE *diag_file;
+    diag_file = fopen("timeseries.ts", "a");
+    // TODO Get time from earlier state. 
+    AcReal t_step = 0.0;
+
+    // Generate the title row.
+    fprintf(diag_file, "step  t_step  dt  uu_total_min  uu_total_rms  uu_total_max  ");
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
+    }
+
+    fprintf(diag_file, "\n");
+
+    write_mesh_info(&mesh_info);
+    print_diagnostics(0, AcReal(.0), t_step, diag_file);
+
+    acSynchronize();
+    acStore(mesh);
+    save_mesh(*mesh, 0, t_step);
+
+    const int max_steps = mesh_info.int_params[AC_max_steps];
+    const int save_steps = mesh_info.int_params[AC_save_steps];
+    const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info
+
+    AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
+    AcReal bin_crit_t = bin_save_t;
+
+    /* Step the simulation */
+    for (int i = 1; i < max_steps; ++i) {
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                        VTXBUF_UUZ);
+        const AcReal dt   = host_timestep(umax, mesh_info);
+        acIntegrate(dt);
+
+        t_step += dt; 
+
+        /* Save the simulation state and print diagnostics */
+        if ((i % save_steps) == 0) {
+
+            /*
+		print_diagnostics() writes out both std.out printout from the
+		results and saves the diagnostics into a table for ascii file
+                timeseries.ts.
+            */
+
+            print_diagnostics(i, dt, t_step, diag_file);
+
+            /*
+		We would also might want an XY-average calculating funtion,
+		which can be very useful when observing behaviour of turbulent
+                simulations. (TODO)
+            */
+
+        }
+
+        /* Save the simulation state and print diagnostics */
+        if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) {
+
+            /*
+		This loop saves the data into simple C binaries which can be
+                used for analysing the data snapshots closely.
+ 
+                Saving simulation state should happen in a separate stage. We do 
+                not want to save it as often as diagnostics. The file format 
+                should IDEALLY be HDF5 which has become a well supported, portable and 
+		reliable data format when it comes to HPC applications.
+		However, implementing it will have to for more simpler approach
+                to function. (TODO?)
+            */
+                
+            /*
+                The updated mesh will be located on the GPU. Also all calls
+                to the astaroth interface (functions beginning with ac*) are
+                assumed to be asynchronous, so the meshes must be also synchronized
+                before transferring the data to the CPU. Like so:
+
+                acSynchronize();
+                acStore(mesh);
+            */
+
+            acSynchronize();
+            acStore(mesh);
+
+            save_mesh(*mesh, i, t_step);
+
+            bin_crit_t += bin_save_t; 
+
+        }
+
+    }
+
+    //////Save the final snapshot
+    ////acSynchronize();
+    ////acStore(mesh);
+
+    ////save_mesh(*mesh, , t_step);
+
+    acQuit();
+    acmesh_destroy(mesh);
+
+    fclose(diag_file);
+
+    return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/standalone/timer_hires.h b/src/standalone/timer_hires.h
new file mode 100644
index 0000000..6b9723b
--- /dev/null
+++ b/src/standalone/timer_hires.h
@@ -0,0 +1,64 @@
+/*
+    Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+  @file
+ \brief High-resolution timer.
+
+    Usage:
+        Timer t;
+        timer_reset(&t);
+        timer_diff_nsec(t);
+
+    If there are issues, try compiling with -std=gnu11 -lrt
+ */
+#pragma once
+#include <stdio.h> // perror
+#include <time.h>
+
+typedef struct timespec Timer;
+// Contains at least the following members:
+// time_t tv_sec;
+// long tv_nsec;
+
+static inline int
+timer_reset(Timer* t)
+{
+    const int retval = clock_gettime(CLOCK_REALTIME, t);
+    if (retval == -1)
+        perror("clock_gettime failure");
+
+    return retval;
+}
+
+static inline long
+timer_diff_nsec(const Timer start)
+{
+    Timer end;
+    timer_reset(&end);
+    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
+                      (end.tv_nsec - start.tv_nsec);
+    return diff;
+}
+
+static inline void
+timer_diff_print(const Timer t)
+{
+    printf("Time elapsed: %g ms\n", timer_diff_nsec(t) / 1e6);
+}