From 78fbcc090d4ae4b3f42a993977a6cb3e68551307 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 23 Jan 2020 20:06:20 +0200
Subject: [PATCH] Reordered src/core to have better division to host and device
 code (this is more likely to work when compiling with mpicxx). Disabled
 separate compilation of CUDA kernels as this complicates compilation and is a
 source of many cmake/cuda bugs. As a downside, GPU code takes longer to
 compile.

---
 src/core/CMakeLists.txt                       |   42 +-
 src/core/astaroth.cc                          |   33 +-
 src/core/device.cc                            |  331 ++--
 src/core/errchk.h                             |  113 --
 src/core/grid.cc                              |  194 ---
 src/core/kernels/CMakeLists.txt               |    3 +
 src/core/kernels/boundconds.cu                |   91 --
 src/core/kernels/boundconds.cuh               |   92 +-
 src/core/kernels/common.cuh                   |  127 --
 .../deprecated/boundconds_PLACEHOLDER.cuh     | 1363 -----------------
 .../kernels/deprecated/reduce_PLACEHOLDER.cuh |  338 ----
 .../kernels/deprecated/rk3_PLACEHOLDER.cuh    |  742 ---------
 src/core/kernels/integration.cu               |  293 ----
 src/core/kernels/integration.cuh              |  287 +++-
 src/core/kernels/kernels.cu                   |  177 +++
 src/core/kernels/kernels.h                    |   82 +
 src/core/kernels/packing.cu                   |  120 --
 src/core/kernels/packing.cuh                  |  124 +-
 src/core/kernels/reductions.cu                |  284 ----
 src/core/kernels/reductions.cuh               |  284 +++-
 src/core/math_utils.h                         |  187 ---
 src/core/node.cc                              |    6 +-
 22 files changed, 1008 insertions(+), 4305 deletions(-)
 delete mode 100644 src/core/errchk.h
 delete mode 100644 src/core/grid.cc
 create mode 100644 src/core/kernels/CMakeLists.txt
 delete mode 100644 src/core/kernels/boundconds.cu
 delete mode 100644 src/core/kernels/common.cuh
 delete mode 100644 src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
 delete mode 100644 src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
 delete mode 100644 src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
 delete mode 100644 src/core/kernels/integration.cu
 create mode 100644 src/core/kernels/kernels.cu
 create mode 100644 src/core/kernels/kernels.h
 delete mode 100644 src/core/kernels/packing.cu
 delete mode 100644 src/core/kernels/reductions.cu
 delete mode 100644 src/core/math_utils.h

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 80ba471..391784d 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,38 +1,4 @@
-########################################
-##  CMakeLists.txt for Astaroth Core  ##
-########################################
-
-# Require C++11
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_FLAGS "-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
-set(CMAKE_CXX_FLAGS_DEBUG "-g")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-
-# Require CUDA
-find_package(CUDA REQUIRED)
-set(CMAKE_CUDA_FLAGS "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
-
-# Compile kernels
-add_library(astaroth_kernels STATIC kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu kernels/packing.cu)
-target_include_directories(astaroth_kernels PRIVATE .)
-target_compile_features(astaroth_kernels PRIVATE cxx_std_11)
-set_target_properties(astaroth_kernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-add_dependencies(astaroth_kernels dsl_headers)
-
-# Compile core
-add_library(astaroth_core STATIC astaroth.cc node.cc device.cc)
-target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
-target_link_libraries(astaroth_core m astaroth_kernels ${CUDA_LIBRARIES})
-target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
-set_target_properties(astaroth_core PROPERTIES POSITION_INDEPENDENT_CODE ON)
-add_dependencies(astaroth_kernels dsl_headers)
-
-if (MULTIGPU_ENABLED)
-    target_compile_definitions(astaroth_core PRIVATE AC_MULTIGPU_ENABLED=1)
-endif ()
-
-if (MPI_ENABLED)
-    target_link_libraries(astaroth_core astaroth_utils)
-    target_compile_definitions(astaroth_core PRIVATE AC_MPI_ENABLED=1)
-endif ()
+## Astaroth Core
+add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
+target_include_directories(astaroth_core PRIVATE ${CUDA_INCLUDE_DIRS})
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc
index b152f5d..3821e3a 100644
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -19,40 +19,11 @@
 #include "astaroth.h"
 
 #include "errchk.h"
-#include "math_utils.h" // int3 + int3
-
-#define AC_GEN_STR(X) #X
-const char* intparam_names[]    = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
-                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]   = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]   = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[]  = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
-                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
-const char* vtxbuf_names[]      = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
-#undef AC_GEN_STR
+#include "math_utils.h"
 
 static const int max_num_nodes   = 1;
 static Node nodes[max_num_nodes] = {0};
-
-static int num_nodes = 0;
-
-void
-acPrintMeshInfo(const AcMeshInfo config)
-{
-    for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
-    for (int i = 0; i < NUM_INT3_PARAMS; ++i)
-        printf("[%s]: (%d, %d, %d)\n", int3param_names[i], config.int3_params[i].x,
-               config.int3_params[i].y, config.int3_params[i].z);
-    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
-    for (int i = 0; i < NUM_REAL3_PARAMS; ++i)
-        printf("[%s]: (%g, %g, %g)\n", real3param_names[i], double(config.real3_params[i].x),
-               double(config.real3_params[i].y), double(config.real3_params[i].z));
-}
+static int num_nodes             = 0;
 
 AcResult
 acInit(const AcMeshInfo mesh_info)
diff --git a/src/core/device.cc b/src/core/device.cc
index 2aeca87..33b241f 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1,60 +1,106 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+#include "astaroth.h"
 
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "astaroth_device.h"
-
-#include <string.h> // memcpy
+#include <mpi.h>
+#include <string.h>
 
+#include "astaroth_utils.h"
 #include "errchk.h"
 #include "math_utils.h"
+#include "timer_hires.h"
 
-#include "kernels/common.cuh"
+#include "kernels/kernels.h"
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 
-struct device_s {
-    int id;
-    AcMeshInfo local_config;
+AcResult
+acDevicePrintInfo(const Device device)
+{
+    const int device_id = device->id;
 
-    // Concurrency
-    cudaStream_t streams[NUM_STREAMS];
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);
 
+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf(
+        "    Compute mode: %d\n",
+        (int)props
+            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
     // Memory
-    VertexBufferArray vba;
-    AcReal* reduce_scratchpad;
-    AcReal* reduce_result;
-};
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
 
-#include "kernels/boundconds.cuh"
-#include "kernels/integration.cuh"
-#include "kernels/reductions.cuh"
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
+    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
+    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
 
-#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
-// #include "kernels/pack_unpack.cuh"
-#endif
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceAutoOptimize(const Device device)
+{
+    cudaSetDevice(device->id);
+    const int3 start = (int3){
+        device->local_config.int_params[AC_nx_min],
+        device->local_config.int_params[AC_ny_min],
+        device->local_config.int_params[AC_nz_min],
+    };
+    const int3 end = (int3){
+        device->local_config.int_params[AC_nx_max],
+        device->local_config.int_params[AC_ny_max],
+        device->local_config.int_params[AC_nz_max],
+    };
+    return acKernelAutoOptimizeIntegration(start, end, device->vba);
+}
+
+AcResult
+acDeviceSynchronizeStream(const Device device, const Stream stream)
+{
+    cudaSetDevice(device->id);
+    if (stream == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    }
+    else {
+        cudaStreamSynchronize(device->streams[stream]);
+    }
+    return AC_SUCCESS;
+}
 
 AcResult
 acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -147,96 +193,6 @@ acDeviceDestroy(Device device)
     return AC_SUCCESS;
 }
 
-AcResult
-acDevicePrintInfo(const Device device)
-{
-    const int device_id = device->id;
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, device_id);
-    printf("--------------------------------------------------\n");
-    printf("Device Number: %d\n", device_id);
-    const size_t bus_id_max_len = 128;
-    char bus_id[bus_id_max_len];
-    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
-    printf("  PCI bus ID: %s\n", bus_id);
-    printf("    Device name: %s\n", props.name);
-    printf("    Compute capability: %d.%d\n", props.major, props.minor);
-
-    // Compute
-    printf("  Compute\n");
-    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
-    printf("    Stream processors: %d\n", props.multiProcessorCount);
-    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
-    printf(
-        "    Compute mode: %d\n",
-        (int)props
-            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
-    // Memory
-    printf("  Global memory\n");
-    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
-    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
-    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
-           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
-    printf("    ECC enabled: %d\n", props.ECCEnabled);
-
-    // Memory usage
-    size_t free_bytes, total_bytes;
-    cudaMemGetInfo(&free_bytes, &total_bytes);
-    const size_t used_bytes = total_bytes - free_bytes;
-    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
-    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
-    printf("  Caches\n");
-    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
-    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
-    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
-    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
-    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
-    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
-    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
-    printf("  Other\n");
-    printf("    Warp size: %d\n", props.warpSize);
-    // printf("    Single to double perf. ratio: %dx\n",
-    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
-    // versions
-    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
-    printf("--------------------------------------------------\n");
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceAutoOptimize(const Device device)
-{
-    cudaSetDevice(device->id);
-    const int3 start = (int3){
-        device->local_config.int_params[AC_nx_min],
-        device->local_config.int_params[AC_ny_min],
-        device->local_config.int_params[AC_nz_min],
-    };
-    const int3 end = (int3){
-        device->local_config.int_params[AC_nx_max],
-        device->local_config.int_params[AC_ny_max],
-        device->local_config.int_params[AC_nz_max],
-    };
-    return acKernelAutoOptimizeIntegration(start, end, device->vba);
-}
-
-AcResult
-acDeviceSynchronizeStream(const Device device, const Stream stream)
-{
-    cudaSetDevice(device->id);
-    if (stream == STREAM_ALL) {
-        cudaDeviceSynchronize();
-    }
-    else {
-        cudaStreamSynchronize(device->streams[stream]);
-    }
-    return AC_SUCCESS;
-}
-
 AcResult
 acDeviceSwapBuffers(const Device device)
 {
@@ -249,66 +205,6 @@ acDeviceSwapBuffers(const Device device)
     return AC_SUCCESS;
 }
 
-AcResult
-acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
-                          const AcReal value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_REAL_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
-                          const AcReal3 value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
-                       const int value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_INT_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
-                        const int3 value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_INT3_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
 AcResult
 acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
                         const size_t start, const AcReal* data, const size_t num)
@@ -327,22 +223,6 @@ acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarAr
     return AC_SUCCESS;
 }
 
-AcResult
-acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
-{
-    cudaSetDevice(device->id);
-
-    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
-    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
-    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
-    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
-                  device->local_config.int_params[AC_multigpu_offset]);
-
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(&d_mesh_info, &device_config, sizeof(device_config),
-                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
 AcResult
 acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                                    const VertexBufferHandle vtxbuf_handle, const int3 src,
@@ -578,31 +458,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
     return AC_SUCCESS;
 }
 
-#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
-// Functions for calling packed data transfers
-#endif
-
-#if AC_MPI_ENABLED == 0
-AcResult
-acDeviceRunMPITest(void)
-{
-    WARNING("MPI was not enabled but acDeviceRunMPITest() was called");
-    return AC_FAILURE;
-}
-#else // MPI_ENABLED ///////////////////////////////////////////////////////////////////////////////
-#include <mpi.h>
-
-// Kernels
-#include "kernels/packing.cuh"
-
-// From Astaroth Utils
-#include "src/utils/config_loader.h"
-#include "src/utils/memory.h"
-#include "src/utils/timer_hires.h"
-#include "src/utils/verification.h"
-
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
-
 static int
 mod(const int a, const int b)
 {
@@ -881,6 +736,8 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
     return AC_SUCCESS;
 }
 
+/*
+// Deprecated
 static AcResult
 acDeviceCommunicateBlocksMPI(const Device device,        //
                              const int3* a0s,            // Src idx inside comp. domain
@@ -977,6 +834,7 @@ acDeviceCommunicateBlocksMPI(const Device device,        //
 
     return AC_SUCCESS;
 }
+*/
 
 typedef struct {
     PackedData* srcs;
@@ -1558,7 +1416,7 @@ acDeviceRunMPITest(void)
     };
     submesh_info.int3_params[AC_multigpu_offset] = (int3){-1, -1, -1}; // TODO
     WARNING("AC_multigpu_offset not yet implemented");
-    acUpdateConfig(&submesh_info);
+    acUpdateBuiltinParams(&submesh_info);
     ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx]));
     ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound]));
 
@@ -1630,4 +1488,3 @@ acDeviceRunMPITest(void)
     //////////////////////////////////////////////////////////////
     return AC_SUCCESS;
 }
-#endif // MPI_ENABLED //////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/errchk.h b/src/core/errchk.h
deleted file mode 100644
index 85106a9..0000000
--- a/src/core/errchk.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-// clang-format off
-/*
- * =============================================================================
- * General error checking
- * =============================================================================
- */
-#define ERROR(str) \
-{ \
-    time_t t; time(&t); \
-    fprintf(stderr, "%s", ctime(&t)); \
-    fprintf(stderr, "\tError in file %s line %d: %s\n", \
-                    __FILE__, __LINE__, str); \
-    fflush(stderr); \
-    exit(EXIT_FAILURE); \
-    abort(); \
-}
-
-#define WARNING(str) \
-{ \
-    time_t t; time(&t); \
-    fprintf(stderr, "%s", ctime(&t)); \
-    fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
-                    __FILE__, __LINE__, str); \
-    fflush(stderr); \
-}
-
-// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
-#define ERRCHK(retval)  { if (!(retval)) ERROR(#retval " was false"); }
-#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
-#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
-
-/*
- * =============================================================================
- * CUDA-specific error checking
- * =============================================================================
- */
-#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
-static inline void
-cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
-{
-    if (code != cudaSuccess) {
-        time_t t; time(&t); \
-        fprintf(stderr, "%s", ctime(&t)); \
-        fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
-                        file, line, cudaGetErrorString(code)); \
-        fflush(stderr); \
-
-        if (abort)
-            exit(code);
-    }
-}
-
-#ifdef NDEBUG
-    #undef ERRCHK
-    #undef WARNCHK
-    #define ERRCHK(params)
-    #define WARNCHK(params)
-    #define ERRCHK_CUDA(params) params
-    #define WARNCHK_CUDA(params) params
-    #define ERRCHK_CUDA_KERNEL() {}
-#else
-    #define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
-    #define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
-
-    #define ERRCHK_CUDA_KERNEL()                                               \
-    {                                                                          \
-        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
-        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
-    }
-#endif
-
-#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
-
-#define ERRCHK_CUDA_KERNEL_ALWAYS()                                               \
-{                                                                          \
-    ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError());                                    \
-    ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize());                                  \
-}
-
-#define WARNCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__, false); }
-// clang-format on
-#endif // AC_USE_CUDA_RUNTIME_API
diff --git a/src/core/grid.cc b/src/core/grid.cc
deleted file mode 100644
index 3e06aa9..0000000
--- a/src/core/grid.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-#include "astaroth_grid.h"
-
-#include "astaroth_device.h"
-#include "errchk.h"
-
-/** */
-AcResult
-acGridInit(const AcMeshInfo info)
-{
-    int num_processes, pid;
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-
-    char processor_name[MPI_MAX_PROCESSOR_NAME];
-    int name_len;
-    MPI_Get_processor_name(processor_name, &name_len);
-    printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes);
-
-    AcMesh submesh;
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridQuit(void)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-#if 0
-/** */
-AcResult
-acGridSynchronizeStream(const Stream stream)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridSwapBuffers(void)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
-                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
-                                 const int3 dst, const int num_vertices)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
-                         const int3 dst, const int num_vertices)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
-                       const VertexBufferHandle vtxbuf_handle)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-#endif
-/** */
-AcResult
-acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
-{
-    (void)stream;
-    (void)host_mesh;
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-#if 0
-
-/** */
-AcResult
-acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
-                                  const int3 src, const int3 dst, const int num_vertices,
-                                  AcMesh* host_mesh)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
-                          const int num_vertices, AcMesh* host_mesh)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
-                        AcMesh* host_mesh)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-#endif
-/** */
-AcResult
-acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
-{
-    (void)stream;
-    (void)host_mesh;
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-#if 0
-
-/** */
-AcResult
-acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
-                       const AcReal dt)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridIntegrateStep(const Stream stream, const AcReal dt)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridPeriodicBoundcondStep(const Stream stream)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridReduceScal(const Stream stream, const ReductionType rtype,
-                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
-{
-    return AC_FAILURE;
-}
-/** */
-AcResult
-acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
-                const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
-{
-    return AC_FAILURE;
-}
-#endif
diff --git a/src/core/kernels/CMakeLists.txt b/src/core/kernels/CMakeLists.txt
new file mode 100644
index 0000000..120acd2
--- /dev/null
+++ b/src/core/kernels/CMakeLists.txt
@@ -0,0 +1,3 @@
+## Astaroth Kernels
+add_library(astaroth_kernels STATIC kernels.cu)
+add_dependencies(astaroth_kernels dsl_headers)
diff --git a/src/core/kernels/boundconds.cu b/src/core/kernels/boundconds.cu
deleted file mode 100644
index 1bb8c91..0000000
--- a/src/core/kernels/boundconds.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "boundconds.cuh"
-#include "common.cuh"
-
-#include "src/core/errchk.h"
-
-__global__ void
-kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
-        return;
-
-    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
-    //    return;
-
-    // If destination index is inside the computational domain, return since
-    // the boundary conditions are only applied to the ghost zones
-    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
-        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
-        return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates
-    int i_src = i_dst - DCONST(AC_nx_min);
-    int j_src = j_dst - DCONST(AC_ny_min);
-    int k_src = k_dst - DCONST(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST(AC_nx);
-    j_src += DCONST(AC_ny);
-    k_src += DCONST(AC_nz);
-
-    // Wrap
-    i_src %= DCONST(AC_nx);
-    j_src %= DCONST(AC_ny);
-    k_src %= DCONST(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST(AC_nx_min);
-    j_src += DCONST(AC_ny_min);
-    k_src += DCONST(AC_nz_min);
-
-    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx]   = vtxbuf[src_idx];
-}
-
-AcResult
-acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                           AcReal* vtxbuf)
-{
-    const dim3 tpb(8, 2, 8);
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
-    ERRCHK_CUDA_KERNEL();
-    return AC_SUCCESS;
-}
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index f4f1670..6a31d58 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,40 +1,62 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
 #pragma once
 
-#include "astaroth.h"
+static __global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
 
-AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                                    AcReal* vtxbuf);
+    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
+    //    return;
 
-#ifdef __cplusplus
-} // extern "C"
-#endif
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
+        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST(AC_nx_min);
+    int j_src = j_dst - DCONST(AC_ny_min);
+    int k_src = k_dst - DCONST(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST(AC_nx);
+    j_src += DCONST(AC_ny);
+    k_src += DCONST(AC_nz);
+
+    // Wrap
+    i_src %= DCONST(AC_nx);
+    j_src %= DCONST(AC_ny);
+    k_src %= DCONST(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST(AC_nx_min);
+    j_src += DCONST(AC_ny_min);
+    k_src += DCONST(AC_nz_min);
+
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vtxbuf[dst_idx]   = vtxbuf[src_idx];
+}
+
+AcResult
+acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                           AcReal* vtxbuf)
+{
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
diff --git a/src/core/kernels/common.cuh b/src/core/kernels/common.cuh
deleted file mode 100644
index 11f8649..0000000
--- a/src/core/kernels/common.cuh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include "astaroth.h"
-
-extern __constant__ AcMeshInfo d_mesh_info;
-
-typedef struct {
-    AcReal* in[NUM_VTXBUF_HANDLES];
-    AcReal* out[NUM_VTXBUF_HANDLES];
-
-    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
-} VertexBufferArray;
-
-static int __device__ __forceinline__
-DCONST(const AcIntParam param)
-{
-    return d_mesh_info.int_params[param];
-}
-static int3 __device__ __forceinline__
-DCONST(const AcInt3Param param)
-{
-    return d_mesh_info.int3_params[param];
-}
-static AcReal __device__ __forceinline__
-DCONST(const AcRealParam param)
-{
-    return d_mesh_info.real_params[param];
-}
-static AcReal3 __device__ __forceinline__
-DCONST(const AcReal3Param param)
-{
-    return d_mesh_info.real3_params[param];
-}
-static __device__ constexpr VertexBufferHandle
-DCONST(const VertexBufferHandle handle)
-{
-    return handle;
-}
-#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
-#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
-#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
-//#define globalMeshM // Placeholder
-//#define localMeshN // Placeholder
-//#define localMeshM // Placeholder
-//#define localMeshN_min // Placeholder
-//#define globalMeshN_min // Placeholder
-#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
-//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
-
-static __device__ constexpr int
-IDX(const int i)
-{
-    return i;
-}
-
-static __device__ __forceinline__ int
-IDX(const int i, const int j, const int k)
-{
-    return DEVICE_VTXBUF_IDX(i, j, k);
-}
-
-static __device__ __forceinline__ int
-IDX(const int3 idx)
-{
-    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
-}
-
-//#include <thrust/complex.h>
-// using namespace thrust;
-
-#include <cuComplex.h>
-#if AC_DOUBLE_PRECISION == 1
-typedef cuDoubleComplex acComplex;
-#define acComplex(x, y) make_cuDoubleComplex(x, y)
-#else
-typedef cuFloatComplex acComplex;
-#define acComplex(x, y) make_cuFloatComplex(x, y)
-#define exp(x) expf(x)
-#define sin(x) sinf(x)
-#define cos(x) cosf(x)
-#define sqrt(x) sqrtf(x)
-#endif
-static __device__ inline acComplex
-exp(const acComplex& val)
-{
-    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
-}
-static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
-{
-    return (acComplex){a * b.x, a * b.y};
-}
-
-static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
-{
-    return (acComplex){a * b.x, a * b.y};
-}
-
-static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
-{
-    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
-}
-//#include <complex>
diff --git a/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh b/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
deleted file mode 100644
index 3a93db5..0000000
--- a/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
+++ /dev/null
@@ -1,1363 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include "device_globals.cuh"
-
-
-//TODO MV: MAKE A BETTER SWITCH
-#define B_VELTYPE 666
-//#define B_VELTYPE 1
-
-////////////////////////////////////
-// Define the destination indices //
-////////////////////////////////////
-
-
-// Get the standard coordinate indices. 
-__device__ void
-get_dst_index(const int3 start, int* i_dst, int* j_dst, int* k_dst, int* dst_idx)
-{
-     *i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-     *j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-     *k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-     *dst_idx      = DEVICE_VTXBUF_IDX(*i_dst, *j_dst, *k_dst);
-
-     //printf("*i_dst = %i, *j_dst = %i, *k_dst = %i, *dst_idx = %i \n", *i_dst, *j_dst, *k_dst, *dst_idx);
-}
-
-
-__device__ void
-_sym_indexing_xz(int* edge_idx, int* src_idx, 
-                 const int i_dst, const int j_dst, const int k_dst)
-{
-
-    int i_edge, j_edge, k_edge;
-    int i_diff, k_diff;
-    int i_src,  k_src ;
-    int is_ztop = 0, is_zbot = 0;
-    int is_xtop = 0, is_xbot = 0;
-
-    if (i_dst  < DCONST(AC_nx_min)){
-        i_edge = DCONST(AC_nx_min);
-        is_xbot = 1;
-    } else if (i_dst >=  DCONST(AC_nx_max)){
-        i_edge = DCONST(AC_nx_max)-1;
-        is_xtop = 1;
-    } else {
-        i_edge = i_dst;
-    }
-
-    j_edge = j_dst;
-
-    if (k_dst < DCONST(AC_nz_min)) {
-        k_edge = DCONST(AC_nz_min);
-        is_zbot = 1;
-    } else if (k_dst >=  DCONST(AC_nz_max)) {
-        k_edge = DCONST(AC_nz_max)-1;
-        is_ztop = 1;
-    } else {
-        k_edge = k_dst;
-    }
-
-    *edge_idx = DEVICE_VTXBUF_IDX(i_edge, j_edge, k_edge);
-
-    //TODO: problematic on the corners!!! 
-    if (is_xtop == 1 || is_xbot == 1) {
-        i_diff = i_edge - i_dst;
-        i_src  = i_edge + i_diff;
-        //OK 
-        //printf("i_edge %i, j_edge %i, k_edge %i, i_dst %i, j_dst %i, k_dst %i \n", i_edge, j_edge, k_edge, i_dst, j_dst, k_dst);
-        *src_idx = DEVICE_VTXBUF_IDX(i_src, j_dst, k_dst);
-    } else if (is_ztop == 1 || is_zbot == 1) {
-        k_diff = k_edge - k_dst;
-        k_src  = k_edge + k_diff;
-        *src_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_src);
-    } else {
-        *src_idx = NULL;
-    }
-}
-
-
-//////////////////////////////////////
-// Choose surface value points only //
-//////////////////////////////////////
-
-// Choose negative x-boundary SURFACE values.  
-__device__ int
-choose_negxbound_point(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst == DCONST(AC_nx_min))
-        return 0;
-
-    return 1;
-}
-
-// Choose positive x-boundary SURFACE values.  
-__device__ int
-choose_posxbound_point(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst == (DCONST(AC_nx_max)-1))
-        return 0;
-
-    return 1;
-}
-
-// Choose negative z-boundary SURFACE values.  
-__device__ int
-choose_negzbound_point(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (k_dst == DCONST(AC_nz_min))
-        return 0;
-
-    return 1;
-}
-
-// Choose positive z-boundary SURFACE values.  
-__device__ int
-choose_poszbound_point(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (k_dst == (DCONST(AC_nz_max)-1))
-        return 0;
-
-    return 1;
-}
-
-////////////////////////////////////
-// Filtering out of bounds values //
-////////////////////////////////////
-
-// If within the start-end range (this allows threadblock dims that are not
-// divisible by end - start)
-__device__ int
-filter_outbound(const int3 end, const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
-        return 1;
-
-    return 0; 
-}
-
-// If destination index is inside the computational domain, return since
-// the boundary conditions are only applied to the ghost zones
-__device__ int
-filter_inbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) &&
-        j_dst >= DCONST(AC_ny_min) && j_dst < DCONST(AC_ny_max) &&
-        k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
-        return 1;
-
-    return 0;
-}
-
-// If destination index is within x boundary, we do not need it
-__device__ int
-filter_xbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst < DCONST(AC_nx_min) || i_dst >= DCONST(AC_nx_max))
-        return 1;
-
-    return 0;
-}
-
-
-// Discard negative x-boundary values.  
-__device__ int
-filter_negxbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst < DCONST(AC_nx_min))
-        return 1;
-
-    return 0;
-}
-
-// Discard positive x-boundary values.  
-__device__ int
-filter_posxbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst >= DCONST(AC_nx_max))
-        return 1;
-
-    return 0;
-}
-
-// Discard y-boundary values.  
-__device__ int
-filter_ybound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if ((j_dst <  DCONST(AC_ny_min) || j_dst >= DCONST(AC_ny_max)))
-        return 1;
-
-    return 0;
-}
-
-// Discard negative y-boundary values.  
-__device__ int
-filter_negybound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (j_dst < DCONST(AC_ny_min))
-        return 1;
-
-    return 0;
-}
-
-// Discard positive y-boundary values.  
-__device__ int
-filter_posybound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (j_dst >= DCONST(AC_ny_max))
-        return 1;
-
-    return 0;
-}
-
-// If destination index is within z boundary, we do not need it
-__device__ int
-filter_zbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (k_dst < DCONST(AC_nz_min) || k_dst >= DCONST(AC_nz_max))
-        return 1;
-
-    return 0;
-}
-
-
-// Discard negative z-boundary values.  
-__device__ int
-filter_negzbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (k_dst < DCONST(AC_nz_min))
-        return 1;
-
-    return 0;
-}
-
-// Discard positive x-boundary values.  
-__device__ int
-filter_poszbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (k_dst >= DCONST(AC_nz_max))
-        return 1;
-
-    return 0;
-}
-
-// Discard all exept negative x
-__device__ int
-filter_allbut_negxbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst >= DCONST(AC_nx_min))
-        return 1;
-
-    return 0;
-}
-
-// Discard all exept positive x
-__device__ int
-filter_allbut_posxbound(const int i_dst, const int j_dst, const int k_dst)
-{
-    if (i_dst < DCONST(AC_nx_max))
-        return 1;
-
-    return 0;
-}
-
-
-
-
-
-
-/////////////////////////////////////
-// Constant values at the boundary //
-/////////////////////////////////////
-
-
-__global__ void
-_set_density_xzin(const int3 start, const int3 end, AcReal* density_buffer)    
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) &&
-         (choose_negzbound_point(i_dst, j_dst, k_dst)) && 
-         (choose_poszbound_point(i_dst, j_dst, k_dst)) ) return;
-
-    density_buffer[dst_idx] =  DCONST_REAL(AC_lnrho_edge);
-
-}
-
-__global__ void
-_set_density_xin(const int3 start, const int3 end, AcReal* density_buffer)    
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-    //const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    //const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    //const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-    //int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    //if ( choose_posxbound_point(i_dst, j_dst, k_dst) ) return;
-    if ( filter_allbut_posxbound(i_dst, j_dst, k_dst) ) return;
-
-    //if (dst_idx >= DCONST(AC_mx)*DCONST(AC_my)*DCONST(AC_mz)) 
-    //printf(" %i %i %i %i XX %i %i  \n", i_dst, j_dst, k_dst, dst_idx,  
-    //                                    i_dst + j_dst*DCONST(AC_mx) + k_dst*DCONST(AC_mx)*DCONST(AC_my), 
-    //                                    DCONST(AC_mx)*DCONST(AC_my)*DCONST(AC_mz) );
-
-    //TODO: add a powerlaw gradient. 
-    density_buffer[dst_idx] = DCONST_REAL(AC_lnrho_edge);
-
-    //printf(" %i %i %i %i %e %i %i  \n", i_dst, j_dst, k_dst, dst_idx, density_buffer[dst_idx], 
-    //                                    i_dst + j_dst*DCONST(AC_mx) + k_dst*DCONST(AC_mx)*DCONST(AC_my), 
-    //                                    DCONST(AC_mx)*DCONST(AC_my)*DCONST(AC_mz) );
-
-}
-
-
-__global__ void
-_set_density_xout(const int3 start, const int3 end, AcReal* density_buffer)    
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if ( choose_negxbound_point(i_dst, j_dst, k_dst) ) return;
-
-    //printf(" PIIP ");
-    density_buffer[dst_idx] = DCONST_REAL(AC_lnrho_out); 
-    //printf(" %i %i %i %i %e %i %i  \n", i_dst, j_dst, k_dst, dst_idx, density_buffer[dst_idx], 
-    //                                    i_dst + j_dst*DCONST(AC_mx) + k_dst*DCONST(AC_mx)*DCONST(AC_my), 
-    //                                    DCONST(AC_mx)*DCONST(AC_my)*DCONST(AC_mz) );
-
-
-}
-
-__device__ void 
-_calc_csconst(const int dst_idx, AcReal* entropy_buffer, AcReal* density_buffer)
-{
-    //DUMMY!!! TODO: Test isothermal first. 
-
-    //AcReal TT_bound = 0.0; //AcReal(2.0) * DCONST_REAL(AC_cv_sound) * log(cs2bound/ DCONST_REAL(AC_cp_sound));
-
-    //entropy_buffer[dst_idx] = 0.0; //AcReal(0.5)*TT_bound
-                              //- (DCONST_REAL(AC_cp_sound) - DCONST_REAL(AC_cv_sound))
-                              //*(density_buffer[dst_idx] - DCONST_REAL(AC_lnrho0));
-}
-
-
-// This boundary condion sets the edge point in the system to a specific value.
-// In this case, the sound speed. At the outflow boundary
-// This way a constant value at the boundary is defined. The ghost zones are
-// instead for the purpose of defining the behaviour of the derivatives. 
-__global__ void
-_set_csconst_xbot(const int3 start, const int3 end, AcReal* entropy_buffer, AcReal* density_buffer) 
-{
-    //const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    //const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    //const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-    //const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (choose_negxbound_point(i_dst, j_dst, k_dst)) return;
-
-    _calc_csconst(dst_idx, entropy_buffer, density_buffer);
-
-}
-
-// This boundary condion sets the edge point in the system to a specific value.
-// In this case, the sound speed. At the inflow boundaries. 
-__global__ void
-_set_csconst_xzin(const int3 start, const int3 end, AcReal* entropy_buffer, AcReal* density_buffer) 
-{
-    
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) &&
-         (choose_negzbound_point(i_dst, j_dst, k_dst)) && 
-         (choose_poszbound_point(i_dst, j_dst, k_dst)) ) return;
-
-    _calc_csconst(dst_idx, entropy_buffer, density_buffer);
-
-}
-
-// This boundary condion sets the edge point in the system to a specific value.
-// In this case, the sound speed. At the inflow boundaries. 
-__global__ void
-_set_csconst_xin(const int3 start, const int3 end, AcReal* entropy_buffer, AcReal* density_buffer) 
-{
-    
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) ) return;
-
-    _calc_csconst(dst_idx, entropy_buffer, density_buffer);
-
-}
-
-
-// Calculate inflow velocity at a coordinate point
-__device__ void
-_calc_inflow_velocity(const int dst_idx, const AcReal xx, const AcReal zz, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)
-{
-
-    const AcReal delx = xx - DCONST_REAL(AC_star_pos_x);
-    const AcReal delz = zz - DCONST_REAL(AC_star_pos_z);
-
-    //TODO: Figure out isthis needed. Now a placeholder.
-    //tanhz = fabs(tanh(zz/DCONST_REAL(AC_trans)));
-    const AcReal tanhz = 1.0;
-                    
-    const AcReal RR     = sqrt(delx*delx + delz*delz);
-    const AcReal veltot = DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
-    
-    //Normal velocity components
-    const AcReal uu_x = - veltot*(delx/RR);  
-    const AcReal uu_z = - veltot*(delz/RR);
-
-    //Take into account either the top or bottom direction if the inflow angle is transformed in some way.  
-    if (delz >= 0.0) {
-        uux_buffer[dst_idx] = ( uu_x*cos(DCONST_REAL(AC_angl_uu)) - uu_z*sin(DCONST_REAL(AC_angl_uu)) )*tanhz;
-        uuz_buffer[dst_idx] = ( uu_x*sin(DCONST_REAL(AC_angl_uu)) + uu_z*cos(DCONST_REAL(AC_angl_uu)) )*tanhz;
-    } else {        
-        uux_buffer[dst_idx] = ( uu_x*cos(DCONST_REAL(AC_angl_uu)) + uu_z*sin(DCONST_REAL(AC_angl_uu)) )*tanhz;
-        uuz_buffer[dst_idx] = (-uu_x*sin(DCONST_REAL(AC_angl_uu)) + uu_z*cos(DCONST_REAL(AC_angl_uu)) )*tanhz;
-    }
-    uuy_buffer[dst_idx] = AcReal(0.0) ;
-}
-
-
-//TODO: Make inflow xtrapolation based on the free fall profile 
-__device__ void
-_extrapolate_inflow_velocity_xonly(const int dst_idx, const int edge_idx, const AcReal xx, 
-                                   const AcReal xx_edge, const AcReal zz, AcReal* uux_buffer, 
-                                   AcReal* uuy_buffer, AcReal* uuz_buffer)
-{
-
-    const AcReal delx      = xx - DCONST_REAL(AC_star_pos_x);
-    const AcReal delx_edge = xx_edge - DCONST_REAL(AC_star_pos_x);
-                    
-
-    const AcReal RR              = sqrt(delx*delx);
-    const AcReal RR_edge         = sqrt(delx_edge*delx_edge);
-    const AcReal RR_rel          = sqrt(RR_edge/RR); // 1/sqrt(R) scaling
-
-    const AcReal veltot          = uux_buffer[edge_idx]*RR_rel;
-    const AcReal veltot_freefall = -DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
-    
-    //Normal velocity components
-    //Set the roof to free fall velocity
-    AcReal uu_x;
-    if (veltot >= veltot_freefall) {
-        uu_x = veltot; 
-        //if (uux_buffer[edge_idx] < 0.0) printf("uux_buffer[%i] %e  veltot %e veltot_freefall %e RR_rel %e \n", edge_idx, uux_buffer[edge_idx], veltot, veltot_freefall, RR_rel);
-        //if (veltot < uux_buffer[edge_idx]) printf("uux_buffer[%i] %e  veltot %e veltot_freefall %e RR_rel %e \n", edge_idx, uux_buffer[edge_idx], veltot, veltot_freefall, RR_rel);
-        //if (veltot > uux_buffer[edge_idx]) printf("uux_buffer[%i] RR %e RR_edge %e RR_rel %e \n", edge_idx, RR, RR_edge, RR_rel);
-    } else { 
-        uu_x = veltot_freefall; 
-        //printf("%i %e %e \n", dst_idx, veltot, veltot_freefall);
-    }
-
-    //Take into account either the top or bottom direction if the inflow angle is transformed in some way.  
-    uux_buffer[dst_idx]    = uu_x;
-    uuz_buffer[dst_idx]    = AcReal(0.0);
-    uuy_buffer[dst_idx]    = AcReal(0.0);
-
-}
-
-
-__device__ void
-_calc_inflow_velocity_xonly(const int dst_idx, const AcReal xx, const AcReal zz, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)
-{
-
-    const AcReal delx = xx - DCONST_REAL(AC_star_pos_x);
-                    
-    const AcReal RR     = sqrt(delx*delx);
-    const AcReal veltot = DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
-    
-    //Normal velocity components
-    const AcReal uu_x = - veltot;  
-
-    //Take into account either the top or bottom direction if the inflow angle is transformed in some way.  
-    uux_buffer[dst_idx] = uu_x;
-    uuz_buffer[dst_idx] = AcReal(0.0);
-    uuy_buffer[dst_idx] = AcReal(0.0) ;
-}
-
-
-// Set inflow velocity based on free-fall at both vertical and horizontal boundaries. 
-__global__ void
-_set_uinflow_xzin(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)
-{
-    
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    const AcReal xx = DCONST_REAL(AC_dsx) * AcReal(i_dst) - DCONST_REAL(AC_xorig);
-    const AcReal zz = DCONST_REAL(AC_dsz) * AcReal(k_dst) - DCONST_REAL(AC_zorig);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) &&
-         (choose_negzbound_point(i_dst, j_dst, k_dst)) && 
-         (choose_poszbound_point(i_dst, j_dst, k_dst)) ) return;
-
-    _calc_inflow_velocity(dst_idx, xx, zz, uux_buffer, uuy_buffer, uuz_buffer);
-
-} 
-
-// Set inflow velocity based on free-fall only at horizontal boundary. 
-__global__ void
-_set_uinflow_xin(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer, AcReal* lnrho_buffer)
-{
-    
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    //Fix the whole boundary
-    //if ( (choose_posxbound_point(i_dst, j_dst, k_dst)) ) return;
-    
-    if (filter_allbut_posxbound(i_dst, j_dst, k_dst)) return;
-
-    int i_edge, edge_idx, src_idx;
-    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
-    i_edge = DCONST(AC_nx_max)-1;
-
-    const AcReal xx      = DCONST_REAL(AC_dsx) * AcReal(i_dst)  - DCONST_REAL(AC_xorig);
-    const AcReal zz      = DCONST_REAL(AC_dsz) * AcReal(k_dst)  - DCONST_REAL(AC_zorig);
-    const AcReal xx_edge = DCONST_REAL(AC_dsx) * AcReal(i_edge) - DCONST_REAL(AC_xorig);
-
-    //Free fall
-    ////_calc_inflow_velocity_xonly(dst_idx, xx, zz, uux_buffer, uuy_buffer, uuz_buffer);
-
-    //Basic momentum from boundcond
-    const AcReal delx = xx - DCONST_REAL(AC_star_pos_x);
-    const AcReal RR     = sqrt(delx*delx);
-    const AcReal vel_freefall = -DCONST_REAL(AC_sq2GM_star)/sqrt(RR); //Free fall velocity
-
-    //Extrapolation scheme but only if inflow
-    if (uux_buffer[edge_idx] <= AcReal(0.0)) { 
-        _extrapolate_inflow_velocity_xonly(dst_idx, edge_idx, xx, xx_edge, zz, uux_buffer, uuy_buffer, uuz_buffer);
-
-        //uux_buffer[dst_idx] = AcReal(-0.001);
-        //uuz_buffer[dst_idx] = AcReal(0.0);
-        //uuy_buffer[dst_idx] = AcReal(0.0);
-    } else {
-        uux_buffer[dst_idx] = AcReal(0.0);
-        uuz_buffer[dst_idx] = AcReal(0.0);
-        uuy_buffer[dst_idx] = AcReal(0.0);
-
-        uux_buffer[edge_idx] = AcReal(0.0);
-    }
-          
-    //Simple linear interpolation for density scaling.
-    lnrho_buffer[dst_idx] = (DCONST_REAL(AC_lnrho_edge) - DCONST_REAL(AC_ampl_lnrho))*(uux_buffer[dst_idx]/vel_freefall) + DCONST_REAL(AC_ampl_lnrho);    
-   
-}
-
-
-///////////////////////////////////////
-// Antisymmetric boundary conditions //
-///////////////////////////////////////
-
-
-
-//Set constant pressure at the inflow boundary it manage the "ramming effect" causing wiggles.
-// WARNING: NOT TESTED
-__global__ void
-_inflow_set_der_xtop(const int3 start, const int3 end, const AcReal der_value, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
-
-    //Get the correct, symmetric indices for the boundary compurtation
-    int edge_idx, src_idx;
-    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
-
-    int i_edge = DCONST(AC_nx_max)-1;
-    int i_diff = i_edge - i_dst;
-
-    const AcReal xx = DCONST_REAL(AC_dsx) * AcReal(i_edge + i_diff) - DCONST_REAL(AC_xorig);
-    const AcReal xx_edge = DCONST_REAL(AC_dsx) * AcReal(i_edge) - DCONST_REAL(AC_xorig);
-
-    vertex_buffer[dst_idx] =  vertex_buffer[src_idx] + (AcReal(2.0)*(xx_edge-xx))*der_value;
-
-}
-
-
-__device__ void
-_rel_antisym_general(int i_dst, int j_dst, int k_dst, int dst_idx, AcReal* vertex_buffer)
-{
-    //Get the correct, symmetric indices for the boundary compurtation
-    int edge_idx, src_idx;
-    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
-
-    AcReal sgn = -1.0; //Relative anti-symmetric
-    vertex_buffer[dst_idx] = sgn*vertex_buffer[src_idx] + AcReal(2.0)*vertex_buffer[edge_idx];       
-}
-
-__device__ void
-_sym_antisym_general(int i_dst, int j_dst, int k_dst, int dst_idx, AcReal* vertex_buffer, AcReal sgn)
-{
-    //Get the correct, symmetric indices for the boundary compurtation
-    int edge_idx, src_idx;
-    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
-
-    vertex_buffer[dst_idx] = sgn*vertex_buffer[src_idx];       
-}
-
-__global__ void
-_rel_antisym_xbot(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-    if (filter_posxbound(i_dst, j_dst, k_dst)) return; 
-
-    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
-
-    //printf(" %i %i %i %i %e %i %i  \n", i_dst, j_dst, k_dst, dst_idx, vertex_buffer[dst_idx], 
-    //                                    i_dst + j_dst*DCONST(AC_mx) + k_dst*DCONST(AC_mx)*DCONST(AC_my), 
-    //                                    DCONST(AC_mx)*DCONST(AC_my)*DCONST(AC_mz) );
-
-}
-
-__global__ void
-_rel_antisym_xtop(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
-
-    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
-
-}
-
-__global__ void
-_rel_antisym_zbot(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-    if (filter_poszbound(i_dst, j_dst, k_dst)) return; 
-    
-    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
-
-}
-
-__global__ void
-_rel_antisym_ztop(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-    if (filter_negzbound(i_dst, j_dst, k_dst)) return; 
-    
-    _rel_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer);
-
-}  
-
-__global__ void
-_antisym_xbot(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-    if (filter_posxbound(i_dst, j_dst, k_dst)) return; 
-
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
-
-}
-
-__global__ void
-_antisym_xtop(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
-
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
-
-}
-
-__global__ void
-_antisym_zbot(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-    if (filter_poszbound(i_dst, j_dst, k_dst)) return; 
-    
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
-
-}
-
-__global__ void
-_antisym_ztop(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-    if (filter_negzbound(i_dst, j_dst, k_dst)) return; 
-    
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(-1.0));
-
-}  
-
- 
-__global__ void
-_sym_xbot(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-    if (filter_posxbound(i_dst, j_dst, k_dst)) return; 
-
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
-
-}
-
-__global__ void
-_sym_xtop(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-    if (filter_negxbound(i_dst, j_dst, k_dst)) return; 
-
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
-
-}
-
-__global__ void
-_sym_zbot(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-    if (filter_poszbound(i_dst, j_dst, k_dst)) return; 
-    
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
-
-}
-
-__global__ void
-_sym_ztop(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_ybound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-    if (filter_negzbound(i_dst, j_dst, k_dst)) return; 
-    
-    _sym_antisym_general(i_dst, j_dst, k_dst, dst_idx, vertex_buffer, AcReal(1.0));
-
-}  
-
-//Outflow boundary conditions: 
-//Symmetric conditon when velocity vector points outside the box.
-//Antisymmetric condition when velocity points inside.
-__global__ void
-_uu_outflow_xbot(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)     
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_allbut_negxbound(i_dst, j_dst, k_dst)) return;
-    
-    //Get the correct, symmetric indices for the boundary compurtation
-    int edge_idx, src_idx;
-    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
-
-    AcReal sgnx;
-    if (uux_buffer[edge_idx] <= 0.0) {  
-        sgnx = 1.0; //Symmetric
-    } else {
-        sgnx = -1.0; //Antisymmetric
-        //TODO: This might have synchronization problem.
-        uux_buffer[edge_idx] = 0.0;  //Antisymmetric condition requires value to vanish at the boundary. TODO: This might  
-    }
-    AcReal sgny = 1.0, sgnz = 1.0; //Symmetric
-
-    uux_buffer[dst_idx] = sgnx*uux_buffer[src_idx];       
-    uuy_buffer[dst_idx] = sgny*uuy_buffer[src_idx];       
-    uuz_buffer[dst_idx] = sgnz*uuz_buffer[src_idx];       
-
-
-} 
-
-//Inflow boundary conditions Pencil Code style: 
-//Symmetric conditon when velocity vector points inside the box.
-//Antisymmetric condition when velocity points outside.
-__global__ void
-_uu_inflow_simple_xbot(const int3 start, const int3 end, AcReal* uux_buffer, AcReal* uuy_buffer, AcReal* uuz_buffer)     
-{
-    int i_dst, j_dst, k_dst, dst_idx;
-    get_dst_index(start, &i_dst, &j_dst, &k_dst, &dst_idx);
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_allbut_posxbound(i_dst, j_dst, k_dst)) return;
-
-    //Get the correct, symmetric indices for the boundary compurtation
-    int edge_idx, src_idx;
-    _sym_indexing_xz(&edge_idx, &src_idx, i_dst, j_dst, k_dst);
-
-    AcReal sgnx;
-    if (uux_buffer[edge_idx] <= 0.0) {  
-        sgnx = 1.0; //Symmetric
-    } else {
-        sgnx = -1.0; //Antisymmetric
-        //TODO: This might have synchronization problem.
-        uux_buffer[edge_idx] = 0.0;  //Antisymmetric condition requires value to vanish at the boundary. 
-    }
-    AcReal sgny = 1.0, sgnz = 1.0; //Symmetric
-
-    uux_buffer[dst_idx] = sgnx*uux_buffer[src_idx];       
-    uuy_buffer[dst_idx] = sgny*uuy_buffer[src_idx];       
-    uuz_buffer[dst_idx] = sgnz*uuz_buffer[src_idx];       
-
-
-} 
-
- 
-
-//////////////////////////////////
-// Periodic boundary conditions //
-//////////////////////////////////
-
-// This boundary condition is only peridic in y-direction and will require
-// other boundaries to determine x and z direction. Build for the vedge model
-// in mind. Use with discretion. 
-__global__ void
-_y_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-    if (filter_zbound(i_dst, j_dst, k_dst)) return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates  
-    int i_src = i_dst - DCONST(AC_nx_min);
-    int j_src = j_dst - DCONST(AC_ny_min);
-    int k_src = k_dst - DCONST(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST(AC_nx);
-    j_src += DCONST(AC_ny);
-    k_src += DCONST(AC_nz);
-
-    // Wrap
-    i_src %= DCONST(AC_nx);
-    j_src %= DCONST(AC_ny);
-    k_src %= DCONST(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST(AC_nx_min);
-    j_src += DCONST(AC_ny_min);
-    k_src += DCONST(AC_nz_min);
-
-    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vertex_buffer[dst_idx] = vertex_buffer[src_idx];     
-}
-
-
-
-
-// This boundary condition is only peridic in y-direction and will require
-// other boundaries to determine x direction. Build for the vedge model
-// teasting in mind. Use with discretion. 
-__global__ void
-_yz_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-
-    //Skip threads which are not at valid boundaries.
-    if (filter_outbound(end, i_dst, j_dst, k_dst)) return;
-    if (filter_inbound(i_dst, j_dst, k_dst)) return;
-    if (filter_xbound(i_dst, j_dst, k_dst)) return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates  
-    int i_src = i_dst - DCONST(AC_nx_min);
-    int j_src = j_dst - DCONST(AC_ny_min);
-    int k_src = k_dst - DCONST(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST(AC_nx);
-    j_src += DCONST(AC_ny);
-    k_src += DCONST(AC_nz);
-
-    // Wrap
-    i_src %= DCONST(AC_nx);
-    j_src %= DCONST(AC_ny);
-    k_src %= DCONST(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST(AC_nx_min);
-    j_src += DCONST(AC_ny_min);
-    k_src += DCONST(AC_nz_min);
-
-    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vertex_buffer[dst_idx] = vertex_buffer[src_idx];     
-}
-
-//Bundle of boundary conditions for xy inflow
-void
-xz_inflow_boundconds(const cudaStream_t stream, const dim3& tpb, const dim3 bpg,
-                     const int3& start, const int3& end, VertexBufferArray d_buffer)
-{
-    _set_density_xzin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]);                                               ERRCHK_CUDA_KERNEL();    
-    //_set_csconst_xzin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_ENTROPY], d_buffer.in[VTXBUF_LNRHO]);                  ERRCHK_CUDA_KERNEL();    
-    _set_uinflow_xzin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL();    
-
-    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();   
-    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]);     ERRCHK_CUDA_KERNEL();    
-    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]);     ERRCHK_CUDA_KERNEL();    
-    _rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]);     ERRCHK_CUDA_KERNEL();    
-
-    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();   
-    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]);     ERRCHK_CUDA_KERNEL();    
-    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]);     ERRCHK_CUDA_KERNEL();    
-    _rel_antisym_zbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]);     ERRCHK_CUDA_KERNEL();    
-
-    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();   
-    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]);     ERRCHK_CUDA_KERNEL();    
-    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]);     ERRCHK_CUDA_KERNEL();    
-    _rel_antisym_ztop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]);     ERRCHK_CUDA_KERNEL();    
-}
-
-//Bundle of boundary conditions for x inflow
-void
-x_inflow_boundconds(const cudaStream_t stream, const dim3& tpb, const dim3 bpg, 
-                    const int3& start, const int3& end, VertexBufferArray d_buffer)
-{
-    //_set_density_xin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]);                                               ERRCHK_CUDA_KERNEL(); //OK!!!   
-    _set_uinflow_xin<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ], d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();    
-    //_uu_inflow_simple_xbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL();
-
-    //acSynchronize();
-
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    ////TODO: Corners have to be determined by some condintion. Otherwise weird values appear. 
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    ////TODO: Corners have to be determined by some condintion. Otherwise weird values appear. 
-    //_sym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    //_sym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    //_sym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUY]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-    //_rel_antisym_xtop<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-
-
-
-}
-
-//Bundle of boundary conditions for outflow
-void
-x_outflow_boundconds(const cudaStream_t stream, const dim3& tpb, const dim3 bpg,
-                    const int3& start, const int3& end, VertexBufferArray d_buffer)
-{
-   // _set_density_xout<<<bpg, tpb, 0, stream>>>(start,  end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL(); //OK!!! 
-   //acSynchronize();
-
-   //DO we need to take the gravity effects explicitly in account here? 
-
-    //_rel_antisym_xbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();  //OK!!!   
-    _rel_antisym_xbot<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_LNRHO]); ERRCHK_CUDA_KERNEL();  //OK!!!   
-    //TODO: Chenk that these actually work.  
-    _uu_outflow_xbot<<< bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_UUX], d_buffer.in[VTXBUF_UUY], d_buffer.in[VTXBUF_UUZ]); ERRCHK_CUDA_KERNEL();     
-    
-}
-
-
-
-
-/*
-
-IMPORTANT NOTE! 
-
-The above boundary conditions have ben written for the pseudodisk model by
-mvaisala. At the moment therefore there two alternative approaches for the
-antisymmetric boundary condition. These need to be adapted in a smart way.
-
-Fortunately the kernels as they are do not interfere each other due to
-differing naming conventions.
-
-jpekkila's antisymmetric kernel
-
-_antisymmetric_boundconds(...)
-
-mvaisala's antisymmetric kernels: 
-
-_rel_antisym_general(...)
-_rel_antisym_xtop(...);    
-_rel_antisym_zbot(...);    
-_rel_antisym_ztop(...);    
-
-*/
-
-
-
-
-
-__global__ void
-_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
-        return;
-
-    //if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
-    //    return;
-
-    // If destination index is inside the computational domain, return since
-    // the boundary conditions are only applied to the ghost zones
-    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) &&
-        j_dst >= DCONST(AC_ny_min) && j_dst < DCONST(AC_ny_max) &&
-        k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
-        return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates
-    int i_src = i_dst - DCONST(AC_nx_min);
-    int j_src = j_dst - DCONST(AC_ny_min);
-    int k_src = k_dst - DCONST(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST(AC_nx);
-    j_src += DCONST(AC_ny);
-    k_src += DCONST(AC_nz);
-
-    // Wrap
-    i_src %= DCONST(AC_nx);
-    j_src %= DCONST(AC_ny);
-    k_src %= DCONST(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST(AC_nx_min);
-    j_src += DCONST(AC_ny_min);
-    k_src += DCONST(AC_nz_min);
-
-    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vertex_buffer[dst_idx] = vertex_buffer[src_idx];    
-}
-
-void
-periodic_boundconds(const cudaStream_t stream, const dim3& tpb, 
-                    const int3& start, const int3& end, AcReal* vertex_buffer)
-{
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
-    ERRCHK_CUDA_KERNEL();
-}
-
-
-typedef enum {
-    X_AXIS,
-    Y_AXIS,
-    Z_AXIS,
-    NUM_AXES
-} Axis;
-
-__global__ void
-_antisymmetric_boundconds(const int3 start, const int3 end, const Axis symmetry_axis, const AcReal base_value, AcReal* vertex_buffer)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
-        return;
-
-    //if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
-    //    return;
-
-    // If destination index is inside the computational domain, return since
-    // the boundary conditions are only applied to the ghost zones
-    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) &&
-        j_dst >= DCONST(AC_ny_min) && j_dst < DCONST(AC_ny_max) &&
-        k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
-        return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates
-    int i_src = i_dst - DCONST(AC_nx_min);
-    int j_src = j_dst - DCONST(AC_ny_min);
-    int k_src = k_dst - DCONST(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST(AC_nx);
-    j_src += DCONST(AC_ny);
-    k_src += DCONST(AC_nz);
-
-    // Wrap
-    i_src %= DCONST(AC_nx);
-    j_src %= DCONST(AC_ny);
-    k_src %= DCONST(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST(AC_nx_min);
-    j_src += DCONST(AC_ny_min);
-    k_src += DCONST(AC_nz_min);
-
-    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-
-    if (base_value >= 0) {
-        vertex_buffer[dst_idx] = -vertex_buffer[src_idx];
-    } else {
-        int bound_idx = -1;
-        if (symmetry_axis == X_AXIS) {
-            int boundary_idx = 0;
-            if (i_dst < STENCIL_ORDER/2)
-                boundary_idx = STENCIL_ORDER/2;
-            else
-                boundary_idx = DCONST(AC_nx_max) - 1;
-
-            bound_idx = DEVICE_VTXBUF_IDX(boundary_idx, j_src, k_src);
-        } else if (symmetry_axis == Y_AXIS) {
-            int boundary_idx = 0;
-            if (j_dst < STENCIL_ORDER/2)
-                boundary_idx = STENCIL_ORDER/2;
-            else
-                boundary_idx = DCONST(AC_ny_max) - 1;
-
-            bound_idx = DEVICE_VTXBUF_IDX(i_src, boundary_idx, k_src);
-        } else { // symmetry_axis == Z_AXIS
-            int boundary_idx = 0;
-            if (k_dst < STENCIL_ORDER/2)
-                boundary_idx = STENCIL_ORDER/2;
-            else
-                boundary_idx = DCONST(AC_nz_max) - 1;
-
-            bound_idx = DEVICE_VTXBUF_IDX(i_src, j_src, boundary_idx);
-        }
-        vertex_buffer[dst_idx] = -(vertex_buffer[src_idx] - vertex_buffer[bound_idx]) + vertex_buffer[bound_idx];
-    }
-}
-
-void
-antisymmetric_boundconds(const cudaStream_t stream, const dim3& tpb, 
-                    const int3& start, const int3& end, const Axis symmetry_axis, const AcReal base_value, AcReal* vertex_buffer)
-{
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    _antisymmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, symmetry_axis, base_value, vertex_buffer);
-    ERRCHK_CUDA_KERNEL();
-}
-
-
-//Here we attempt to construct the boundary condition for the vedge setup.
-void 
-wedge_boundconds(const cudaStream_t stream, const dim3& tpb,
-                 const int3& start, const int3& end, VertexBufferArray d_buffer)
-{
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    //Y direction is always periodic
-    // Repeat fo all buffers
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-#if B_VELTYPE == 1
-        _y_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[i]); 
-#else
-        _yz_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[i]);
-#endif
-        ERRCHK_CUDA_KERNEL();
-    }
-
-
-    //NOTE: Testing first now with fixed value antisymmetric boundary conditions. 
-    //Conditions for inflow boundary 
-#if B_VELTYPE == 1
-    xz_inflow_boundconds(stream, tpb, bpg, start, end, d_buffer);
-#else
-    x_inflow_boundconds(stream, tpb, bpg, start, end, d_buffer);
-#endif
-
-    //Conditions for outflow boundary
-    x_outflow_boundconds(stream, tpb, bpg, start, end, d_buffer);
-    
-    //Conditions for magnetic field 
-    //USING PERIODIC WHILE DOING BASIC HYDRO TESTING.
-    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_AX]); ERRCHK_CUDA_KERNEL();
-    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_AY]); ERRCHK_CUDA_KERNEL();
-    _periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, d_buffer.in[VTXBUF_AZ]); ERRCHK_CUDA_KERNEL();
-
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh b/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
deleted file mode 100644
index a7d40e2..0000000
--- a/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include "device_globals.cuh"
-
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
-
-// Function pointer definitions
-typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
-typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
-typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
-                                       const AcReal&);
-
-// clang-format off
-/* Comparison funcs */
-__device__ inline AcReal
-_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
-
-__device__ inline AcReal
-_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
-
-__device__ inline AcReal
-_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
-
-/* Function used to determine the values used during reduction */
-__device__ inline AcReal
-_device_length_scal(const AcReal& a) { return AcReal(a); }
-
-__device__ inline AcReal
-_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
-
-__device__ inline AcReal
-_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
-
-__device__ inline AcReal
-_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
-
-__device__ inline AcReal
-_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
-
-__device__ inline AcReal
-_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
-// clang-format on
-
-__device__ inline bool
-oob(const int& i, const int& j, const int& k)
-{
-    if (i >= d_mesh_info.int_params[AC_nx] ||
-        j >= d_mesh_info.int_params[AC_ny] ||
-        k >= d_mesh_info.int_params[AC_nz])
-        return true;
-    else
-        return false;
-}
-
-template <ReduceInitialScalFunc reduce_initial>
-__global__ void
-_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
-{
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k = threadIdx.z + blockIdx.z * blockDim.z;
-
-    if (oob(i, j, k))
-        return;
-
-    const int src_idx = DEVICE_VTXBUF_IDX(
-        i + d_mesh_info.int_params[AC_nx_min],
-        j + d_mesh_info.int_params[AC_ny_min],
-        k + d_mesh_info.int_params[AC_nz_min]);
-    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
-
-    dst[dst_idx] = reduce_initial(src[src_idx]);
-}
-
-template <ReduceInitialVecFunc reduce_initial>
-__global__ void
-_kernel_reduce_vec(const __restrict__ AcReal* src_a,
-                   const __restrict__ AcReal* src_b,
-                   const __restrict__ AcReal* src_c, AcReal* dst)
-{
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k = threadIdx.z + blockIdx.z * blockDim.z;
-
-    if (oob(i, j, k))
-        return;
-
-    const int src_idx = DEVICE_VTXBUF_IDX(
-        i + d_mesh_info.int_params[AC_nx_min],
-        j + d_mesh_info.int_params[AC_ny_min],
-        k + d_mesh_info.int_params[AC_nz_min]);
-    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
-
-    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
-                                  src_c[src_idx]);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-#define BLOCK_SIZE (1024)
-#define ELEMS_PER_THREAD (32)
-
-template <ReduceFunc reduce>
-__global__ void
-_kernel_reduce(AcReal* src, AcReal* result)
-{
-    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
-    const int scratchpad_size = DCONST(AC_nxyz);
-
-    if (idx >= scratchpad_size)
-        return;
-
-    __shared__ AcReal smem[BLOCK_SIZE];
-
-    AcReal tmp = src[idx];
-
-    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
-        const int src_idx = idx + i * BLOCK_SIZE;
-        if (src_idx >= scratchpad_size) {
-            // This check is for safety: if accessing uninitialized values
-            // beyond the mesh boundaries, we will immediately start seeing NANs
-            if (threadIdx.x < BLOCK_SIZE)
-                smem[threadIdx.x] = NAN;
-            else
-                break;
-        }
-        tmp = reduce(tmp, src[src_idx]);
-    }
-
-    smem[threadIdx.x] = tmp;
-    __syncthreads();
-
-    int offset = BLOCK_SIZE / 2;
-    while (offset > 0) {
-
-        if (threadIdx.x < offset) {
-            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
-            smem[threadIdx.x] = tmp;
-        }
-        offset /= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        src[idx] = tmp;
-}
-
-template <ReduceFunc reduce>
-__global__ void
-_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
-{
-    const int scratchpad_size = DCONST(AC_nxyz);
-    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
-    AcReal tmp    = src[idx];
-    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
-    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
-        tmp = reduce(tmp, src[idx + i * block_offset]);
-
-    *result = tmp;
-}
-//////////////////////////////////////////////////////////////////////////////
-
-AcReal
-_reduce_scal(const cudaStream_t stream,
-             const ReductionType& rtype, const int& nx, const int& ny,
-             const int& nz, const AcReal* vertex_buffer,
-             AcReal* reduce_scratchpad, AcReal* reduce_result)
-{
-    bool solve_mean = false;
-
-    const dim3 tpb(32, 4, 1);
-    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
-                   int(ceil(AcReal(nz) / tpb.z)));
-
-    const int scratchpad_size = nx * ny * nz;
-    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
-                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
-
-    switch (rtype) {
-    case RTYPE_MAX:
-        _kernel_reduce_scal<_device_length_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_max>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_max>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_MIN:
-        _kernel_reduce_scal<_device_length_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_min>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_min>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_RMS:
-        _kernel_reduce_scal<_device_squared_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    case RTYPE_RMS_EXP:
-        _kernel_reduce_scal<_device_exp_squared_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    default:
-        ERROR("Unrecognized RTYPE");
-    }
-
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    if (solve_mean) {
-        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
-        return inv_n * result;
-    }
-    else {
-        return result;
-    }
-}
-
-AcReal
-_reduce_vec(const cudaStream_t stream,
-            const ReductionType& rtype, const int& nx, const int& ny,
-            const int& nz, const AcReal* vertex_buffer_a,
-            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
-            AcReal* reduce_scratchpad, AcReal* reduce_result)
-{
-    bool solve_mean = false;
-
-    const dim3 tpb(32, 4, 1);
-    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
-                   int(ceil(float(ny) / tpb.y)),
-                   int(ceil(float(nz) / tpb.z)));
-
-    const int scratchpad_size = nx * ny * nz;
-    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
-                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
-
-    // "Features" of this quick & efficient reduction:
-    // Block size must be smaller than the computational domain size
-    // (otherwise we would have do some additional bounds checking in the
-    // second half of _kernel_reduce, which gets quite confusing)
-    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
-    // the work without worrying too much about the array bounds.
-    ERRCHK(BLOCK_SIZE <= scratchpad_size);
-    ERRCHK(!(BLOCK_SIZE % 2));
-    // NOTE! Also does not work properly with non-power of two mesh dimension
-    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
-    // see that uninitialized smem values are used in the comparison
-    ERRCHK(is_power_of_two(nx));
-    ERRCHK(is_power_of_two(ny));
-    ERRCHK(is_power_of_two(nz));
-
-    switch (rtype) {
-    case RTYPE_MAX:
-        _kernel_reduce_vec<_device_length_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_max>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_max>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_MIN:
-        _kernel_reduce_vec<_device_length_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_min>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_min>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_RMS:
-        _kernel_reduce_vec<_device_squared_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    case RTYPE_RMS_EXP:
-        _kernel_reduce_vec<_device_exp_squared_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    default:
-        ERROR("Unrecognized RTYPE");
-    }
-
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    if (solve_mean) {
-        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
-        return inv_n * result;
-    }
-    else {
-        return result;
-    }
-}
diff --git a/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh b/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
deleted file mode 100644
index 94524be..0000000
--- a/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
+++ /dev/null
@@ -1,742 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Implementation of the integration pipeline
- *
- *
- *
- */
-#pragma once
-#include "device_globals.cuh"
-
-#include <assert.h>
-
-/*
-#define RK_THREADS_X (32)
-#define RK_THREADS_Y (1)
-#define RK_THREADS_Z (4)
-#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
-#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
-*/
-
-static __device__ __forceinline__ int
-IDX(const int i)
-{
-    return i;
-}
-
-static __device__ __forceinline__ int
-IDX(const int i, const int j, const int k)
-{
-    return DEVICE_VTXBUF_IDX(i, j, k);
-}
-
-static __device__ __forceinline__ int
-IDX(const int3 idx)
-{
-    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
-}
-
-static __forceinline__ AcMatrix
-create_rotz(const AcReal radians)
-{
-    AcMatrix mat;
-
-    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
-    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
-    mat.row[2] = (AcReal3){0, 0, 0};
-
-    return mat;
-}
-
-
-#if AC_DOUBLE_PRECISION == 0
-#define sin __sinf
-#define cos __cosf
-#define exp __expf
-#define rsqrt rsqrtf // hardware reciprocal sqrt
-#endif // AC_DOUBLE_PRECISION == 0
-
-
-/*
-typedef struct {
-    int i, j, k;
-} int3;*/
-
-/*
- * =============================================================================
- * Level 0 (Input Assembly Stage)
- * =============================================================================
- */
-
-/*
- * =============================================================================
- * Level 0.1 (Read stencil elements and solve derivatives)
- * =============================================================================
- */
-static __device__ __forceinline__ AcReal
-first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
-{
-#if STENCIL_ORDER == 2
-    const AcReal coefficients[] = {0, 1.0 / 2.0};
-#elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
-#elif STENCIL_ORDER == 6
-    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
-#elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
-                                   -1.0 / 280.0};
-#endif
-
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = 0;
-
-#pragma unroll
-    for (int i = 1; i <= MID; ++i)
-        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
-
-    return res * inv_ds;
-}
-
-static __device__ __forceinline__ AcReal
-second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
-{
-#if STENCIL_ORDER == 2
-    const AcReal coefficients[] = {-2., 1.};
-#elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
-#elif STENCIL_ORDER == 6
-    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
-                                   1.0 / 90.0};
-#elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
-                                   8.0 / 315.0, -1.0 / 560.0};
-#endif
-
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = coefficients[0] * pencil[MID];
-
-#pragma unroll
-    for (int i = 1; i <= MID; ++i)
-        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
-
-    return res * inv_ds * inv_ds;
-}
-
-/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
-static __device__ __forceinline__ AcReal
-cross_derivative(const AcReal* __restrict__ pencil_a,
-                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
-                 const AcReal inv_ds_b)
-{
-#if STENCIL_ORDER == 2
-    const AcReal coefficients[] = {0, 1.0 / 4.0};
-#elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
-#elif STENCIL_ORDER == 6
-    const AcReal fac            = (1. / 720.);
-    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
-                                   2.0 * fac};
-#elif STENCIL_ORDER == 8
-    const AcReal fac            = (1. / 20160.);
-    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
-                                   128. * fac, -9. * fac};
-#endif
-
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = AcReal(0.);
-
-    #pragma unroll
-    for (int i = 1; i <= MID; ++i) {
-        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
-                                  pencil_b[MID + i] - pencil_b[MID - i]);
-    }
-    return res * inv_ds_a * inv_ds_b;
-}
-
-static __device__ __forceinline__ AcReal
-derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
-
-    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
-}
-
-static __device__ __forceinline__ AcReal
-derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
-
-    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
-}
-
-static __device__ __forceinline__ AcReal
-derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil_a[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
-
-    AcReal pencil_b[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
-
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsy));
-}
-
-static __device__ __forceinline__ AcReal
-derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil_a[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
-                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    AcReal pencil_b[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
-                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
-
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsz));
-}
-
-static __device__ __forceinline__ AcReal
-dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
-
-    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
-}
-
-static __device__ __forceinline__ AcReal
-deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
-
-    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
-}
-
-static __device__ __forceinline__ AcReal
-deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil_a[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    AcReal pencil_b[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
-
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
-                            DCONST_REAL(AC_inv_dsz));
-}
-
-static __device__ __forceinline__ AcReal
-derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
-}
-
-static __device__ __forceinline__ AcReal
-derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
-}
-
-/*
- * =============================================================================
- * Level 0.2 (Caching functions)
- * =============================================================================
- */
-
-#include "stencil_assembly.cuh"
-
-/*
-typedef struct {
-    AcRealData x;
-    AcRealData y;
-    AcRealData z;
-} AcReal3Data;
-
-static __device__ __forceinline__ AcReal3Data
-read_data(const int i, const int j, const int k,
-          AcReal* __restrict__ buf[], const int3& handle)
-{
-    AcReal3Data data;
-
-    data.x = read_data(i, j, k, buf, handle.x);
-    data.y = read_data(i, j, k, buf, handle.y);
-    data.z = read_data(i, j, k, buf, handle.z);
-
-    return data;
-}
-*/
-
-/*
- * =============================================================================
- * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
- * =============================================================================
- */
-
-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator+(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a)
-{
-    return (AcReal3){-a.x, -a.y, -a.z};
-}
-
-static __host__  __device__ __forceinline__ AcReal3
-operator*(const AcReal a, const AcReal3& b)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal
-dot(const AcReal3& a, const AcReal3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-mul(const AcMatrix& aa, const AcReal3& x)
-{
-    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-cross(const AcReal3& a, const AcReal3& b)
-{
-    AcReal3 c;
-
-    c.x = a.y * b.z - a.z * b.y;
-    c.y = a.z * b.x - a.x * b.z;
-    c.z = a.x * b.y - a.y * b.x;
-
-    return c;
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal a)
-{
-    return !isnan(a) && !isinf(a);
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal3& a)
-{
-    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
-}
-
-
-/*
- * =============================================================================
- * Level 1 (Stencil Processing Stage)
- * =============================================================================
- */
-
-/*
- * =============================================================================
- * Level 1.1 (Terms)
- * =============================================================================
- */
-static __device__ __forceinline__ AcReal
-laplace(const AcRealData& data)
-{
-    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
-}
-
-static __device__ __forceinline__ AcReal
-divergence(const AcReal3Data& vec)
-{
-    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
-}
-
-static __device__ __forceinline__ AcReal3
-laplace_vec(const AcReal3Data& vec)
-{
-    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
-}
-
-static __device__ __forceinline__ AcReal3
-curl(const AcReal3Data& vec)
-{
-    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
-                     gradient(vec.x).z - gradient(vec.z).x,
-                     gradient(vec.y).x - gradient(vec.x).y};
-}
-
-static __device__ __forceinline__ AcReal3
-gradient_of_divergence(const AcReal3Data& vec)
-{
-    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
-                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
-                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
-}
-
-// Takes uu gradients and returns S
-static __device__ __forceinline__ AcMatrix
-stress_tensor(const AcReal3Data& vec)
-{
-    AcMatrix S;
-
-    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
-                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
-    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
-    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
-
-    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
-                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
-
-    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
-
-    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
-                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
-
-    S.row[1].x = S.row[0].y;
-    S.row[2].x = S.row[0].z;
-    S.row[2].y = S.row[1].z;
-
-    return S;
-}
-
-static __device__ __forceinline__ AcReal
-contract(const AcMatrix& mat)
-{
-    AcReal res = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 3; ++i)
-        res += dot(mat.row[i], mat.row[i]);
-
-    return res;
-}
-
-/*
- * =============================================================================
- * Level 1.2 (Equations)
- * =============================================================================
- */
-static __device__ __forceinline__ AcReal
-length(const AcReal3& vec)
-{
-    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
-}
-
-static __device__ __forceinline__ AcReal
-reciprocal_len(const AcReal3& vec)
-{
-    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
-}
-
-static __device__ __forceinline__ AcReal3
-normalized(const AcReal3& vec)
-{
-    const AcReal inv_len = reciprocal_len(vec);
-    return inv_len * vec;
-}
-
-// Sinusoidal forcing
-// https://arxiv.org/pdf/1704.04676.pdf
-__constant__ AcReal3 forcing_vec;
-__constant__ AcReal forcing_phi;
-static __device__ __forceinline__ AcReal3
-forcing(const int i, const int j, const int k)
-{
-    #define DOMAIN_SIZE_X (DCONST(AC_nx) * DCONST_REAL(AC_dsx))
-    #define DOMAIN_SIZE_Y (DCONST(AC_ny) * DCONST_REAL(AC_dsy))
-    #define DOMAIN_SIZE_Z (DCONST(AC_nz) * DCONST_REAL(AC_dsz))
-    const AcReal3 k_vec = (AcReal3){(i - DCONST(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
-                                    (j - DCONST(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
-                                    (k - DCONST(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
-    AcReal inv_len = reciprocal_len(k_vec);
-    if (isnan(inv_len) || isinf(inv_len))
-        inv_len = 0;
-    if (inv_len > 2) // hack to make it cool
-        inv_len = 2;
-    const AcReal k_dot_x = dot(k_vec, forcing_vec);
-
-    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
-
-    return inv_len * inv_len * waves * forcing_vec;
-}
-
-
-// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
-#define LNT0 (AcReal(0.0))
-#define LNRHO0 (AcReal(0.0))
-
-#define H_CONST (AcReal(0.0))
-#define C_CONST (AcReal(0.0))
-
-
-
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current,
-              const AcReal rate_of_change, const AcReal dt)
-{
-    // Williamson (1980)
-    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
-                           AcReal(8. / 15.)};
-
-
-    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
-    // access (when accessing beta[step_number-1] even when step_number >= 1)
-    switch (step_number) {
-        case 0:
-            return state_current + beta[step_number + 1] * rate_of_change * dt;
-        case 1: // Fallthrough
-        case 2:
-            return state_current +
-               beta[step_number + 1] *
-                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
-                        (state_current - state_previous) +
-                    rate_of_change * dt);
-        default:
-            return NAN;
-    }
-}
-/*
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
-              const AcReal rate_of_change, const AcReal dt)
-{
-    // Williamson (1980)
-    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
-                           AcReal(8. / 15.)};
-
-
-    switch (step_number) {
-        case 0:
-            return state_current + beta[step_number] * rate_of_change * dt;
-        case 1: // Fallthrough
-        case 2:
-            return state_current +
-               beta[step_number] *
-                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
-                        (state_current - state_previous) +
-                    rate_of_change * dt);
-        default:
-            return NAN;
-    }
-}
-*/
-
-template <int step_number>
-static __device__ __forceinline__ AcReal3
-rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
-              const AcReal3 rate_of_change, const AcReal dt)
-{
-    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
-}
-
-#define rk3(state_previous, state_current, rate_of_change, dt)\
-rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
-
-/*
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate(const int idx, const AcReal out, const int handle,
-              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
-{
-    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
-}
-
-template <int step_number>
-static __device__ __forceinline__ AcReal3
-rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
-                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
-{
-    return (AcReal3) {
-        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
-        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
-    };
-}
-
-#define RK3(handle, in_cached, rate_of_change, dt) \
-rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
-*/
-
-/*
- * =============================================================================
- * Level 1.3 (Kernels)
- * =============================================================================
- */
-
-static __device__ void
-write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
-{
-    out[handle][idx] = value;
-}
-
-static __device__ void
-write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
-{
-    write(out, vec.x, idx, value.x);
-    write(out, vec.y, idx, value.y);
-    write(out, vec.z, idx, value.z);
-}
-
-static __device__ AcReal
-read_out(const int idx, AcReal* __restrict__ field[], const int handle)
-{
-    return field[handle][idx];
-}
-
-static __device__ AcReal3
-read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
-{
-    return (AcReal3) { read_out(idx, field, handle.x),
-                                       read_out(idx, field, handle.y),
-                                       read_out(idx, field, handle.z) };
-}
-
-#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
-#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
-#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
-
-// also write for clarity here also, not for the DSL
-//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
-
-#define GEN_KERNEL_PARAM_BOILERPLATE \
-        const int3 start, const int3 end, VertexBufferArray buffer
-
-#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
-        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
-                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
-                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
-        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
-            return;\
-\
-\
-        assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&\
-               vertexIdx.z < DCONST(AC_nz_max));\
-\
-        assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&\
-               vertexIdx.z >= DCONST(AC_nz_min));\
-\
-        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
-
-#include "stencil_process.cuh"
-
-/*
- * =============================================================================
- * Level 2 (Host calls)
- * =============================================================================
- */
-
-static AcReal
-randf(void)
-{
-    return AcReal(rand()) / AcReal(RAND_MAX);
-}
-
-AcResult
-rk3_step_async(const cudaStream_t stream, const dim3& tpb,
-               const int3& start, const int3& end, const int& step_number,
-               const AcReal dt, const AcMeshInfo& /*mesh_info*/,
-               VertexBufferArray* buffer)
-{
-    /////////////////// Forcing
-    #if LFORCING
-    const AcReal ff_scale = AcReal(.2);
-    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
-    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
-    const AcMatrix rotz = create_rotz(radians);
-    ff = mul(rotz, ff);
-    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
-
-    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
-    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
-    #endif // LFORCING
-    //////////////////////////
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-
-    const dim3 bpg(
-        (unsigned int)ceil(nx / AcReal(tpb.x)),
-        (unsigned int)ceil(ny / AcReal(tpb.y)),
-        (unsigned int)ceil(nz / AcReal(tpb.z)));
-
-
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-    else
-        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-
-    ERRCHK_CUDA_KERNEL();
-    return AC_SUCCESS;
-}
diff --git a/src/core/kernels/integration.cu b/src/core/kernels/integration.cu
deleted file mode 100644
index a2377da..0000000
--- a/src/core/kernels/integration.cu
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "common.cuh"
-#include "integration.cuh"
-
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
-
-#include <assert.h>
-
-__constant__ AcMeshInfo d_mesh_info; // Extern in kernels/common.cuh.
-
-static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
-
-// Device info
-#define REGISTERS_PER_THREAD (255)
-#define MAX_REGISTERS_PER_BLOCK (65536)
-#define MAX_THREADS_PER_BLOCK (1024)
-#define WARP_SIZE (32)
-
-#define make_int3(a, b, c)                                                                         \
-    (int3) { (int)a, (int)b, (int)c }
-
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
-              const AcReal dt)
-{
-    // Williamson (1980)
-    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
-
-    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
-    // access (when accessing beta[step_number-1] even when step_number >= 1)
-    switch (step_number) {
-    case 0:
-        return state_current + beta[step_number + 1] * rate_of_change * dt;
-    case 1: // Fallthrough
-    case 2:
-        return state_current +
-               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
-                                            (state_current - state_previous) +
-                                        rate_of_change * dt);
-    default:
-        return NAN;
-    }
-}
-
-template <int step_number>
-static __device__ __forceinline__ AcReal3
-rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
-              const AcReal3 rate_of_change, const AcReal dt)
-{
-    return (AcReal3){
-        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
-}
-
-#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
-    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
-
-static __device__ void
-write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
-{
-    out[handle][idx] = value;
-}
-
-static __device__ __forceinline__ void
-write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
-{
-    write(out, vec.x, idx, value.x);
-    write(out, vec.y, idx, value.y);
-    write(out, vec.z, idx, value.z);
-}
-
-static __device__ __forceinline__ AcReal
-read_out(const int idx, AcReal* __restrict__ field[], const int handle)
-{
-    return field[handle][idx];
-}
-
-static __device__ __forceinline__ AcReal3
-read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
-{
-    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
-                     read_out(idx, field, handle.z)};
-}
-
-#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
-#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
-#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
-
-#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
-#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
-
-#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
-    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
-                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
-                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
-    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
-                                        d_multigpu_offset.y + vertexIdx.y,                         \
-                                        d_multigpu_offset.z + vertexIdx.z};                        \
-    (void)globalVertexIdx;                                                                         \
-    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
-        return;                                                                                    \
-                                                                                                   \
-    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
-           vertexIdx.z < DCONST(AC_nz_max));                                                       \
-                                                                                                   \
-    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
-           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
-                                                                                                   \
-    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
-
-#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
-    template <int step_number>                                                                     \
-    AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start,              \
-                                         const int3 end, VertexBufferArray vba)                    \
-    {                                                                                              \
-                                                                                                   \
-        const dim3 tpb(32, 1, 4);                                                                  \
-                                                                                                   \
-        const int3 n = end - start;                                                                \
-        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
-                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
-                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
-                                                                                                   \
-        identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba);                         \
-        ERRCHK_CUDA_KERNEL();                                                                      \
-                                                                                                   \
-        return AC_SUCCESS;                                                                         \
-    }
-
-#include "user_kernels.h"
-
-static dim3 rk3_tpb(32, 1, 4);
-
-AcResult
-acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
-{
-    // RK3
-    dim3 best_dims(0, 0, 0);
-    float best_time          = INFINITY;
-    const int num_iterations = 10;
-
-    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
-        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
-            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
-
-                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
-                    break;
-                if (x * y * z > MAX_THREADS_PER_BLOCK)
-                    break;
-
-                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
-                    break;
-
-                if (((x * y * z) % WARP_SIZE) != 0)
-                    continue;
-
-                const dim3 tpb(x, y, z);
-                const int3 n = end - start;
-                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-                cudaDeviceSynchronize();
-                if (cudaGetLastError() != cudaSuccess) // resets the error if any
-                    continue;
-
-                // printf("(%d, %d, %d)\n", x, y, z);
-
-                cudaEvent_t tstart, tstop;
-                cudaEventCreate(&tstart);
-                cudaEventCreate(&tstop);
-
-                // #ifdef AC_dt
-                // acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
-                // note, temporarily disabled
-                /*#else
-                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
-                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
-                it or call the generated " "device function acDeviceKernel_<kernel name> which does
-                not require the " "timestep to be defined.\n"); #endif*/
-
-                cudaEventRecord(tstart); // ---------------------------------------- Timing start
-                for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, vba);
-
-                cudaEventRecord(tstop); // ----------------------------------------- Timing end
-                cudaEventSynchronize(tstop);
-                float milliseconds = 0;
-                cudaEventElapsedTime(&milliseconds, tstart, tstop);
-
-                ERRCHK_CUDA_KERNEL_ALWAYS();
-                if (milliseconds < best_time) {
-                    best_time = milliseconds;
-                    best_dims = tpb;
-                }
-            }
-        }
-    }
-#if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
-#endif
-    /*
-    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
-    ERRCHK(fp);
-    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
-    fclose(fp);
-    */
-
-    rk3_tpb = best_dims;
-    return AC_SUCCESS;
-}
-
-AcResult
-acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
-                         const int3 end, VertexBufferArray vba)
-{
-    const dim3 tpb = rk3_tpb;
-
-    const int3 n = end - start;
-    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-    //#ifdef AC_dt
-    // acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
-    /*#else
-        (void)dt;
-        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
-              "called, but AC_dt was not defined. Either define it or call the generated "
-              "device function acDeviceKernel_<kernel name> which does not require the "
-              "timestep to be defined.\n");
-    #endif*/
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
-    else
-        solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
-
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
-}
-
-static __global__ void
-dummy_kernel(void)
-{
-    DCONST((AcIntParam)0);
-    DCONST((AcInt3Param)0);
-    DCONST((AcRealParam)0);
-    DCONST((AcReal3Param)0);
-    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
-    a* a;
-}
-
-AcResult
-acKernelDummy(void)
-{
-    dummy_kernel<<<1, 1>>>();
-    ERRCHK_CUDA_KERNEL_ALWAYS();
-    return AC_SUCCESS;
-}
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index 9fb5095..8d66fd2 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -1,42 +1,259 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
 #pragma once
 
-#ifdef __cplusplus
-extern "C" {
+static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define WARP_SIZE (32)
+
+#define make_int3(a, b, c)                                                                         \
+    (int3) { (int)a, (int)b, (int)c }
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
+              const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+    case 0:
+        return state_current + beta[step_number + 1] * rate_of_change * dt;
+    case 1: // Fallthrough
+    case 2:
+        return state_current +
+               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                                            (state_current - state_previous) +
+                                        rate_of_change * dt);
+    default:
+        return NAN;
+    }
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3){
+        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
+    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ __forceinline__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ __forceinline__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ __forceinline__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
+                     read_out(idx, field, handle.z)};
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
+#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
+    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
+                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
+                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
+    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
+                                        d_multigpu_offset.y + vertexIdx.y,                         \
+                                        d_multigpu_offset.z + vertexIdx.z};                        \
+    (void)globalVertexIdx;                                                                         \
+    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
+        return;                                                                                    \
+                                                                                                   \
+    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
+           vertexIdx.z < DCONST(AC_nz_max));                                                       \
+                                                                                                   \
+    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
+           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
+                                                                                                   \
+    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
+    template <int step_number>                                                                     \
+    AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start,              \
+                                         const int3 end, VertexBufferArray vba)                    \
+    {                                                                                              \
+                                                                                                   \
+        const dim3 tpb(32, 1, 4);                                                                  \
+                                                                                                   \
+        const int3 n = end - start;                                                                \
+        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
+                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
+                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
+                                                                                                   \
+        identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba);                         \
+        ERRCHK_CUDA_KERNEL();                                                                      \
+                                                                                                   \
+        return AC_SUCCESS;                                                                         \
+    }
+
+#include "user_kernels.h"
+
+static dim3 rk3_tpb(32, 1, 4);
+
+AcResult
+acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
+{
+    // RK3
+    dim3 best_dims(0, 0, 0);
+    float best_time          = INFINITY;
+    const int num_iterations = 10;
+
+    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
+        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
+            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
+
+                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
+                    break;
+                if (x * y * z > MAX_THREADS_PER_BLOCK)
+                    break;
+
+                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
+                    break;
+
+                if (((x * y * z) % WARP_SIZE) != 0)
+                    continue;
+
+                const dim3 tpb(x, y, z);
+                const int3 n = end - start;
+                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+                cudaDeviceSynchronize();
+                if (cudaGetLastError() != cudaSuccess) // resets the error if any
+                    continue;
+
+                // printf("(%d, %d, %d)\n", x, y, z);
+
+                cudaEvent_t tstart, tstop;
+                cudaEventCreate(&tstart);
+                cudaEventCreate(&tstop);
+
+                // #ifdef AC_dt
+                // acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
+                // note, temporarily disabled
+                /*#else
+                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
+                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
+                it or call the generated " "device function acDeviceKernel_<kernel name> which does
+                not require the " "timestep to be defined.\n"); #endif*/
+
+                cudaEventRecord(tstart); // ---------------------------------------- Timing start
+                for (int i = 0; i < num_iterations; ++i)
+                    solve<2><<<bpg, tpb>>>(start, end, vba);
+
+                cudaEventRecord(tstop); // ----------------------------------------- Timing end
+                cudaEventSynchronize(tstop);
+                float milliseconds = 0;
+                cudaEventElapsedTime(&milliseconds, tstart, tstop);
+
+                ERRCHK_CUDA_KERNEL_ALWAYS();
+                if (milliseconds < best_time) {
+                    best_time = milliseconds;
+                    best_dims = tpb;
+                }
+            }
+        }
+    }
+#if VERBOSE_PRINTING
+    printf(
+        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
+        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
+    /*
+    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
+    ERRCHK(fp);
+    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
+    fclose(fp);
+    */
 
-AcResult acKernelDummy(void);
+    rk3_tpb = best_dims;
+    return AC_SUCCESS;
+}
 
-AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
+AcResult
+acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
+                         const int3 end, VertexBufferArray vba)
+{
+    const dim3 tpb = rk3_tpb;
 
-AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
-                                  const int3 start, const int3 end, VertexBufferArray vba);
+    const int3 n = end - start;
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
 
-#ifdef __cplusplus
-} // extern "C"
-#endif
+    //#ifdef AC_dt
+    // acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
+    /*#else
+        (void)dt;
+        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
+              "called, but AC_dt was not defined. Either define it or call the generated "
+              "device function acDeviceKernel_<kernel name> which does not require the "
+              "timestep to be defined.\n");
+    #endif*/
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
+
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+static __global__ void
+dummy_kernel(void)
+{
+    DCONST((AcIntParam)0);
+    DCONST((AcInt3Param)0);
+    DCONST((AcRealParam)0);
+    DCONST((AcReal3Param)0);
+    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
+    a* a;
+}
+
+AcResult
+acKernelDummy(void)
+{
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    return AC_SUCCESS;
+}
diff --git a/src/core/kernels/kernels.cu b/src/core/kernels/kernels.cu
new file mode 100644
index 0000000..fbcdd17
--- /dev/null
+++ b/src/core/kernels/kernels.cu
@@ -0,0 +1,177 @@
+#include "kernels.h"
+
+#include <assert.h>
+#include <cuComplex.h>
+
+#include "errchk.h"
+#include "math_utils.h"
+
+__device__ __constant__ AcMeshInfo d_mesh_info;
+
+static int __device__ __forceinline__
+DCONST(const AcIntParam param)
+{
+    return d_mesh_info.int_params[param];
+}
+static int3 __device__ __forceinline__
+DCONST(const AcInt3Param param)
+{
+    return d_mesh_info.int3_params[param];
+}
+static AcReal __device__ __forceinline__
+DCONST(const AcRealParam param)
+{
+    return d_mesh_info.real_params[param];
+}
+static AcReal3 __device__ __forceinline__
+DCONST(const AcReal3Param param)
+{
+    return d_mesh_info.real3_params[param];
+}
+static __device__ constexpr VertexBufferHandle
+DCONST(const VertexBufferHandle handle)
+{
+    return handle;
+}
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
+#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
+#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
+//#define globalMeshM // Placeholder
+//#define localMeshN // Placeholder
+//#define localMeshM // Placeholder
+//#define localMeshN_min // Placeholder
+//#define globalMeshN_min // Placeholder
+#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
+//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
+
+static __device__ constexpr int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+#if AC_DOUBLE_PRECISION == 1
+typedef cuDoubleComplex acComplex;
+#define acComplex(x, y) make_cuDoubleComplex(x, y)
+#else
+typedef cuFloatComplex acComplex;
+#define acComplex(x, y) make_cuFloatComplex(x, y)
+#define exp(x) expf(x)
+#define sin(x) sinf(x)
+#define cos(x) cosf(x)
+#define sqrt(x) sqrtf(x)
+#endif
+
+static __device__ inline acComplex
+exp(const acComplex& val)
+{
+    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
+}
+static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+{
+    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
+}
+
+// Kernels /////////////////////////////////////////////////////////////////////
+#include "boundconds.cuh"
+#include "integration.cuh"
+#include "packing.cuh"
+#include "reductions.cuh"
+
+AcResult
+acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
+{
+    cudaSetDevice(device->id);
+
+    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
+                  device->local_config.int_params[AC_multigpu_offset]);
+
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
+                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
+                          const AcReal value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_REAL_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
+                          const AcReal3 value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
+                       const int value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_INT_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
+                        const int3 value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_INT3_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
new file mode 100644
index 0000000..a0ad336
--- /dev/null
+++ b/src/core/kernels/kernels.h
@@ -0,0 +1,82 @@
+#pragma once
+#include "astaroth.h"
+
+typedef struct {
+    int3 dims;
+    AcReal* data;
+} PackedData;
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+
+    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
+} VertexBufferArray;
+
+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAMS];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** */
+AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                                    AcReal* vtxbuf);
+
+/** */
+AcResult acKernelDummy(void);
+
+/** */
+AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
+
+/** */
+AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
+                                  const int3 start, const int3 end, VertexBufferArray vba);
+
+/** */
+AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba,
+                          const int3 vba_start, PackedData packed);
+
+/** */
+AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed,
+                            const int3 vba_start, VertexBufferArray vba);
+
+/** */
+AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                          const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
+                          AcReal* reduce_result);
+
+/** */
+AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                         const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
+
+AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
+                              const AcMeshInfo device_config);
+
+AcResult acDeviceLoadScalarUniform(const Device device, const Stream stream,
+                                   const AcRealParam param, const AcReal value);
+
+AcResult acDeviceLoadVectorUniform(const Device device, const Stream stream,
+                                   const AcReal3Param param, const AcReal3 value);
+
+AcResult acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
+                                const int value);
+
+AcResult acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
+                                 const int3 value);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/core/kernels/packing.cu b/src/core/kernels/packing.cu
deleted file mode 100644
index d5e70a3..0000000
--- a/src/core/kernels/packing.cu
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "packing.cuh"
-
-#include "src/core/errchk.h"
-
-__global__ void
-kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
-{
-    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_packed >= packed.dims.x || //
-        j_packed >= packed.dims.y || //
-        k_packed >= packed.dims.z) {
-        return;
-    }
-
-    const int i_unpacked = i_packed + vba_start.x;
-    const int j_unpacked = j_packed + vba_start.y;
-    const int k_unpacked = k_packed + vba_start.z;
-
-    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
-    const int packed_idx   = i_packed +               //
-                           j_packed * packed.dims.x + //
-                           k_packed * packed.dims.x * packed.dims.y;
-
-    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
-
-    //#pragma unroll
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-        packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
-}
-
-__global__ void
-kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
-{
-    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_packed >= packed.dims.x || //
-        j_packed >= packed.dims.y || //
-        k_packed >= packed.dims.z) {
-        return;
-    }
-
-    const int i_unpacked = i_packed + vba_start.x;
-    const int j_unpacked = j_packed + vba_start.y;
-    const int k_unpacked = k_packed + vba_start.z;
-
-    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
-    const int packed_idx   = i_packed +               //
-                           j_packed * packed.dims.x + //
-                           k_packed * packed.dims.x * packed.dims.y;
-
-    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
-
-    //#pragma unroll
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-        vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
-}
-
-AcResult
-acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
-                 PackedData packed)
-{
-    const dim3 tpb(32, 8, 1);
-    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
-                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
-                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
-
-    kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
-                   VertexBufferArray vba)
-{
-    const dim3 tpb(32, 8, 1);
-    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
-                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
-                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
-
-    kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
-    ERRCHK_CUDA_KERNEL();
-    return AC_SUCCESS;
-}
diff --git a/src/core/kernels/packing.cuh b/src/core/kernels/packing.cuh
index a11b2c4..7d39a1a 100644
--- a/src/core/kernels/packing.cuh
+++ b/src/core/kernels/packing.cuh
@@ -1,40 +1,92 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
 #pragma once
-#include "astaroth.h"
-#include "common.cuh"
 
-typedef struct {
-    int3 dims;
-    AcReal* data;
-} PackedData;
+static __global__ void
+kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
+{
+    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
 
-AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba,
-                          const int3 vba_start, PackedData packed);
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_packed >= packed.dims.x || //
+        j_packed >= packed.dims.y || //
+        k_packed >= packed.dims.z) {
+        return;
+    }
 
-AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed,
-                            const int3 vba_start, VertexBufferArray vba);
+    const int i_unpacked = i_packed + vba_start.x;
+    const int j_unpacked = j_packed + vba_start.y;
+    const int k_unpacked = k_packed + vba_start.z;
+
+    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
+    const int packed_idx   = i_packed +               //
+                           j_packed * packed.dims.x + //
+                           k_packed * packed.dims.x * packed.dims.y;
+
+    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
+
+    //#pragma unroll
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
+}
+
+static __global__ void
+kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
+{
+    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_packed >= packed.dims.x || //
+        j_packed >= packed.dims.y || //
+        k_packed >= packed.dims.z) {
+        return;
+    }
+
+    const int i_unpacked = i_packed + vba_start.x;
+    const int j_unpacked = j_packed + vba_start.y;
+    const int k_unpacked = k_packed + vba_start.z;
+
+    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
+    const int packed_idx   = i_packed +               //
+                           j_packed * packed.dims.x + //
+                           k_packed * packed.dims.x * packed.dims.y;
+
+    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
+
+    //#pragma unroll
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
+}
+
+AcResult
+acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
+                 PackedData packed)
+{
+    const dim3 tpb(32, 8, 1);
+    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
+                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
+                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
+
+    kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
+                   VertexBufferArray vba)
+{
+    const dim3 tpb(32, 8, 1);
+    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
+                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
+                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
+
+    kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
diff --git a/src/core/kernels/reductions.cu b/src/core/kernels/reductions.cu
deleted file mode 100644
index 3723425..0000000
--- a/src/core/kernels/reductions.cu
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "common.cuh"
-#include "reductions.cuh"
-
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h" // is_power_of_two
-
-/*
-Reduction steps:
- 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
- 2 of 3: Compute most of the reductions into a single block of data
- 3 of 3: After all results have been stored into the final block, reduce the data in the final block
-*/
-
-// Function pointer definitions
-typedef AcReal (*FilterFunc)(const AcReal&);
-typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
-typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
-
-// clang-format off
-/* Comparison funcs */
-static __device__ inline AcReal
-dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
-
-static __device__ inline AcReal
-dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
-
-static __device__ inline AcReal
-dsum(const AcReal& a, const AcReal& b) { return a + b; }
-
-/* Function used to determine the values used during reduction */
-static __device__ inline AcReal
-dvalue(const AcReal& a) { return AcReal(a); }
-
-static __device__ inline AcReal
-dsquared(const AcReal& a) { return (AcReal)(a*a); }
-
-static __device__ inline AcReal
-dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
-
-static __device__ inline AcReal
-dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
-
-static __device__ inline AcReal
-dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
-
-static __device__ inline AcReal
-dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
-// clang-format on
-
-#include <assert.h>
-template <FilterFunc filter>
-__global__ void
-kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
-           src_idx.z < DCONST(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
-}
-
-template <FilterFuncVec filter>
-__global__ void
-kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
-                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
-           src_idx.z < DCONST(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce(AcReal* scratchpad, const int num_elems)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    extern __shared__ AcReal smem[];
-    if (idx < num_elems) {
-        smem[threadIdx.x] = scratchpad[idx];
-    }
-    else {
-        smem[threadIdx.x] = NAN;
-    }
-    __syncthreads();
-
-    int offset = blockDim.x / 2;
-    assert(offset % 2 == 0);
-    while (offset > 0) {
-        if (threadIdx.x < offset) {
-            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
-        }
-        offset /= 2;
-        __syncthreads();
-    }
-
-    if (threadIdx.x == 0) {
-        scratchpad[idx] = smem[threadIdx.x];
-    }
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
-                    const int block_size, AcReal* result)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx != 0) {
-        return;
-    }
-
-    AcReal res = scratchpad[0];
-    for (int i = 1; i < num_blocks; ++i) {
-        res = reduce(res, scratchpad[i * block_size]);
-    }
-    *result = res;
-}
-
-AcReal
-acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
-                   const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
-
-AcReal
-acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
-                  const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
-                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh
index 7f5cf4f..8877e7e 100644
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -1,44 +1,254 @@
+#pragma once
+
 /*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+Reduction steps:
+ 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
+ 2 of 3: Compute most of the reductions into a single block of data
+ 3 of 3: After all results have been stored into the final block, reduce the data in the final block
 */
 
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include <astaroth.h>
+// Function pointer definitions
+typedef AcReal (*FilterFunc)(const AcReal&);
+typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+// clang-format off
+/* Comparison funcs */
+static __device__ inline AcReal
+dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
 
-AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
-                          const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
-                          AcReal* reduce_result);
+static __device__ inline AcReal
+dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
 
-AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
-                         const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
-                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
+static __device__ inline AcReal
+dsum(const AcReal& a, const AcReal& b) { return a + b; }
 
-#ifdef __cplusplus
-} // extern "C"
-#endif
+/* Function used to determine the values used during reduction */
+static __device__ inline AcReal
+dvalue(const AcReal& a) { return AcReal(a); }
+
+static __device__ inline AcReal
+dsquared(const AcReal& a) { return (AcReal)(a*a); }
+
+static __device__ inline AcReal
+dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
+
+static __device__ inline AcReal
+dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+static __device__ inline AcReal
+dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
+
+static __device__ inline AcReal
+dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
+// clang-format on
+
+#include <assert.h>
+template <FilterFunc filter>
+static __global__ void
+kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
+           src_idx.z < DCONST(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
+}
+
+template <FilterFuncVec filter>
+static __global__ void
+kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
+                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
+           src_idx.z < DCONST(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
+        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+}
+
+template <ReduceFunc reduce>
+static __global__ void
+kernel_reduce(AcReal* scratchpad, const int num_elems)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    extern __shared__ AcReal smem[];
+    if (idx < num_elems) {
+        smem[threadIdx.x] = scratchpad[idx];
+    }
+    else {
+        smem[threadIdx.x] = NAN;
+    }
+    __syncthreads();
+
+    int offset = blockDim.x / 2;
+    assert(offset % 2 == 0);
+    while (offset > 0) {
+        if (threadIdx.x < offset) {
+            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        scratchpad[idx] = smem[threadIdx.x];
+    }
+}
+
+template <ReduceFunc reduce>
+static __global__ void
+kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
+                    const int block_size, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx != 0) {
+        return;
+    }
+
+    AcReal res = scratchpad[0];
+    for (int i = 1; i < num_blocks; ++i) {
+        res = reduce(res, scratchpad[i * block_size]);
+    }
+    *result = res;
+}
+
+AcReal
+acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                   const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
+
+AcReal
+acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                  const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
diff --git a/src/core/math_utils.h b/src/core/math_utils.h
deleted file mode 100644
index 472b55b..0000000
--- a/src/core/math_utils.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-//#include <cmath>
-// using namespace std; // Potentially bad practice to declare namespace std here
-#include <math.h> // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
-//#include <tgmath.h> // Even this does not work
-#include <stdlib.h> // rand
-
-template <class T>
-static inline const T
-max(const T& a, const T& b)
-{
-    return a > b ? a : b;
-}
-
-template <class T>
-static inline const T
-min(const T& a, const T& b)
-{
-    return a < b ? a : b;
-}
-
-static inline const int3
-max(const int3& a, const int3& b)
-{
-    return (int3){max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
-}
-
-static inline const int3
-min(const int3& a, const int3& b)
-{
-    return (int3){min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
-}
-
-template <class T>
-static inline const T
-sum(const T& a, const T& b)
-{
-    return a + b;
-}
-
-template <class T>
-static inline const T
-clamp(const T& val, const T& min, const T& max)
-{
-    return val < min ? min : val > max ? max : val;
-}
-
-static inline AcReal
-randr()
-{
-    return AcReal(rand()) / AcReal(RAND_MAX);
-}
-
-static inline bool
-is_power_of_two(const unsigned val)
-{
-    return val && !(val & (val - 1));
-}
-
-#ifdef __CUDACC__
-#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
-#else
-#define HOST_DEVICE_INLINE inline
-#endif // __CUDACC__
-
-static HOST_DEVICE_INLINE AcReal3
-operator+(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static HOST_DEVICE_INLINE int3
-operator+(const int3& a, const int3& b)
-{
-    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
-{
-    return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
-}
-
-static HOST_DEVICE_INLINE void
-operator+=(AcReal3& lhs, const AcReal3& rhs)
-{
-    lhs.x += rhs.x;
-    lhs.y += rhs.y;
-    lhs.z += rhs.z;
-}
-
-static HOST_DEVICE_INLINE AcReal3
-operator-(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static HOST_DEVICE_INLINE int3
-operator-(const int3& a, const int3& b)
-{
-    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static HOST_DEVICE_INLINE AcReal3
-operator-(const AcReal3& a)
-{
-    return (AcReal3){-a.x, -a.y, -a.z};
-}
-
-static HOST_DEVICE_INLINE void
-operator-=(AcReal3& lhs, const AcReal3& rhs)
-{
-    lhs.x -= rhs.x;
-    lhs.y -= rhs.y;
-    lhs.z -= rhs.z;
-}
-
-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static HOST_DEVICE_INLINE AcReal
-dot(const AcReal3& a, const AcReal3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static HOST_DEVICE_INLINE AcReal3
-mul(const AcMatrix& aa, const AcReal3& x)
-{
-    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
-}
-
-static HOST_DEVICE_INLINE AcReal3
-cross(const AcReal3& a, const AcReal3& b)
-{
-    AcReal3 c;
-
-    c.x = a.y * b.z - a.z * b.y;
-    c.y = a.z * b.x - a.x * b.z;
-    c.z = a.x * b.y - a.y * b.x;
-
-    return c;
-}
-
-static HOST_DEVICE_INLINE bool
-is_valid(const AcReal& a)
-{
-    return !isnan(a) && !isinf(a);
-}
-
-static HOST_DEVICE_INLINE bool
-is_valid(const AcReal3& a)
-{
-    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
-}
diff --git a/src/core/node.cc b/src/core/node.cc
index d703afe..33c9270 100644
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -123,12 +123,10 @@
                                  db  da
  *
  */
-#include "astaroth_node.h"
+#include "astaroth.h"
 
-#include "astaroth_device.h"
 #include "errchk.h"
-#include "kernels/common.cuh"
-#include "math_utils.h" // sum for reductions
+#include "math_utils.h"
 
 static const int MAX_NUM_DEVICES = 32;