Reordered src/core to have better division to host and device code (this is more likely to work when compiling with mpicxx). Disabled separate compilation of CUDA kernels as this complicates compilation and is a source of many cmake/cuda bugs. As a downside, GPU code takes longer to compile.

2020-01-23 20:06:20 +02:00
parent 96389e9da6
commit 78fbcc090d
22 changed files with 1008 additions and 4305 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,38 +1,4 @@
-########################################
-##  CMakeLists.txt for Astaroth Core  ##
-########################################
-
-# Require C++11
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_FLAGS "-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
-set(CMAKE_CXX_FLAGS_DEBUG "-g")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-
-# Require CUDA
-find_package(CUDA REQUIRED)
-set(CMAKE_CUDA_FLAGS "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
-
-# Compile kernels
-add_library(astaroth_kernels STATIC kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu kernels/packing.cu)
-target_include_directories(astaroth_kernels PRIVATE .)
-target_compile_features(astaroth_kernels PRIVATE cxx_std_11)
-set_target_properties(astaroth_kernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-add_dependencies(astaroth_kernels dsl_headers)
-
-# Compile core
-add_library(astaroth_core STATIC astaroth.cc node.cc device.cc)
-target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
-target_link_libraries(astaroth_core m astaroth_kernels ${CUDA_LIBRARIES})
-target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
-set_target_properties(astaroth_core PROPERTIES POSITION_INDEPENDENT_CODE ON)
-add_dependencies(astaroth_kernels dsl_headers)
-
-if (MULTIGPU_ENABLED)
-    target_compile_definitions(astaroth_core PRIVATE AC_MULTIGPU_ENABLED=1)
-endif ()
-
-if (MPI_ENABLED)
-    target_link_libraries(astaroth_core astaroth_utils)
-    target_compile_definitions(astaroth_core PRIVATE AC_MPI_ENABLED=1)
-endif ()
+## Astaroth Core
+add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
+target_include_directories(astaroth_core PRIVATE ${CUDA_INCLUDE_DIRS})
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -19,41 +19,12 @@
 #include "astaroth.h"

 #include "errchk.h"
-#include "math_utils.h" // int3 + int3
-
-#define AC_GEN_STR(X) #X
-const char* intparam_names[]    = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
-                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]   = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]   = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[]  = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
-                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
-const char* vtxbuf_names[]      = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
-#undef AC_GEN_STR
+#include "math_utils.h"

 static const int max_num_nodes   = 1;
 static Node nodes[max_num_nodes] = {0};
-
 static int num_nodes             = 0;

-void
-acPrintMeshInfo(const AcMeshInfo config)
-{
-    for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
-    for (int i = 0; i < NUM_INT3_PARAMS; ++i)
-        printf("[%s]: (%d, %d, %d)\n", int3param_names[i], config.int3_params[i].x,
-               config.int3_params[i].y, config.int3_params[i].z);
-    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
-    for (int i = 0; i < NUM_REAL3_PARAMS; ++i)
-        printf("[%s]: (%g, %g, %g)\n", real3param_names[i], double(config.real3_params[i].x),
-               double(config.real3_params[i].y), double(config.real3_params[i].z));
-}
-
 AcResult
 acInit(const AcMeshInfo mesh_info)
 {
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1,60 +1,106 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+#include "astaroth.h"

-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "astaroth_device.h"
-
-#include <string.h> // memcpy
+#include <mpi.h>
+#include <string.h>

+#include "astaroth_utils.h"
 #include "errchk.h"
 #include "math_utils.h"
+#include "timer_hires.h"

-#include "kernels/common.cuh"
+#include "kernels/kernels.h"

 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))

-struct device_s {
-    int id;
-    AcMeshInfo local_config;
+AcResult
+acDevicePrintInfo(const Device device)
+{
+    const int device_id = device->id;

-    // Concurrency
-    cudaStream_t streams[NUM_STREAMS];
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);

+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf(
+        "    Compute mode: %d\n",
+        (int)props
+            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
    // Memory
-    VertexBufferArray vba;
-    AcReal* reduce_scratchpad;
-    AcReal* reduce_result;
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
+
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
+    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
+    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceAutoOptimize(const Device device)
+{
+    cudaSetDevice(device->id);
+    const int3 start = (int3){
+        device->local_config.int_params[AC_nx_min],
+        device->local_config.int_params[AC_ny_min],
+        device->local_config.int_params[AC_nz_min],
    };
+    const int3 end = (int3){
+        device->local_config.int_params[AC_nx_max],
+        device->local_config.int_params[AC_ny_max],
+        device->local_config.int_params[AC_nz_max],
+    };
+    return acKernelAutoOptimizeIntegration(start, end, device->vba);
+}

-#include "kernels/boundconds.cuh"
-#include "kernels/integration.cuh"
-#include "kernels/reductions.cuh"
-
-#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
-// #include "kernels/pack_unpack.cuh"
-#endif
+AcResult
+acDeviceSynchronizeStream(const Device device, const Stream stream)
+{
+    cudaSetDevice(device->id);
+    if (stream == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    }
+    else {
+        cudaStreamSynchronize(device->streams[stream]);
+    }
+    return AC_SUCCESS;
+}

 AcResult
 acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -147,96 +193,6 @@ acDeviceDestroy(Device device)
    return AC_SUCCESS;
 }

-AcResult
-acDevicePrintInfo(const Device device)
-{
-    const int device_id = device->id;
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, device_id);
-    printf("--------------------------------------------------\n");
-    printf("Device Number: %d\n", device_id);
-    const size_t bus_id_max_len = 128;
-    char bus_id[bus_id_max_len];
-    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
-    printf("  PCI bus ID: %s\n", bus_id);
-    printf("    Device name: %s\n", props.name);
-    printf("    Compute capability: %d.%d\n", props.major, props.minor);
-
-    // Compute
-    printf("  Compute\n");
-    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
-    printf("    Stream processors: %d\n", props.multiProcessorCount);
-    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
-    printf(
-        "    Compute mode: %d\n",
-        (int)props
-            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
-    // Memory
-    printf("  Global memory\n");
-    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
-    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
-    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
-           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
-    printf("    ECC enabled: %d\n", props.ECCEnabled);
-
-    // Memory usage
-    size_t free_bytes, total_bytes;
-    cudaMemGetInfo(&free_bytes, &total_bytes);
-    const size_t used_bytes = total_bytes - free_bytes;
-    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
-    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
-    printf("  Caches\n");
-    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
-    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
-    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
-    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
-    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
-    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
-    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
-    printf("  Other\n");
-    printf("    Warp size: %d\n", props.warpSize);
-    // printf("    Single to double perf. ratio: %dx\n",
-    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
-    // versions
-    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
-    printf("--------------------------------------------------\n");
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceAutoOptimize(const Device device)
-{
-    cudaSetDevice(device->id);
-    const int3 start = (int3){
-        device->local_config.int_params[AC_nx_min],
-        device->local_config.int_params[AC_ny_min],
-        device->local_config.int_params[AC_nz_min],
-    };
-    const int3 end = (int3){
-        device->local_config.int_params[AC_nx_max],
-        device->local_config.int_params[AC_ny_max],
-        device->local_config.int_params[AC_nz_max],
-    };
-    return acKernelAutoOptimizeIntegration(start, end, device->vba);
-}
-
-AcResult
-acDeviceSynchronizeStream(const Device device, const Stream stream)
-{
-    cudaSetDevice(device->id);
-    if (stream == STREAM_ALL) {
-        cudaDeviceSynchronize();
-    }
-    else {
-        cudaStreamSynchronize(device->streams[stream]);
-    }
-    return AC_SUCCESS;
-}
-
 AcResult
 acDeviceSwapBuffers(const Device device)
 {
@@ -249,66 +205,6 @@ acDeviceSwapBuffers(const Device device)
    return AC_SUCCESS;
 }

-AcResult
-acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
-                          const AcReal value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_REAL_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
-                          const AcReal3 value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
-                       const int value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_INT_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
-AcResult
-acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
-                        const int3 value)
-{
-    cudaSetDevice(device->id);
-
-    if (param >= NUM_INT3_PARAMS)
-        return AC_FAILURE;
-
-    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
-                                        cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
 AcResult
 acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
                        const size_t start, const AcReal* data, const size_t num)
@@ -327,22 +223,6 @@ acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarAr
    return AC_SUCCESS;
 }

-AcResult
-acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
-{
-    cudaSetDevice(device->id);
-
-    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
-    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
-    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
-    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
-                  device->local_config.int_params[AC_multigpu_offset]);
-
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(&d_mesh_info, &device_config, sizeof(device_config),
-                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
-    return AC_SUCCESS;
-}
-
 AcResult
 acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
@@ -578,31 +458,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
    return AC_SUCCESS;
 }

-#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
-// Functions for calling packed data transfers
-#endif
-
-#if AC_MPI_ENABLED == 0
-AcResult
-acDeviceRunMPITest(void)
-{
-    WARNING("MPI was not enabled but acDeviceRunMPITest() was called");
-    return AC_FAILURE;
-}
-#else // MPI_ENABLED ///////////////////////////////////////////////////////////////////////////////
-#include <mpi.h>
-
-// Kernels
-#include "kernels/packing.cuh"
-
-// From Astaroth Utils
-#include "src/utils/config_loader.h"
-#include "src/utils/memory.h"
-#include "src/utils/timer_hires.h"
-#include "src/utils/verification.h"
-
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
-
 static int
 mod(const int a, const int b)
 {
@@ -881,6 +736,8 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
    return AC_SUCCESS;
 }

+/*
+// Deprecated
 static AcResult
 acDeviceCommunicateBlocksMPI(const Device device,        //
                             const int3* a0s,            // Src idx inside comp. domain
@@ -977,6 +834,7 @@ acDeviceCommunicateBlocksMPI(const Device device,        //

    return AC_SUCCESS;
 }
+*/

 typedef struct {
    PackedData* srcs;
@@ -1558,7 +1416,7 @@ acDeviceRunMPITest(void)
    };
    submesh_info.int3_params[AC_multigpu_offset] = (int3){-1, -1, -1}; // TODO
    WARNING("AC_multigpu_offset not yet implemented");
-    acUpdateConfig(&submesh_info);
+    acUpdateBuiltinParams(&submesh_info);
    ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx]));
    ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound]));

@@ -1630,4 +1488,3 @@ acDeviceRunMPITest(void)
    //////////////////////////////////////////////////////////////
    return AC_SUCCESS;
 }
-#endif // MPI_ENABLED //////////////////////////////////////////////////////////////////////////////
--- a/src/core/errchk.h
+++ b/src/core/errchk.h
@@ -1,113 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-// clang-format off
-/*
- * =============================================================================
- * General error checking
- * =============================================================================
- */
-#define ERROR(str) \
-{ \
-    time_t t; time(&t); \
-    fprintf(stderr, "%s", ctime(&t)); \
-    fprintf(stderr, "\tError in file %s line %d: %s\n", \
-                    __FILE__, __LINE__, str); \
-    fflush(stderr); \
-    exit(EXIT_FAILURE); \
-    abort(); \
-}
-
-#define WARNING(str) \
-{ \
-    time_t t; time(&t); \
-    fprintf(stderr, "%s", ctime(&t)); \
-    fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
-                    __FILE__, __LINE__, str); \
-    fflush(stderr); \
-}
-
-// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
-#define ERRCHK(retval)  { if (!(retval)) ERROR(#retval " was false"); }
-#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
-#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
-
-/*
- * =============================================================================
- * CUDA-specific error checking
- * =============================================================================
- */
-#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
-static inline void
-cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
-{
-    if (code != cudaSuccess) {
-        time_t t; time(&t); \
-        fprintf(stderr, "%s", ctime(&t)); \
-        fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
-                        file, line, cudaGetErrorString(code)); \
-        fflush(stderr); \
-
-        if (abort)
-            exit(code);
-    }
-}
-
-#ifdef NDEBUG
-    #undef ERRCHK
-    #undef WARNCHK
-    #define ERRCHK(params)
-    #define WARNCHK(params)
-    #define ERRCHK_CUDA(params) params
-    #define WARNCHK_CUDA(params) params
-    #define ERRCHK_CUDA_KERNEL() {}
-#else
-    #define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
-    #define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
-
-    #define ERRCHK_CUDA_KERNEL()                                               \
-    {                                                                          \
-        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
-        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
-    }
-#endif
-
-#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
-
-#define ERRCHK_CUDA_KERNEL_ALWAYS()                                               \
-{                                                                          \
-    ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError());                                    \
-    ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize());                                  \
-}
-
-#define WARNCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__, false); }
-// clang-format on
-#endif // AC_USE_CUDA_RUNTIME_API
--- a/src/core/grid.cc
+++ b/src/core/grid.cc
@@ -1,194 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-#include "astaroth_grid.h"
-
-#include "astaroth_device.h"
-#include "errchk.h"
-
-/** */
-AcResult
-acGridInit(const AcMeshInfo info)
-{
-    int num_processes, pid;
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-
-    char processor_name[MPI_MAX_PROCESSOR_NAME];
-    int name_len;
-    MPI_Get_processor_name(processor_name, &name_len);
-    printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes);
-
-    AcMesh submesh;
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridQuit(void)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-#if 0
-/** */
-AcResult
-acGridSynchronizeStream(const Stream stream)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridSwapBuffers(void)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
-                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
-                                 const int3 dst, const int num_vertices)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
-                         const int3 dst, const int num_vertices)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
-                       const VertexBufferHandle vtxbuf_handle)
-{
-    WARNING("Proper multinode not yet implemented");
-    return AC_FAILURE;
-}
-
-#endif
-/** */
-AcResult
-acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
-{
-    (void)stream;
-    (void)host_mesh;
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-#if 0
-
-/** */
-AcResult
-acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
-                                  const int3 src, const int3 dst, const int num_vertices,
-                                  AcMesh* host_mesh)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
-                          const int num_vertices, AcMesh* host_mesh)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
-                        AcMesh* host_mesh)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-#endif
-/** */
-AcResult
-acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
-{
-    (void)stream;
-    (void)host_mesh;
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-#if 0
-
-/** */
-AcResult
-acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
-                       const AcReal dt)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridIntegrateStep(const Stream stream, const AcReal dt)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridPeriodicBoundcondStep(const Stream stream)
-{
-    WARNING("Not implemented");
-    return AC_FAILURE;
-}
-
-/** */
-AcResult
-acGridReduceScal(const Stream stream, const ReductionType rtype,
-                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
-{
-    return AC_FAILURE;
-}
-/** */
-AcResult
-acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
-                const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
-{
-    return AC_FAILURE;
-}
-#endif
--- a/src/core/kernels/CMakeLists.txt
+++ b/src/core/kernels/CMakeLists.txt
@@ -0,0 +1,3 @@
+## Astaroth Kernels
+add_library(astaroth_kernels STATIC kernels.cu)
+add_dependencies(astaroth_kernels dsl_headers)
--- a/src/core/kernels/boundconds.cu
+++ b/src/core/kernels/boundconds.cu
@@ -1,91 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "boundconds.cuh"
-#include "common.cuh"
-
-#include "src/core/errchk.h"
-
-__global__ void
-kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
-        return;
-
-    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
-    //    return;
-
-    // If destination index is inside the computational domain, return since
-    // the boundary conditions are only applied to the ghost zones
-    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
-        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
-        return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates
-    int i_src = i_dst - DCONST(AC_nx_min);
-    int j_src = j_dst - DCONST(AC_ny_min);
-    int k_src = k_dst - DCONST(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST(AC_nx);
-    j_src += DCONST(AC_ny);
-    k_src += DCONST(AC_nz);
-
-    // Wrap
-    i_src %= DCONST(AC_nx);
-    j_src %= DCONST(AC_ny);
-    k_src %= DCONST(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST(AC_nx_min);
-    j_src += DCONST(AC_ny_min);
-    k_src += DCONST(AC_nz_min);
-
-    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx]   = vtxbuf[src_idx];
-}
-
-AcResult
-acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                           AcReal* vtxbuf)
-{
-    const dim3 tpb(8, 2, 8);
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
-    ERRCHK_CUDA_KERNEL();
-    return AC_SUCCESS;
-}
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,40 +1,62 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
 #pragma once

-#include "astaroth.h"
+static __global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;

-#ifdef __cplusplus
-extern "C" {
-#endif
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;

-AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                                    AcReal* vtxbuf);
+    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
+    //    return;

-#ifdef __cplusplus
-} // extern "C"
-#endif
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
+        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST(AC_nx_min);
+    int j_src = j_dst - DCONST(AC_ny_min);
+    int k_src = k_dst - DCONST(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST(AC_nx);
+    j_src += DCONST(AC_ny);
+    k_src += DCONST(AC_nz);
+
+    // Wrap
+    i_src %= DCONST(AC_nx);
+    j_src %= DCONST(AC_ny);
+    k_src %= DCONST(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST(AC_nx_min);
+    j_src += DCONST(AC_ny_min);
+    k_src += DCONST(AC_nz_min);
+
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vtxbuf[dst_idx]   = vtxbuf[src_idx];
+}
+
+AcResult
+acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                           AcReal* vtxbuf)
+{
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/common.cuh
+++ b/src/core/kernels/common.cuh
@@ -1,127 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include "astaroth.h"
-
-extern __constant__ AcMeshInfo d_mesh_info;
-
-typedef struct {
-    AcReal* in[NUM_VTXBUF_HANDLES];
-    AcReal* out[NUM_VTXBUF_HANDLES];
-
-    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
-} VertexBufferArray;
-
-static int __device__ __forceinline__
-DCONST(const AcIntParam param)
-{
-    return d_mesh_info.int_params[param];
-}
-static int3 __device__ __forceinline__
-DCONST(const AcInt3Param param)
-{
-    return d_mesh_info.int3_params[param];
-}
-static AcReal __device__ __forceinline__
-DCONST(const AcRealParam param)
-{
-    return d_mesh_info.real_params[param];
-}
-static AcReal3 __device__ __forceinline__
-DCONST(const AcReal3Param param)
-{
-    return d_mesh_info.real3_params[param];
-}
-static __device__ constexpr VertexBufferHandle
-DCONST(const VertexBufferHandle handle)
-{
-    return handle;
-}
-#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
-#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
-#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
-//#define globalMeshM // Placeholder
-//#define localMeshN // Placeholder
-//#define localMeshM // Placeholder
-//#define localMeshN_min // Placeholder
-//#define globalMeshN_min // Placeholder
-#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
-//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
-
-static __device__ constexpr int
-IDX(const int i)
-{
-    return i;
-}
-
-static __device__ __forceinline__ int
-IDX(const int i, const int j, const int k)
-{
-    return DEVICE_VTXBUF_IDX(i, j, k);
-}
-
-static __device__ __forceinline__ int
-IDX(const int3 idx)
-{
-    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
-}
-
-//#include <thrust/complex.h>
-// using namespace thrust;
-
-#include <cuComplex.h>
-#if AC_DOUBLE_PRECISION == 1
-typedef cuDoubleComplex acComplex;
-#define acComplex(x, y) make_cuDoubleComplex(x, y)
-#else
-typedef cuFloatComplex acComplex;
-#define acComplex(x, y) make_cuFloatComplex(x, y)
-#define exp(x) expf(x)
-#define sin(x) sinf(x)
-#define cos(x) cosf(x)
-#define sqrt(x) sqrtf(x)
-#endif
-static __device__ inline acComplex
-exp(const acComplex& val)
-{
-    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
-}
-static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
-{
-    return (acComplex){a * b.x, a * b.y};
-}
-
-static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
-{
-    return (acComplex){a * b.x, a * b.y};
-}
-
-static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
-{
-    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
-}
-//#include <complex>
--- a/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
--- a/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
@@ -1,338 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-#include "device_globals.cuh"
-
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
-
-// Function pointer definitions
-typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
-typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
-typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
-                                       const AcReal&);
-
-// clang-format off
-/* Comparison funcs */
-__device__ inline AcReal
-_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
-
-__device__ inline AcReal
-_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
-
-__device__ inline AcReal
-_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
-
-/* Function used to determine the values used during reduction */
-__device__ inline AcReal
-_device_length_scal(const AcReal& a) { return AcReal(a); }
-
-__device__ inline AcReal
-_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
-
-__device__ inline AcReal
-_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
-
-__device__ inline AcReal
-_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
-
-__device__ inline AcReal
-_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
-
-__device__ inline AcReal
-_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
-// clang-format on
-
-__device__ inline bool
-oob(const int& i, const int& j, const int& k)
-{
-    if (i >= d_mesh_info.int_params[AC_nx] ||
-        j >= d_mesh_info.int_params[AC_ny] ||
-        k >= d_mesh_info.int_params[AC_nz])
-        return true;
-    else
-        return false;
-}
-
-template <ReduceInitialScalFunc reduce_initial>
-__global__ void
-_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
-{
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k = threadIdx.z + blockIdx.z * blockDim.z;
-
-    if (oob(i, j, k))
-        return;
-
-    const int src_idx = DEVICE_VTXBUF_IDX(
-        i + d_mesh_info.int_params[AC_nx_min],
-        j + d_mesh_info.int_params[AC_ny_min],
-        k + d_mesh_info.int_params[AC_nz_min]);
-    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
-
-    dst[dst_idx] = reduce_initial(src[src_idx]);
-}
-
-template <ReduceInitialVecFunc reduce_initial>
-__global__ void
-_kernel_reduce_vec(const __restrict__ AcReal* src_a,
-                   const __restrict__ AcReal* src_b,
-                   const __restrict__ AcReal* src_c, AcReal* dst)
-{
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k = threadIdx.z + blockIdx.z * blockDim.z;
-
-    if (oob(i, j, k))
-        return;
-
-    const int src_idx = DEVICE_VTXBUF_IDX(
-        i + d_mesh_info.int_params[AC_nx_min],
-        j + d_mesh_info.int_params[AC_ny_min],
-        k + d_mesh_info.int_params[AC_nz_min]);
-    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
-
-    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
-                                  src_c[src_idx]);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-#define BLOCK_SIZE (1024)
-#define ELEMS_PER_THREAD (32)
-
-template <ReduceFunc reduce>
-__global__ void
-_kernel_reduce(AcReal* src, AcReal* result)
-{
-    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
-    const int scratchpad_size = DCONST(AC_nxyz);
-
-    if (idx >= scratchpad_size)
-        return;
-
-    __shared__ AcReal smem[BLOCK_SIZE];
-
-    AcReal tmp = src[idx];
-
-    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
-        const int src_idx = idx + i * BLOCK_SIZE;
-        if (src_idx >= scratchpad_size) {
-            // This check is for safety: if accessing uninitialized values
-            // beyond the mesh boundaries, we will immediately start seeing NANs
-            if (threadIdx.x < BLOCK_SIZE)
-                smem[threadIdx.x] = NAN;
-            else
-                break;
-        }
-        tmp = reduce(tmp, src[src_idx]);
-    }
-
-    smem[threadIdx.x] = tmp;
-    __syncthreads();
-
-    int offset = BLOCK_SIZE / 2;
-    while (offset > 0) {
-
-        if (threadIdx.x < offset) {
-            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
-            smem[threadIdx.x] = tmp;
-        }
-        offset /= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        src[idx] = tmp;
-}
-
-template <ReduceFunc reduce>
-__global__ void
-_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
-{
-    const int scratchpad_size = DCONST(AC_nxyz);
-    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
-    AcReal tmp    = src[idx];
-    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
-    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
-        tmp = reduce(tmp, src[idx + i * block_offset]);
-
-    *result = tmp;
-}
-//////////////////////////////////////////////////////////////////////////////
-
-AcReal
-_reduce_scal(const cudaStream_t stream,
-             const ReductionType& rtype, const int& nx, const int& ny,
-             const int& nz, const AcReal* vertex_buffer,
-             AcReal* reduce_scratchpad, AcReal* reduce_result)
-{
-    bool solve_mean = false;
-
-    const dim3 tpb(32, 4, 1);
-    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
-                   int(ceil(AcReal(nz) / tpb.z)));
-
-    const int scratchpad_size = nx * ny * nz;
-    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
-                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
-
-    switch (rtype) {
-    case RTYPE_MAX:
-        _kernel_reduce_scal<_device_length_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_max>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_max>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_MIN:
-        _kernel_reduce_scal<_device_length_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_min>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_min>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_RMS:
-        _kernel_reduce_scal<_device_squared_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    case RTYPE_RMS_EXP:
-        _kernel_reduce_scal<_device_exp_squared_scal>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    default:
-        ERROR("Unrecognized RTYPE");
-    }
-
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    if (solve_mean) {
-        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
-        return inv_n * result;
-    }
-    else {
-        return result;
-    }
-}
-
-AcReal
-_reduce_vec(const cudaStream_t stream,
-            const ReductionType& rtype, const int& nx, const int& ny,
-            const int& nz, const AcReal* vertex_buffer_a,
-            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
-            AcReal* reduce_scratchpad, AcReal* reduce_result)
-{
-    bool solve_mean = false;
-
-    const dim3 tpb(32, 4, 1);
-    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
-                   int(ceil(float(ny) / tpb.y)),
-                   int(ceil(float(nz) / tpb.z)));
-
-    const int scratchpad_size = nx * ny * nz;
-    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
-                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
-
-    // "Features" of this quick & efficient reduction:
-    // Block size must be smaller than the computational domain size
-    // (otherwise we would have do some additional bounds checking in the
-    // second half of _kernel_reduce, which gets quite confusing)
-    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
-    // the work without worrying too much about the array bounds.
-    ERRCHK(BLOCK_SIZE <= scratchpad_size);
-    ERRCHK(!(BLOCK_SIZE % 2));
-    // NOTE! Also does not work properly with non-power of two mesh dimension
-    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
-    // see that uninitialized smem values are used in the comparison
-    ERRCHK(is_power_of_two(nx));
-    ERRCHK(is_power_of_two(ny));
-    ERRCHK(is_power_of_two(nz));
-
-    switch (rtype) {
-    case RTYPE_MAX:
-        _kernel_reduce_vec<_device_length_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_max>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_max>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_MIN:
-        _kernel_reduce_vec<_device_length_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_min>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_min>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        break;
-    case RTYPE_RMS:
-        _kernel_reduce_vec<_device_squared_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    case RTYPE_RMS_EXP:
-        _kernel_reduce_vec<_device_exp_squared_vec>
-            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
-                           reduce_scratchpad);
-        _kernel_reduce<_device_sum>
-            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
-        _kernel_reduce_block<_device_sum>
-            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
-        solve_mean = true;
-        break;
-    default:
-        ERROR("Unrecognized RTYPE");
-    }
-
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    if (solve_mean) {
-        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
-        return inv_n * result;
-    }
-    else {
-        return result;
-    }
-}
--- a/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
@@ -1,742 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Implementation of the integration pipeline
- *
- *
- *
- */
-#pragma once
-#include "device_globals.cuh"
-
-#include <assert.h>
-
-/*
-#define RK_THREADS_X (32)
-#define RK_THREADS_Y (1)
-#define RK_THREADS_Z (4)
-#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
-#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
-*/
-
-static __device__ __forceinline__ int
-IDX(const int i)
-{
-    return i;
-}
-
-static __device__ __forceinline__ int
-IDX(const int i, const int j, const int k)
-{
-    return DEVICE_VTXBUF_IDX(i, j, k);
-}
-
-static __device__ __forceinline__ int
-IDX(const int3 idx)
-{
-    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
-}
-
-static __forceinline__ AcMatrix
-create_rotz(const AcReal radians)
-{
-    AcMatrix mat;
-
-    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
-    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
-    mat.row[2] = (AcReal3){0, 0, 0};
-
-    return mat;
-}
-
-
-#if AC_DOUBLE_PRECISION == 0
-#define sin __sinf
-#define cos __cosf
-#define exp __expf
-#define rsqrt rsqrtf // hardware reciprocal sqrt
-#endif // AC_DOUBLE_PRECISION == 0
-
-
-/*
-typedef struct {
-    int i, j, k;
-} int3;*/
-
-/*
- * =============================================================================
- * Level 0 (Input Assembly Stage)
- * =============================================================================
- */
-
-/*
- * =============================================================================
- * Level 0.1 (Read stencil elements and solve derivatives)
- * =============================================================================
- */
-static __device__ __forceinline__ AcReal
-first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
-{
-#if STENCIL_ORDER == 2
-    const AcReal coefficients[] = {0, 1.0 / 2.0};
-#elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
-#elif STENCIL_ORDER == 6
-    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
-#elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
-                                   -1.0 / 280.0};
-#endif
-
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = 0;
-
-#pragma unroll
-    for (int i = 1; i <= MID; ++i)
-        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
-
-    return res * inv_ds;
-}
-
-static __device__ __forceinline__ AcReal
-second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
-{
-#if STENCIL_ORDER == 2
-    const AcReal coefficients[] = {-2., 1.};
-#elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
-#elif STENCIL_ORDER == 6
-    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
-                                   1.0 / 90.0};
-#elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
-                                   8.0 / 315.0, -1.0 / 560.0};
-#endif
-
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = coefficients[0] * pencil[MID];
-
-#pragma unroll
-    for (int i = 1; i <= MID; ++i)
-        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
-
-    return res * inv_ds * inv_ds;
-}
-
-/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
-static __device__ __forceinline__ AcReal
-cross_derivative(const AcReal* __restrict__ pencil_a,
-                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
-                 const AcReal inv_ds_b)
-{
-#if STENCIL_ORDER == 2
-    const AcReal coefficients[] = {0, 1.0 / 4.0};
-#elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
-#elif STENCIL_ORDER == 6
-    const AcReal fac            = (1. / 720.);
-    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
-                                   2.0 * fac};
-#elif STENCIL_ORDER == 8
-    const AcReal fac            = (1. / 20160.);
-    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
-                                   128. * fac, -9. * fac};
-#endif
-
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = AcReal(0.);
-
-    #pragma unroll
-    for (int i = 1; i <= MID; ++i) {
-        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
-                                  pencil_b[MID + i] - pencil_b[MID - i]);
-    }
-    return res * inv_ds_a * inv_ds_b;
-}
-
-static __device__ __forceinline__ AcReal
-derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
-
-    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
-}
-
-static __device__ __forceinline__ AcReal
-derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
-
-    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
-}
-
-static __device__ __forceinline__ AcReal
-derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil_a[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
-
-    AcReal pencil_b[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
-
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsy));
-}
-
-static __device__ __forceinline__ AcReal
-derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil_a[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
-                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    AcReal pencil_b[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
-                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
-
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsz));
-}
-
-static __device__ __forceinline__ AcReal
-dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
-
-    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
-}
-
-static __device__ __forceinline__ AcReal
-deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
-
-    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
-}
-
-static __device__ __forceinline__ AcReal
-deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil_a[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    AcReal pencil_b[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
-                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
-
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
-                            DCONST_REAL(AC_inv_dsz));
-}
-
-static __device__ __forceinline__ AcReal
-derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
-}
-
-static __device__ __forceinline__ AcReal
-derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
-{
-    AcReal pencil[STENCIL_ORDER + 1];
-#pragma unroll
-    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
-
-    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
-}
-
-/*
- * =============================================================================
- * Level 0.2 (Caching functions)
- * =============================================================================
- */
-
-#include "stencil_assembly.cuh"
-
-/*
-typedef struct {
-    AcRealData x;
-    AcRealData y;
-    AcRealData z;
-} AcReal3Data;
-
-static __device__ __forceinline__ AcReal3Data
-read_data(const int i, const int j, const int k,
-          AcReal* __restrict__ buf[], const int3& handle)
-{
-    AcReal3Data data;
-
-    data.x = read_data(i, j, k, buf, handle.x);
-    data.y = read_data(i, j, k, buf, handle.y);
-    data.z = read_data(i, j, k, buf, handle.z);
-
-    return data;
-}
-*/
-
-/*
- * =============================================================================
- * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
- * =============================================================================
- */
-
-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator+(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a)
-{
-    return (AcReal3){-a.x, -a.y, -a.z};
-}
-
-static __host__  __device__ __forceinline__ AcReal3
-operator*(const AcReal a, const AcReal3& b)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal
-dot(const AcReal3& a, const AcReal3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-mul(const AcMatrix& aa, const AcReal3& x)
-{
-    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-cross(const AcReal3& a, const AcReal3& b)
-{
-    AcReal3 c;
-
-    c.x = a.y * b.z - a.z * b.y;
-    c.y = a.z * b.x - a.x * b.z;
-    c.z = a.x * b.y - a.y * b.x;
-
-    return c;
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal a)
-{
-    return !isnan(a) && !isinf(a);
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal3& a)
-{
-    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
-}
-
-
-/*
- * =============================================================================
- * Level 1 (Stencil Processing Stage)
- * =============================================================================
- */
-
-/*
- * =============================================================================
- * Level 1.1 (Terms)
- * =============================================================================
- */
-static __device__ __forceinline__ AcReal
-laplace(const AcRealData& data)
-{
-    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
-}
-
-static __device__ __forceinline__ AcReal
-divergence(const AcReal3Data& vec)
-{
-    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
-}
-
-static __device__ __forceinline__ AcReal3
-laplace_vec(const AcReal3Data& vec)
-{
-    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
-}
-
-static __device__ __forceinline__ AcReal3
-curl(const AcReal3Data& vec)
-{
-    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
-                     gradient(vec.x).z - gradient(vec.z).x,
-                     gradient(vec.y).x - gradient(vec.x).y};
-}
-
-static __device__ __forceinline__ AcReal3
-gradient_of_divergence(const AcReal3Data& vec)
-{
-    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
-                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
-                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
-}
-
-// Takes uu gradients and returns S
-static __device__ __forceinline__ AcMatrix
-stress_tensor(const AcReal3Data& vec)
-{
-    AcMatrix S;
-
-    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
-                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
-    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
-    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
-
-    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
-                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
-
-    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
-
-    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
-                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
-
-    S.row[1].x = S.row[0].y;
-    S.row[2].x = S.row[0].z;
-    S.row[2].y = S.row[1].z;
-
-    return S;
-}
-
-static __device__ __forceinline__ AcReal
-contract(const AcMatrix& mat)
-{
-    AcReal res = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 3; ++i)
-        res += dot(mat.row[i], mat.row[i]);
-
-    return res;
-}
-
-/*
- * =============================================================================
- * Level 1.2 (Equations)
- * =============================================================================
- */
-static __device__ __forceinline__ AcReal
-length(const AcReal3& vec)
-{
-    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
-}
-
-static __device__ __forceinline__ AcReal
-reciprocal_len(const AcReal3& vec)
-{
-    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
-}
-
-static __device__ __forceinline__ AcReal3
-normalized(const AcReal3& vec)
-{
-    const AcReal inv_len = reciprocal_len(vec);
-    return inv_len * vec;
-}
-
-// Sinusoidal forcing
-// https://arxiv.org/pdf/1704.04676.pdf
-__constant__ AcReal3 forcing_vec;
-__constant__ AcReal forcing_phi;
-static __device__ __forceinline__ AcReal3
-forcing(const int i, const int j, const int k)
-{
-    #define DOMAIN_SIZE_X (DCONST(AC_nx) * DCONST_REAL(AC_dsx))
-    #define DOMAIN_SIZE_Y (DCONST(AC_ny) * DCONST_REAL(AC_dsy))
-    #define DOMAIN_SIZE_Z (DCONST(AC_nz) * DCONST_REAL(AC_dsz))
-    const AcReal3 k_vec = (AcReal3){(i - DCONST(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
-                                    (j - DCONST(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
-                                    (k - DCONST(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
-    AcReal inv_len = reciprocal_len(k_vec);
-    if (isnan(inv_len) || isinf(inv_len))
-        inv_len = 0;
-    if (inv_len > 2) // hack to make it cool
-        inv_len = 2;
-    const AcReal k_dot_x = dot(k_vec, forcing_vec);
-
-    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
-
-    return inv_len * inv_len * waves * forcing_vec;
-}
-
-
-// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
-#define LNT0 (AcReal(0.0))
-#define LNRHO0 (AcReal(0.0))
-
-#define H_CONST (AcReal(0.0))
-#define C_CONST (AcReal(0.0))
-
-
-
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current,
-              const AcReal rate_of_change, const AcReal dt)
-{
-    // Williamson (1980)
-    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
-                           AcReal(8. / 15.)};
-
-
-    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
-    // access (when accessing beta[step_number-1] even when step_number >= 1)
-    switch (step_number) {
-        case 0:
-            return state_current + beta[step_number + 1] * rate_of_change * dt;
-        case 1: // Fallthrough
-        case 2:
-            return state_current +
-               beta[step_number + 1] *
-                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
-                        (state_current - state_previous) +
-                    rate_of_change * dt);
-        default:
-            return NAN;
-    }
-}
-/*
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
-              const AcReal rate_of_change, const AcReal dt)
-{
-    // Williamson (1980)
-    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
-                           AcReal(8. / 15.)};
-
-
-    switch (step_number) {
-        case 0:
-            return state_current + beta[step_number] * rate_of_change * dt;
-        case 1: // Fallthrough
-        case 2:
-            return state_current +
-               beta[step_number] *
-                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
-                        (state_current - state_previous) +
-                    rate_of_change * dt);
-        default:
-            return NAN;
-    }
-}
-*/
-
-template <int step_number>
-static __device__ __forceinline__ AcReal3
-rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
-              const AcReal3 rate_of_change, const AcReal dt)
-{
-    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
-}
-
-#define rk3(state_previous, state_current, rate_of_change, dt)\
-rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
-
-/*
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate(const int idx, const AcReal out, const int handle,
-              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
-{
-    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
-}
-
-template <int step_number>
-static __device__ __forceinline__ AcReal3
-rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
-                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
-{
-    return (AcReal3) {
-        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
-        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
-    };
-}
-
-#define RK3(handle, in_cached, rate_of_change, dt) \
-rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
-*/
-
-/*
- * =============================================================================
- * Level 1.3 (Kernels)
- * =============================================================================
- */
-
-static __device__ void
-write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
-{
-    out[handle][idx] = value;
-}
-
-static __device__ void
-write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
-{
-    write(out, vec.x, idx, value.x);
-    write(out, vec.y, idx, value.y);
-    write(out, vec.z, idx, value.z);
-}
-
-static __device__ AcReal
-read_out(const int idx, AcReal* __restrict__ field[], const int handle)
-{
-    return field[handle][idx];
-}
-
-static __device__ AcReal3
-read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
-{
-    return (AcReal3) { read_out(idx, field, handle.x),
-                                       read_out(idx, field, handle.y),
-                                       read_out(idx, field, handle.z) };
-}
-
-#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
-#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
-#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
-
-// also write for clarity here also, not for the DSL
-//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
-
-#define GEN_KERNEL_PARAM_BOILERPLATE \
-        const int3 start, const int3 end, VertexBufferArray buffer
-
-#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
-        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
-                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
-                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
-        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
-            return;\
-\
-\
-        assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&\
-               vertexIdx.z < DCONST(AC_nz_max));\
-\
-        assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&\
-               vertexIdx.z >= DCONST(AC_nz_min));\
-\
-        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
-
-#include "stencil_process.cuh"
-
-/*
- * =============================================================================
- * Level 2 (Host calls)
- * =============================================================================
- */
-
-static AcReal
-randf(void)
-{
-    return AcReal(rand()) / AcReal(RAND_MAX);
-}
-
-AcResult
-rk3_step_async(const cudaStream_t stream, const dim3& tpb,
-               const int3& start, const int3& end, const int& step_number,
-               const AcReal dt, const AcMeshInfo& /*mesh_info*/,
-               VertexBufferArray* buffer)
-{
-    /////////////////// Forcing
-    #if LFORCING
-    const AcReal ff_scale = AcReal(.2);
-    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
-    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
-    const AcMatrix rotz = create_rotz(radians);
-    ff = mul(rotz, ff);
-    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
-
-    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
-    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
-    #endif // LFORCING
-    //////////////////////////
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-
-    const dim3 bpg(
-        (unsigned int)ceil(nx / AcReal(tpb.x)),
-        (unsigned int)ceil(ny / AcReal(tpb.y)),
-        (unsigned int)ceil(nz / AcReal(tpb.z)));
-
-
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-    else
-        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-
-    ERRCHK_CUDA_KERNEL();
-    return AC_SUCCESS;
-}
--- a/src/core/kernels/integration.cu
+++ b/src/core/kernels/integration.cu
@@ -1,293 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "common.cuh"
-#include "integration.cuh"
-
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h"
-
-#include <assert.h>
-
-__constant__ AcMeshInfo d_mesh_info; // Extern in kernels/common.cuh.
-
-static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
-
-// Device info
-#define REGISTERS_PER_THREAD (255)
-#define MAX_REGISTERS_PER_BLOCK (65536)
-#define MAX_THREADS_PER_BLOCK (1024)
-#define WARP_SIZE (32)
-
-#define make_int3(a, b, c)                                                                         \
-    (int3) { (int)a, (int)b, (int)c }
-
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
-              const AcReal dt)
-{
-    // Williamson (1980)
-    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
-
-    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
-    // access (when accessing beta[step_number-1] even when step_number >= 1)
-    switch (step_number) {
-    case 0:
-        return state_current + beta[step_number + 1] * rate_of_change * dt;
-    case 1: // Fallthrough
-    case 2:
-        return state_current +
-               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
-                                            (state_current - state_previous) +
-                                        rate_of_change * dt);
-    default:
-        return NAN;
-    }
-}
-
-template <int step_number>
-static __device__ __forceinline__ AcReal3
-rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
-              const AcReal3 rate_of_change, const AcReal dt)
-{
-    return (AcReal3){
-        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
-}
-
-#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
-    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
-
-static __device__ void
-write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
-{
-    out[handle][idx] = value;
-}
-
-static __device__ __forceinline__ void
-write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
-{
-    write(out, vec.x, idx, value.x);
-    write(out, vec.y, idx, value.y);
-    write(out, vec.z, idx, value.z);
-}
-
-static __device__ __forceinline__ AcReal
-read_out(const int idx, AcReal* __restrict__ field[], const int handle)
-{
-    return field[handle][idx];
-}
-
-static __device__ __forceinline__ AcReal3
-read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
-{
-    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
-                     read_out(idx, field, handle.z)};
-}
-
-#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
-#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
-#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
-
-#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
-#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
-
-#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
-    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
-                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
-                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
-    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
-                                        d_multigpu_offset.y + vertexIdx.y,                         \
-                                        d_multigpu_offset.z + vertexIdx.z};                        \
-    (void)globalVertexIdx;                                                                         \
-    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
-        return;                                                                                    \
-                                                                                                   \
-    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
-           vertexIdx.z < DCONST(AC_nz_max));                                                       \
-                                                                                                   \
-    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
-           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
-                                                                                                   \
-    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
-
-#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
-    template <int step_number>                                                                     \
-    AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start,              \
-                                         const int3 end, VertexBufferArray vba)                    \
-    {                                                                                              \
-                                                                                                   \
-        const dim3 tpb(32, 1, 4);                                                                  \
-                                                                                                   \
-        const int3 n = end - start;                                                                \
-        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
-                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
-                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
-                                                                                                   \
-        identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba);                         \
-        ERRCHK_CUDA_KERNEL();                                                                      \
-                                                                                                   \
-        return AC_SUCCESS;                                                                         \
-    }
-
-#include "user_kernels.h"
-
-static dim3 rk3_tpb(32, 1, 4);
-
-AcResult
-acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
-{
-    // RK3
-    dim3 best_dims(0, 0, 0);
-    float best_time          = INFINITY;
-    const int num_iterations = 10;
-
-    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
-        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
-            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
-
-                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
-                    break;
-                if (x * y * z > MAX_THREADS_PER_BLOCK)
-                    break;
-
-                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
-                    break;
-
-                if (((x * y * z) % WARP_SIZE) != 0)
-                    continue;
-
-                const dim3 tpb(x, y, z);
-                const int3 n = end - start;
-                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-                cudaDeviceSynchronize();
-                if (cudaGetLastError() != cudaSuccess) // resets the error if any
-                    continue;
-
-                // printf("(%d, %d, %d)\n", x, y, z);
-
-                cudaEvent_t tstart, tstop;
-                cudaEventCreate(&tstart);
-                cudaEventCreate(&tstop);
-
-                // #ifdef AC_dt
-                // acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
-                // note, temporarily disabled
-                /*#else
-                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
-                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
-                it or call the generated " "device function acDeviceKernel_<kernel name> which does
-                not require the " "timestep to be defined.\n"); #endif*/
-
-                cudaEventRecord(tstart); // ---------------------------------------- Timing start
-                for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, vba);
-
-                cudaEventRecord(tstop); // ----------------------------------------- Timing end
-                cudaEventSynchronize(tstop);
-                float milliseconds = 0;
-                cudaEventElapsedTime(&milliseconds, tstart, tstop);
-
-                ERRCHK_CUDA_KERNEL_ALWAYS();
-                if (milliseconds < best_time) {
-                    best_time = milliseconds;
-                    best_dims = tpb;
-                }
-            }
-        }
-    }
-#if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
-#endif
-    /*
-    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
-    ERRCHK(fp);
-    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
-    fclose(fp);
-    */
-
-    rk3_tpb = best_dims;
-    return AC_SUCCESS;
-}
-
-AcResult
-acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
-                         const int3 end, VertexBufferArray vba)
-{
-    const dim3 tpb = rk3_tpb;
-
-    const int3 n = end - start;
-    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-    //#ifdef AC_dt
-    // acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
-    /*#else
-        (void)dt;
-        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
-              "called, but AC_dt was not defined. Either define it or call the generated "
-              "device function acDeviceKernel_<kernel name> which does not require the "
-              "timestep to be defined.\n");
-    #endif*/
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
-    else
-        solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
-
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
-}
-
-static __global__ void
-dummy_kernel(void)
-{
-    DCONST((AcIntParam)0);
-    DCONST((AcInt3Param)0);
-    DCONST((AcRealParam)0);
-    DCONST((AcReal3Param)0);
-    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
-    a* a;
-}
-
-AcResult
-acKernelDummy(void)
-{
-    dummy_kernel<<<1, 1>>>();
-    ERRCHK_CUDA_KERNEL_ALWAYS();
-    return AC_SUCCESS;
-}
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -1,42 +1,259 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
 #pragma once

-#ifdef __cplusplus
-extern "C" {
+static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define WARP_SIZE (32)
+
+#define make_int3(a, b, c)                                                                         \
+    (int3) { (int)a, (int)b, (int)c }
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
+              const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+    case 0:
+        return state_current + beta[step_number + 1] * rate_of_change * dt;
+    case 1: // Fallthrough
+    case 2:
+        return state_current +
+               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                                            (state_current - state_previous) +
+                                        rate_of_change * dt);
+    default:
+        return NAN;
+    }
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3){
+        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
+    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ __forceinline__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ __forceinline__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ __forceinline__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
+                     read_out(idx, field, handle.z)};
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
+#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
+    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
+                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
+                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
+    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
+                                        d_multigpu_offset.y + vertexIdx.y,                         \
+                                        d_multigpu_offset.z + vertexIdx.z};                        \
+    (void)globalVertexIdx;                                                                         \
+    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
+        return;                                                                                    \
+                                                                                                   \
+    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
+           vertexIdx.z < DCONST(AC_nz_max));                                                       \
+                                                                                                   \
+    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
+           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
+                                                                                                   \
+    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
+    template <int step_number>                                                                     \
+    AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start,              \
+                                         const int3 end, VertexBufferArray vba)                    \
+    {                                                                                              \
+                                                                                                   \
+        const dim3 tpb(32, 1, 4);                                                                  \
+                                                                                                   \
+        const int3 n = end - start;                                                                \
+        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
+                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
+                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
+                                                                                                   \
+        identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba);                         \
+        ERRCHK_CUDA_KERNEL();                                                                      \
+                                                                                                   \
+        return AC_SUCCESS;                                                                         \
+    }
+
+#include "user_kernels.h"
+
+static dim3 rk3_tpb(32, 1, 4);
+
+AcResult
+acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
+{
+    // RK3
+    dim3 best_dims(0, 0, 0);
+    float best_time          = INFINITY;
+    const int num_iterations = 10;
+
+    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
+        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
+            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
+
+                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
+                    break;
+                if (x * y * z > MAX_THREADS_PER_BLOCK)
+                    break;
+
+                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
+                    break;
+
+                if (((x * y * z) % WARP_SIZE) != 0)
+                    continue;
+
+                const dim3 tpb(x, y, z);
+                const int3 n = end - start;
+                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+                cudaDeviceSynchronize();
+                if (cudaGetLastError() != cudaSuccess) // resets the error if any
+                    continue;
+
+                // printf("(%d, %d, %d)\n", x, y, z);
+
+                cudaEvent_t tstart, tstop;
+                cudaEventCreate(&tstart);
+                cudaEventCreate(&tstop);
+
+                // #ifdef AC_dt
+                // acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
+                // note, temporarily disabled
+                /*#else
+                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
+                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
+                it or call the generated " "device function acDeviceKernel_<kernel name> which does
+                not require the " "timestep to be defined.\n"); #endif*/
+
+                cudaEventRecord(tstart); // ---------------------------------------- Timing start
+                for (int i = 0; i < num_iterations; ++i)
+                    solve<2><<<bpg, tpb>>>(start, end, vba);
+
+                cudaEventRecord(tstop); // ----------------------------------------- Timing end
+                cudaEventSynchronize(tstop);
+                float milliseconds = 0;
+                cudaEventElapsedTime(&milliseconds, tstart, tstop);
+
+                ERRCHK_CUDA_KERNEL_ALWAYS();
+                if (milliseconds < best_time) {
+                    best_time = milliseconds;
+                    best_dims = tpb;
+                }
+            }
+        }
+    }
+#if VERBOSE_PRINTING
+    printf(
+        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
+        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
+    /*
+    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
+    ERRCHK(fp);
+    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
+    fclose(fp);
+    */

-AcResult acKernelDummy(void);
+    rk3_tpb = best_dims;
+    return AC_SUCCESS;
+}

-AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
+AcResult
+acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
+                         const int3 end, VertexBufferArray vba)
+{
+    const dim3 tpb = rk3_tpb;

-AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
-                                  const int3 start, const int3 end, VertexBufferArray vba);
+    const int3 n = end - start;
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                   (unsigned int)ceil(n.z / AcReal(tpb.z)));

-#ifdef __cplusplus
-} // extern "C"
-#endif
+    //#ifdef AC_dt
+    // acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
+    /*#else
+        (void)dt;
+        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
+              "called, but AC_dt was not defined. Either define it or call the generated "
+              "device function acDeviceKernel_<kernel name> which does not require the "
+              "timestep to be defined.\n");
+    #endif*/
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
+
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+static __global__ void
+dummy_kernel(void)
+{
+    DCONST((AcIntParam)0);
+    DCONST((AcInt3Param)0);
+    DCONST((AcRealParam)0);
+    DCONST((AcReal3Param)0);
+    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
+    a* a;
+}
+
+AcResult
+acKernelDummy(void)
+{
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/kernels.cu
+++ b/src/core/kernels/kernels.cu
@@ -0,0 +1,177 @@
+#include "kernels.h"
+
+#include <assert.h>
+#include <cuComplex.h>
+
+#include "errchk.h"
+#include "math_utils.h"
+
+__device__ __constant__ AcMeshInfo d_mesh_info;
+
+static int __device__ __forceinline__
+DCONST(const AcIntParam param)
+{
+    return d_mesh_info.int_params[param];
+}
+static int3 __device__ __forceinline__
+DCONST(const AcInt3Param param)
+{
+    return d_mesh_info.int3_params[param];
+}
+static AcReal __device__ __forceinline__
+DCONST(const AcRealParam param)
+{
+    return d_mesh_info.real_params[param];
+}
+static AcReal3 __device__ __forceinline__
+DCONST(const AcReal3Param param)
+{
+    return d_mesh_info.real3_params[param];
+}
+static __device__ constexpr VertexBufferHandle
+DCONST(const VertexBufferHandle handle)
+{
+    return handle;
+}
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
+#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
+#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
+//#define globalMeshM // Placeholder
+//#define localMeshN // Placeholder
+//#define localMeshM // Placeholder
+//#define localMeshN_min // Placeholder
+//#define globalMeshN_min // Placeholder
+#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
+//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
+
+static __device__ constexpr int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+#if AC_DOUBLE_PRECISION == 1
+typedef cuDoubleComplex acComplex;
+#define acComplex(x, y) make_cuDoubleComplex(x, y)
+#else
+typedef cuFloatComplex acComplex;
+#define acComplex(x, y) make_cuFloatComplex(x, y)
+#define exp(x) expf(x)
+#define sin(x) sinf(x)
+#define cos(x) cosf(x)
+#define sqrt(x) sqrtf(x)
+#endif
+
+static __device__ inline acComplex
+exp(const acComplex& val)
+{
+    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
+}
+static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+{
+    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
+}
+
+// Kernels /////////////////////////////////////////////////////////////////////
+#include "boundconds.cuh"
+#include "integration.cuh"
+#include "packing.cuh"
+#include "reductions.cuh"
+
+AcResult
+acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
+{
+    cudaSetDevice(device->id);
+
+    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
+                  device->local_config.int_params[AC_multigpu_offset]);
+
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
+                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
+                          const AcReal value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_REAL_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
+                          const AcReal3 value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
+                       const int value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_INT_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
+                        const int3 value)
+{
+    cudaSetDevice(device->id);
+
+    if (param >= NUM_INT3_PARAMS)
+        return AC_FAILURE;
+
+    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -0,0 +1,82 @@
+#pragma once
+#include "astaroth.h"
+
+typedef struct {
+    int3 dims;
+    AcReal* data;
+} PackedData;
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+
+    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
+} VertexBufferArray;
+
+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAMS];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** */
+AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                                    AcReal* vtxbuf);
+
+/** */
+AcResult acKernelDummy(void);
+
+/** */
+AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
+
+/** */
+AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
+                                  const int3 start, const int3 end, VertexBufferArray vba);
+
+/** */
+AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba,
+                          const int3 vba_start, PackedData packed);
+
+/** */
+AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed,
+                            const int3 vba_start, VertexBufferArray vba);
+
+/** */
+AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                          const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
+                          AcReal* reduce_result);
+
+/** */
+AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                         const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
+
+AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
+                              const AcMeshInfo device_config);
+
+AcResult acDeviceLoadScalarUniform(const Device device, const Stream stream,
+                                   const AcRealParam param, const AcReal value);
+
+AcResult acDeviceLoadVectorUniform(const Device device, const Stream stream,
+                                   const AcReal3Param param, const AcReal3 value);
+
+AcResult acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
+                                const int value);
+
+AcResult acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
+                                 const int3 value);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
--- a/src/core/kernels/packing.cu
+++ b/src/core/kernels/packing.cu
@@ -1,120 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "packing.cuh"
-
-#include "src/core/errchk.h"
-
-__global__ void
-kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
-{
-    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_packed >= packed.dims.x || //
-        j_packed >= packed.dims.y || //
-        k_packed >= packed.dims.z) {
-        return;
-    }
-
-    const int i_unpacked = i_packed + vba_start.x;
-    const int j_unpacked = j_packed + vba_start.y;
-    const int k_unpacked = k_packed + vba_start.z;
-
-    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
-    const int packed_idx   = i_packed +               //
-                           j_packed * packed.dims.x + //
-                           k_packed * packed.dims.x * packed.dims.y;
-
-    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
-
-    //#pragma unroll
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-        packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
-}
-
-__global__ void
-kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
-{
-    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_packed >= packed.dims.x || //
-        j_packed >= packed.dims.y || //
-        k_packed >= packed.dims.z) {
-        return;
-    }
-
-    const int i_unpacked = i_packed + vba_start.x;
-    const int j_unpacked = j_packed + vba_start.y;
-    const int k_unpacked = k_packed + vba_start.z;
-
-    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
-    const int packed_idx   = i_packed +               //
-                           j_packed * packed.dims.x + //
-                           k_packed * packed.dims.x * packed.dims.y;
-
-    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
-
-    //#pragma unroll
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-        vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
-}
-
-AcResult
-acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
-                 PackedData packed)
-{
-    const dim3 tpb(32, 8, 1);
-    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
-                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
-                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
-
-    kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
-                   VertexBufferArray vba)
-{
-    const dim3 tpb(32, 8, 1);
-    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
-                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
-                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
-
-    kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
-    ERRCHK_CUDA_KERNEL();
-    return AC_SUCCESS;
-}
--- a/src/core/kernels/packing.cuh
+++ b/src/core/kernels/packing.cuh
@@ -1,40 +1,92 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
 #pragma once
-#include "astaroth.h"
-#include "common.cuh"

-typedef struct {
-    int3 dims;
-    AcReal* data;
-} PackedData;
+static __global__ void
+kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
+{
+    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;

-AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba,
-                          const int3 vba_start, PackedData packed);
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_packed >= packed.dims.x || //
+        j_packed >= packed.dims.y || //
+        k_packed >= packed.dims.z) {
+        return;
+    }

-AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed,
-                            const int3 vba_start, VertexBufferArray vba);
+    const int i_unpacked = i_packed + vba_start.x;
+    const int j_unpacked = j_packed + vba_start.y;
+    const int k_unpacked = k_packed + vba_start.z;
+
+    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
+    const int packed_idx   = i_packed +               //
+                           j_packed * packed.dims.x + //
+                           k_packed * packed.dims.x * packed.dims.y;
+
+    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
+
+    //#pragma unroll
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
+}
+
+static __global__ void
+kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
+{
+    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_packed >= packed.dims.x || //
+        j_packed >= packed.dims.y || //
+        k_packed >= packed.dims.z) {
+        return;
+    }
+
+    const int i_unpacked = i_packed + vba_start.x;
+    const int j_unpacked = j_packed + vba_start.y;
+    const int k_unpacked = k_packed + vba_start.z;
+
+    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
+    const int packed_idx   = i_packed +               //
+                           j_packed * packed.dims.x + //
+                           k_packed * packed.dims.x * packed.dims.y;
+
+    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
+
+    //#pragma unroll
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+        vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
+}
+
+AcResult
+acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
+                 PackedData packed)
+{
+    const dim3 tpb(32, 8, 1);
+    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
+                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
+                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
+
+    kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
+                   VertexBufferArray vba)
+{
+    const dim3 tpb(32, 8, 1);
+    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
+                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
+                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
+
+    kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/reductions.cu
+++ b/src/core/kernels/reductions.cu
@@ -1,284 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "common.cuh"
-#include "reductions.cuh"
-
-#include "src/core/errchk.h"
-#include "src/core/math_utils.h" // is_power_of_two
-
-/*
-Reduction steps:
- 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
- 2 of 3: Compute most of the reductions into a single block of data
- 3 of 3: After all results have been stored into the final block, reduce the data in the final block
-*/
-
-// Function pointer definitions
-typedef AcReal (*FilterFunc)(const AcReal&);
-typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
-typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
-
-// clang-format off
-/* Comparison funcs */
-static __device__ inline AcReal
-dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
-
-static __device__ inline AcReal
-dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
-
-static __device__ inline AcReal
-dsum(const AcReal& a, const AcReal& b) { return a + b; }
-
-/* Function used to determine the values used during reduction */
-static __device__ inline AcReal
-dvalue(const AcReal& a) { return AcReal(a); }
-
-static __device__ inline AcReal
-dsquared(const AcReal& a) { return (AcReal)(a*a); }
-
-static __device__ inline AcReal
-dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
-
-static __device__ inline AcReal
-dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
-
-static __device__ inline AcReal
-dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
-
-static __device__ inline AcReal
-dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
-// clang-format on
-
-#include <assert.h>
-template <FilterFunc filter>
-__global__ void
-kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
-           src_idx.z < DCONST(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
-}
-
-template <FilterFuncVec filter>
-__global__ void
-kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
-                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
-           src_idx.z < DCONST(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce(AcReal* scratchpad, const int num_elems)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    extern __shared__ AcReal smem[];
-    if (idx < num_elems) {
-        smem[threadIdx.x] = scratchpad[idx];
-    }
-    else {
-        smem[threadIdx.x] = NAN;
-    }
-    __syncthreads();
-
-    int offset = blockDim.x / 2;
-    assert(offset % 2 == 0);
-    while (offset > 0) {
-        if (threadIdx.x < offset) {
-            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
-        }
-        offset /= 2;
-        __syncthreads();
-    }
-
-    if (threadIdx.x == 0) {
-        scratchpad[idx] = smem[threadIdx.x];
-    }
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
-                    const int block_size, AcReal* result)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx != 0) {
-        return;
-    }
-
-    AcReal res = scratchpad[0];
-    for (int i = 1; i < num_blocks; ++i) {
-        res = reduce(res, scratchpad[i * block_size]);
-    }
-    *result = res;
-}
-
-AcReal
-acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
-                   const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
-
-AcReal
-acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
-                  const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
-                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -1,44 +1,254 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
 #pragma once
-#include <astaroth.h>

-#ifdef __cplusplus
-extern "C" {
-#endif
+/*
+Reduction steps:
+ 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
+ 2 of 3: Compute most of the reductions into a single block of data
+ 3 of 3: After all results have been stored into the final block, reduce the data in the final block
+*/

-AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
-                          const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
-                          AcReal* reduce_result);
+// Function pointer definitions
+typedef AcReal (*FilterFunc)(const AcReal&);
+typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);

-AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+// clang-format off
+/* Comparison funcs */
+static __device__ inline AcReal
+dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+static __device__ inline AcReal
+dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+static __device__ inline AcReal
+dsum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static __device__ inline AcReal
+dvalue(const AcReal& a) { return AcReal(a); }
+
+static __device__ inline AcReal
+dsquared(const AcReal& a) { return (AcReal)(a*a); }
+
+static __device__ inline AcReal
+dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
+
+static __device__ inline AcReal
+dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+static __device__ inline AcReal
+dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
+
+static __device__ inline AcReal
+dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
+// clang-format on
+
+#include <assert.h>
+template <FilterFunc filter>
+static __global__ void
+kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
+           src_idx.z < DCONST(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
+}
+
+template <FilterFuncVec filter>
+static __global__ void
+kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
+                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
+           src_idx.z < DCONST(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
+        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+}
+
+template <ReduceFunc reduce>
+static __global__ void
+kernel_reduce(AcReal* scratchpad, const int num_elems)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    extern __shared__ AcReal smem[];
+    if (idx < num_elems) {
+        smem[threadIdx.x] = scratchpad[idx];
+    }
+    else {
+        smem[threadIdx.x] = NAN;
+    }
+    __syncthreads();
+
+    int offset = blockDim.x / 2;
+    assert(offset % 2 == 0);
+    while (offset > 0) {
+        if (threadIdx.x < offset) {
+            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        scratchpad[idx] = smem[threadIdx.x];
+    }
+}
+
+template <ReduceFunc reduce>
+static __global__ void
+kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
+                    const int block_size, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx != 0) {
+        return;
+    }
+
+    AcReal res = scratchpad[0];
+    for (int i = 1; i < num_blocks; ++i) {
+        res = reduce(res, scratchpad[i * block_size]);
+    }
+    *result = res;
+}
+
+AcReal
+acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                   const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
+
+AcReal
+acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
                  const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
-                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
+                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;

-#ifdef __cplusplus
-} // extern "C"
-#endif
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -1,187 +0,0 @@
-/*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-
-    This file is part of Astaroth.
-
-    Astaroth is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Astaroth is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#pragma once
-//#include <cmath>
-// using namespace std; // Potentially bad practice to declare namespace std here
-#include <math.h> // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
-//#include <tgmath.h> // Even this does not work
-#include <stdlib.h> // rand
-
-template <class T>
-static inline const T
-max(const T& a, const T& b)
-{
-    return a > b ? a : b;
-}
-
-template <class T>
-static inline const T
-min(const T& a, const T& b)
-{
-    return a < b ? a : b;
-}
-
-static inline const int3
-max(const int3& a, const int3& b)
-{
-    return (int3){max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
-}
-
-static inline const int3
-min(const int3& a, const int3& b)
-{
-    return (int3){min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
-}
-
-template <class T>
-static inline const T
-sum(const T& a, const T& b)
-{
-    return a + b;
-}
-
-template <class T>
-static inline const T
-clamp(const T& val, const T& min, const T& max)
-{
-    return val < min ? min : val > max ? max : val;
-}
-
-static inline AcReal
-randr()
-{
-    return AcReal(rand()) / AcReal(RAND_MAX);
-}
-
-static inline bool
-is_power_of_two(const unsigned val)
-{
-    return val && !(val & (val - 1));
-}
-
-#ifdef __CUDACC__
-#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
-#else
-#define HOST_DEVICE_INLINE inline
-#endif // __CUDACC__
-
-static HOST_DEVICE_INLINE AcReal3
-operator+(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static HOST_DEVICE_INLINE int3
-operator+(const int3& a, const int3& b)
-{
-    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
-{
-    return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
-}
-
-static HOST_DEVICE_INLINE void
-operator+=(AcReal3& lhs, const AcReal3& rhs)
-{
-    lhs.x += rhs.x;
-    lhs.y += rhs.y;
-    lhs.z += rhs.z;
-}
-
-static HOST_DEVICE_INLINE AcReal3
-operator-(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static HOST_DEVICE_INLINE int3
-operator-(const int3& a, const int3& b)
-{
-    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static HOST_DEVICE_INLINE AcReal3
-operator-(const AcReal3& a)
-{
-    return (AcReal3){-a.x, -a.y, -a.z};
-}
-
-static HOST_DEVICE_INLINE void
-operator-=(AcReal3& lhs, const AcReal3& rhs)
-{
-    lhs.x -= rhs.x;
-    lhs.y -= rhs.y;
-    lhs.z -= rhs.z;
-}
-
-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static HOST_DEVICE_INLINE AcReal
-dot(const AcReal3& a, const AcReal3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static HOST_DEVICE_INLINE AcReal3
-mul(const AcMatrix& aa, const AcReal3& x)
-{
-    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
-}
-
-static HOST_DEVICE_INLINE AcReal3
-cross(const AcReal3& a, const AcReal3& b)
-{
-    AcReal3 c;
-
-    c.x = a.y * b.z - a.z * b.y;
-    c.y = a.z * b.x - a.x * b.z;
-    c.z = a.x * b.y - a.y * b.x;
-
-    return c;
-}
-
-static HOST_DEVICE_INLINE bool
-is_valid(const AcReal& a)
-{
-    return !isnan(a) && !isinf(a);
-}
-
-static HOST_DEVICE_INLINE bool
-is_valid(const AcReal3& a)
-{
-    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
-}
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -123,12 +123,10 @@
                                 db  da
 *
 */
-#include "astaroth_node.h"
+#include "astaroth.h"

-#include "astaroth_device.h"
 #include "errchk.h"
-#include "kernels/common.cuh"
-#include "math_utils.h" // sum for reductions
+#include "math_utils.h"

 static const int MAX_NUM_DEVICES = 32;