Reordered src/core to have better division to host and device code (this is more likely to work when compiling with mpicxx). Disabled separate compilation of CUDA kernels as this complicates compilation and is a source of many cmake/cuda bugs. As a downside, GPU code takes longer to compile.

2020-01-23 20:06:20 +02:00
parent 96389e9da6
commit 78fbcc090d
22 changed files with 1008 additions and 4305 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,38 +1,4 @@
-########################################
+## Astaroth Core
-##  CMakeLists.txt for Astaroth Core  ##
+add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-########################################
+target_include_directories(astaroth_core PRIVATE ${CUDA_INCLUDE_DIRS})
-
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
 # Require C++11
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_FLAGS "-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
 set(CMAKE_CXX_FLAGS_DEBUG "-g")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 # Require CUDA
 find_package(CUDA REQUIRED)
 set(CMAKE_CUDA_FLAGS "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
 # Compile kernels
 add_library(astaroth_kernels STATIC kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu kernels/packing.cu)
 target_include_directories(astaroth_kernels PRIVATE .)
 target_compile_features(astaroth_kernels PRIVATE cxx_std_11)
 set_target_properties(astaroth_kernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 add_dependencies(astaroth_kernels dsl_headers)
 # Compile core
 add_library(astaroth_core STATIC astaroth.cc node.cc device.cc)
 target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
 target_link_libraries(astaroth_core m astaroth_kernels ${CUDA_LIBRARIES})
 target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
 set_target_properties(astaroth_core PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_dependencies(astaroth_kernels dsl_headers)
 if (MULTIGPU_ENABLED)
    target_compile_definitions(astaroth_core PRIVATE AC_MULTIGPU_ENABLED=1)
 endif ()
 if (MPI_ENABLED)
    target_link_libraries(astaroth_core astaroth_utils)
    target_compile_definitions(astaroth_core PRIVATE AC_MPI_ENABLED=1)
 endif ()
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -19,40 +19,11 @@
 #include "astaroth.h"
 #include "errchk.h"
-#include "math_utils.h" // int3 + int3
+#include "math_utils.h"
 #define AC_GEN_STR(X) #X
 const char* intparam_names[]    = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
 const char* int3param_names[]   = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
 const char* realparam_names[]   = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
 const char* real3param_names[]  = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
 const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
 const char* vtxbuf_names[]      = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 #undef AC_GEN_STR
 static const int max_num_nodes   = 1;
 static Node nodes[max_num_nodes] = {0};
-
+static int num_nodes             = 0;
 static int num_nodes = 0;
 void
 acPrintMeshInfo(const AcMeshInfo config)
 {
    for (int i = 0; i < NUM_INT_PARAMS; ++i)
        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
    for (int i = 0; i < NUM_INT3_PARAMS; ++i)
        printf("[%s]: (%d, %d, %d)\n", int3param_names[i], config.int3_params[i].x,
               config.int3_params[i].y, config.int3_params[i].z);
    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
    for (int i = 0; i < NUM_REAL3_PARAMS; ++i)
        printf("[%s]: (%g, %g, %g)\n", real3param_names[i], double(config.real3_params[i].x),
               double(config.real3_params[i].y), double(config.real3_params[i].z));
 }
 AcResult
 acInit(const AcMeshInfo mesh_info)
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1,60 +1,106 @@
-/*
+#include "astaroth.h"
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
-    This file is part of Astaroth.
+#include <mpi.h>
-
+#include <string.h>
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #include "astaroth_device.h"
 #include <string.h> // memcpy
 #include "astaroth_utils.h"
 #include "errchk.h"
 #include "math_utils.h"
 #include "timer_hires.h"
-#include "kernels/common.cuh"
+#include "kernels/kernels.h"
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
-struct device_s {
+AcResult
-    int id;
+acDevicePrintInfo(const Device device)
-    AcMeshInfo local_config;
+{
    const int device_id = device->id;
-    // Concurrency
+    cudaDeviceProp props;
-    cudaStream_t streams[NUM_STREAMS];
+    cudaGetDeviceProperties(&props, device_id);
    printf("--------------------------------------------------\n");
    printf("Device Number: %d\n", device_id);
    const size_t bus_id_max_len = 128;
    char bus_id[bus_id_max_len];
    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
    printf("  PCI bus ID: %s\n", bus_id);
    printf("    Device name: %s\n", props.name);
    printf("    Compute capability: %d.%d\n", props.major, props.minor);
    // Compute
    printf("  Compute\n");
    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
    printf("    Stream processors: %d\n", props.multiProcessorCount);
    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
    printf(
        "    Compute mode: %d\n",
        (int)props
            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
    // Memory
-    VertexBufferArray vba;
+    printf("  Global memory\n");
-    AcReal* reduce_scratchpad;
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
-    AcReal* reduce_result;
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
-};
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
    printf("    ECC enabled: %d\n", props.ECCEnabled);
-#include "kernels/boundconds.cuh"
+    // Memory usage
-#include "kernels/integration.cuh"
+    size_t free_bytes, total_bytes;
-#include "kernels/reductions.cuh"
+    cudaMemGetInfo(&free_bytes, &total_bytes);
    const size_t used_bytes = total_bytes - free_bytes;
    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
    printf("  Caches\n");
    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
    printf("  Other\n");
    printf("    Warp size: %d\n", props.warpSize);
    // printf("    Single to double perf. ratio: %dx\n",
    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
    // versions
    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
    printf("--------------------------------------------------\n");
-#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
+    return AC_SUCCESS;
-// #include "kernels/pack_unpack.cuh"
+}
-#endif
+
 AcResult
 acDeviceAutoOptimize(const Device device)
 {
    cudaSetDevice(device->id);
    const int3 start = (int3){
        device->local_config.int_params[AC_nx_min],
        device->local_config.int_params[AC_ny_min],
        device->local_config.int_params[AC_nz_min],
    };
    const int3 end = (int3){
        device->local_config.int_params[AC_nx_max],
        device->local_config.int_params[AC_ny_max],
        device->local_config.int_params[AC_nz_max],
    };
    return acKernelAutoOptimizeIntegration(start, end, device->vba);
 }
 AcResult
 acDeviceSynchronizeStream(const Device device, const Stream stream)
 {
    cudaSetDevice(device->id);
    if (stream == STREAM_ALL) {
        cudaDeviceSynchronize();
    }
    else {
        cudaStreamSynchronize(device->streams[stream]);
    }
    return AC_SUCCESS;
 }
 AcResult
 acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -147,96 +193,6 @@ acDeviceDestroy(Device device)
    return AC_SUCCESS;
 }
 AcResult
 acDevicePrintInfo(const Device device)
 {
    const int device_id = device->id;
    cudaDeviceProp props;
    cudaGetDeviceProperties(&props, device_id);
    printf("--------------------------------------------------\n");
    printf("Device Number: %d\n", device_id);
    const size_t bus_id_max_len = 128;
    char bus_id[bus_id_max_len];
    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
    printf("  PCI bus ID: %s\n", bus_id);
    printf("    Device name: %s\n", props.name);
    printf("    Compute capability: %d.%d\n", props.major, props.minor);
    // Compute
    printf("  Compute\n");
    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
    printf("    Stream processors: %d\n", props.multiProcessorCount);
    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
    printf(
        "    Compute mode: %d\n",
        (int)props
            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
    // Memory
    printf("  Global memory\n");
    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
    printf("    ECC enabled: %d\n", props.ECCEnabled);
    // Memory usage
    size_t free_bytes, total_bytes;
    cudaMemGetInfo(&free_bytes, &total_bytes);
    const size_t used_bytes = total_bytes - free_bytes;
    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
    printf("  Caches\n");
    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
    printf("  Other\n");
    printf("    Warp size: %d\n", props.warpSize);
    // printf("    Single to double perf. ratio: %dx\n",
    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
    // versions
    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
    printf("--------------------------------------------------\n");
    return AC_SUCCESS;
 }
 AcResult
 acDeviceAutoOptimize(const Device device)
 {
    cudaSetDevice(device->id);
    const int3 start = (int3){
        device->local_config.int_params[AC_nx_min],
        device->local_config.int_params[AC_ny_min],
        device->local_config.int_params[AC_nz_min],
    };
    const int3 end = (int3){
        device->local_config.int_params[AC_nx_max],
        device->local_config.int_params[AC_ny_max],
        device->local_config.int_params[AC_nz_max],
    };
    return acKernelAutoOptimizeIntegration(start, end, device->vba);
 }
 AcResult
 acDeviceSynchronizeStream(const Device device, const Stream stream)
 {
    cudaSetDevice(device->id);
    if (stream == STREAM_ALL) {
        cudaDeviceSynchronize();
    }
    else {
        cudaStreamSynchronize(device->streams[stream]);
    }
    return AC_SUCCESS;
 }
 AcResult
 acDeviceSwapBuffers(const Device device)
 {
@@ -249,66 +205,6 @@ acDeviceSwapBuffers(const Device device)
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
                          const AcReal value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_REAL_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
                          const AcReal3 value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
                       const int value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_INT_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
                        const int3 value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_INT3_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
                        const size_t start, const AcReal* data, const size_t num)
@@ -327,22 +223,6 @@ acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarAr
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
 {
    cudaSetDevice(device->id);
    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
                  device->local_config.int_params[AC_multigpu_offset]);
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(&d_mesh_info, &device_config, sizeof(device_config),
                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
@@ -578,31 +458,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
    return AC_SUCCESS;
 }
 #if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
 // Functions for calling packed data transfers
 #endif
 #if AC_MPI_ENABLED == 0
 AcResult
 acDeviceRunMPITest(void)
 {
    WARNING("MPI was not enabled but acDeviceRunMPITest() was called");
    return AC_FAILURE;
 }
 #else // MPI_ENABLED ///////////////////////////////////////////////////////////////////////////////
 #include <mpi.h>
 // Kernels
 #include "kernels/packing.cuh"
 // From Astaroth Utils
 #include "src/utils/config_loader.h"
 #include "src/utils/memory.h"
 #include "src/utils/timer_hires.h"
 #include "src/utils/verification.h"
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 static int
 mod(const int a, const int b)
 {
@@ -881,6 +736,8 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
    return AC_SUCCESS;
 }
 /*
 // Deprecated
 static AcResult
 acDeviceCommunicateBlocksMPI(const Device device,        //
                             const int3* a0s,            // Src idx inside comp. domain
@@ -977,6 +834,7 @@ acDeviceCommunicateBlocksMPI(const Device device,        //
    return AC_SUCCESS;
 }
 */
 typedef struct {
    PackedData* srcs;
@@ -1558,7 +1416,7 @@ acDeviceRunMPITest(void)
    };
    submesh_info.int3_params[AC_multigpu_offset] = (int3){-1, -1, -1}; // TODO
    WARNING("AC_multigpu_offset not yet implemented");
-    acUpdateConfig(&submesh_info);
+    acUpdateBuiltinParams(&submesh_info);
    ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx]));
    ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound]));
@@ -1630,4 +1488,3 @@ acDeviceRunMPITest(void)
    //////////////////////////////////////////////////////////////
    return AC_SUCCESS;
 }
 #endif // MPI_ENABLED //////////////////////////////////////////////////////////////////////////////
--- a/src/core/errchk.h
+++ b/src/core/errchk.h
@@ -1,113 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #pragma once
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 // clang-format off
 /*
 * =============================================================================
 * General error checking
 * =============================================================================
 */
 #define ERROR(str) \
 { \
    time_t t; time(&t); \
    fprintf(stderr, "%s", ctime(&t)); \
    fprintf(stderr, "\tError in file %s line %d: %s\n", \
                    __FILE__, __LINE__, str); \
    fflush(stderr); \
    exit(EXIT_FAILURE); \
    abort(); \
 }
 #define WARNING(str) \
 { \
    time_t t; time(&t); \
    fprintf(stderr, "%s", ctime(&t)); \
    fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
                    __FILE__, __LINE__, str); \
    fflush(stderr); \
 }
 // DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
 #define ERRCHK(retval)  { if (!(retval)) ERROR(#retval " was false"); }
 #define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
 #define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
 /*
 * =============================================================================
 * CUDA-specific error checking
 * =============================================================================
 */
 #if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
 static inline void
 cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
 {
    if (code != cudaSuccess) {
        time_t t; time(&t); \
        fprintf(stderr, "%s", ctime(&t)); \
        fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
                        file, line, cudaGetErrorString(code)); \
        fflush(stderr); \
        if (abort)
            exit(code);
    }
 }
 #ifdef NDEBUG
    #undef ERRCHK
    #undef WARNCHK
    #define ERRCHK(params)
    #define WARNCHK(params)
    #define ERRCHK_CUDA(params) params
    #define WARNCHK_CUDA(params) params
    #define ERRCHK_CUDA_KERNEL() {}
 #else
    #define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
    #define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
    #define ERRCHK_CUDA_KERNEL()                                               \
    {                                                                          \
        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
    }
 #endif
 #define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
 #define ERRCHK_CUDA_KERNEL_ALWAYS()                                               \
 {                                                                          \
    ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError());                                    \
    ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize());                                  \
 }
 #define WARNCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__, false); }
 // clang-format on
 #endif // AC_USE_CUDA_RUNTIME_API
--- a/src/core/grid.cc
+++ b/src/core/grid.cc
@@ -1,194 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "astaroth_grid.h"
 #include "astaroth_device.h"
 #include "errchk.h"
 /** */
 AcResult
 acGridInit(const AcMeshInfo info)
 {
    int num_processes, pid;
    MPI_Init(NULL, NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);
    printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes);
    AcMesh submesh;
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridQuit(void)
 {
    WARNING("Proper multinode not yet implemented");
    return AC_FAILURE;
 }
 #if 0
 /** */
 AcResult
 acGridSynchronizeStream(const Stream stream)
 {
    WARNING("Proper multinode not yet implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridSwapBuffers(void)
 {
    WARNING("Proper multinode not yet implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
 {
    WARNING("Proper multinode not yet implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
                                 const int3 dst, const int num_vertices)
 {
    WARNING("Proper multinode not yet implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
                         const int3 dst, const int num_vertices)
 {
    WARNING("Proper multinode not yet implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
                       const VertexBufferHandle vtxbuf_handle)
 {
    WARNING("Proper multinode not yet implemented");
    return AC_FAILURE;
 }
 #endif
 /** */
 AcResult
 acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
 {
    (void)stream;
    (void)host_mesh;
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 #if 0
 /** */
 AcResult
 acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
                                  const int3 src, const int3 dst, const int num_vertices,
                                  AcMesh* host_mesh)
 {
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
                          const int num_vertices, AcMesh* host_mesh)
 {
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
                        AcMesh* host_mesh)
 {
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 #endif
 /** */
 AcResult
 acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
 {
    (void)stream;
    (void)host_mesh;
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 #if 0
 /** */
 AcResult
 acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
                       const AcReal dt)
 {
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridIntegrateStep(const Stream stream, const AcReal dt)
 {
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridPeriodicBoundcondStep(const Stream stream)
 {
    WARNING("Not implemented");
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridReduceScal(const Stream stream, const ReductionType rtype,
                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
    return AC_FAILURE;
 }
 /** */
 AcResult
 acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
                const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
 {
    return AC_FAILURE;
 }
 #endif
--- a/src/core/kernels/CMakeLists.txt
+++ b/src/core/kernels/CMakeLists.txt
@@ -0,0 +1,3 @@
 ## Astaroth Kernels
 add_library(astaroth_kernels STATIC kernels.cu)
 add_dependencies(astaroth_kernels dsl_headers)
--- a/src/core/kernels/boundconds.cu
+++ b/src/core/kernels/boundconds.cu
@@ -1,91 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #include "boundconds.cuh"
 #include "common.cuh"
 #include "src/core/errchk.h"
 __global__ void
 kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
 {
    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
    // If within the start-end range (this allows threadblock dims that are not
    // divisible by end - start)
    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
        return;
    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
    //    return;
    // If destination index is inside the computational domain, return since
    // the boundary conditions are only applied to the ghost zones
    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
        return;
    // Find the source index
    // Map to nx, ny, nz coordinates
    int i_src = i_dst - DCONST(AC_nx_min);
    int j_src = j_dst - DCONST(AC_ny_min);
    int k_src = k_dst - DCONST(AC_nz_min);
    // Translate (s.t. the index is always positive)
    i_src += DCONST(AC_nx);
    j_src += DCONST(AC_ny);
    k_src += DCONST(AC_nz);
    // Wrap
    i_src %= DCONST(AC_nx);
    j_src %= DCONST(AC_ny);
    k_src %= DCONST(AC_nz);
    // Map to mx, my, mz coordinates
    i_src += DCONST(AC_nx_min);
    j_src += DCONST(AC_ny_min);
    k_src += DCONST(AC_nz_min);
    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
    vtxbuf[dst_idx]   = vtxbuf[src_idx];
 }
 AcResult
 acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                           AcReal* vtxbuf)
 {
    const dim3 tpb(8, 2, 8);
    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,40 +1,62 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #pragma once
-#include "astaroth.h"
+static __global__ void
 kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
 {
    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-#ifdef __cplusplus
+    // If within the start-end range (this allows threadblock dims that are not
-extern "C" {
+    // divisible by end - start)
-#endif
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
        return;
-AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
-                                    AcReal* vtxbuf);
+    //    return;
-#ifdef __cplusplus
+    // If destination index is inside the computational domain, return since
-} // extern "C"
+    // the boundary conditions are only applied to the ghost zones
-#endif
+    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
        return;
    // Find the source index
    // Map to nx, ny, nz coordinates
    int i_src = i_dst - DCONST(AC_nx_min);
    int j_src = j_dst - DCONST(AC_ny_min);
    int k_src = k_dst - DCONST(AC_nz_min);
    // Translate (s.t. the index is always positive)
    i_src += DCONST(AC_nx);
    j_src += DCONST(AC_ny);
    k_src += DCONST(AC_nz);
    // Wrap
    i_src %= DCONST(AC_nx);
    j_src %= DCONST(AC_ny);
    k_src %= DCONST(AC_nz);
    // Map to mx, my, mz coordinates
    i_src += DCONST(AC_nx_min);
    j_src += DCONST(AC_ny_min);
    k_src += DCONST(AC_nz_min);
    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
    vtxbuf[dst_idx]   = vtxbuf[src_idx];
 }
 AcResult
 acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                           AcReal* vtxbuf)
 {
    const dim3 tpb(8, 2, 8);
    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/common.cuh
+++ b/src/core/kernels/common.cuh
@@ -1,127 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #pragma once
 #include "astaroth.h"
 extern __constant__ AcMeshInfo d_mesh_info;
 typedef struct {
    AcReal* in[NUM_VTXBUF_HANDLES];
    AcReal* out[NUM_VTXBUF_HANDLES];
    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
 } VertexBufferArray;
 static int __device__ __forceinline__
 DCONST(const AcIntParam param)
 {
    return d_mesh_info.int_params[param];
 }
 static int3 __device__ __forceinline__
 DCONST(const AcInt3Param param)
 {
    return d_mesh_info.int3_params[param];
 }
 static AcReal __device__ __forceinline__
 DCONST(const AcRealParam param)
 {
    return d_mesh_info.real_params[param];
 }
 static AcReal3 __device__ __forceinline__
 DCONST(const AcReal3Param param)
 {
    return d_mesh_info.real3_params[param];
 }
 static __device__ constexpr VertexBufferHandle
 DCONST(const VertexBufferHandle handle)
 {
    return handle;
 }
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
 #define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
 //#define globalMeshM // Placeholder
 //#define localMeshN // Placeholder
 //#define localMeshM // Placeholder
 //#define localMeshN_min // Placeholder
 //#define globalMeshN_min // Placeholder
 #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
 static __device__ constexpr int
 IDX(const int i)
 {
    return i;
 }
 static __device__ __forceinline__ int
 IDX(const int i, const int j, const int k)
 {
    return DEVICE_VTXBUF_IDX(i, j, k);
 }
 static __device__ __forceinline__ int
 IDX(const int3 idx)
 {
    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
 }
 //#include <thrust/complex.h>
 // using namespace thrust;
 #include <cuComplex.h>
 #if AC_DOUBLE_PRECISION == 1
 typedef cuDoubleComplex acComplex;
 #define acComplex(x, y) make_cuDoubleComplex(x, y)
 #else
 typedef cuFloatComplex acComplex;
 #define acComplex(x, y) make_cuFloatComplex(x, y)
 #define exp(x) expf(x)
 #define sin(x) sinf(x)
 #define cos(x) cosf(x)
 #define sqrt(x) sqrtf(x)
 #endif
 static __device__ inline acComplex
 exp(const acComplex& val)
 {
    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
 }
 static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
 {
    return (acComplex){a * b.x, a * b.y};
 }
 static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
 {
    return (acComplex){a * b.x, a * b.y};
 }
 static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
 {
    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
 }
 //#include <complex>
--- a/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
--- a/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
@@ -1,338 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #pragma once
 #include "device_globals.cuh"
 #include "src/core/errchk.h"
 #include "src/core/math_utils.h"
 // Function pointer definitions
 typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
 typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
 typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
                                       const AcReal&);
 // clang-format off
 /* Comparison funcs */
 __device__ inline AcReal
 _device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
 __device__ inline AcReal
 _device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
 __device__ inline AcReal
 _device_sum(const AcReal& a, const AcReal& b) { return a + b; }
 /* Function used to determine the values used during reduction */
 __device__ inline AcReal
 _device_length_scal(const AcReal& a) { return AcReal(a); }
 __device__ inline AcReal
 _device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
 __device__ inline AcReal
 _device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
 __device__ inline AcReal
 _device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
 __device__ inline AcReal
 _device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
 __device__ inline AcReal
 _device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
 // clang-format on
 __device__ inline bool
 oob(const int& i, const int& j, const int& k)
 {
    if (i >= d_mesh_info.int_params[AC_nx] ||
        j >= d_mesh_info.int_params[AC_ny] ||
        k >= d_mesh_info.int_params[AC_nz])
        return true;
    else
        return false;
 }
 template <ReduceInitialScalFunc reduce_initial>
 __global__ void
 _kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
 {
    const int i = threadIdx.x + blockIdx.x * blockDim.x;
    const int j = threadIdx.y + blockIdx.y * blockDim.y;
    const int k = threadIdx.z + blockIdx.z * blockDim.z;
    if (oob(i, j, k))
        return;
    const int src_idx = DEVICE_VTXBUF_IDX(
        i + d_mesh_info.int_params[AC_nx_min],
        j + d_mesh_info.int_params[AC_ny_min],
        k + d_mesh_info.int_params[AC_nz_min]);
    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
    dst[dst_idx] = reduce_initial(src[src_idx]);
 }
 template <ReduceInitialVecFunc reduce_initial>
 __global__ void
 _kernel_reduce_vec(const __restrict__ AcReal* src_a,
                   const __restrict__ AcReal* src_b,
                   const __restrict__ AcReal* src_c, AcReal* dst)
 {
    const int i = threadIdx.x + blockIdx.x * blockDim.x;
    const int j = threadIdx.y + blockIdx.y * blockDim.y;
    const int k = threadIdx.z + blockIdx.z * blockDim.z;
    if (oob(i, j, k))
        return;
    const int src_idx = DEVICE_VTXBUF_IDX(
        i + d_mesh_info.int_params[AC_nx_min],
        j + d_mesh_info.int_params[AC_ny_min],
        k + d_mesh_info.int_params[AC_nz_min]);
    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
                                  src_c[src_idx]);
 }
 ///////////////////////////////////////////////////////////////////////////////
 #define BLOCK_SIZE (1024)
 #define ELEMS_PER_THREAD (32)
 template <ReduceFunc reduce>
 __global__ void
 _kernel_reduce(AcReal* src, AcReal* result)
 {
    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
    const int scratchpad_size = DCONST(AC_nxyz);
    if (idx >= scratchpad_size)
        return;
    __shared__ AcReal smem[BLOCK_SIZE];
    AcReal tmp = src[idx];
    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
        const int src_idx = idx + i * BLOCK_SIZE;
        if (src_idx >= scratchpad_size) {
            // This check is for safety: if accessing uninitialized values
            // beyond the mesh boundaries, we will immediately start seeing NANs
            if (threadIdx.x < BLOCK_SIZE)
                smem[threadIdx.x] = NAN;
            else
                break;
        }
        tmp = reduce(tmp, src[src_idx]);
    }
    smem[threadIdx.x] = tmp;
    __syncthreads();
    int offset = BLOCK_SIZE / 2;
    while (offset > 0) {
        if (threadIdx.x < offset) {
            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
            smem[threadIdx.x] = tmp;
        }
        offset /= 2;
        __syncthreads();
    }
    if (threadIdx.x == 0)
        src[idx] = tmp;
 }
 template <ReduceFunc reduce>
 __global__ void
 _kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
 {
    const int scratchpad_size = DCONST(AC_nxyz);
    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
    AcReal tmp    = src[idx];
    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
        tmp = reduce(tmp, src[idx + i * block_offset]);
    *result = tmp;
 }
 //////////////////////////////////////////////////////////////////////////////
 AcReal
 _reduce_scal(const cudaStream_t stream,
             const ReductionType& rtype, const int& nx, const int& ny,
             const int& nz, const AcReal* vertex_buffer,
             AcReal* reduce_scratchpad, AcReal* reduce_result)
 {
    bool solve_mean = false;
    const dim3 tpb(32, 4, 1);
    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
                   int(ceil(AcReal(nz) / tpb.z)));
    const int scratchpad_size = nx * ny * nz;
    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
    switch (rtype) {
    case RTYPE_MAX:
        _kernel_reduce_scal<_device_length_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_max>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_max>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_MIN:
        _kernel_reduce_scal<_device_length_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_min>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_min>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_RMS:
        _kernel_reduce_scal<_device_squared_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        solve_mean = true;
        break;
    case RTYPE_RMS_EXP:
        _kernel_reduce_scal<_device_exp_squared_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        solve_mean = true;
        break;
    default:
        ERROR("Unrecognized RTYPE");
    }
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    if (solve_mean) {
        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
        return inv_n * result;
    }
    else {
        return result;
    }
 }
 AcReal
 _reduce_vec(const cudaStream_t stream,
            const ReductionType& rtype, const int& nx, const int& ny,
            const int& nz, const AcReal* vertex_buffer_a,
            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
            AcReal* reduce_scratchpad, AcReal* reduce_result)
 {
    bool solve_mean = false;
    const dim3 tpb(32, 4, 1);
    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
                   int(ceil(float(ny) / tpb.y)),
                   int(ceil(float(nz) / tpb.z)));
    const int scratchpad_size = nx * ny * nz;
    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
    // "Features" of this quick & efficient reduction:
    // Block size must be smaller than the computational domain size
    // (otherwise we would have do some additional bounds checking in the
    // second half of _kernel_reduce, which gets quite confusing)
    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
    // the work without worrying too much about the array bounds.
    ERRCHK(BLOCK_SIZE <= scratchpad_size);
    ERRCHK(!(BLOCK_SIZE % 2));
    // NOTE! Also does not work properly with non-power of two mesh dimension
    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
    // see that uninitialized smem values are used in the comparison
    ERRCHK(is_power_of_two(nx));
    ERRCHK(is_power_of_two(ny));
    ERRCHK(is_power_of_two(nz));
    switch (rtype) {
    case RTYPE_MAX:
        _kernel_reduce_vec<_device_length_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_max>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_max>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_MIN:
        _kernel_reduce_vec<_device_length_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_min>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_min>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_RMS:
        _kernel_reduce_vec<_device_squared_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        solve_mean = true;
        break;
    case RTYPE_RMS_EXP:
        _kernel_reduce_vec<_device_exp_squared_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        solve_mean = true;
        break;
    default:
        ERROR("Unrecognized RTYPE");
    }
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    if (solve_mean) {
        const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
        return inv_n * result;
    }
    else {
        return result;
    }
 }
--- a/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
@@ -1,742 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Implementation of the integration pipeline
 *
 *
 *
 */
 #pragma once
 #include "device_globals.cuh"
 #include <assert.h>
 /*
 #define RK_THREADS_X (32)
 #define RK_THREADS_Y (1)
 #define RK_THREADS_Z (4)
 #define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
 #define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
 */
 static __device__ __forceinline__ int
 IDX(const int i)
 {
    return i;
 }
 static __device__ __forceinline__ int
 IDX(const int i, const int j, const int k)
 {
    return DEVICE_VTXBUF_IDX(i, j, k);
 }
 static __device__ __forceinline__ int
 IDX(const int3 idx)
 {
    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
 }
 static __forceinline__ AcMatrix
 create_rotz(const AcReal radians)
 {
    AcMatrix mat;
    mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
    mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
    mat.row[2] = (AcReal3){0, 0, 0};
    return mat;
 }
 #if AC_DOUBLE_PRECISION == 0
 #define sin __sinf
 #define cos __cosf
 #define exp __expf
 #define rsqrt rsqrtf // hardware reciprocal sqrt
 #endif // AC_DOUBLE_PRECISION == 0
 /*
 typedef struct {
    int i, j, k;
 } int3;*/
 /*
 * =============================================================================
 * Level 0 (Input Assembly Stage)
 * =============================================================================
 */
 /*
 * =============================================================================
 * Level 0.1 (Read stencil elements and solve derivatives)
 * =============================================================================
 */
 static __device__ __forceinline__ AcReal
 first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 {
 #if STENCIL_ORDER == 2
    const AcReal coefficients[] = {0, 1.0 / 2.0};
 #elif STENCIL_ORDER == 4
    const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
 #elif STENCIL_ORDER == 6
    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
 #elif STENCIL_ORDER == 8
    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
                                   -1.0 / 280.0};
 #endif
    #define MID (STENCIL_ORDER / 2)
    AcReal res    = 0;
 #pragma unroll
    for (int i = 1; i <= MID; ++i)
        res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
    return res * inv_ds;
 }
 static __device__ __forceinline__ AcReal
 second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 {
 #if STENCIL_ORDER == 2
    const AcReal coefficients[] = {-2., 1.};
 #elif STENCIL_ORDER == 4
    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
 #elif STENCIL_ORDER == 6
    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
                                   1.0 / 90.0};
 #elif STENCIL_ORDER == 8
    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
                                   8.0 / 315.0, -1.0 / 560.0};
 #endif
    #define MID (STENCIL_ORDER / 2)
    AcReal res    = coefficients[0] * pencil[MID];
 #pragma unroll
    for (int i = 1; i <= MID; ++i)
        res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
    return res * inv_ds * inv_ds;
 }
 /** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
 static __device__ __forceinline__ AcReal
 cross_derivative(const AcReal* __restrict__ pencil_a,
                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
                 const AcReal inv_ds_b)
 {
 #if STENCIL_ORDER == 2
    const AcReal coefficients[] = {0, 1.0 / 4.0};
 #elif STENCIL_ORDER == 4
    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
    const AcReal fac            = (1. / 720.);
    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
                                   2.0 * fac};
 #elif STENCIL_ORDER == 8
    const AcReal fac            = (1. / 20160.);
    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
                                   128. * fac, -9. * fac};
 #endif
    #define MID (STENCIL_ORDER / 2)
    AcReal res    = AcReal(0.);
    #pragma unroll
    for (int i = 1; i <= MID; ++i) {
        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
                                  pencil_b[MID + i] - pencil_b[MID - i]);
    }
    return res * inv_ds_a * inv_ds_b;
 }
 static __device__ __forceinline__ AcReal
 derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
 static __device__ __forceinline__ AcReal
 derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
 static __device__ __forceinline__ AcReal
 derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil_a[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
                                   vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
    AcReal pencil_b[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
                            DCONST_REAL(AC_inv_dsy));
 }
 static __device__ __forceinline__ AcReal
 derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil_a[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
    AcReal pencil_b[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
                            DCONST_REAL(AC_inv_dsz));
 }
 static __device__ __forceinline__ AcReal
 dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
 static __device__ __forceinline__ AcReal
 deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
 static __device__ __forceinline__ AcReal
 deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil_a[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
                                   vertexIdx.z + offset - STENCIL_ORDER / 2)];
    AcReal pencil_b[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
                            DCONST_REAL(AC_inv_dsz));
 }
 static __device__ __forceinline__ AcReal
 derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
 static __device__ __forceinline__ AcReal
 derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
 {
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
 /*
 * =============================================================================
 * Level 0.2 (Caching functions)
 * =============================================================================
 */
 #include "stencil_assembly.cuh"
 /*
 typedef struct {
    AcRealData x;
    AcRealData y;
    AcRealData z;
 } AcReal3Data;
 static __device__ __forceinline__ AcReal3Data
 read_data(const int i, const int j, const int k,
          AcReal* __restrict__ buf[], const int3& handle)
 {
    AcReal3Data data;
    data.x = read_data(i, j, k, buf, handle.x);
    data.y = read_data(i, j, k, buf, handle.y);
    data.z = read_data(i, j, k, buf, handle.z);
    return data;
 }
 */
 /*
 * =============================================================================
 * Level 0.3 (Built-in functions available during the Stencil Processing Stage)
 * =============================================================================
 */
 static __host__ __device__ __forceinline__ AcReal3
 operator-(const AcReal3& a, const AcReal3& b)
 {
    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
 }
 static __host__ __device__ __forceinline__ AcReal3
 operator+(const AcReal3& a, const AcReal3& b)
 {
    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
 }
 static __host__ __device__ __forceinline__ AcReal3
 operator-(const AcReal3& a)
 {
    return (AcReal3){-a.x, -a.y, -a.z};
 }
 static __host__  __device__ __forceinline__ AcReal3
 operator*(const AcReal a, const AcReal3& b)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
 static __host__ __device__ __forceinline__ AcReal
 dot(const AcReal3& a, const AcReal3& b)
 {
    return a.x * b.x + a.y * b.y + a.z * b.z;
 }
 static __host__ __device__ __forceinline__ AcReal3
 mul(const AcMatrix& aa, const AcReal3& x)
 {
    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
 }
 static __host__ __device__ __forceinline__ AcReal3
 cross(const AcReal3& a, const AcReal3& b)
 {
    AcReal3 c;
    c.x = a.y * b.z - a.z * b.y;
    c.y = a.z * b.x - a.x * b.z;
    c.z = a.x * b.y - a.y * b.x;
    return c;
 }
 static __host__ __device__ __forceinline__ bool
 is_valid(const AcReal a)
 {
    return !isnan(a) && !isinf(a);
 }
 static __host__ __device__ __forceinline__ bool
 is_valid(const AcReal3& a)
 {
    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
 }
 /*
 * =============================================================================
 * Level 1 (Stencil Processing Stage)
 * =============================================================================
 */
 /*
 * =============================================================================
 * Level 1.1 (Terms)
 * =============================================================================
 */
 static __device__ __forceinline__ AcReal
 laplace(const AcRealData& data)
 {
    return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
 }
 static __device__ __forceinline__ AcReal
 divergence(const AcReal3Data& vec)
 {
    return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
 }
 static __device__ __forceinline__ AcReal3
 laplace_vec(const AcReal3Data& vec)
 {
    return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
 }
 static __device__ __forceinline__ AcReal3
 curl(const AcReal3Data& vec)
 {
    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
                     gradient(vec.x).z - gradient(vec.z).x,
                     gradient(vec.y).x - gradient(vec.x).y};
 }
 static __device__ __forceinline__ AcReal3
 gradient_of_divergence(const AcReal3Data& vec)
 {
    return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
                     hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
                     hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
 }
 // Takes uu gradients and returns S
 static __device__ __forceinline__ AcMatrix
 stress_tensor(const AcReal3Data& vec)
 {
    AcMatrix S;
    S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
                 AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
    S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
    S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
    S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
    S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
    S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
                 AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
    S.row[1].x = S.row[0].y;
    S.row[2].x = S.row[0].z;
    S.row[2].y = S.row[1].z;
    return S;
 }
 static __device__ __forceinline__ AcReal
 contract(const AcMatrix& mat)
 {
    AcReal res = 0;
    #pragma unroll
    for (int i = 0; i < 3; ++i)
        res += dot(mat.row[i], mat.row[i]);
    return res;
 }
 /*
 * =============================================================================
 * Level 1.2 (Equations)
 * =============================================================================
 */
 static __device__ __forceinline__ AcReal
 length(const AcReal3& vec)
 {
    return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
 }
 static __device__ __forceinline__ AcReal
 reciprocal_len(const AcReal3& vec)
 {
    return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
 }
 static __device__ __forceinline__ AcReal3
 normalized(const AcReal3& vec)
 {
    const AcReal inv_len = reciprocal_len(vec);
    return inv_len * vec;
 }
 // Sinusoidal forcing
 // https://arxiv.org/pdf/1704.04676.pdf
 __constant__ AcReal3 forcing_vec;
 __constant__ AcReal forcing_phi;
 static __device__ __forceinline__ AcReal3
 forcing(const int i, const int j, const int k)
 {
    #define DOMAIN_SIZE_X (DCONST(AC_nx) * DCONST_REAL(AC_dsx))
    #define DOMAIN_SIZE_Y (DCONST(AC_ny) * DCONST_REAL(AC_dsy))
    #define DOMAIN_SIZE_Z (DCONST(AC_nz) * DCONST_REAL(AC_dsz))
    const AcReal3 k_vec = (AcReal3){(i - DCONST(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
                                    (j - DCONST(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
                                    (k - DCONST(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
    AcReal inv_len = reciprocal_len(k_vec);
    if (isnan(inv_len) || isinf(inv_len))
        inv_len = 0;
    if (inv_len > 2) // hack to make it cool
        inv_len = 2;
    const AcReal k_dot_x = dot(k_vec, forcing_vec);
    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
    return inv_len * inv_len * waves * forcing_vec;
 }
 // Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
 #define LNT0 (AcReal(0.0))
 #define LNRHO0 (AcReal(0.0))
 #define H_CONST (AcReal(0.0))
 #define C_CONST (AcReal(0.0))
 template <int step_number>
 static __device__ __forceinline__ AcReal
 rk3_integrate(const AcReal state_previous, const AcReal state_current,
              const AcReal rate_of_change, const AcReal dt)
 {
    // Williamson (1980)
    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
                           AcReal(8. / 15.)};
    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
    // access (when accessing beta[step_number-1] even when step_number >= 1)
    switch (step_number) {
        case 0:
            return state_current + beta[step_number + 1] * rate_of_change * dt;
        case 1: // Fallthrough
        case 2:
            return state_current +
               beta[step_number + 1] *
                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
                        (state_current - state_previous) +
                    rate_of_change * dt);
        default:
            return NAN;
    }
 }
 /*
 template <int step_number>
 static __device__ __forceinline__ AcReal
 rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
              const AcReal rate_of_change, const AcReal dt)
 {
    // Williamson (1980)
    const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
    const AcReal beta[]  = {AcReal(1. / 3.), AcReal(15. / 16.),
                           AcReal(8. / 15.)};
    switch (step_number) {
        case 0:
            return state_current + beta[step_number] * rate_of_change * dt;
        case 1: // Fallthrough
        case 2:
            return state_current +
               beta[step_number] *
                   (alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
                        (state_current - state_previous) +
                    rate_of_change * dt);
        default:
            return NAN;
    }
 }
 */
 template <int step_number>
 static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
              const AcReal3 rate_of_change, const AcReal dt)
 {
    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
 }
 #define rk3(state_previous, state_current, rate_of_change, dt)\
 rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
 /*
 template <int step_number>
 static __device__ __forceinline__ AcReal
 rk3_integrate(const int idx, const AcReal out, const int handle,
              const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
 {
    return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
 }
 template <int step_number>
 static __device__ __forceinline__ AcReal3
 rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
                  const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
 {
    return (AcReal3) {
        rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
        rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
        rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
    };
 }
 #define RK3(handle, in_cached, rate_of_change, dt) \
 rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
 */
 /*
 * =============================================================================
 * Level 1.3 (Kernels)
 * =============================================================================
 */
 static __device__ void
 write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
 {
    out[handle][idx] = value;
 }
 static __device__ void
 write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
 {
    write(out, vec.x, idx, value.x);
    write(out, vec.y, idx, value.y);
    write(out, vec.z, idx, value.z);
 }
 static __device__ AcReal
 read_out(const int idx, AcReal* __restrict__ field[], const int handle)
 {
    return field[handle][idx];
 }
 static __device__ AcReal3
 read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 {
    return (AcReal3) { read_out(idx, field, handle.x),
                                       read_out(idx, field, handle.y),
                                       read_out(idx, field, handle.z) };
 }
 #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
 #define READ(handle) (read_data(vertexIdx, buffer.in, handle))
 #define READ_OUT(handle) (read_out(idx, buffer.out, handle))
 // also write for clarity here also, not for the DSL
 //#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
 #define GEN_KERNEL_PARAM_BOILERPLATE \
        const int3 start, const int3 end, VertexBufferArray buffer
 #define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
            return;\
 \
 \
        assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&\
               vertexIdx.z < DCONST(AC_nz_max));\
 \
        assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&\
               vertexIdx.z >= DCONST(AC_nz_min));\
 \
        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 #include "stencil_process.cuh"
 /*
 * =============================================================================
 * Level 2 (Host calls)
 * =============================================================================
 */
 static AcReal
 randf(void)
 {
    return AcReal(rand()) / AcReal(RAND_MAX);
 }
 AcResult
 rk3_step_async(const cudaStream_t stream, const dim3& tpb,
               const int3& start, const int3& end, const int& step_number,
               const AcReal dt, const AcMeshInfo& /*mesh_info*/,
               VertexBufferArray* buffer)
 {
    /////////////////// Forcing
    #if LFORCING
    const AcReal ff_scale = AcReal(.2);
    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
    const AcMatrix rotz = create_rotz(radians);
    ff = mul(rotz, ff);
    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
    #endif // LFORCING
    //////////////////////////
    const int nx = end.x - start.x;
    const int ny = end.y - start.y;
    const int nz = end.z - start.z;
    const dim3 bpg(
        (unsigned int)ceil(nx / AcReal(tpb.x)),
        (unsigned int)ceil(ny / AcReal(tpb.y)),
        (unsigned int)ceil(nz / AcReal(tpb.z)));
    if (step_number == 0)
        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
    else if (step_number == 1)
        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
    else
        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/integration.cu
+++ b/src/core/kernels/integration.cu
@@ -1,293 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #include "common.cuh"
 #include "integration.cuh"
 #include "src/core/errchk.h"
 #include "src/core/math_utils.h"
 #include <assert.h>
 __constant__ AcMeshInfo d_mesh_info; // Extern in kernels/common.cuh.
 static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
 // Device info
 #define REGISTERS_PER_THREAD (255)
 #define MAX_REGISTERS_PER_BLOCK (65536)
 #define MAX_THREADS_PER_BLOCK (1024)
 #define WARP_SIZE (32)
 #define make_int3(a, b, c)                                                                         \
    (int3) { (int)a, (int)b, (int)c }
 template <int step_number>
 static __device__ __forceinline__ AcReal
 rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
              const AcReal dt)
 {
    // Williamson (1980)
    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
    // access (when accessing beta[step_number-1] even when step_number >= 1)
    switch (step_number) {
    case 0:
        return state_current + beta[step_number + 1] * rate_of_change * dt;
    case 1: // Fallthrough
    case 2:
        return state_current +
               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
                                            (state_current - state_previous) +
                                        rate_of_change * dt);
    default:
        return NAN;
    }
 }
 template <int step_number>
 static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
              const AcReal3 rate_of_change, const AcReal dt)
 {
    return (AcReal3){
        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
 }
 #define rk3(state_previous, state_current, rate_of_change, dt)                                     \
    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
 static __device__ void
 write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
 {
    out[handle][idx] = value;
 }
 static __device__ __forceinline__ void
 write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
 {
    write(out, vec.x, idx, value.x);
    write(out, vec.y, idx, value.y);
    write(out, vec.z, idx, value.z);
 }
 static __device__ __forceinline__ AcReal
 read_out(const int idx, AcReal* __restrict__ field[], const int handle)
 {
    return field[handle][idx];
 }
 static __device__ __forceinline__ AcReal3
 read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 {
    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
                     read_out(idx, field, handle.z)};
 }
 #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
 #define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
 #define READ_OUT(handle) (read_out(idx, buffer.out, handle))
 #define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
 #define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
 #define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
                                        d_multigpu_offset.y + vertexIdx.y,                         \
                                        d_multigpu_offset.z + vertexIdx.z};                        \
    (void)globalVertexIdx;                                                                         \
    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
        return;                                                                                    \
                                                                                                   \
    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
           vertexIdx.z < DCONST(AC_nz_max));                                                       \
                                                                                                   \
    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
                                                                                                   \
    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 #define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
    template <int step_number>                                                                     \
    AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start,              \
                                         const int3 end, VertexBufferArray vba)                    \
    {                                                                                              \
                                                                                                   \
        const dim3 tpb(32, 1, 4);                                                                  \
                                                                                                   \
        const int3 n = end - start;                                                                \
        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
                                                                                                   \
        identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba);                         \
        ERRCHK_CUDA_KERNEL();                                                                      \
                                                                                                   \
        return AC_SUCCESS;                                                                         \
    }
 #include "user_kernels.h"
 static dim3 rk3_tpb(32, 1, 4);
 AcResult
 acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
 {
    // RK3
    dim3 best_dims(0, 0, 0);
    float best_time          = INFINITY;
    const int num_iterations = 10;
    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
                    break;
                if (x * y * z > MAX_THREADS_PER_BLOCK)
                    break;
                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
                    break;
                if (((x * y * z) % WARP_SIZE) != 0)
                    continue;
                const dim3 tpb(x, y, z);
                const int3 n = end - start;
                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
                cudaDeviceSynchronize();
                if (cudaGetLastError() != cudaSuccess) // resets the error if any
                    continue;
                // printf("(%d, %d, %d)\n", x, y, z);
                cudaEvent_t tstart, tstop;
                cudaEventCreate(&tstart);
                cudaEventCreate(&tstop);
                // #ifdef AC_dt
                // acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
                // note, temporarily disabled
                /*#else
                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
                it or call the generated " "device function acDeviceKernel_<kernel name> which does
                not require the " "timestep to be defined.\n"); #endif*/
                cudaEventRecord(tstart); // ---------------------------------------- Timing start
                for (int i = 0; i < num_iterations; ++i)
                    solve<2><<<bpg, tpb>>>(start, end, vba);
                cudaEventRecord(tstop); // ----------------------------------------- Timing end
                cudaEventSynchronize(tstop);
                float milliseconds = 0;
                cudaEventElapsedTime(&milliseconds, tstart, tstop);
                ERRCHK_CUDA_KERNEL_ALWAYS();
                if (milliseconds < best_time) {
                    best_time = milliseconds;
                    best_dims = tpb;
                }
            }
        }
    }
 #if VERBOSE_PRINTING
    printf(
        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
    /*
    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
    ERRCHK(fp);
    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
    fclose(fp);
    */
    rk3_tpb = best_dims;
    return AC_SUCCESS;
 }
 AcResult
 acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
                         const int3 end, VertexBufferArray vba)
 {
    const dim3 tpb = rk3_tpb;
    const int3 n = end - start;
    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
    //#ifdef AC_dt
    // acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
    /*#else
        (void)dt;
        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
              "called, but AC_dt was not defined. Either define it or call the generated "
              "device function acDeviceKernel_<kernel name> which does not require the "
              "timestep to be defined.\n");
    #endif*/
    if (step_number == 0)
        solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
    else if (step_number == 1)
        solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
    else
        solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
 static __global__ void
 dummy_kernel(void)
 {
    DCONST((AcIntParam)0);
    DCONST((AcInt3Param)0);
    DCONST((AcRealParam)0);
    DCONST((AcReal3Param)0);
    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
    a* a;
 }
 AcResult
 acKernelDummy(void)
 {
    dummy_kernel<<<1, 1>>>();
    ERRCHK_CUDA_KERNEL_ALWAYS();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -1,42 +1,259 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #pragma once
-#ifdef __cplusplus
+static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
-extern "C" {
+
 // Device info
 #define REGISTERS_PER_THREAD (255)
 #define MAX_REGISTERS_PER_BLOCK (65536)
 #define MAX_THREADS_PER_BLOCK (1024)
 #define WARP_SIZE (32)
 #define make_int3(a, b, c)                                                                         \
    (int3) { (int)a, (int)b, (int)c }
 template <int step_number>
 static __device__ __forceinline__ AcReal
 rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
              const AcReal dt)
 {
    // Williamson (1980)
    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
    // access (when accessing beta[step_number-1] even when step_number >= 1)
    switch (step_number) {
    case 0:
        return state_current + beta[step_number + 1] * rate_of_change * dt;
    case 1: // Fallthrough
    case 2:
        return state_current +
               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
                                            (state_current - state_previous) +
                                        rate_of_change * dt);
    default:
        return NAN;
    }
 }
 template <int step_number>
 static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
              const AcReal3 rate_of_change, const AcReal dt)
 {
    return (AcReal3){
        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
 }
 #define rk3(state_previous, state_current, rate_of_change, dt)                                     \
    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
 static __device__ void
 write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
 {
    out[handle][idx] = value;
 }
 static __device__ __forceinline__ void
 write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
 {
    write(out, vec.x, idx, value.x);
    write(out, vec.y, idx, value.y);
    write(out, vec.z, idx, value.z);
 }
 static __device__ __forceinline__ AcReal
 read_out(const int idx, AcReal* __restrict__ field[], const int handle)
 {
    return field[handle][idx];
 }
 static __device__ __forceinline__ AcReal3
 read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 {
    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
                     read_out(idx, field, handle.z)};
 }
 #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
 #define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
 #define READ_OUT(handle) (read_out(idx, buffer.out, handle))
 #define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
 #define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
 #define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
                                        d_multigpu_offset.y + vertexIdx.y,                         \
                                        d_multigpu_offset.z + vertexIdx.z};                        \
    (void)globalVertexIdx;                                                                         \
    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
        return;                                                                                    \
                                                                                                   \
    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
           vertexIdx.z < DCONST(AC_nz_max));                                                       \
                                                                                                   \
    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
                                                                                                   \
    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 #define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
    template <int step_number>                                                                     \
    AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start,              \
                                         const int3 end, VertexBufferArray vba)                    \
    {                                                                                              \
                                                                                                   \
        const dim3 tpb(32, 1, 4);                                                                  \
                                                                                                   \
        const int3 n = end - start;                                                                \
        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
                                                                                                   \
        identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba);                         \
        ERRCHK_CUDA_KERNEL();                                                                      \
                                                                                                   \
        return AC_SUCCESS;                                                                         \
    }
 #include "user_kernels.h"
 static dim3 rk3_tpb(32, 1, 4);
 AcResult
 acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
 {
    // RK3
    dim3 best_dims(0, 0, 0);
    float best_time          = INFINITY;
    const int num_iterations = 10;
    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
                    break;
                if (x * y * z > MAX_THREADS_PER_BLOCK)
                    break;
                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
                    break;
                if (((x * y * z) % WARP_SIZE) != 0)
                    continue;
                const dim3 tpb(x, y, z);
                const int3 n = end - start;
                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
                cudaDeviceSynchronize();
                if (cudaGetLastError() != cudaSuccess) // resets the error if any
                    continue;
                // printf("(%d, %d, %d)\n", x, y, z);
                cudaEvent_t tstart, tstop;
                cudaEventCreate(&tstart);
                cudaEventCreate(&tstop);
                // #ifdef AC_dt
                // acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
                // note, temporarily disabled
                /*#else
                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
                it or call the generated " "device function acDeviceKernel_<kernel name> which does
                not require the " "timestep to be defined.\n"); #endif*/
                cudaEventRecord(tstart); // ---------------------------------------- Timing start
                for (int i = 0; i < num_iterations; ++i)
                    solve<2><<<bpg, tpb>>>(start, end, vba);
                cudaEventRecord(tstop); // ----------------------------------------- Timing end
                cudaEventSynchronize(tstop);
                float milliseconds = 0;
                cudaEventElapsedTime(&milliseconds, tstart, tstop);
                ERRCHK_CUDA_KERNEL_ALWAYS();
                if (milliseconds < best_time) {
                    best_time = milliseconds;
                    best_dims = tpb;
                }
            }
        }
    }
 #if VERBOSE_PRINTING
    printf(
        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
    /*
    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
    ERRCHK(fp);
    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
    fclose(fp);
    */
-AcResult acKernelDummy(void);
+    rk3_tpb = best_dims;
    return AC_SUCCESS;
 }
-AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
+AcResult
 acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
                         const int3 end, VertexBufferArray vba)
 {
    const dim3 tpb = rk3_tpb;
-AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
+    const int3 n = end - start;
-                                  const int3 start, const int3 end, VertexBufferArray vba);
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
-#ifdef __cplusplus
+    //#ifdef AC_dt
-} // extern "C"
+    // acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
-#endif
+    /*#else
        (void)dt;
        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
              "called, but AC_dt was not defined. Either define it or call the generated "
              "device function acDeviceKernel_<kernel name> which does not require the "
              "timestep to be defined.\n");
    #endif*/
    if (step_number == 0)
        solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
    else if (step_number == 1)
        solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
    else
        solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
 static __global__ void
 dummy_kernel(void)
 {
    DCONST((AcIntParam)0);
    DCONST((AcInt3Param)0);
    DCONST((AcRealParam)0);
    DCONST((AcReal3Param)0);
    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
    a* a;
 }
 AcResult
 acKernelDummy(void)
 {
    dummy_kernel<<<1, 1>>>();
    ERRCHK_CUDA_KERNEL_ALWAYS();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/kernels.cu
+++ b/src/core/kernels/kernels.cu
@@ -0,0 +1,177 @@
 #include "kernels.h"
 #include <assert.h>
 #include <cuComplex.h>
 #include "errchk.h"
 #include "math_utils.h"
 __device__ __constant__ AcMeshInfo d_mesh_info;
 static int __device__ __forceinline__
 DCONST(const AcIntParam param)
 {
    return d_mesh_info.int_params[param];
 }
 static int3 __device__ __forceinline__
 DCONST(const AcInt3Param param)
 {
    return d_mesh_info.int3_params[param];
 }
 static AcReal __device__ __forceinline__
 DCONST(const AcRealParam param)
 {
    return d_mesh_info.real_params[param];
 }
 static AcReal3 __device__ __forceinline__
 DCONST(const AcReal3Param param)
 {
    return d_mesh_info.real3_params[param];
 }
 static __device__ constexpr VertexBufferHandle
 DCONST(const VertexBufferHandle handle)
 {
    return handle;
 }
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
 #define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
 //#define globalMeshM // Placeholder
 //#define localMeshN // Placeholder
 //#define localMeshM // Placeholder
 //#define localMeshN_min // Placeholder
 //#define globalMeshN_min // Placeholder
 #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
 static __device__ constexpr int
 IDX(const int i)
 {
    return i;
 }
 static __device__ __forceinline__ int
 IDX(const int i, const int j, const int k)
 {
    return DEVICE_VTXBUF_IDX(i, j, k);
 }
 static __device__ __forceinline__ int
 IDX(const int3 idx)
 {
    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
 }
 #if AC_DOUBLE_PRECISION == 1
 typedef cuDoubleComplex acComplex;
 #define acComplex(x, y) make_cuDoubleComplex(x, y)
 #else
 typedef cuFloatComplex acComplex;
 #define acComplex(x, y) make_cuFloatComplex(x, y)
 #define exp(x) expf(x)
 #define sin(x) sinf(x)
 #define cos(x) cosf(x)
 #define sqrt(x) sqrtf(x)
 #endif
 static __device__ inline acComplex
 exp(const acComplex& val)
 {
    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
 }
 static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
 {
    return (acComplex){a * b.x, a * b.y};
 }
 static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
 {
    return (acComplex){a * b.x, a * b.y};
 }
 static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
 {
    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
 }
 // Kernels /////////////////////////////////////////////////////////////////////
 #include "boundconds.cuh"
 #include "integration.cuh"
 #include "packing.cuh"
 #include "reductions.cuh"
 AcResult
 acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
 {
    cudaSetDevice(device->id);
    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
                  device->local_config.int_params[AC_multigpu_offset]);
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
                          const AcReal value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_REAL_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
                          const AcReal3 value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
                       const int value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_INT_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
 AcResult
 acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
                        const int3 value)
 {
    cudaSetDevice(device->id);
    if (param >= NUM_INT3_PARAMS)
        return AC_FAILURE;
    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -0,0 +1,82 @@
 #pragma once
 #include "astaroth.h"
 typedef struct {
    int3 dims;
    AcReal* data;
 } PackedData;
 typedef struct {
    AcReal* in[NUM_VTXBUF_HANDLES];
    AcReal* out[NUM_VTXBUF_HANDLES];
    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
 } VertexBufferArray;
 struct device_s {
    int id;
    AcMeshInfo local_config;
    // Concurrency
    cudaStream_t streams[NUM_STREAMS];
    // Memory
    VertexBufferArray vba;
    AcReal* reduce_scratchpad;
    AcReal* reduce_result;
 };
 #ifdef __cplusplus
 extern "C" {
 #endif
 /** */
 AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                                    AcReal* vtxbuf);
 /** */
 AcResult acKernelDummy(void);
 /** */
 AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
 /** */
 AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
                                  const int3 start, const int3 end, VertexBufferArray vba);
 /** */
 AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba,
                          const int3 vba_start, PackedData packed);
 /** */
 AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed,
                            const int3 vba_start, VertexBufferArray vba);
 /** */
 AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
                          const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
                          AcReal* reduce_result);
 /** */
 AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
                         const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
 AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
                              const AcMeshInfo device_config);
 AcResult acDeviceLoadScalarUniform(const Device device, const Stream stream,
                                   const AcRealParam param, const AcReal value);
 AcResult acDeviceLoadVectorUniform(const Device device, const Stream stream,
                                   const AcReal3Param param, const AcReal3 value);
 AcResult acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
                                const int value);
 AcResult acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
                                 const int3 value);
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/src/core/kernels/packing.cu
+++ b/src/core/kernels/packing.cu
@@ -1,120 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #include "packing.cuh"
 #include "src/core/errchk.h"
 __global__ void
 kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
 {
    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
    // If within the start-end range (this allows threadblock dims that are not
    // divisible by end - start)
    if (i_packed >= packed.dims.x || //
        j_packed >= packed.dims.y || //
        k_packed >= packed.dims.z) {
        return;
    }
    const int i_unpacked = i_packed + vba_start.x;
    const int j_unpacked = j_packed + vba_start.y;
    const int k_unpacked = k_packed + vba_start.z;
    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
    const int packed_idx   = i_packed +               //
                           j_packed * packed.dims.x + //
                           k_packed * packed.dims.x * packed.dims.y;
    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
    //#pragma unroll
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
        packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
 }
 __global__ void
 kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
 {
    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
    // If within the start-end range (this allows threadblock dims that are not
    // divisible by end - start)
    if (i_packed >= packed.dims.x || //
        j_packed >= packed.dims.y || //
        k_packed >= packed.dims.z) {
        return;
    }
    const int i_unpacked = i_packed + vba_start.x;
    const int j_unpacked = j_packed + vba_start.y;
    const int k_unpacked = k_packed + vba_start.z;
    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
    const int packed_idx   = i_packed +               //
                           j_packed * packed.dims.x + //
                           k_packed * packed.dims.x * packed.dims.y;
    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
    //#pragma unroll
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
        vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
 }
 AcResult
 acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
                 PackedData packed)
 {
    const dim3 tpb(32, 8, 1);
    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
    kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
 AcResult
 acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
                   VertexBufferArray vba)
 {
    const dim3 tpb(32, 8, 1);
    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
    kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/packing.cuh
+++ b/src/core/kernels/packing.cuh
@@ -1,40 +1,92 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #pragma once
 #include "astaroth.h"
 #include "common.cuh"
-typedef struct {
+static __global__ void
-    int3 dims;
+kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
-    AcReal* data;
+{
-} PackedData;
+    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
-AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba,
+    // If within the start-end range (this allows threadblock dims that are not
-                          const int3 vba_start, PackedData packed);
+    // divisible by end - start)
    if (i_packed >= packed.dims.x || //
        j_packed >= packed.dims.y || //
        k_packed >= packed.dims.z) {
        return;
    }
-AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed,
+    const int i_unpacked = i_packed + vba_start.x;
-                            const int3 vba_start, VertexBufferArray vba);
+    const int j_unpacked = j_packed + vba_start.y;
    const int k_unpacked = k_packed + vba_start.z;
    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
    const int packed_idx   = i_packed +               //
                           j_packed * packed.dims.x + //
                           k_packed * packed.dims.x * packed.dims.y;
    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
    //#pragma unroll
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
        packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
 }
 static __global__ void
 kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
 {
    const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
    const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
    const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
    // If within the start-end range (this allows threadblock dims that are not
    // divisible by end - start)
    if (i_packed >= packed.dims.x || //
        j_packed >= packed.dims.y || //
        k_packed >= packed.dims.z) {
        return;
    }
    const int i_unpacked = i_packed + vba_start.x;
    const int j_unpacked = j_packed + vba_start.y;
    const int k_unpacked = k_packed + vba_start.z;
    const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
    const int packed_idx   = i_packed +               //
                           j_packed * packed.dims.x + //
                           k_packed * packed.dims.x * packed.dims.y;
    const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
    //#pragma unroll
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
        vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
 }
 AcResult
 acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
                 PackedData packed)
 {
    const dim3 tpb(32, 8, 1);
    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
    kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
 AcResult
 acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
                   VertexBufferArray vba)
 {
    const dim3 tpb(32, 8, 1);
    const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
                   (unsigned int)ceil(packed.dims.y / (float)tpb.y),
                   (unsigned int)ceil(packed.dims.z / (float)tpb.z));
    kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/reductions.cu
+++ b/src/core/kernels/reductions.cu
@@ -1,284 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #include "common.cuh"
 #include "reductions.cuh"
 #include "src/core/errchk.h"
 #include "src/core/math_utils.h" // is_power_of_two
 /*
 Reduction steps:
 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
 2 of 3: Compute most of the reductions into a single block of data
 3 of 3: After all results have been stored into the final block, reduce the data in the final block
 */
 // Function pointer definitions
 typedef AcReal (*FilterFunc)(const AcReal&);
 typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
 typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
 // clang-format off
 /* Comparison funcs */
 static __device__ inline AcReal
 dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
 static __device__ inline AcReal
 dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
 static __device__ inline AcReal
 dsum(const AcReal& a, const AcReal& b) { return a + b; }
 /* Function used to determine the values used during reduction */
 static __device__ inline AcReal
 dvalue(const AcReal& a) { return AcReal(a); }
 static __device__ inline AcReal
 dsquared(const AcReal& a) { return (AcReal)(a*a); }
 static __device__ inline AcReal
 dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
 static __device__ inline AcReal
 dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
 static __device__ inline AcReal
 dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
 static __device__ inline AcReal
 dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
 // clang-format on
 #include <assert.h>
 template <FilterFunc filter>
 __global__ void
 kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
 {
    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
    const int nx = end.x - start.x;
    const int ny = end.y - start.y;
    const int nz = end.z - start.z;
    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
                                threadIdx.y + blockIdx.y * blockDim.y,
                                threadIdx.z + blockIdx.z * blockDim.z};
    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
           src_idx.z < DCONST(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
 }
 template <FilterFuncVec filter>
 __global__ void
 kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
 {
    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
    const int nx = end.x - start.x;
    const int ny = end.y - start.y;
    const int nz = end.z - start.z;
    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
                                threadIdx.y + blockIdx.y * blockDim.y,
                                threadIdx.z + blockIdx.z * blockDim.z};
    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
           src_idx.z < DCONST(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
 }
 template <ReduceFunc reduce>
 __global__ void
 kernel_reduce(AcReal* scratchpad, const int num_elems)
 {
    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
    extern __shared__ AcReal smem[];
    if (idx < num_elems) {
        smem[threadIdx.x] = scratchpad[idx];
    }
    else {
        smem[threadIdx.x] = NAN;
    }
    __syncthreads();
    int offset = blockDim.x / 2;
    assert(offset % 2 == 0);
    while (offset > 0) {
        if (threadIdx.x < offset) {
            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
        }
        offset /= 2;
        __syncthreads();
    }
    if (threadIdx.x == 0) {
        scratchpad[idx] = smem[threadIdx.x];
    }
 }
 template <ReduceFunc reduce>
 __global__ void
 kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
                    const int block_size, AcReal* result)
 {
    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx != 0) {
        return;
    }
    AcReal res = scratchpad[0];
    for (int i = 1; i < num_blocks; ++i) {
        res = reduce(res, scratchpad[i * block_size]);
    }
    *result = res;
 }
 AcReal
 acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
                   const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
    const unsigned nz        = end.z - start.z;
    const unsigned num_elems = nx * ny * nz;
    const dim3 tpb_filter(32, 4, 1);
    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
    ERRCHK(nx >= tpb_filter.x);
    ERRCHK(ny >= tpb_filter.y);
    ERRCHK(nz >= tpb_filter.z);
    ERRCHK(tpb_reduce <= num_elems);
    ERRCHK(nx * ny * nz % 2 == 0);
    // clang-format off
    if (rtype == RTYPE_MAX) {
        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_MIN) {
        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS) {
        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS_EXP) {
        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_SUM) {
        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else {
        ERROR("Unrecognized rtype");
    }
    // clang-format on
    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
 }
 AcReal
 acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
                  const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
    const unsigned nz        = end.z - start.z;
    const unsigned num_elems = nx * ny * nz;
    const dim3 tpb_filter(32, 4, 1);
    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
    ERRCHK(nx >= tpb_filter.x);
    ERRCHK(ny >= tpb_filter.y);
    ERRCHK(nz >= tpb_filter.z);
    ERRCHK(tpb_reduce <= num_elems);
    ERRCHK(nx * ny * nz % 2 == 0);
    // clang-format off
    if (rtype == RTYPE_MAX) {
        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_MIN) {
        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS) {
        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS_EXP) {
        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_SUM) {
        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else {
        ERROR("Unrecognized rtype");
    }
    // clang-format on
    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
 }
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -1,44 +1,254 @@
 #pragma once
 /*
-    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+Reduction steps:
-
+ 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
-    This file is part of Astaroth.
+ 2 of 3: Compute most of the reductions into a single block of data
-
+ 3 of 3: After all results have been stored into the final block, reduce the data in the final block
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-/**
+// Function pointer definitions
- * @file
+typedef AcReal (*FilterFunc)(const AcReal&);
- * \brief Brief info.
+typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
- *
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
 * Detailed info.
 *
 */
 #pragma once
 #include <astaroth.h>
-#ifdef __cplusplus
+// clang-format off
-extern "C" {
+/* Comparison funcs */
-#endif
+static __device__ inline AcReal
 dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
-AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+static __device__ inline AcReal
-                          const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
+dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
                          AcReal* reduce_result);
-AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+static __device__ inline AcReal
-                         const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+dsum(const AcReal& a, const AcReal& b) { return a + b; }
                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
-#ifdef __cplusplus
+/* Function used to determine the values used during reduction */
-} // extern "C"
+static __device__ inline AcReal
-#endif
+dvalue(const AcReal& a) { return AcReal(a); }
 static __device__ inline AcReal
 dsquared(const AcReal& a) { return (AcReal)(a*a); }
 static __device__ inline AcReal
 dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
 static __device__ inline AcReal
 dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
 static __device__ inline AcReal
 dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
 static __device__ inline AcReal
 dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
 // clang-format on
 #include <assert.h>
 template <FilterFunc filter>
 static __global__ void
 kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
 {
    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
    const int nx = end.x - start.x;
    const int ny = end.y - start.y;
    const int nz = end.z - start.z;
    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
                                threadIdx.y + blockIdx.y * blockDim.y,
                                threadIdx.z + blockIdx.z * blockDim.z};
    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
           src_idx.z < DCONST(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
 }
 template <FilterFuncVec filter>
 static __global__ void
 kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
 {
    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
    const int nx = end.x - start.x;
    const int ny = end.y - start.y;
    const int nz = end.z - start.z;
    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
                                threadIdx.y + blockIdx.y * blockDim.y,
                                threadIdx.z + blockIdx.z * blockDim.z};
    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
           src_idx.z < DCONST(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
 }
 template <ReduceFunc reduce>
 static __global__ void
 kernel_reduce(AcReal* scratchpad, const int num_elems)
 {
    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
    extern __shared__ AcReal smem[];
    if (idx < num_elems) {
        smem[threadIdx.x] = scratchpad[idx];
    }
    else {
        smem[threadIdx.x] = NAN;
    }
    __syncthreads();
    int offset = blockDim.x / 2;
    assert(offset % 2 == 0);
    while (offset > 0) {
        if (threadIdx.x < offset) {
            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
        }
        offset /= 2;
        __syncthreads();
    }
    if (threadIdx.x == 0) {
        scratchpad[idx] = smem[threadIdx.x];
    }
 }
 template <ReduceFunc reduce>
 static __global__ void
 kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
                    const int block_size, AcReal* result)
 {
    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx != 0) {
        return;
    }
    AcReal res = scratchpad[0];
    for (int i = 1; i < num_blocks; ++i) {
        res = reduce(res, scratchpad[i * block_size]);
    }
    *result = res;
 }
 AcReal
 acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
                   const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
    const unsigned nz        = end.z - start.z;
    const unsigned num_elems = nx * ny * nz;
    const dim3 tpb_filter(32, 4, 1);
    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
    ERRCHK(nx >= tpb_filter.x);
    ERRCHK(ny >= tpb_filter.y);
    ERRCHK(nz >= tpb_filter.z);
    ERRCHK(tpb_reduce <= num_elems);
    ERRCHK(nx * ny * nz % 2 == 0);
    // clang-format off
    if (rtype == RTYPE_MAX) {
        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_MIN) {
        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS) {
        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS_EXP) {
        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_SUM) {
        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else {
        ERROR("Unrecognized rtype");
    }
    // clang-format on
    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
 }
 AcReal
 acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
                  const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
    const unsigned nz        = end.z - start.z;
    const unsigned num_elems = nx * ny * nz;
    const dim3 tpb_filter(32, 4, 1);
    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
    ERRCHK(nx >= tpb_filter.x);
    ERRCHK(ny >= tpb_filter.y);
    ERRCHK(nz >= tpb_filter.z);
    ERRCHK(tpb_reduce <= num_elems);
    ERRCHK(nx * ny * nz % 2 == 0);
    // clang-format off
    if (rtype == RTYPE_MAX) {
        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_MIN) {
        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS) {
        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_RMS_EXP) {
        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else if (rtype == RTYPE_SUM) {
        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    } else {
        ERROR("Unrecognized rtype");
    }
    // clang-format on
    cudaStreamSynchronize(stream);
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
 }
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -1,187 +0,0 @@
 /*
    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #pragma once
 //#include <cmath>
 // using namespace std; // Potentially bad practice to declare namespace std here
 #include <math.h> // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
 //#include <tgmath.h> // Even this does not work
 #include <stdlib.h> // rand
 template <class T>
 static inline const T
 max(const T& a, const T& b)
 {
    return a > b ? a : b;
 }
 template <class T>
 static inline const T
 min(const T& a, const T& b)
 {
    return a < b ? a : b;
 }
 static inline const int3
 max(const int3& a, const int3& b)
 {
    return (int3){max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
 }
 static inline const int3
 min(const int3& a, const int3& b)
 {
    return (int3){min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
 }
 template <class T>
 static inline const T
 sum(const T& a, const T& b)
 {
    return a + b;
 }
 template <class T>
 static inline const T
 clamp(const T& val, const T& min, const T& max)
 {
    return val < min ? min : val > max ? max : val;
 }
 static inline AcReal
 randr()
 {
    return AcReal(rand()) / AcReal(RAND_MAX);
 }
 static inline bool
 is_power_of_two(const unsigned val)
 {
    return val && !(val & (val - 1));
 }
 #ifdef __CUDACC__
 #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
 #else
 #define HOST_DEVICE_INLINE inline
 #endif // __CUDACC__
 static HOST_DEVICE_INLINE AcReal3
 operator+(const AcReal3& a, const AcReal3& b)
 {
    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
 }
 static HOST_DEVICE_INLINE int3
 operator+(const int3& a, const int3& b)
 {
    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
 }
 static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
 {
    return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
 }
 static HOST_DEVICE_INLINE void
 operator+=(AcReal3& lhs, const AcReal3& rhs)
 {
    lhs.x += rhs.x;
    lhs.y += rhs.y;
    lhs.z += rhs.z;
 }
 static HOST_DEVICE_INLINE AcReal3
 operator-(const AcReal3& a, const AcReal3& b)
 {
    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
 }
 static HOST_DEVICE_INLINE int3
 operator-(const int3& a, const int3& b)
 {
    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
 }
 static HOST_DEVICE_INLINE AcReal3
 operator-(const AcReal3& a)
 {
    return (AcReal3){-a.x, -a.y, -a.z};
 }
 static HOST_DEVICE_INLINE void
 operator-=(AcReal3& lhs, const AcReal3& rhs)
 {
    lhs.x -= rhs.x;
    lhs.y -= rhs.y;
    lhs.z -= rhs.z;
 }
 static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
 static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
 static HOST_DEVICE_INLINE AcReal
 dot(const AcReal3& a, const AcReal3& b)
 {
    return a.x * b.x + a.y * b.y + a.z * b.z;
 }
 static HOST_DEVICE_INLINE AcReal3
 mul(const AcMatrix& aa, const AcReal3& x)
 {
    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
 }
 static HOST_DEVICE_INLINE AcReal3
 cross(const AcReal3& a, const AcReal3& b)
 {
    AcReal3 c;
    c.x = a.y * b.z - a.z * b.y;
    c.y = a.z * b.x - a.x * b.z;
    c.z = a.x * b.y - a.y * b.x;
    return c;
 }
 static HOST_DEVICE_INLINE bool
 is_valid(const AcReal& a)
 {
    return !isnan(a) && !isinf(a);
 }
 static HOST_DEVICE_INLINE bool
 is_valid(const AcReal3& a)
 {
    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
 }
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -123,12 +123,10 @@
                                 db  da
 *
 */
-#include "astaroth_node.h"
+#include "astaroth.h"
 #include "astaroth_device.h"
 #include "errchk.h"
-#include "kernels/common.cuh"
+#include "math_utils.h"
 #include "math_utils.h" // sum for reductions
 static const int MAX_NUM_DEVICES = 32;