Now nvcc is used to compile kernels only. All host code, incl. device.cc, MPI communication and others are now compiled with the host C++ compiler. This should work around an nvcc/MPI bug on Puhti.

2019-12-03 15:12:17 +02:00
parent 8bffb2a1d0
commit f14e35620c
14 changed files with 869 additions and 686 deletions
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -26,7 +26,10 @@ extern "C" {
 #include <stdlib.h> // size_t
 //#include <vector_types.h> // CUDA vector types (float4, etc)

-#ifndef __CUDACC__
+//#ifndef __CUDACC__
+#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
+#include <cuda_runtime_api.h>
+#else
 typedef struct {
    int x, y, z;
 } int3;
@@ -42,7 +45,8 @@ typedef struct {
 typedef struct {
    double x, y, z;
 } double3;
-#endif // __CUDACC__
+#endif
+//#endif // __CUDACC__

 // Library flags
 #define STENCIL_ORDER (6)
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,8 +2,13 @@
 ##  CMakeLists.txt for Astaroth Core  ##
 ########################################

+# Require C++11
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 ## Find packages
 find_package(CUDA REQUIRED)
+set(CUDA_SEPARABLE_COMPILATION ON)
 # find_package(OpenMP REQUIRED)

 ## Architecture and optimization flags
@@ -23,8 +28,13 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
 set(CUDA_WARNING_FLAGS --compiler-options -Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion) # -Wshadow

 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS})
-set(CUDA_NVCC_FLAGS_RELEASE)
-set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
+
+# Workaround for CMake bug https://gitlab.kitware.com/cmake/cmake/issues/16411
+if (CMAKE_BUILD_TYPE_UPPER STREQUAL "RELEASE")
+    set(CUDA_NVCC_FLAGS_RELEASE -O3)
+elseif (CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG")
+    set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
+endif()

 if (MPI_ENABLED)
    find_package(MPI REQUIRED)
@@ -36,10 +46,11 @@ if (MPI_ENABLED)
 endif ()

 ## Create and link the library
-set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries
-cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
-target_include_directories(astaroth_core PRIVATE .)
-target_link_libraries(astaroth_core m)
+#set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries
+cuda_add_library(astaroth_core STATIC astaroth.cc device.cc node.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu)
+target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
+target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m)
+target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
 add_dependencies(astaroth_core dsl_headers)

 ## Definitions
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -26,20 +26,10 @@
 */
 #include "astaroth_device.h"

+#include "math_utils.h"
 #include "errchk.h"

-// Device info
-#define REGISTERS_PER_THREAD (255)
-#define MAX_REGISTERS_PER_BLOCK (65536)
-#define MAX_THREADS_PER_BLOCK (1024)
-#define WARP_SIZE (32)
-
-typedef struct {
-    AcReal* in[NUM_VTXBUF_HANDLES];
-    AcReal* out[NUM_VTXBUF_HANDLES];
-
-    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
-} VertexBufferArray;
+#include "kernels/common.cuh"

 struct device_s {
    int id;
@@ -63,94 +53,14 @@ struct device_s {
 #endif
 };

-__constant__ AcMeshInfo d_mesh_info;
-static int __device__ __forceinline__
-DCONST(const AcIntParam param)
-{
-    return d_mesh_info.int_params[param];
-}
-static int3 __device__ __forceinline__
-DCONST(const AcInt3Param param)
-{
-    return d_mesh_info.int3_params[param];
-}
-static AcReal __device__ __forceinline__
-DCONST(const AcRealParam param)
-{
-    return d_mesh_info.real_params[param];
-}
-static AcReal3 __device__ __forceinline__
-DCONST(const AcReal3Param param)
-{
-    return d_mesh_info.real3_params[param];
-}
-static __device__ constexpr VertexBufferHandle
-DCONST(const VertexBufferHandle handle)
-{
-    return handle;
-}
-#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
-#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
-#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
-//#define globalMeshM // Placeholder
-//#define localMeshN // Placeholder
-//#define localMeshM // Placeholder
-//#define localMeshN_min // Placeholder
-//#define globalMeshN_min // Placeholder
-#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
-//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
-//#include <thrust/complex.h>
-// using namespace thrust;
-#include <cuComplex.h>
-#if AC_DOUBLE_PRECISION == 1
-typedef cuDoubleComplex acComplex;
-#define acComplex(x, y) make_cuDoubleComplex(x, y)
-#else
-typedef cuFloatComplex acComplex;
-#define acComplex(x, y) make_cuFloatComplex(x, y)
-#endif
-static __device__ inline acComplex
-exp(const acComplex& val)
-{
-    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
-}
-static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
-{
-    return (acComplex){a * b.x, a * b.y};
-}
-
-static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
-{
-    return (acComplex){a * b.x, a * b.y};
-}
-
-static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
-{
-    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
-}
-//#include <complex>
-
 #include "kernels/boundconds.cuh"
 #include "kernels/integration.cuh"
 #include "kernels/reductions.cuh"

-static dim3 rk3_tpb(32, 1, 4);
-
 #if PACKED_DATA_TRANSFERS // Defined in device.cuh
 // #include "kernels/pack_unpack.cuh"
 #endif

-static __global__ void
-dummy_kernel(void)
-{
-    DCONST((AcIntParam)0);
-    DCONST((AcInt3Param)0);
-    DCONST((AcRealParam)0);
-    DCONST((AcReal3Param)0);
-    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
-    a* a;
-}
-
 AcResult
 acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
 {
@@ -171,8 +81,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
           "device supports the CUDA architecture you are compiling for.\n"
           "Running dummy kernel... ");
    fflush(stdout);
-    dummy_kernel<<<1, 1>>>();
-    ERRCHK_CUDA_KERNEL_ALWAYS();
+    acKernelDummy();
    printf("Success!\n");

    // Concurrency
@@ -184,21 +93,21 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
    // VBA in/out
    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.out[i], vba_size_bytes));
    }
    // VBA Profiles
    const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
                                                           max(device_config.int_params[AC_my],
                                                               device_config.int_params[AC_mz]));
    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.profiles[i], profile_size_bytes));
    }

    // Reductions
    ERRCHK_CUDA_ALWAYS(
-        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+        cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));

 #if AC_MPI_ENABLED
    // Allocate data required for packed transfers here (cudaMalloc)
@@ -206,8 +115,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
                                    device_config.int_params[AC_my] * NGHOST * NUM_VTXBUF_HANDLES *
                                    sizeof(AcReal);
    for (int i = 0; i < 2; ++i) {
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->inner[i], block_size_bytes));
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->outer[i], block_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes));

        ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes));
        ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes));
@@ -332,87 +241,11 @@ AcResult
 acDeviceAutoOptimize(const Device device)
 {
    cudaSetDevice(device->id);
-
-    // RK3
    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + (int3){device->local_config.int_params[AC_nx], //
-                                    device->local_config.int_params[AC_ny], //
-                                    device->local_config.int_params[AC_nz]};
-
-    dim3 best_dims(0, 0, 0);
-    float best_time          = INFINITY;
-    const int num_iterations = 10;
-
-    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
-        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
-            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
-
-                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
-                    break;
-                if (x * y * z > MAX_THREADS_PER_BLOCK)
-                    break;
-
-                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
-                    break;
-
-                if (((x * y * z) % WARP_SIZE) != 0)
-                    continue;
-
-                const dim3 tpb(x, y, z);
-                const int3 n = end - start;
-                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-                cudaDeviceSynchronize();
-                if (cudaGetLastError() != cudaSuccess) // resets the error if any
-                    continue;
-
-                // printf("(%d, %d, %d)\n", x, y, z);
-
-                cudaEvent_t tstart, tstop;
-                cudaEventCreate(&tstart);
-                cudaEventCreate(&tstop);
-
-                // #ifdef AC_dt
-                acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
-                /*#else
-                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
-                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
-                it or call the generated " "device function acDeviceKernel_<kernel name> which does
-                not require the " "timestep to be defined.\n"); #endif*/
-
-                cudaEventRecord(tstart); // ---------------------------------------- Timing start
-                for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, device->vba);
-
-                cudaEventRecord(tstop); // ----------------------------------------- Timing end
-                cudaEventSynchronize(tstop);
-                float milliseconds = 0;
-                cudaEventElapsedTime(&milliseconds, tstart, tstop);
-
-                ERRCHK_CUDA_KERNEL_ALWAYS();
-                if (milliseconds < best_time) {
-                    best_time = milliseconds;
-                    best_dims = tpb;
-                }
-            }
-        }
-    }
-#if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
-#endif
-    /*
-    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
-    ERRCHK(fp);
-    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
-    fclose(fp);
-    */
-
-    rk3_tpb = best_dims;
-    return AC_SUCCESS;
+    const int3 end = (int3){device->local_config.int_params[AC_mx], //
+                            device->local_config.int_params[AC_my], //
+                            device->local_config.int_params[AC_mz]};
+    return acKernelAutoOptimizeIntegration(start, end, device->vba);
 }

 AcResult
@@ -446,7 +279,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
@@ -457,7 +290,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
@@ -468,7 +301,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
@@ -479,7 +312,7 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
@@ -510,7 +343,7 @@ acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo
    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
                  device->local_config.int_params[AC_multigpu_offset]);

-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(&d_mesh_info, &device_config, sizeof(device_config),
                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }
@@ -685,52 +518,17 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
                         const int3 start, const int3 end, const AcReal dt)
 {
    cudaSetDevice(device->id);
-
-    const dim3 tpb = rk3_tpb;
-
-    const int3 n = end - start;
-    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-    //#ifdef AC_dt
    acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
-    /*#else
-        (void)dt;
-        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
-              "called, but AC_dt was not defined. Either define it or call the generated "
-              "device function acDeviceKernel_<kernel name> which does not require the "
-              "timestep to be defined.\n");
-    #endif*/
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
-    else
-        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
-
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
+    return acKernelIntegrateSubstep(device->streams[stream], step_number, start, end, device->vba);
 }

 AcResult
-acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type,
+acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
                              const VertexBufferHandle vtxbuf_handle, const int3 start,
                              const int3 end)
 {
    cudaSetDevice(device->id);
-    const cudaStream_t stream = device->streams[stream_type];
-
-    const dim3 tpb(8, 2, 8);
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
+    return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]);
 }

 AcResult
@@ -757,7 +555,7 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
                            device->local_config.int_params[AC_ny_max],
                            device->local_config.int_params[AC_nz_max]};

-    *result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
+    *result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
                          device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
 }
@@ -777,7 +575,7 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
                            device->local_config.int_params[AC_ny_max],
                            device->local_config.int_params[AC_nz_max]};

-    *result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
+    *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
                         device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
--- a/src/core/errchk.h
+++ b/src/core/errchk.h
@@ -65,7 +65,7 @@
 * CUDA-specific error checking
 * =============================================================================
 */
-#ifdef __CUDACC__
+#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
 static inline void
 cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
 {
@@ -98,8 +98,6 @@ cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
        ERRCHK_CUDA(cudaPeekAtLastError());                                    \
        ERRCHK_CUDA(cudaDeviceSynchronize());                                  \
    }
-    #endif
-
 #endif

 #define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
@@ -112,3 +110,4 @@ cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)

 #define WARNCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__, false); }
 // clang-format on
+#endif // AC_USE_CUDA_RUNTIME_API
--- a/src/core/grid.cc
+++ b/src/core/grid.cc
--- a/src/core/kernels/boundconds.cu
+++ b/src/core/kernels/boundconds.cu
@@ -0,0 +1,89 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "common.cuh"
+
+#include "src/core/errchk.h"
+
+__global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
+        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST(AC_nx_min);
+    int j_src = j_dst - DCONST(AC_ny_min);
+    int k_src = k_dst - DCONST(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST(AC_nx);
+    j_src += DCONST(AC_ny);
+    k_src += DCONST(AC_nz);
+
+    // Wrap
+    i_src %= DCONST(AC_nx);
+    j_src %= DCONST(AC_ny);
+    k_src %= DCONST(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST(AC_nx_min);
+    j_src += DCONST(AC_ny_min);
+    k_src += DCONST(AC_nz_min);
+
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vtxbuf[dst_idx]   = vtxbuf[src_idx];
+}
+
+AcResult
+acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
+{
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
+    ERRCHK_CUDA_KERNEL();
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -26,61 +26,7 @@
 */
 #pragma once

-__global__ void
-kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+#include "astaroth.h"

-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
-        return;
-
-    // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
-    //    return;
-
-    // If destination index is inside the computational domain, return since
-    // the boundary conditions are only applied to the ghost zones
-    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
-        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
-        return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates
-    int i_src = i_dst - DCONST(AC_nx_min);
-    int j_src = j_dst - DCONST(AC_ny_min);
-    int k_src = k_dst - DCONST(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST(AC_nx);
-    j_src += DCONST(AC_ny);
-    k_src += DCONST(AC_nz);
-
-    // Wrap
-    i_src %= DCONST(AC_nx);
-    j_src %= DCONST(AC_ny);
-    k_src %= DCONST(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST(AC_nx_min);
-    j_src += DCONST(AC_ny_min);
-    k_src += DCONST(AC_nz_min);
-
-    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx]   = vtxbuf[src_idx];
-}
-
-static void
-periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
-{
-    const dim3 tpb(8, 2, 8);
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
-    ERRCHK_CUDA_KERNEL();
-}
+AcResult
+acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf);
--- a/src/core/kernels/common.cuh
+++ b/src/core/kernels/common.cuh
@@ -0,0 +1,136 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+#include "astaroth.h"
+
+extern __constant__ AcMeshInfo d_mesh_info;
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+
+    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
+} VertexBufferArray;
+
+static int __device__ __forceinline__
+DCONST(const AcIntParam param)
+{
+    return d_mesh_info.int_params[param];
+}
+static int3 __device__ __forceinline__
+DCONST(const AcInt3Param param)
+{
+    return d_mesh_info.int3_params[param];
+}
+static AcReal __device__ __forceinline__
+DCONST(const AcRealParam param)
+{
+    return d_mesh_info.real_params[param];
+}
+static AcReal3 __device__ __forceinline__
+DCONST(const AcReal3Param param)
+{
+    return d_mesh_info.real3_params[param];
+}
+static __device__ constexpr VertexBufferHandle
+DCONST(const VertexBufferHandle handle)
+{
+    return handle;
+}
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
+#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
+#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
+//#define globalMeshM // Placeholder
+//#define localMeshN // Placeholder
+//#define localMeshM // Placeholder
+//#define localMeshN_min // Placeholder
+//#define globalMeshN_min // Placeholder
+#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
+//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
+
+
+
+
+static __device__ constexpr int
+IDX(const int i)
+{
+    return i;
+}
+
+static __device__ __forceinline__ int
+IDX(const int i, const int j, const int k)
+{
+    return DEVICE_VTXBUF_IDX(i, j, k);
+}
+
+static __device__ __forceinline__ int
+IDX(const int3 idx)
+{
+    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
+}
+
+
+
+
+
+
+
+
+//#include <thrust/complex.h>
+// using namespace thrust;
+
+
+
+
+#include <cuComplex.h>
+#if AC_DOUBLE_PRECISION == 1
+typedef cuDoubleComplex acComplex;
+#define acComplex(x, y) make_cuDoubleComplex(x, y)
+#else
+typedef cuFloatComplex acComplex;
+#define acComplex(x, y) make_cuFloatComplex(x, y)
+#endif
+static __device__ inline acComplex
+exp(const acComplex& val)
+{
+    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
+}
+static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+{
+    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
+}
+//#include <complex>
--- a/src/core/kernels/integration.cu
+++ b/src/core/kernels/integration.cu
@@ -0,0 +1,296 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "common.cuh"
+
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h"
+
+#include <assert.h>
+
+__constant__ AcMeshInfo d_mesh_info; // Extern in kernels/common.cuh.
+
+static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define WARP_SIZE (32)
+
+#define make_int3(a, b, c)                                                                         \
+    (int3) { (int)a, (int)b, (int)c }
+
+template <int step_number>
+static __device__ __forceinline__ AcReal
+rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
+              const AcReal dt)
+{
+    // Williamson (1980)
+    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
+
+    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
+    // access (when accessing beta[step_number-1] even when step_number >= 1)
+    switch (step_number) {
+    case 0:
+        return state_current + beta[step_number + 1] * rate_of_change * dt;
+    case 1: // Fallthrough
+    case 2:
+        return state_current +
+               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                                            (state_current - state_previous) +
+                                        rate_of_change * dt);
+    default:
+        return NAN;
+    }
+}
+
+template <int step_number>
+static __device__ __forceinline__ AcReal3
+rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
+              const AcReal3 rate_of_change, const AcReal dt)
+{
+    return (AcReal3){
+        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+}
+
+#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
+    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+
+static __device__ void
+write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
+{
+    out[handle][idx] = value;
+}
+
+static __device__ __forceinline__ void
+write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
+{
+    write(out, vec.x, idx, value.x);
+    write(out, vec.y, idx, value.y);
+    write(out, vec.z, idx, value.z);
+}
+
+static __device__ __forceinline__ AcReal
+read_out(const int idx, AcReal* __restrict__ field[], const int handle)
+{
+    return field[handle][idx];
+}
+
+static __device__ __forceinline__ AcReal3
+read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
+{
+    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
+                     read_out(idx, field, handle.z)};
+}
+
+#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
+#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
+#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
+
+#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
+#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
+
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
+    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
+                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
+                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
+    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
+                                        d_multigpu_offset.y + vertexIdx.y,                         \
+                                        d_multigpu_offset.z + vertexIdx.z};                        \
+    (void)globalVertexIdx;                                                                         \
+    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
+        return;                                                                                    \
+                                                                                                   \
+    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&           \
+           vertexIdx.z < DCONST(AC_nz_max));                                                   \
+                                                                                                   \
+    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&         \
+           vertexIdx.z >= DCONST(AC_nz_min));                                                  \
+                                                                                                   \
+    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+
+// clang-format off
+/*
+#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
+    template <int step_number>                                                                     \
+    AcResult acDeviceKernel_##identifier(const cudaStream_t stream,                 \
+                                         const int3 start, const int3 end, VertexBufferArray vba)                         \
+    {                                                                                              \
+                                                                                                   \
+        const dim3 tpb(32, 1, 4);                                                                  \
+                                                                                                   \
+        const int3 n = end - start;                                                                \
+        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
+                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
+                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
+                                                                                                   \
+        identifier<step_number>                                                                    \
+            <<<bpg, tpb, 0, stream>>>(start, end, vba);                   \
+        ERRCHK_CUDA_KERNEL();                                                                      \
+                                                                                                   \
+        return AC_SUCCESS;                                                                         \
+    }
+*/
+#define GEN_DEVICE_FUNC_HOOK(identifier)
+
+#include "user_kernels.h"
+
+static dim3 rk3_tpb(32, 1, 4);
+
+AcResult
+acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
+{
+    // RK3
+    dim3 best_dims(0, 0, 0);
+    float best_time          = INFINITY;
+    const int num_iterations = 10;
+
+    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
+        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
+            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
+
+                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
+                    break;
+                if (x * y * z > MAX_THREADS_PER_BLOCK)
+                    break;
+
+                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
+                    break;
+
+                if (((x * y * z) % WARP_SIZE) != 0)
+                    continue;
+
+                const dim3 tpb(x, y, z);
+                const int3 n = end - start;
+                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+                cudaDeviceSynchronize();
+                if (cudaGetLastError() != cudaSuccess) // resets the error if any
+                    continue;
+
+                // printf("(%d, %d, %d)\n", x, y, z);
+
+                cudaEvent_t tstart, tstop;
+                cudaEventCreate(&tstart);
+                cudaEventCreate(&tstop);
+
+                // #ifdef AC_dt
+                //acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO note, temporarily disabled
+                /*#else
+                                ERROR("FATAL ERROR: acDeviceAutoOptimize() or
+                acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
+                it or call the generated " "device function acDeviceKernel_<kernel name> which does
+                not require the " "timestep to be defined.\n"); #endif*/
+
+                cudaEventRecord(tstart); // ---------------------------------------- Timing start
+                for (int i = 0; i < num_iterations; ++i)
+                    solve<2><<<bpg, tpb>>>(start, end, vba);
+
+                cudaEventRecord(tstop); // ----------------------------------------- Timing end
+                cudaEventSynchronize(tstop);
+                float milliseconds = 0;
+                cudaEventElapsedTime(&milliseconds, tstart, tstop);
+
+                ERRCHK_CUDA_KERNEL_ALWAYS();
+                if (milliseconds < best_time) {
+                    best_time = milliseconds;
+                    best_dims = tpb;
+                }
+            }
+        }
+    }
+#if VERBOSE_PRINTING
+    printf(
+        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
+        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
+#endif
+    /*
+    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
+    ERRCHK(fp);
+    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
+    fclose(fp);
+    */
+
+    rk3_tpb = best_dims;
+    return AC_SUCCESS;
+}
+
+AcResult
+acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba)
+{
+    const dim3 tpb = rk3_tpb;
+
+    const int3 n = end - start;
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+    //#ifdef AC_dt
+    //acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
+    /*#else
+        (void)dt;
+        ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
+              "called, but AC_dt was not defined. Either define it or call the generated "
+              "device function acDeviceKernel_<kernel name> which does not require the "
+              "timestep to be defined.\n");
+    #endif*/
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
+
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+static __global__ void
+dummy_kernel(void)
+{
+    DCONST((AcIntParam)0);
+    DCONST((AcInt3Param)0);
+    DCONST((AcRealParam)0);
+    DCONST((AcReal3Param)0);
+    acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
+    a* a;
+}
+
+AcReal
+acKernelDummy(void)
+{
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    return AC_SUCCESS;
+}
+
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -25,146 +25,12 @@
 *
 */
 #pragma once
-#include "src/core/math_utils.h"

-#include <assert.h>
+AcResult
+acKernelDummy(void);

-static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
+AcResult
+acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);

-static __device__ constexpr int
-IDX(const int i)
-{
-    return i;
-}
-
-static __device__ __forceinline__ int
-IDX(const int i, const int j, const int k)
-{
-    return DEVICE_VTXBUF_IDX(i, j, k);
-}
-
-static __device__ __forceinline__ int
-IDX(const int3 idx)
-{
-    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
-}
-
-#define make_int3(a, b, c)                                                                         \
-    (int3) { (int)a, (int)b, (int)c }
-
-template <int step_number>
-static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
-              const AcReal dt)
-{
-    // Williamson (1980)
-    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
-
-    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
-    // access (when accessing beta[step_number-1] even when step_number >= 1)
-    switch (step_number) {
-    case 0:
-        return state_current + beta[step_number + 1] * rate_of_change * dt;
-    case 1: // Fallthrough
-    case 2:
-        return state_current +
-               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
-                                            (state_current - state_previous) +
-                                        rate_of_change * dt);
-    default:
-        return NAN;
-    }
-}
-
-template <int step_number>
-static __device__ __forceinline__ AcReal3
-rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
-              const AcReal3 rate_of_change, const AcReal dt)
-{
-    return (AcReal3){
-        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
-}
-
-#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
-    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
-
-static __device__ void
-write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
-{
-    out[handle][idx] = value;
-}
-
-static __device__ __forceinline__ void
-write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
-{
-    write(out, vec.x, idx, value.x);
-    write(out, vec.y, idx, value.y);
-    write(out, vec.z, idx, value.z);
-}
-
-static __device__ __forceinline__ AcReal
-read_out(const int idx, AcReal* __restrict__ field[], const int handle)
-{
-    return field[handle][idx];
-}
-
-static __device__ __forceinline__ AcReal3
-read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
-{
-    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
-                     read_out(idx, field, handle.z)};
-}
-
-#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
-#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
-#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
-
-#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
-#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
-
-#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
-    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
-                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
-                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
-    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
-                                        d_multigpu_offset.y + vertexIdx.y,                         \
-                                        d_multigpu_offset.z + vertexIdx.z};                        \
-    (void)globalVertexIdx;                                                                         \
-    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
-        return;                                                                                    \
-                                                                                                   \
-    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&           \
-           vertexIdx.z < DCONST(AC_nz_max));                                                   \
-                                                                                                   \
-    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&         \
-           vertexIdx.z >= DCONST(AC_nz_min));                                                  \
-                                                                                                   \
-    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
-
-// clang-format off
-#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
-    template <int step_number>                                                                     \
-    AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,                 \
-                                         const int3 start, const int3 end)                         \
-    {                                                                                              \
-        cudaSetDevice(device->id);                                                                 \
-                                                                                                   \
-        const dim3 tpb(32, 1, 4);                                                                  \
-                                                                                                   \
-        const int3 n = end - start;                                                                \
-        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
-                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
-                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
-                                                                                                   \
-        identifier<step_number>                                                                    \
-            <<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);                   \
-        ERRCHK_CUDA_KERNEL();                                                                      \
-                                                                                                   \
-        return AC_SUCCESS;                                                                         \
-    }
-
-
-#include "user_kernels.h"
+AcResult
+acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba);
--- a/src/core/kernels/reductions.cu
+++ b/src/core/kernels/reductions.cu
@@ -0,0 +1,283 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "common.cuh"
+
+#include "src/core/errchk.h"
+#include "src/core/math_utils.h" // is_power_of_two
+
+/*
+Reduction steps:
+ 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
+ 2 of 3: Compute most of the reductions into a single block of data
+ 3 of 3: After all results have been stored into the final block, reduce the data in the final block
+*/
+
+// Function pointer definitions
+typedef AcReal (*FilterFunc)(const AcReal&);
+typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
+
+// clang-format off
+/* Comparison funcs */
+static __device__ inline AcReal
+dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+static __device__ inline AcReal
+dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+static __device__ inline AcReal
+dsum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static __device__ inline AcReal
+dvalue(const AcReal& a) { return AcReal(a); }
+
+static __device__ inline AcReal
+dsquared(const AcReal& a) { return (AcReal)(a*a); }
+
+static __device__ inline AcReal
+dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
+
+static __device__ inline AcReal
+dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+static __device__ inline AcReal
+dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
+
+static __device__ inline AcReal
+dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
+// clang-format on
+
+#include <assert.h>
+template <FilterFunc filter>
+__global__ void
+kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
+           src_idx.z < DCONST(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
+}
+
+template <FilterFuncVec filter>
+__global__ void
+kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
+                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
+           src_idx.z < DCONST(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
+        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+}
+
+template <ReduceFunc reduce>
+__global__ void
+kernel_reduce(AcReal* scratchpad, const int num_elems)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    extern __shared__ AcReal smem[];
+    if (idx < num_elems) {
+        smem[threadIdx.x] = scratchpad[idx];
+    }
+    else {
+        smem[threadIdx.x] = NAN;
+    }
+    __syncthreads();
+
+    int offset = blockDim.x / 2;
+    assert(offset % 2 == 0);
+    while (offset > 0) {
+        if (threadIdx.x < offset) {
+            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        scratchpad[idx] = smem[threadIdx.x];
+    }
+}
+
+template <ReduceFunc reduce>
+__global__ void
+kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
+                    const int block_size, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx != 0) {
+        return;
+    }
+
+    AcReal res = scratchpad[0];
+    for (int i = 1; i < num_blocks; ++i) {
+        res = reduce(res, scratchpad[i * block_size]);
+    }
+    *result = res;
+}
+
+AcReal
+acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
+
+AcReal
+acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
+           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
+           AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -25,258 +25,13 @@
 *
 */
 #pragma once
+#include <astaroth.h>

-#include "src/core/math_utils.h" // is_power_of_two
+AcReal
+acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result);

-/*
-Reduction steps:
- 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
- 2 of 3: Compute most of the reductions into a single block of data
- 3 of 3: After all results have been stored into the final block, reduce the data in the final block
-*/
-
-// Function pointer definitions
-typedef AcReal (*FilterFunc)(const AcReal&);
-typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
-typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
-
-// clang-format off
-/* Comparison funcs */
-static __device__ inline AcReal
-dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
-
-static __device__ inline AcReal
-dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
-
-static __device__ inline AcReal
-dsum(const AcReal& a, const AcReal& b) { return a + b; }
-
-/* Function used to determine the values used during reduction */
-static __device__ inline AcReal
-dvalue(const AcReal& a) { return AcReal(a); }
-
-static __device__ inline AcReal
-dsquared(const AcReal& a) { return (AcReal)(a*a); }
-
-static __device__ inline AcReal
-dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
-
-static __device__ inline AcReal
-dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
-
-static __device__ inline AcReal
-dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
-
-static __device__ inline AcReal
-dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
-// clang-format on
-
-#include <assert.h>
-template <FilterFunc filter>
-__global__ void
-kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
-           src_idx.z < DCONST(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
-}
-
-template <FilterFuncVec filter>
-__global__ void
-kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
-                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
-           src_idx.z < DCONST(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce(AcReal* scratchpad, const int num_elems)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    extern __shared__ AcReal smem[];
-    if (idx < num_elems) {
-        smem[threadIdx.x] = scratchpad[idx];
-    }
-    else {
-        smem[threadIdx.x] = NAN;
-    }
-    __syncthreads();
-
-    int offset = blockDim.x / 2;
-    assert(offset % 2 == 0);
-    while (offset > 0) {
-        if (threadIdx.x < offset) {
-            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
-        }
-        offset /= 2;
-        __syncthreads();
-    }
-
-    if (threadIdx.x == 0) {
-        scratchpad[idx] = smem[threadIdx.x];
-    }
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
-                    const int block_size, AcReal* result)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx != 0) {
-        return;
-    }
-
-    AcReal res = scratchpad[0];
-    for (int i = 1; i < num_blocks; ++i) {
-        res = reduce(res, scratchpad[i * block_size]);
-    }
-    *result = res;
-}
-
-static AcReal
-reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
-
-static AcReal
-reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
+AcReal
+acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
-           AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
+           AcReal* reduce_result);
--- a/src/core/node.cc
+++ b/src/core/node.cc