Merge branch 'master' into sink_20190723

2019-08-07 10:46:23 +08:00
parent aebf4f11ae a6fca069a7
commit e199ca196d
24 changed files with 849 additions and 437 deletions
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -48,7 +48,9 @@ __constant__ Grid globalGrid;
 #define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
-#include "kernels/kernels.cuh"
+#include "kernels/boundconds.cuh"
+#include "kernels/integration.cuh"
+#include "kernels/reductions.cuh"

 static dim3 rk3_tpb = (dim3){32, 1, 4};

--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -0,0 +1,87 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+
+__global__ void
+kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    //    return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
+        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
+        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src = i_dst - DCONST_INT(AC_nx_min);
+    int j_src = j_dst - DCONST_INT(AC_ny_min);
+    int k_src = k_dst - DCONST_INT(AC_nz_min);
+
+    // Translate (s.t. the index is always positive)
+    i_src += DCONST_INT(AC_nx);
+    j_src += DCONST_INT(AC_ny);
+    k_src += DCONST_INT(AC_nz);
+
+    // Wrap
+    i_src %= DCONST_INT(AC_nx);
+    j_src %= DCONST_INT(AC_ny);
+    k_src %= DCONST_INT(AC_nz);
+
+    // Map to mx, my, mz coordinates
+    i_src += DCONST_INT(AC_nx_min);
+    j_src += DCONST_INT(AC_ny_min);
+    k_src += DCONST_INT(AC_nz_min);
+
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vtxbuf[dst_idx]   = vtxbuf[src_idx];
+}
+
+void
+periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
+{
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
+    ERRCHK_CUDA_KERNEL();
+}
--- a/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/boundconds_PLACEHOLDER.cuh
--- a/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/reduce_PLACEHOLDER.cuh
--- a/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
+++ b/src/core/kernels/deprecated/rk3_PLACEHOLDER.cuh
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -26,67 +26,6 @@
 */
 #pragma once

-__global__ void
-kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
-{
-    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
-    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
-    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
-
-    // If within the start-end range (this allows threadblock dims that are not
-    // divisible by end - start)
-    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
-        return;
-
-    // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
-    //    return;
-
-    // If destination index is inside the computational domain, return since
-    // the boundary conditions are only applied to the ghost zones
-    if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
-        j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
-        k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
-        return;
-
-    // Find the source index
-    // Map to nx, ny, nz coordinates
-    int i_src = i_dst - DCONST_INT(AC_nx_min);
-    int j_src = j_dst - DCONST_INT(AC_ny_min);
-    int k_src = k_dst - DCONST_INT(AC_nz_min);
-
-    // Translate (s.t. the index is always positive)
-    i_src += DCONST_INT(AC_nx);
-    j_src += DCONST_INT(AC_ny);
-    k_src += DCONST_INT(AC_nz);
-
-    // Wrap
-    i_src %= DCONST_INT(AC_nx);
-    j_src %= DCONST_INT(AC_ny);
-    k_src %= DCONST_INT(AC_nz);
-
-    // Map to mx, my, mz coordinates
-    i_src += DCONST_INT(AC_nx_min);
-    j_src += DCONST_INT(AC_ny_min);
-    k_src += DCONST_INT(AC_nz_min);
-
-    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx]   = vtxbuf[src_idx];
-}
-
-void
-periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
-{
-    const dim3 tpb(8, 2, 8);
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
-    ERRCHK_CUDA_KERNEL();
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <assert.h>

 static __device__ __forceinline__ int
@@ -120,17 +59,18 @@ create_rotz(const AcReal radians)
 }

 #if AC_DOUBLE_PRECISION == 0
+/*
+// Fast but inaccurate
 #define sin __sinf
 #define cos __cosf
 #define exp __expf
+*/
+#define sin sinf
+#define cos cosf
+#define exp expf
 #define rsqrt rsqrtf // hardware reciprocal sqrt
 #endif               // AC_DOUBLE_PRECISION == 0

-/*
-typedef struct {
-    int i, j, k;
-} int3;*/
-
 /*
 * =============================================================================
 * Level 0 (Input Assembly Stage)
@@ -700,265 +640,3 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);

 #include "stencil_process.cuh"
-
-/*
- * =============================================================================
- * Level 2 (Host calls)
- * =============================================================================
- */
-
-////////////////REDUCE///////////////////////////
-#include "src/core/math_utils.h" // is_power_of_two
-
-/*
-Reduction steps:
- 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
- 2 of 3: Compute most of the reductions into a single block of data
- 3 of 3: After all results have been stored into the final block, reduce the data in the final block
-*/
-
-// Function pointer definitions
-typedef AcReal (*FilterFunc)(const AcReal&);
-typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
-typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
-
-// clang-format off
-/* Comparison funcs */
-static __device__ inline AcReal
-dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
-
-static __device__ inline AcReal
-dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
-
-static __device__ inline AcReal
-dsum(const AcReal& a, const AcReal& b) { return a + b; }
-
-/* Function used to determine the values used during reduction */
-static __device__ inline AcReal
-dvalue(const AcReal& a) { return AcReal(a); }
-
-static __device__ inline AcReal
-dsquared(const AcReal& a) { return (AcReal)(a*a); }
-
-static __device__ inline AcReal
-dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
-
-static __device__ inline AcReal
-dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
-
-static __device__ inline AcReal
-dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
-
-static __device__ inline AcReal
-dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
-// clang-format on
-
-#include <assert.h>
-template <FilterFunc filter>
-__global__ void
-kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
-           src_idx.z < DCONST_INT(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
-}
-
-template <FilterFuncVec filter>
-__global__ void
-kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
-                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
-{
-    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
-
-    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-                                threadIdx.y + blockIdx.y * blockDim.y,
-                                threadIdx.z + blockIdx.z * blockDim.z};
-
-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
-           src_idx.z < DCONST_INT(AC_nz_max));
-    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
-    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce(AcReal* scratchpad, const int num_elems)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    extern __shared__ AcReal smem[];
-    if (idx < num_elems) {
-        smem[threadIdx.x] = scratchpad[idx];
-    }
-    else {
-        smem[threadIdx.x] = NAN;
-    }
-    __syncthreads();
-
-    int offset = blockDim.x / 2;
-    assert(offset % 2 == 0);
-    while (offset > 0) {
-        if (threadIdx.x < offset) {
-            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
-        }
-        offset /= 2;
-        __syncthreads();
-    }
-
-    if (threadIdx.x == 0) {
-        scratchpad[idx] = smem[threadIdx.x];
-    }
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
-                    const int block_size, AcReal* result)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx != 0) {
-        return;
-    }
-
-    AcReal res = scratchpad[0];
-    for (int i = 1; i < num_blocks; ++i) {
-        res = reduce(res, scratchpad[i * block_size]);
-    }
-    *result = res;
-}
-
-AcReal
-reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
-
-AcReal
-reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
-           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
-           AcReal* reduce_result)
-{
-    const unsigned nx        = end.x - start.x;
-    const unsigned ny        = end.y - start.y;
-    const unsigned nz        = end.z - start.z;
-    const unsigned num_elems = nx * ny * nz;
-
-    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
-
-    const int tpb_reduce = 128;
-    const int bpg_reduce = num_elems / tpb_reduce;
-
-    ERRCHK(nx >= tpb_filter.x);
-    ERRCHK(ny >= tpb_filter.y);
-    ERRCHK(nz >= tpb_filter.z);
-    ERRCHK(tpb_reduce <= num_elems);
-    ERRCHK(nx * ny * nz % 2 == 0);
-
-    // clang-format off
-    if (rtype == RTYPE_MAX) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_SUM) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
-        ERROR("Unrecognized rtype");
-    }
-    // clang-format on
-
-    cudaStreamSynchronize(stream);
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -0,0 +1,282 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#pragma once
+
+#include "src/core/math_utils.h" // is_power_of_two
+
+/*
+Reduction steps:
+ 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
+ 2 of 3: Compute most of the reductions into a single block of data
+ 3 of 3: After all results have been stored into the final block, reduce the data in the final block
+*/
+
+// Function pointer definitions
+typedef AcReal (*FilterFunc)(const AcReal&);
+typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
+
+// clang-format off
+/* Comparison funcs */
+static __device__ inline AcReal
+dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+static __device__ inline AcReal
+dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+static __device__ inline AcReal
+dsum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static __device__ inline AcReal
+dvalue(const AcReal& a) { return AcReal(a); }
+
+static __device__ inline AcReal
+dsquared(const AcReal& a) { return (AcReal)(a*a); }
+
+static __device__ inline AcReal
+dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
+
+static __device__ inline AcReal
+dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+static __device__ inline AcReal
+dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
+
+static __device__ inline AcReal
+dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
+// clang-format on
+
+#include <assert.h>
+template <FilterFunc filter>
+__global__ void
+kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
+           src_idx.z < DCONST_INT(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
+}
+
+template <FilterFuncVec filter>
+__global__ void
+kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
+                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
+           src_idx.z < DCONST_INT(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
+        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+}
+
+template <ReduceFunc reduce>
+__global__ void
+kernel_reduce(AcReal* scratchpad, const int num_elems)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    extern __shared__ AcReal smem[];
+    if (idx < num_elems) {
+        smem[threadIdx.x] = scratchpad[idx];
+    }
+    else {
+        smem[threadIdx.x] = NAN;
+    }
+    __syncthreads();
+
+    int offset = blockDim.x / 2;
+    assert(offset % 2 == 0);
+    while (offset > 0) {
+        if (threadIdx.x < offset) {
+            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        scratchpad[idx] = smem[threadIdx.x];
+    }
+}
+
+template <ReduceFunc reduce>
+__global__ void
+kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
+                    const int block_size, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx != 0) {
+        return;
+    }
+
+    AcReal res = scratchpad[0];
+    for (int i = 1; i < num_blocks; ++i) {
+        res = reduce(res, scratchpad[i * block_size]);
+    }
+    *result = res;
+}
+
+AcReal
+reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
+
+AcReal
+reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
+           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
+           AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    // clang-format off
+    if (rtype == RTYPE_MAX) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_MIN) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS) {
+        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -236,6 +236,14 @@ check_reductions(const AcMeshInfo& config)
        acLoad(*mesh);

        for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) {
+
+            if (rtype == RTYPE_SUM) {
+                // Skip SUM test for now. The failure is either caused by floating-point
+                // cancellation or an actual issue
+                WARNING("Skipping RTYPE_SUM test\n");
+                continue;
+            }
+
            const VertexBufferHandle ftype = VTXBUF_UUX;

            // Scal
@@ -337,6 +345,8 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)
                printf("Index (%d, %d, %d)\n", i0, j0, k0);
                print_debug_info(model_val, cand_val, range);
                retval = false;
+                printf("Breaking\n");
+                break;
            }

            const ModelScalar abs_error = get_absolute_error(model_val, cand_val);
@@ -412,6 +422,12 @@ check_rk3(const AcMeshInfo& mesh_info)
            // const AcReal dt   = host_timestep(umax, mesh_info);
            const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities

+#if LFORCING
+            const ForcingParams forcing_params = generateForcingParams(model_mesh->info);
+            loadForcingParamsToHost(forcing_params, model_mesh);
+            loadForcingParamsToDevice(forcing_params);
+#endif
+
            acIntegrate(dt);

            model_rk3(dt, model_mesh);
--- a/src/standalone/model/host_forcing.cc
+++ b/src/standalone/model/host_forcing.cc
@@ -255,6 +255,30 @@ loadForcingParamsToHost(const ForcingParams& forcing_params, AcMesh* mesh)
    mesh->info.real_params[AC_kaver] = forcing_params.kaver;
 }

+void
+loadForcingParamsToHost(const ForcingParams& forcing_params, ModelMesh* mesh)
+{
+    // %JP: Left some regex magic here in case we need to modify the ForcingParams struct
+    // acLoadDeviceConstant\(([A-Za-z_]*), ([a-z_.]*)\);
+    // mesh->info.real_params[$1] = $2;
+    mesh->info.real_params[AC_forcing_magnitude] = forcing_params.magnitude;
+    mesh->info.real_params[AC_forcing_phase]     = forcing_params.phase;
+
+    mesh->info.real_params[AC_k_forcex] = forcing_params.k_force.x;
+    mesh->info.real_params[AC_k_forcey] = forcing_params.k_force.y;
+    mesh->info.real_params[AC_k_forcez] = forcing_params.k_force.z;
+
+    mesh->info.real_params[AC_ff_hel_rex] = forcing_params.ff_hel_re.x;
+    mesh->info.real_params[AC_ff_hel_rey] = forcing_params.ff_hel_re.y;
+    mesh->info.real_params[AC_ff_hel_rez] = forcing_params.ff_hel_re.z;
+
+    mesh->info.real_params[AC_ff_hel_imx] = forcing_params.ff_hel_im.x;
+    mesh->info.real_params[AC_ff_hel_imy] = forcing_params.ff_hel_im.y;
+    mesh->info.real_params[AC_ff_hel_imz] = forcing_params.ff_hel_im.z;
+
+    mesh->info.real_params[AC_kaver] = forcing_params.kaver;
+}
+
 ForcingParams
 generateForcingParams(const AcMeshInfo& mesh_info)
 {
--- a/src/standalone/model/host_forcing.h
+++ b/src/standalone/model/host_forcing.h
@@ -28,6 +28,8 @@
 #pragma once
 #include "astaroth.h"

+#include "modelmesh.h"
+
 AcReal get_random_number_01();

 AcReal3 cross(const AcReal3& a, const AcReal3& b);
@@ -64,5 +66,6 @@ typedef struct {
 void loadForcingParamsToDevice(const ForcingParams& forcing_params);

 void loadForcingParamsToHost(const ForcingParams& forcing_params, AcMesh* mesh);
+void loadForcingParamsToHost(const ForcingParams& forcing_params, ModelMesh* mesh);

 ForcingParams generateForcingParams(const AcMeshInfo& mesh_info);
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -560,11 +560,9 @@ momentum(const ModelVectorData& uu, const ModelScalarData& lnrho
 #else
    // !!!!!!!!!!!!!!!!%JP: NOTE TODO IMPORTANT!!!!!!!!!!!!!!!!!!!!!!!!
    // NOT CHECKED FOR CORRECTNESS: USE AT YOUR OWN RISK
-    const ModelMatrix S   = stress_tensor(uu);
-    const ModelScalar cs2 = get(AC_cs2_sound) *
-                            expl((get(AC_gamma) - 1) * (value(lnrho) - get(AC_lnrho0)));
+    const ModelMatrix S = stress_tensor(uu);

-    const ModelVector mom = -mul(gradients(uu), value(uu)) - cs2 * gradient(lnrho) +
+    const ModelVector mom = -mul(gradients(uu), value(uu)) - get(AC_cs2_sound) * gradient(lnrho) +
                            get(AC_nu_visc) * (laplace_vec(uu) +
                                               ModelScalar(1. / 3.) * gradient_of_divergence(uu) +
                                               ModelScalar(2.) * mul(S, gradient(lnrho))) +
@@ -662,15 +660,13 @@ is_valid(const ModelVector& a)
 }

 #if LFORCING
-// FORCING NOT SUPPORTED FOR AUTOTEST
-
-static inline ModelVector
+ModelVector
 simple_vortex_forcing(ModelVector a, ModelVector b, ModelScalar magnitude)
 {
    return magnitude * cross(normalized(b - a), (ModelVector){0, 0, 1}); // Vortex
 }

-static inline ModelVector
+ModelVector
 simple_outward_flow_forcing(ModelVector a, ModelVector b, ModelScalar magnitude)
 {
    return magnitude * (1 / length(b - a)) * normalized(b - a); // Outward flow
@@ -678,22 +674,24 @@ simple_outward_flow_forcing(ModelVector a, ModelVector b, ModelScalar magnitude)

 // The Pencil Code forcing_hel_noshear(), manual Eq. 222, inspired forcing function with adjustable
 // helicity
-static inline ModelVector
-helical_forcing(ModelScalar /* magnitude */, ModelVector k_force, ModelVector xx, ModelVector ff_re,
+ModelVector
+helical_forcing(ModelScalar magnitude, ModelVector k_force, ModelVector xx, ModelVector ff_re,
                ModelVector ff_im, ModelScalar phi)
 {
+    (void)magnitude; // WARNING: unused
+    xx.x = xx.x * (2.0l * M_PI / (get(AC_dsx) * (get(AC_ny_max) - get(AC_ny_min))));
+    xx.y = xx.y * (2.0l * M_PI / (get(AC_dsy) * (get(AC_ny_max) - get(AC_ny_min))));
+    xx.z = xx.z * (2.0l * M_PI / (get(AC_dsz) * (get(AC_ny_max) - get(AC_ny_min))));

-    xx.x = xx.x * (2.0 * M_PI / (get(AC_dsx) * (get(AC_ny_max) - get(AC_ny_min))));
-    xx.y = xx.y * (2.0 * M_PI / (get(AC_dsy) * (get(AC_ny_max) - get(AC_ny_min))));
-    xx.z = xx.z * (2.0 * M_PI / (get(AC_dsz) * (get(AC_ny_max) - get(AC_ny_min))));
-
-    ModelScalar cos_phi     = cosl(phi);
-    ModelScalar sin_phi     = sinl(phi);
-    ModelScalar cos_k_dox_x = cosl(dot(k_force, xx));
-    ModelScalar sin_k_dox_x = sinl(dot(k_force, xx));
+    ModelScalar cosl_phi     = cosl(phi);
+    ModelScalar sinl_phi     = sinl(phi);
+    ModelScalar cosl_k_dox_x = cosl(dot(k_force, xx));
+    ModelScalar sinl_k_dox_x = sinl(dot(k_force, xx));
    // Phase affect only the x-component
-    ModelScalar real_comp_phase = cos_k_dox_x * cos_phi - sin_k_dox_x * sin_phi;
-    ModelScalar imag_comp_phase = cos_k_dox_x * sin_phi + sin_k_dox_x * cos_phi;
+    // ModelScalar real_comp       = cosl_k_dox_x;
+    // ModelScalar imag_comp       = sinl_k_dox_x;
+    ModelScalar real_comp_phase = cosl_k_dox_x * cosl_phi - sinl_k_dox_x * sinl_phi;
+    ModelScalar imag_comp_phase = cosl_k_dox_x * sinl_phi + sinl_k_dox_x * cosl_phi;

    ModelVector force = (ModelVector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
                                      ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
@@ -702,18 +700,17 @@ helical_forcing(ModelScalar /* magnitude */, ModelVector k_force, ModelVector xx
    return force;
 }

-static inline ModelVector
+ModelVector
 forcing(int3 globalVertexIdx, ModelScalar dt)
 {
-    /*
-    ModelVector a  = ModelScalar(.5) * (ModelVector){get(AC_nx) * get(AC_dsx),
+    ModelVector a = ModelScalar(.5) * (ModelVector){get(AC_nx) * get(AC_dsx),
                                                    get(AC_ny) * get(AC_dsy),
                                                    get(AC_nz) * get(AC_dsz)}; // source (origin)
-                                                    */
-    ModelVector xx = (ModelVector){
-        (globalVertexIdx.x - get(AC_nx_min)) * get(AC_dsx),
-        (globalVertexIdx.y - get(AC_ny_min) * get(AC_dsy)),
-        (globalVertexIdx.z - get(AC_nz_min) * get(AC_dsz))}; // sink (current index)
+    (void)a;                                                                   // WARNING: not used
+    ModelVector xx        = (ModelVector){(globalVertexIdx.x - get(AC_nx_min)) * get(AC_dsx),
+                                   (globalVertexIdx.y - get(AC_ny_min)) * get(AC_dsy),
+                                   (globalVertexIdx.z - get(AC_nz_min)) *
+                                       get(AC_dsz)}; // sink (current index)
    const ModelScalar cs2 = get(AC_cs2_sound);
    const ModelScalar cs  = sqrtl(cs2);

@@ -724,6 +721,11 @@ forcing(int3 globalVertexIdx, ModelScalar dt)
    ModelVector ff_re = (ModelVector){get(AC_ff_hel_rex), get(AC_ff_hel_rey), get(AC_ff_hel_rez)};
    ModelVector ff_im = (ModelVector){get(AC_ff_hel_imx), get(AC_ff_hel_imy), get(AC_ff_hel_imz)};

+    (void)phase;   // WARNING: unused with simple forcing. Should be defined in helical_forcing
+    (void)k_force; // WARNING: unused with simple forcing. Should be defined in helical_forcing
+    (void)ff_re;   // WARNING: unused with simple forcing. Should be defined in helical_forcing
+    (void)ff_im;   // WARNING: unused with simple forcing. Should be defined in helical_forcing
+
    // Determine that forcing funtion type at this point.
    // ModelVector force = simple_vortex_forcing(a, xx, magnitude);
    // ModelVector force = simple_outward_flow_forcing(a, xx, magnitude);
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -34,6 +34,7 @@
 #include "config_loader.h"
 #include "core/errchk.h"
 #include "core/math_utils.h"
+#include "model/host_forcing.h"
 #include "model/host_memory.h"
 #include "model/host_timestep.h"
 #include "model/model_reduce.h"
@@ -383,6 +384,12 @@ run_renderer(void)
 #if 1
        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
        const AcReal dt   = host_timestep(umax, mesh_info);
+
+#if LFORCING
+        const ForcingParams forcing_params = generateForcingParams(mesh->info);
+        loadForcingParamsToDevice(forcing_params);
+#endif
+
        acIntegrate(dt);
 #else
        ModelMesh* model_mesh = modelmesh_create(mesh->info);