diff --git a/src/core/device.cu b/src/core/device.cu index 005d02f..90180f4 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -48,7 +48,9 @@ __constant__ Grid globalGrid; #define DCONST_REAL3(X) (d_mesh_info.real3_params[X]) #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) -#include "kernels/kernels.cuh" +#include "kernels/boundconds.cuh" +#include "kernels/integration.cuh" +#include "kernels/reductions.cuh" static dim3 rk3_tpb = (dim3){32, 1, 4}; diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh new file mode 100644 index 0000000..5c70114 --- /dev/null +++ b/src/core/kernels/boundconds.cuh @@ -0,0 +1,87 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ + +/** + * @file + * \brief Brief info. + * + * Detailed info. + * + */ +#pragma once + +__global__ void +kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) +{ + const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x; + const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y; + const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z; + + // If within the start-end range (this allows threadblock dims that are not + // divisible by end - start) + if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z) + return; + + // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz)) + // return; + + // If destination index is inside the computational domain, return since + // the boundary conditions are only applied to the ghost zones + if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) && + j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) && + k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max)) + return; + + // Find the source index + // Map to nx, ny, nz coordinates + int i_src = i_dst - DCONST_INT(AC_nx_min); + int j_src = j_dst - DCONST_INT(AC_ny_min); + int k_src = k_dst - DCONST_INT(AC_nz_min); + + // Translate (s.t. the index is always positive) + i_src += DCONST_INT(AC_nx); + j_src += DCONST_INT(AC_ny); + k_src += DCONST_INT(AC_nz); + + // Wrap + i_src %= DCONST_INT(AC_nx); + j_src %= DCONST_INT(AC_ny); + k_src %= DCONST_INT(AC_nz); + + // Map to mx, my, mz coordinates + i_src += DCONST_INT(AC_nx_min); + j_src += DCONST_INT(AC_ny_min); + k_src += DCONST_INT(AC_nz_min); + + const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); + const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); + vtxbuf[dst_idx] = vtxbuf[src_idx]; +} + +void +periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) +{ + const dim3 tpb(8, 2, 8); + const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), + (unsigned int)ceil((end.y - start.y) / (float)tpb.y), + (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); + + kernel_periodic_boundconds<<>>(start, end, vtxbuf); + ERRCHK_CUDA_KERNEL(); +} diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/integration.cuh similarity index 61% rename from src/core/kernels/kernels.cuh rename to src/core/kernels/integration.cuh index c604205..3503372 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/integration.cuh @@ -26,67 +26,6 @@ */ #pragma once -__global__ void -kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) -{ - const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x; - const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y; - const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z; - - // If within the start-end range (this allows threadblock dims that are not - // divisible by end - start) - if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z) - return; - - // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz)) - // return; - - // If destination index is inside the computational domain, return since - // the boundary conditions are only applied to the ghost zones - if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) && - j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) && - k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max)) - return; - - // Find the source index - // Map to nx, ny, nz coordinates - int i_src = i_dst - DCONST_INT(AC_nx_min); - int j_src = j_dst - DCONST_INT(AC_ny_min); - int k_src = k_dst - DCONST_INT(AC_nz_min); - - // Translate (s.t. the index is always positive) - i_src += DCONST_INT(AC_nx); - j_src += DCONST_INT(AC_ny); - k_src += DCONST_INT(AC_nz); - - // Wrap - i_src %= DCONST_INT(AC_nx); - j_src %= DCONST_INT(AC_ny); - k_src %= DCONST_INT(AC_nz); - - // Map to mx, my, mz coordinates - i_src += DCONST_INT(AC_nx_min); - j_src += DCONST_INT(AC_ny_min); - k_src += DCONST_INT(AC_nz_min); - - const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); - const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); - vtxbuf[dst_idx] = vtxbuf[src_idx]; -} - -void -periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) -{ - const dim3 tpb(8, 2, 8); - const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), - (unsigned int)ceil((end.y - start.y) / (float)tpb.y), - (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); - - kernel_periodic_boundconds<<>>(start, end, vtxbuf); - ERRCHK_CUDA_KERNEL(); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// #include static __device__ __forceinline__ int @@ -700,265 +639,3 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); #include "stencil_process.cuh" - -/* - * ============================================================================= - * Level 2 (Host calls) - * ============================================================================= - */ - -////////////////REDUCE/////////////////////////// -#include "src/core/math_utils.h" // is_power_of_two - -/* -Reduction steps: - 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad - 2 of 3: Compute most of the reductions into a single block of data - 3 of 3: After all results have been stored into the final block, reduce the data in the final block -*/ - -// Function pointer definitions -typedef AcReal (*FilterFunc)(const AcReal&); -typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&); -typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&); - -// clang-format off -/* Comparison funcs */ -static __device__ inline AcReal -dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; } - -static __device__ inline AcReal -dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; } - -static __device__ inline AcReal -dsum(const AcReal& a, const AcReal& b) { return a + b; } - -/* Function used to determine the values used during reduction */ -static __device__ inline AcReal -dvalue(const AcReal& a) { return AcReal(a); } - -static __device__ inline AcReal -dsquared(const AcReal& a) { return (AcReal)(a*a); } - -static __device__ inline AcReal -dexp_squared(const AcReal& a) { return exp(a)*exp(a); } - -static __device__ inline AcReal -dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); } - -static __device__ inline AcReal -dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); } - -static __device__ inline AcReal -dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); } -// clang-format on - -#include -template -__global__ void -kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst) -{ - const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, - start.y + threadIdx.y + blockIdx.y * blockDim.y, - start.z + threadIdx.z + blockIdx.z * blockDim.z}; - - const int nx = end.x - start.x; - const int ny = end.y - start.y; - const int nz = end.z - start.z; - (void)nz; // Suppressed unused variable warning when not compiling with debug flags - - const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, - threadIdx.y + blockIdx.y * blockDim.y, - threadIdx.z + blockIdx.z * blockDim.z}; - - assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && - src_idx.z < DCONST_INT(AC_nz_max)); - assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); - assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); - - dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]); -} - -template -__global__ void -kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1, - const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst) -{ - const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, - start.y + threadIdx.y + blockIdx.y * blockDim.y, - start.z + threadIdx.z + blockIdx.z * blockDim.z}; - - const int nx = end.x - start.x; - const int ny = end.y - start.y; - const int nz = end.z - start.z; - (void)nz; // Suppressed unused variable warning when not compiling with debug flags - - const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, - threadIdx.y + blockIdx.y * blockDim.y, - threadIdx.z + blockIdx.z * blockDim.z}; - - assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && - src_idx.z < DCONST_INT(AC_nz_max)); - assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); - assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); - - dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( - src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); -} - -template -__global__ void -kernel_reduce(AcReal* scratchpad, const int num_elems) -{ - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - - extern __shared__ AcReal smem[]; - if (idx < num_elems) { - smem[threadIdx.x] = scratchpad[idx]; - } - else { - smem[threadIdx.x] = NAN; - } - __syncthreads(); - - int offset = blockDim.x / 2; - assert(offset % 2 == 0); - while (offset > 0) { - if (threadIdx.x < offset) { - smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]); - } - offset /= 2; - __syncthreads(); - } - - if (threadIdx.x == 0) { - scratchpad[idx] = smem[threadIdx.x]; - } -} - -template -__global__ void -kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks, - const int block_size, AcReal* result) -{ - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx != 0) { - return; - } - - AcReal res = scratchpad[0]; - for (int i = 1; i < num_blocks; ++i) { - res = reduce(res, scratchpad[i * block_size]); - } - *result = res; -} - -AcReal -reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) -{ - const unsigned nx = end.x - start.x; - const unsigned ny = end.y - start.y; - const unsigned nz = end.z - start.z; - const unsigned num_elems = nx * ny * nz; - - const dim3 tpb_filter(32, 4, 1); - const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), - (unsigned int)ceil(ny / AcReal(tpb_filter.y)), - (unsigned int)ceil(nz / AcReal(tpb_filter.z))); - - const int tpb_reduce = 128; - const int bpg_reduce = num_elems / tpb_reduce; - - ERRCHK(nx >= tpb_filter.x); - ERRCHK(ny >= tpb_filter.y); - ERRCHK(nz >= tpb_filter.z); - ERRCHK(tpb_reduce <= num_elems); - ERRCHK(nx * ny * nz % 2 == 0); - - // clang-format off - if (rtype == RTYPE_MAX) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_MIN) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS_EXP) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_SUM) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else { - ERROR("Unrecognized rtype"); - } - // clang-format on - cudaStreamSynchronize(stream); - AcReal result; - cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); - return result; -} - -AcReal -reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, - const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, - AcReal* reduce_result) -{ - const unsigned nx = end.x - start.x; - const unsigned ny = end.y - start.y; - const unsigned nz = end.z - start.z; - const unsigned num_elems = nx * ny * nz; - - const dim3 tpb_filter(32, 4, 1); - const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), - (unsigned int)ceil(ny / AcReal(tpb_filter.y)), - (unsigned int)ceil(nz / AcReal(tpb_filter.z))); - - const int tpb_reduce = 128; - const int bpg_reduce = num_elems / tpb_reduce; - - ERRCHK(nx >= tpb_filter.x); - ERRCHK(ny >= tpb_filter.y); - ERRCHK(nz >= tpb_filter.z); - ERRCHK(tpb_reduce <= num_elems); - ERRCHK(nx * ny * nz % 2 == 0); - - // clang-format off - if (rtype == RTYPE_MAX) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_MIN) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS_EXP) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_SUM) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else { - ERROR("Unrecognized rtype"); - } - // clang-format on - - cudaStreamSynchronize(stream); - AcReal result; - cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); - return result; -} diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh new file mode 100644 index 0000000..26697e0 --- /dev/null +++ b/src/core/kernels/reductions.cuh @@ -0,0 +1,282 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ + +/** + * @file + * \brief Brief info. + * + * Detailed info. + * + */ +#pragma once + +#include "src/core/math_utils.h" // is_power_of_two + +/* +Reduction steps: + 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad + 2 of 3: Compute most of the reductions into a single block of data + 3 of 3: After all results have been stored into the final block, reduce the data in the final block +*/ + +// Function pointer definitions +typedef AcReal (*FilterFunc)(const AcReal&); +typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&); +typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&); + +// clang-format off +/* Comparison funcs */ +static __device__ inline AcReal +dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; } + +static __device__ inline AcReal +dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; } + +static __device__ inline AcReal +dsum(const AcReal& a, const AcReal& b) { return a + b; } + +/* Function used to determine the values used during reduction */ +static __device__ inline AcReal +dvalue(const AcReal& a) { return AcReal(a); } + +static __device__ inline AcReal +dsquared(const AcReal& a) { return (AcReal)(a*a); } + +static __device__ inline AcReal +dexp_squared(const AcReal& a) { return exp(a)*exp(a); } + +static __device__ inline AcReal +dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); } + +static __device__ inline AcReal +dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); } + +static __device__ inline AcReal +dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); } +// clang-format on + +#include +template +__global__ void +kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst) +{ + const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z}; + + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; + (void)nz; // Suppressed unused variable warning when not compiling with debug flags + + const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z}; + + assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && + src_idx.z < DCONST_INT(AC_nz_max)); + assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); + assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); + + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]); +} + +template +__global__ void +kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1, + const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst) +{ + const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z}; + + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; + (void)nz; // Suppressed unused variable warning when not compiling with debug flags + + const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z}; + + assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && + src_idx.z < DCONST_INT(AC_nz_max)); + assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); + assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); + + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( + src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); +} + +template +__global__ void +kernel_reduce(AcReal* scratchpad, const int num_elems) +{ + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + + extern __shared__ AcReal smem[]; + if (idx < num_elems) { + smem[threadIdx.x] = scratchpad[idx]; + } + else { + smem[threadIdx.x] = NAN; + } + __syncthreads(); + + int offset = blockDim.x / 2; + assert(offset % 2 == 0); + while (offset > 0) { + if (threadIdx.x < offset) { + smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]); + } + offset /= 2; + __syncthreads(); + } + + if (threadIdx.x == 0) { + scratchpad[idx] = smem[threadIdx.x]; + } +} + +template +__global__ void +kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks, + const int block_size, AcReal* result) +{ + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx != 0) { + return; + } + + AcReal res = scratchpad[0]; + for (int i = 1; i < num_blocks; ++i) { + res = reduce(res, scratchpad[i * block_size]); + } + *result = res; +} + +AcReal +reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start, + const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) +{ + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; + const unsigned num_elems = nx * ny * nz; + + const dim3 tpb_filter(32, 4, 1); + const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z))); + + const int tpb_reduce = 128; + const int bpg_reduce = num_elems / tpb_reduce; + + ERRCHK(nx >= tpb_filter.x); + ERRCHK(ny >= tpb_filter.y); + ERRCHK(nz >= tpb_filter.z); + ERRCHK(tpb_reduce <= num_elems); + ERRCHK(nx * ny * nz % 2 == 0); + + // clang-format off + if (rtype == RTYPE_MAX) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_MIN) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS_EXP) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_SUM) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else { + ERROR("Unrecognized rtype"); + } + // clang-format on + cudaStreamSynchronize(stream); + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} + +AcReal +reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, + const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, + AcReal* reduce_result) +{ + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; + const unsigned num_elems = nx * ny * nz; + + const dim3 tpb_filter(32, 4, 1); + const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z))); + + const int tpb_reduce = 128; + const int bpg_reduce = num_elems / tpb_reduce; + + ERRCHK(nx >= tpb_filter.x); + ERRCHK(ny >= tpb_filter.y); + ERRCHK(nz >= tpb_filter.z); + ERRCHK(tpb_reduce <= num_elems); + ERRCHK(nx * ny * nz % 2 == 0); + + // clang-format off + if (rtype == RTYPE_MAX) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_MIN) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS_EXP) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_SUM) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else { + ERROR("Unrecognized rtype"); + } + // clang-format on + + cudaStreamSynchronize(stream); + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +}