diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index 577567b..fc9c745 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -72,6 +72,12 @@ AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, c const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result); +/** */ +AcReal acKernelReduceVecScal(const cudaStream_t stream, const ReductionType rtype, const int3 start, + const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, + const AcReal* vtxbuf2, const AcReal* vtxbuf3, AcReal* scratchpad, AcReal* reduce_result); + + #ifdef __cplusplus } // extern "C" #endif diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh index 1f40df3..ba9e823 100644 --- a/src/core/kernels/reductions.cuh +++ b/src/core/kernels/reductions.cuh @@ -10,6 +10,7 @@ Reduction steps: // Function pointer definitions typedef AcReal (*FilterFunc)(const AcReal&); typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&); +typedef AcReal (*FilterFuncVecScal)(const AcReal&, const AcReal&, const AcReal&, const AcReal&); typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&); // clang-format off @@ -41,6 +42,13 @@ dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquare static __device__ inline AcReal dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); } + +static __device__ inline AcReal +dlength_alf(const AcReal& a, const AcReal& b, const AcReal& c, const AcReal& d) { return sqrt(a*a + b*b + c*c)/sqrt(AcReal(4.0)*M_PI*exp(d)); } + +static __device__ inline AcReal +dsquared_alf(const AcReal& a, const AcReal& b, const AcReal& c, const AcReal& d) { return (dsquared(a) + dsquared(b) + dsquared(c))/(AcReal(4.0)*M_PI*exp(d)); } + // clang-format on #include @@ -97,6 +105,35 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr src2[IDX(src_idx)]); } +template +static __global__ void +kernel_filter_vec_scal(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1, + const __restrict__ AcReal* src2, const __restrict__ AcReal* src3, + const int3 start, const int3 end, AcReal* dst) +{ + const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z}; + + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; + (void)nz; // Suppressed unused variable warning when not compiling with debug flags + + const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z}; + + assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) && + src_idx.z < DCONST(AC_nz_max)); + assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); + assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); + + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( + src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)], src3[IDX(src_idx)]); +} + + template static __global__ void kernel_reduce(AcReal* scratchpad, const int num_elems) @@ -253,3 +290,54 @@ acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const in cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); return result; } + +AcReal +acKernelReduceVecScal(const cudaStream_t stream, const ReductionType rtype, const int3 start, + const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, + const AcReal* vtxbuf2, const AcReal* vtxbuf3, AcReal* scratchpad, AcReal* reduce_result) +{ + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; + const unsigned num_elems = nx * ny * nz; + + const dim3 tpb_filter(32, 4, 1); + const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z))); + + const int tpb_reduce = 128; + const int bpg_reduce = num_elems / tpb_reduce; + + ERRCHK(nx >= tpb_filter.x); + ERRCHK(ny >= tpb_filter.y); + ERRCHK(nz >= tpb_filter.z); + ERRCHK(tpb_reduce <= num_elems); + ERRCHK(nx * ny * nz % 2 == 0); + + //NOTE: currently this has been made to only calculate afven speeds from the diagnostics. + + // clang-format off + if (rtype == RTYPE_ALFVEN_MAX) { + kernel_filter_vec_scal<<>>(vtxbuf0, vtxbuf1, vtxbuf2, vtxbuf3, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_ALFVEN_MIN) { + kernel_filter_vec_scal<<>>(vtxbuf0, vtxbuf1, vtxbuf2, vtxbuf3, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_ALFVEN_RMS) { + kernel_filter_vec_scal<<>>(vtxbuf0, vtxbuf1, vtxbuf2, vtxbuf3, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else { + ERROR("Unrecognized rtype"); + } + // clang-format on + + cudaStreamSynchronize(stream); + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} +