From 18d6ad4f6127028f3a3207bf4c6e415a3d59b675 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Mon, 17 Jun 2019 18:05:36 +0300 Subject: [PATCH] Simplified the logic used for calculating reductions --- src/core/kernels/kernels.cuh | 123 +++++++++++++---------------------- 1 file changed, 45 insertions(+), 78 deletions(-) diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index 7931b1c..337bfa0 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -847,7 +847,7 @@ oob(const int& i, const int& j, const int& k) template __global__ void -_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst) +_kernel_reduce_initial_scal(const __restrict__ AcReal* src, AcReal* dst) { const int i = threadIdx.x + blockIdx.x * blockDim.x; const int j = threadIdx.y + blockIdx.y * blockDim.y; @@ -867,7 +867,7 @@ _kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst) template __global__ void -_kernel_reduce_vec(const __restrict__ AcReal* src_a, +_kernel_reduce_initial_vec(const __restrict__ AcReal* src_a, const __restrict__ AcReal* src_b, const __restrict__ AcReal* src_c, AcReal* dst) { @@ -964,40 +964,26 @@ reduce_scal(const cudaStream_t stream, const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) / AcReal(ELEMS_PER_THREAD * BLOCK_SIZE)); - switch (rtype) { - case RTYPE_MAX: - _kernel_reduce_scal - <<>>(vertex_buffer, reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - case RTYPE_MIN: - _kernel_reduce_scal - <<>>(vertex_buffer, reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - case RTYPE_RMS: - _kernel_reduce_scal - <<>>(vertex_buffer, reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - case RTYPE_RMS_EXP: - _kernel_reduce_scal - <<>>(vertex_buffer, reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - default: + if (rtype == RTYPE_MAX || rtype == RTYPE_MIN) { + _kernel_reduce_initial_scal<<>>(vertex_buffer, reduce_scratchpad); + } else if (rtype == RTYPE_RMS) { + _kernel_reduce_initial_scal<<>>(vertex_buffer, reduce_scratchpad); + } else if (rtype == RTYPE_RMS_EXP) { + _kernel_reduce_initial_scal<<>>(vertex_buffer, reduce_scratchpad); + } else { + ERROR("Unrecognized RTYPE"); + } + + if (rtype == RTYPE_MAX) { + _kernel_reduce<<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + } else if (rtype == RTYPE_MIN) { + _kernel_reduce<<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + _kernel_reduce<<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + } else { ERROR("Unrecognized RTYPE"); } @@ -1008,10 +994,9 @@ reduce_scal(const cudaStream_t stream, AcReal reduce_vec(const cudaStream_t stream, - const ReductionType& rtype, const int& nx, const int& ny, - const int& nz, const AcReal* vertex_buffer_a, - const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c, - AcReal* reduce_scratchpad, AcReal* reduce_result) + const ReductionType& rtype, const int& nx, const int& ny, const int& nz, + const AcReal* vec0, const AcReal* vec1, const AcReal* vec2, + AcReal* reduce_scratchpad, AcReal* reduce_result) { const dim3 tpb(32, 4, 1); const dim3 bpg(int(ceil(float(nx) / tpb.x)), @@ -1037,44 +1022,26 @@ reduce_vec(const cudaStream_t stream, ERRCHK_ALWAYS(is_power_of_two(ny)); ERRCHK_ALWAYS(is_power_of_two(nz)); - switch (rtype) { - case RTYPE_MAX: - _kernel_reduce_vec - <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, - reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - case RTYPE_MIN: - _kernel_reduce_vec - <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, - reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - case RTYPE_RMS: - _kernel_reduce_vec - <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, - reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - case RTYPE_RMS_EXP: - _kernel_reduce_vec - <<>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c, - reduce_scratchpad); - _kernel_reduce - <<>>(reduce_scratchpad, reduce_result); - _kernel_reduce_block - <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); - break; - default: + if (rtype == RTYPE_MAX || rtype == RTYPE_MIN) { + _kernel_reduce_initial_vec<<>>(vec0, vec1, vec2, reduce_scratchpad); + } else if (rtype == RTYPE_RMS) { + _kernel_reduce_initial_vec<<>>(vec0, vec1, vec2, reduce_scratchpad); + } else if (rtype == RTYPE_RMS_EXP) { + _kernel_reduce_initial_vec<<>>(vec0, vec1, vec2, reduce_scratchpad); + } else { + ERROR("Unrecognized RTYPE"); + } + + if (rtype == RTYPE_MAX) { + _kernel_reduce<<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + } else if (rtype == RTYPE_MIN) { + _kernel_reduce<<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + _kernel_reduce<<>>(reduce_scratchpad, reduce_result); + _kernel_reduce_block<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result); + } else { ERROR("Unrecognized RTYPE"); }