Split kernels.cuh into bounconds.cuh, integration.cuh and reductions.cuh
This commit is contained in:
@@ -48,7 +48,9 @@ __constant__ Grid globalGrid;
|
||||
#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
||||
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
|
||||
#include "kernels/kernels.cuh"
|
||||
#include "kernels/boundconds.cuh"
|
||||
#include "kernels/integration.cuh"
|
||||
#include "kernels/reductions.cuh"
|
||||
|
||||
static dim3 rk3_tpb = (dim3){32, 1, 4};
|
||||
|
||||
|
87
src/core/kernels/boundconds.cuh
Normal file
87
src/core/kernels/boundconds.cuh
Normal file
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
__global__ void
|
||||
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
{
|
||||
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
// If within the start-end range (this allows threadblock dims that are not
|
||||
// divisible by end - start)
|
||||
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
||||
return;
|
||||
|
||||
// if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
||||
// return;
|
||||
|
||||
// If destination index is inside the computational domain, return since
|
||||
// the boundary conditions are only applied to the ghost zones
|
||||
if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
|
||||
j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
|
||||
k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
|
||||
return;
|
||||
|
||||
// Find the source index
|
||||
// Map to nx, ny, nz coordinates
|
||||
int i_src = i_dst - DCONST_INT(AC_nx_min);
|
||||
int j_src = j_dst - DCONST_INT(AC_ny_min);
|
||||
int k_src = k_dst - DCONST_INT(AC_nz_min);
|
||||
|
||||
// Translate (s.t. the index is always positive)
|
||||
i_src += DCONST_INT(AC_nx);
|
||||
j_src += DCONST_INT(AC_ny);
|
||||
k_src += DCONST_INT(AC_nz);
|
||||
|
||||
// Wrap
|
||||
i_src %= DCONST_INT(AC_nx);
|
||||
j_src %= DCONST_INT(AC_ny);
|
||||
k_src %= DCONST_INT(AC_nz);
|
||||
|
||||
// Map to mx, my, mz coordinates
|
||||
i_src += DCONST_INT(AC_nx_min);
|
||||
j_src += DCONST_INT(AC_ny_min);
|
||||
k_src += DCONST_INT(AC_nz_min);
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
||||
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
||||
vtxbuf[dst_idx] = vtxbuf[src_idx];
|
||||
}
|
||||
|
||||
void
|
||||
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
||||
{
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
|
||||
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
}
|
@@ -26,67 +26,6 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
__global__ void
|
||||
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
{
|
||||
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
// If within the start-end range (this allows threadblock dims that are not
|
||||
// divisible by end - start)
|
||||
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
||||
return;
|
||||
|
||||
// if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
||||
// return;
|
||||
|
||||
// If destination index is inside the computational domain, return since
|
||||
// the boundary conditions are only applied to the ghost zones
|
||||
if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
|
||||
j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
|
||||
k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
|
||||
return;
|
||||
|
||||
// Find the source index
|
||||
// Map to nx, ny, nz coordinates
|
||||
int i_src = i_dst - DCONST_INT(AC_nx_min);
|
||||
int j_src = j_dst - DCONST_INT(AC_ny_min);
|
||||
int k_src = k_dst - DCONST_INT(AC_nz_min);
|
||||
|
||||
// Translate (s.t. the index is always positive)
|
||||
i_src += DCONST_INT(AC_nx);
|
||||
j_src += DCONST_INT(AC_ny);
|
||||
k_src += DCONST_INT(AC_nz);
|
||||
|
||||
// Wrap
|
||||
i_src %= DCONST_INT(AC_nx);
|
||||
j_src %= DCONST_INT(AC_ny);
|
||||
k_src %= DCONST_INT(AC_nz);
|
||||
|
||||
// Map to mx, my, mz coordinates
|
||||
i_src += DCONST_INT(AC_nx_min);
|
||||
j_src += DCONST_INT(AC_ny_min);
|
||||
k_src += DCONST_INT(AC_nz_min);
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
||||
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
||||
vtxbuf[dst_idx] = vtxbuf[src_idx];
|
||||
}
|
||||
|
||||
void
|
||||
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
||||
{
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
|
||||
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include <assert.h>
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
@@ -700,265 +639,3 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
#include "stencil_process.cuh"
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 2 (Host calls)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
////////////////REDUCE///////////////////////////
|
||||
#include "src/core/math_utils.h" // is_power_of_two
|
||||
|
||||
/*
|
||||
Reduction steps:
|
||||
1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
|
||||
2 of 3: Compute most of the reductions into a single block of data
|
||||
3 of 3: After all results have been stored into the final block, reduce the data in the final block
|
||||
*/
|
||||
|
||||
// Function pointer definitions
|
||||
typedef AcReal (*FilterFunc)(const AcReal&);
|
||||
typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
|
||||
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
||||
|
||||
// clang-format off
|
||||
/* Comparison funcs */
|
||||
static __device__ inline AcReal
|
||||
dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsum(const AcReal& a, const AcReal& b) { return a + b; }
|
||||
|
||||
/* Function used to determine the values used during reduction */
|
||||
static __device__ inline AcReal
|
||||
dvalue(const AcReal& a) { return AcReal(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared(const AcReal& a) { return (AcReal)(a*a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
|
||||
// clang-format on
|
||||
|
||||
#include <assert.h>
|
||||
template <FilterFunc filter>
|
||||
__global__ void
|
||||
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||
src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <FilterFuncVec filter>
|
||||
__global__ void
|
||||
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
|
||||
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||
src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
||||
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce(AcReal* scratchpad, const int num_elems)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
extern __shared__ AcReal smem[];
|
||||
if (idx < num_elems) {
|
||||
smem[threadIdx.x] = scratchpad[idx];
|
||||
}
|
||||
else {
|
||||
smem[threadIdx.x] = NAN;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int offset = blockDim.x / 2;
|
||||
assert(offset % 2 == 0);
|
||||
while (offset > 0) {
|
||||
if (threadIdx.x < offset) {
|
||||
smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
|
||||
}
|
||||
offset /= 2;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
scratchpad[idx] = smem[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
||||
const int block_size, AcReal* result)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
AcReal res = scratchpad[0];
|
||||
for (int i = 1; i < num_blocks; ++i) {
|
||||
res = reduce(res, scratchpad[i * block_size]);
|
||||
}
|
||||
*result = res;
|
||||
}
|
||||
|
||||
AcReal
|
||||
reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
||||
|
||||
AcReal
|
||||
reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||
AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
282
src/core/kernels/reductions.cuh
Normal file
282
src/core/kernels/reductions.cuh
Normal file
@@ -0,0 +1,282 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "src/core/math_utils.h" // is_power_of_two
|
||||
|
||||
/*
|
||||
Reduction steps:
|
||||
1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
|
||||
2 of 3: Compute most of the reductions into a single block of data
|
||||
3 of 3: After all results have been stored into the final block, reduce the data in the final block
|
||||
*/
|
||||
|
||||
// Function pointer definitions
|
||||
typedef AcReal (*FilterFunc)(const AcReal&);
|
||||
typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
|
||||
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
||||
|
||||
// clang-format off
|
||||
/* Comparison funcs */
|
||||
static __device__ inline AcReal
|
||||
dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsum(const AcReal& a, const AcReal& b) { return a + b; }
|
||||
|
||||
/* Function used to determine the values used during reduction */
|
||||
static __device__ inline AcReal
|
||||
dvalue(const AcReal& a) { return AcReal(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared(const AcReal& a) { return (AcReal)(a*a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
|
||||
// clang-format on
|
||||
|
||||
#include <assert.h>
|
||||
template <FilterFunc filter>
|
||||
__global__ void
|
||||
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||
src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <FilterFuncVec filter>
|
||||
__global__ void
|
||||
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
|
||||
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||
src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
||||
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce(AcReal* scratchpad, const int num_elems)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
extern __shared__ AcReal smem[];
|
||||
if (idx < num_elems) {
|
||||
smem[threadIdx.x] = scratchpad[idx];
|
||||
}
|
||||
else {
|
||||
smem[threadIdx.x] = NAN;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int offset = blockDim.x / 2;
|
||||
assert(offset % 2 == 0);
|
||||
while (offset > 0) {
|
||||
if (threadIdx.x < offset) {
|
||||
smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
|
||||
}
|
||||
offset /= 2;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
scratchpad[idx] = smem[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
||||
const int block_size, AcReal* result)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
AcReal res = scratchpad[0];
|
||||
for (int i = 1; i < num_blocks; ++i) {
|
||||
res = reduce(res, scratchpad[i * block_size]);
|
||||
}
|
||||
*result = res;
|
||||
}
|
||||
|
||||
AcReal
|
||||
reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
||||
|
||||
AcReal
|
||||
reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||
AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
Reference in New Issue
Block a user