1060 lines
36 KiB
Plaintext
1060 lines
36 KiB
Plaintext
/*
|
|
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
|
|
|
This file is part of Astaroth.
|
|
|
|
Astaroth is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Astaroth is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* \brief Brief info.
|
|
*
|
|
* Detailed info.
|
|
*
|
|
*/
|
|
#pragma once
|
|
|
|
__global__ void
|
|
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
|
{
|
|
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
|
|
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
|
|
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
|
|
|
|
// If within the start-end range (this allows threadblock dims that are not
|
|
// divisible by end - start)
|
|
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
|
return;
|
|
|
|
//if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
|
// return;
|
|
|
|
// If destination index is inside the computational domain, return since
|
|
// the boundary conditions are only applied to the ghost zones
|
|
if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
|
|
j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
|
|
k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
|
|
return;
|
|
|
|
// Find the source index
|
|
// Map to nx, ny, nz coordinates
|
|
int i_src = i_dst - DCONST_INT(AC_nx_min);
|
|
int j_src = j_dst - DCONST_INT(AC_ny_min);
|
|
int k_src = k_dst - DCONST_INT(AC_nz_min);
|
|
|
|
// Translate (s.t. the index is always positive)
|
|
i_src += DCONST_INT(AC_nx);
|
|
j_src += DCONST_INT(AC_ny);
|
|
k_src += DCONST_INT(AC_nz);
|
|
|
|
// Wrap
|
|
i_src %= DCONST_INT(AC_nx);
|
|
j_src %= DCONST_INT(AC_ny);
|
|
k_src %= DCONST_INT(AC_nz);
|
|
|
|
// Map to mx, my, mz coordinates
|
|
i_src += DCONST_INT(AC_nx_min);
|
|
j_src += DCONST_INT(AC_ny_min);
|
|
k_src += DCONST_INT(AC_nz_min);
|
|
|
|
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
|
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
|
vtxbuf[dst_idx] = vtxbuf[src_idx];
|
|
}
|
|
|
|
void
|
|
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
|
{
|
|
const dim3 tpb(8,2,8);
|
|
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
|
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
|
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
|
|
|
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
|
|
ERRCHK_CUDA_KERNEL();
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
#include <assert.h>
|
|
|
|
|
|
static __device__ __forceinline__ int
|
|
IDX(const int i)
|
|
{
|
|
return i;
|
|
}
|
|
|
|
static __device__ __forceinline__ int
|
|
IDX(const int i, const int j, const int k)
|
|
{
|
|
return DEVICE_VTXBUF_IDX(i, j, k);
|
|
}
|
|
|
|
static __device__ __forceinline__ int
|
|
IDX(const int3 idx)
|
|
{
|
|
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
|
}
|
|
|
|
static __forceinline__ AcMatrix
|
|
create_rotz(const AcReal radians)
|
|
{
|
|
AcMatrix mat;
|
|
|
|
mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
|
|
mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
|
|
mat.row[2] = (AcReal3){0, 0, 0};
|
|
|
|
return mat;
|
|
}
|
|
|
|
|
|
#if AC_DOUBLE_PRECISION == 0
|
|
#define sin __sinf
|
|
#define cos __cosf
|
|
#define exp __expf
|
|
#define rsqrt rsqrtf // hardware reciprocal sqrt
|
|
#endif // AC_DOUBLE_PRECISION == 0
|
|
|
|
|
|
/*
|
|
typedef struct {
|
|
int i, j, k;
|
|
} int3;*/
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 0 (Input Assembly Stage)
|
|
* =============================================================================
|
|
*/
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 0.1 (Read stencil elements and solve derivatives)
|
|
* =============================================================================
|
|
*/
|
|
static __device__ __forceinline__ AcReal
|
|
first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
|
{
|
|
#if STENCIL_ORDER == 2
|
|
const AcReal coefficients[] = {0, 1.0 / 2.0};
|
|
#elif STENCIL_ORDER == 4
|
|
const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
|
|
#elif STENCIL_ORDER == 6
|
|
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
|
|
#elif STENCIL_ORDER == 8
|
|
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
|
|
-1.0 / 280.0};
|
|
#endif
|
|
|
|
#define MID (STENCIL_ORDER / 2)
|
|
AcReal res = 0;
|
|
|
|
#pragma unroll
|
|
for (int i = 1; i <= MID; ++i)
|
|
res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
|
|
|
|
return res * inv_ds;
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
|
{
|
|
#if STENCIL_ORDER == 2
|
|
const AcReal coefficients[] = {-2., 1.};
|
|
#elif STENCIL_ORDER == 4
|
|
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
|
|
#elif STENCIL_ORDER == 6
|
|
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
|
|
1.0 / 90.0};
|
|
#elif STENCIL_ORDER == 8
|
|
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
|
|
8.0 / 315.0, -1.0 / 560.0};
|
|
#endif
|
|
|
|
#define MID (STENCIL_ORDER / 2)
|
|
AcReal res = coefficients[0] * pencil[MID];
|
|
|
|
#pragma unroll
|
|
for (int i = 1; i <= MID; ++i)
|
|
res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
|
|
|
|
return res * inv_ds * inv_ds;
|
|
}
|
|
|
|
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
|
|
static __device__ __forceinline__ AcReal
|
|
cross_derivative(const AcReal* __restrict__ pencil_a,
|
|
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
|
|
const AcReal inv_ds_b)
|
|
{
|
|
#if STENCIL_ORDER == 2
|
|
const AcReal coefficients[] = {0, 1.0 / 4.0};
|
|
#elif STENCIL_ORDER == 4
|
|
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
|
|
#elif STENCIL_ORDER == 6
|
|
const AcReal fac = (1. / 720.);
|
|
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
|
|
2.0 * fac};
|
|
#elif STENCIL_ORDER == 8
|
|
const AcReal fac = (1. / 20160.);
|
|
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
|
|
128. * fac, -9. * fac};
|
|
#endif
|
|
|
|
#define MID (STENCIL_ORDER / 2)
|
|
AcReal res = AcReal(0.);
|
|
|
|
#pragma unroll
|
|
for (int i = 1; i <= MID; ++i) {
|
|
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
|
|
pencil_b[MID + i] - pencil_b[MID - i]);
|
|
}
|
|
return res * inv_ds_a * inv_ds_b;
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
|
|
|
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
|
|
|
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil_a[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
|
vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
|
|
|
AcReal pencil_b[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
|
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
|
|
|
|
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
|
DCONST_REAL(AC_inv_dsy));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil_a[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
|
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
|
|
|
AcReal pencil_b[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
|
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
|
|
|
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
|
DCONST_REAL(AC_inv_dsz));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
|
|
|
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
|
|
|
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil_a[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
|
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
|
|
|
AcReal pencil_b[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
|
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
|
|
|
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
|
|
DCONST_REAL(AC_inv_dsz));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
|
|
|
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|
{
|
|
AcReal pencil[STENCIL_ORDER + 1];
|
|
#pragma unroll
|
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
|
|
|
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
|
}
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 0.2 (Caching functions)
|
|
* =============================================================================
|
|
*/
|
|
|
|
#include "stencil_assembly.cuh"
|
|
|
|
/*
|
|
typedef struct {
|
|
AcRealData x;
|
|
AcRealData y;
|
|
AcRealData z;
|
|
} AcReal3Data;
|
|
|
|
static __device__ __forceinline__ AcReal3Data
|
|
read_data(const int i, const int j, const int k,
|
|
AcReal* __restrict__ buf[], const int3& handle)
|
|
{
|
|
AcReal3Data data;
|
|
|
|
data.x = read_data(i, j, k, buf, handle.x);
|
|
data.y = read_data(i, j, k, buf, handle.y);
|
|
data.z = read_data(i, j, k, buf, handle.z);
|
|
|
|
return data;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
|
|
* =============================================================================
|
|
*/
|
|
|
|
static __host__ __device__ __forceinline__ AcReal3
|
|
operator-(const AcReal3& a, const AcReal3& b)
|
|
{
|
|
return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ AcReal3
|
|
operator+(const AcReal3& a, const AcReal3& b)
|
|
{
|
|
return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ AcReal3
|
|
operator-(const AcReal3& a)
|
|
{
|
|
return (AcReal3){-a.x, -a.y, -a.z};
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ AcReal3
|
|
operator*(const AcReal a, const AcReal3& b)
|
|
{
|
|
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ AcReal
|
|
dot(const AcReal3& a, const AcReal3& b)
|
|
{
|
|
return a.x * b.x + a.y * b.y + a.z * b.z;
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ AcReal3
|
|
mul(const AcMatrix& aa, const AcReal3& x)
|
|
{
|
|
return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ AcReal3
|
|
cross(const AcReal3& a, const AcReal3& b)
|
|
{
|
|
AcReal3 c;
|
|
|
|
c.x = a.y * b.z - a.z * b.y;
|
|
c.y = a.z * b.x - a.x * b.z;
|
|
c.z = a.x * b.y - a.y * b.x;
|
|
|
|
return c;
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ bool
|
|
is_valid(const AcReal a)
|
|
{
|
|
return !isnan(a) && !isinf(a);
|
|
}
|
|
|
|
static __host__ __device__ __forceinline__ bool
|
|
is_valid(const AcReal3& a)
|
|
{
|
|
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
|
|
}
|
|
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 1 (Stencil Processing Stage)
|
|
* =============================================================================
|
|
*/
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 1.1 (Terms)
|
|
* =============================================================================
|
|
*/
|
|
static __device__ __forceinline__ AcReal
|
|
laplace(const AcRealData& data)
|
|
{
|
|
return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
divergence(const AcReal3Data& vec)
|
|
{
|
|
return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal3
|
|
laplace_vec(const AcReal3Data& vec)
|
|
{
|
|
return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal3
|
|
curl(const AcReal3Data& vec)
|
|
{
|
|
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
|
|
gradient(vec.x).z - gradient(vec.z).x,
|
|
gradient(vec.y).x - gradient(vec.x).y};
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal3
|
|
gradient_of_divergence(const AcReal3Data& vec)
|
|
{
|
|
return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
|
|
hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
|
|
hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
|
|
}
|
|
|
|
// Takes uu gradients and returns S
|
|
static __device__ __forceinline__ AcMatrix
|
|
stress_tensor(const AcReal3Data& vec)
|
|
{
|
|
AcMatrix S;
|
|
|
|
S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
|
|
AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
|
|
S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
|
|
S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
|
|
|
|
S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
|
|
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
|
|
|
|
S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
|
|
|
|
S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
|
|
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
|
|
|
|
S.row[1].x = S.row[0].y;
|
|
S.row[2].x = S.row[0].z;
|
|
S.row[2].y = S.row[1].z;
|
|
|
|
return S;
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
contract(const AcMatrix& mat)
|
|
{
|
|
AcReal res = 0;
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < 3; ++i)
|
|
res += dot(mat.row[i], mat.row[i]);
|
|
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 1.2 (Equations)
|
|
* =============================================================================
|
|
*/
|
|
static __device__ __forceinline__ AcReal
|
|
length(const AcReal3& vec)
|
|
{
|
|
return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal
|
|
reciprocal_len(const AcReal3& vec)
|
|
{
|
|
return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
|
|
}
|
|
|
|
static __device__ __forceinline__ AcReal3
|
|
normalized(const AcReal3& vec)
|
|
{
|
|
const AcReal inv_len = reciprocal_len(vec);
|
|
return inv_len * vec;
|
|
}
|
|
|
|
// Sinusoidal forcing
|
|
// https://arxiv.org/pdf/1704.04676.pdf
|
|
__constant__ AcReal3 forcing_vec;
|
|
__constant__ AcReal forcing_phi;
|
|
static __device__ __forceinline__ AcReal3
|
|
forcing(const int i, const int j, const int k)
|
|
{
|
|
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
|
|
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
|
|
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
|
|
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
|
|
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
|
|
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
|
|
AcReal inv_len = reciprocal_len(k_vec);
|
|
if (isnan(inv_len) || isinf(inv_len))
|
|
inv_len = 0;
|
|
if (inv_len > 2) // hack to make it cool
|
|
inv_len = 2;
|
|
const AcReal k_dot_x = dot(k_vec, forcing_vec);
|
|
|
|
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
|
|
|
|
return inv_len * inv_len * waves * forcing_vec;
|
|
}
|
|
|
|
|
|
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
|
|
#define LNT0 (AcReal(0.0))
|
|
#define LNRHO0 (AcReal(0.0))
|
|
|
|
#define H_CONST (AcReal(0.0))
|
|
#define C_CONST (AcReal(0.0))
|
|
|
|
|
|
|
|
template <int step_number>
|
|
static __device__ __forceinline__ AcReal
|
|
rk3_integrate(const AcReal state_previous, const AcReal state_current,
|
|
const AcReal rate_of_change, const AcReal dt)
|
|
{
|
|
// Williamson (1980)
|
|
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
|
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
|
|
AcReal(8. / 15.)};
|
|
|
|
|
|
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
|
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
|
switch (step_number) {
|
|
case 0:
|
|
return state_current + beta[step_number + 1] * rate_of_change * dt;
|
|
case 1: // Fallthrough
|
|
case 2:
|
|
return state_current +
|
|
beta[step_number + 1] *
|
|
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
|
(state_current - state_previous) +
|
|
rate_of_change * dt);
|
|
default:
|
|
return NAN;
|
|
}
|
|
}
|
|
/*
|
|
template <int step_number>
|
|
static __device__ __forceinline__ AcReal
|
|
rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
|
|
const AcReal rate_of_change, const AcReal dt)
|
|
{
|
|
// Williamson (1980)
|
|
const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
|
const AcReal beta[] = {AcReal(1. / 3.), AcReal(15. / 16.),
|
|
AcReal(8. / 15.)};
|
|
|
|
|
|
switch (step_number) {
|
|
case 0:
|
|
return state_current + beta[step_number] * rate_of_change * dt;
|
|
case 1: // Fallthrough
|
|
case 2:
|
|
return state_current +
|
|
beta[step_number] *
|
|
(alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
|
|
(state_current - state_previous) +
|
|
rate_of_change * dt);
|
|
default:
|
|
return NAN;
|
|
}
|
|
}
|
|
*/
|
|
|
|
template <int step_number>
|
|
static __device__ __forceinline__ AcReal3
|
|
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
|
const AcReal3 rate_of_change, const AcReal dt)
|
|
{
|
|
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
|
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
|
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
|
}
|
|
|
|
#define rk3(state_previous, state_current, rate_of_change, dt)\
|
|
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
|
|
|
/*
|
|
template <int step_number>
|
|
static __device__ __forceinline__ AcReal
|
|
rk3_integrate(const int idx, const AcReal out, const int handle,
|
|
const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
|
|
{
|
|
return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
|
|
}
|
|
|
|
template <int step_number>
|
|
static __device__ __forceinline__ AcReal3
|
|
rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
|
|
const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
|
|
{
|
|
return (AcReal3) {
|
|
rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
|
|
rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
|
|
rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
|
|
};
|
|
}
|
|
|
|
#define RK3(handle, in_cached, rate_of_change, dt) \
|
|
rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
|
|
*/
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 1.3 (Kernels)
|
|
* =============================================================================
|
|
*/
|
|
|
|
static __device__ void
|
|
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
|
|
{
|
|
out[handle][idx] = value;
|
|
}
|
|
|
|
static __device__ void
|
|
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
|
|
{
|
|
write(out, vec.x, idx, value.x);
|
|
write(out, vec.y, idx, value.y);
|
|
write(out, vec.z, idx, value.z);
|
|
}
|
|
|
|
static __device__ AcReal
|
|
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
|
|
{
|
|
return field[handle][idx];
|
|
}
|
|
|
|
static __device__ AcReal3
|
|
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
|
{
|
|
return (AcReal3) { read_out(idx, field, handle.x),
|
|
read_out(idx, field, handle.y),
|
|
read_out(idx, field, handle.z) };
|
|
}
|
|
|
|
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
|
#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
|
|
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
|
|
|
// also write for clarity here also, not for the DSL
|
|
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
|
|
|
|
#define GEN_KERNEL_PARAM_BOILERPLATE \
|
|
const int3 start, const int3 end, VertexBufferArray buffer
|
|
|
|
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
|
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
|
|
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
|
|
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
|
|
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
|
|
return;\
|
|
\
|
|
\
|
|
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
|
|
vertexIdx.z < DCONST_INT(AC_nz_max));\
|
|
\
|
|
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
|
|
vertexIdx.z >= DCONST_INT(AC_nz_min));\
|
|
\
|
|
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
|
|
|
#include "stencil_process.cuh"
|
|
|
|
/*
|
|
* =============================================================================
|
|
* Level 2 (Host calls)
|
|
* =============================================================================
|
|
*/
|
|
|
|
static AcReal
|
|
randf(void)
|
|
{
|
|
return AcReal(rand()) / AcReal(RAND_MAX);
|
|
}
|
|
|
|
AcResult
|
|
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
|
const AcReal dt, VertexBufferArray* buffer)
|
|
{
|
|
const dim3 tpb(32, 1, 4);
|
|
/////////////////// Forcing
|
|
#if LFORCING
|
|
const AcReal ff_scale = AcReal(.2);
|
|
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
|
|
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
|
|
const AcMatrix rotz = create_rotz(radians);
|
|
ff = mul(rotz, ff);
|
|
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
|
|
|
|
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
|
|
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
|
|
#endif // LFORCING
|
|
//////////////////////////
|
|
|
|
const int nx = end.x - start.x;
|
|
const int ny = end.y - start.y;
|
|
const int nz = end.z - start.z;
|
|
|
|
const dim3 bpg(
|
|
(unsigned int)ceil(nx / AcReal(tpb.x)),
|
|
(unsigned int)ceil(ny / AcReal(tpb.y)),
|
|
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
|
|
|
|
|
if (step_number == 0)
|
|
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
|
else if (step_number == 1)
|
|
solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
|
else
|
|
solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
|
|
|
ERRCHK_CUDA_KERNEL();
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
|
|
////////////////REDUCE///////////////////////////
|
|
#include "src/core/math_utils.h" // is_power_of_two
|
|
|
|
/*
|
|
Reduction steps:
|
|
1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
|
|
2 of 3: Compute most of the reductions into a single block of data
|
|
3 of 3: After all results have been stored into the final block, reduce the data in the final block
|
|
*/
|
|
|
|
// Function pointer definitions
|
|
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
|
typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
|
|
typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
|
|
const AcReal&);
|
|
|
|
// clang-format off
|
|
/* Comparison funcs */
|
|
static __device__ inline AcReal
|
|
dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
|
|
|
static __device__ inline AcReal
|
|
dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
|
|
|
static __device__ inline AcReal
|
|
dsum(const AcReal& a, const AcReal& b) { return a + b; }
|
|
|
|
/* Function used to determine the values used during reduction */
|
|
static __device__ inline AcReal
|
|
dvalue(const AcReal& a) { return AcReal(a); }
|
|
|
|
static __device__ inline AcReal
|
|
dsquared(const AcReal& a) { return (AcReal)(a*a); }
|
|
|
|
static __device__ inline AcReal
|
|
dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
|
|
|
|
static __device__ inline AcReal
|
|
dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
|
|
|
static __device__ inline AcReal
|
|
dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
|
|
|
|
static __device__ inline AcReal
|
|
dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
|
|
// clang-format on
|
|
|
|
static __device__ inline bool
|
|
oob(const int& i, const int& j, const int& k)
|
|
{
|
|
if (i >= d_mesh_info.int_params[AC_nx] ||
|
|
j >= d_mesh_info.int_params[AC_ny] ||
|
|
k >= d_mesh_info.int_params[AC_nz])
|
|
return true;
|
|
else
|
|
return false;
|
|
}
|
|
|
|
template <ReduceInitialScalFunc reduce_initial>
|
|
__global__ void
|
|
kernel_reduce_1of3(const __restrict__ AcReal* src, AcReal* dst)
|
|
{
|
|
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
|
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
|
|
|
if (oob(i, j, k))
|
|
return;
|
|
|
|
const int src_idx = DEVICE_VTXBUF_IDX(
|
|
i + d_mesh_info.int_params[AC_nx_min],
|
|
j + d_mesh_info.int_params[AC_ny_min],
|
|
k + d_mesh_info.int_params[AC_nz_min]);
|
|
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
|
|
|
dst[dst_idx] = reduce_initial(src[src_idx]);
|
|
}
|
|
|
|
template <ReduceInitialVecFunc reduce_initial>
|
|
__global__ void
|
|
kernel_reduce_1of3_vec(const __restrict__ AcReal* src_a,
|
|
const __restrict__ AcReal* src_b,
|
|
const __restrict__ AcReal* src_c, AcReal* dst)
|
|
{
|
|
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
|
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
|
|
|
if (oob(i, j, k))
|
|
return;
|
|
|
|
const int src_idx = DEVICE_VTXBUF_IDX(
|
|
i + d_mesh_info.int_params[AC_nx_min],
|
|
j + d_mesh_info.int_params[AC_ny_min],
|
|
k + d_mesh_info.int_params[AC_nz_min]);
|
|
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
|
|
|
dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
|
|
src_c[src_idx]);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
#define BLOCK_SIZE (1024)
|
|
#define ELEMS_PER_THREAD (32)
|
|
|
|
template <ReduceFunc reduce>
|
|
__global__ void
|
|
kernel_reduce_2of3(AcReal* src, AcReal* result)
|
|
{
|
|
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
|
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
|
|
|
if (idx >= scratchpad_size)
|
|
return;
|
|
|
|
__shared__ AcReal smem[BLOCK_SIZE];
|
|
|
|
AcReal tmp = src[idx];
|
|
|
|
for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
|
|
const int src_idx = idx + i * BLOCK_SIZE;
|
|
if (src_idx >= scratchpad_size) {
|
|
// This check is for safety: if accessing uninitialized values
|
|
// beyond the mesh boundaries, we will immediately start seeing NANs
|
|
if (threadIdx.x < BLOCK_SIZE)
|
|
smem[threadIdx.x] = NAN;
|
|
else
|
|
break;
|
|
}
|
|
tmp = reduce(tmp, src[src_idx]);
|
|
}
|
|
|
|
smem[threadIdx.x] = tmp;
|
|
__syncthreads();
|
|
|
|
int offset = BLOCK_SIZE / 2;
|
|
while (offset > 0) {
|
|
|
|
if (threadIdx.x < offset) {
|
|
smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
|
|
}
|
|
offset /= 2;
|
|
__syncthreads();
|
|
}
|
|
if (threadIdx.x == 0)
|
|
src[idx] = smem[0];
|
|
}
|
|
|
|
template <ReduceFunc reduce>
|
|
__global__ void
|
|
kernel_reduce_3of3(const __restrict__ AcReal* src, AcReal* result)
|
|
{
|
|
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
|
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
|
AcReal tmp = src[idx];
|
|
const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
|
|
for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
|
|
tmp = reduce(tmp, src[idx + i * block_offset]);
|
|
|
|
*result = tmp;
|
|
}
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
AcReal
|
|
reduce_scal(const cudaStream_t stream,
|
|
const ReductionType& rtype, const int& nx, const int& ny,
|
|
const int& nz, const AcReal* vtxbuf,
|
|
AcReal* scratchpad, AcReal* reduce_result)
|
|
{
|
|
const dim3 tpb(32, 4, 1);
|
|
const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
|
|
int(ceil(AcReal(nz) / tpb.z)));
|
|
|
|
const int scratchpad_size = nx * ny * nz;
|
|
const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) /
|
|
AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
|
|
|
|
if (rtype == RTYPE_MAX || rtype == RTYPE_MIN) {
|
|
kernel_reduce_1of3<dvalue><<<bpg, tpb, 0, stream>>>(vtxbuf, scratchpad);
|
|
} else if (rtype == RTYPE_RMS) {
|
|
kernel_reduce_1of3<dsquared><<<bpg, tpb, 0, stream>>>(vtxbuf, scratchpad);
|
|
} else if (rtype == RTYPE_RMS_EXP) {
|
|
kernel_reduce_1of3<dexp_squared><<<bpg, tpb, 0, stream>>>(vtxbuf, scratchpad);
|
|
} else {
|
|
ERROR("Unrecognized RTYPE");
|
|
}
|
|
|
|
if (rtype == RTYPE_MAX) {
|
|
kernel_reduce_2of3<dmax><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
|
|
kernel_reduce_3of3<dmax><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
|
|
} else if (rtype == RTYPE_MIN) {
|
|
kernel_reduce_2of3<dmin><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
|
|
kernel_reduce_3of3<dmin><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
|
|
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
|
kernel_reduce_2of3<dsum><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
|
|
kernel_reduce_3of3<dsum><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
|
|
} else {
|
|
ERROR("Unrecognized RTYPE");
|
|
}
|
|
|
|
AcReal result;
|
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
|
return result;
|
|
}
|
|
|
|
AcReal
|
|
reduce_vec(const cudaStream_t stream,
|
|
const ReductionType& rtype, const int& nx, const int& ny, const int& nz,
|
|
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
|
|
AcReal* scratchpad, AcReal* reduce_result)
|
|
{
|
|
const dim3 tpb(32, 4, 1);
|
|
const dim3 bpg(int(ceil(float(nx) / tpb.x)),
|
|
int(ceil(float(ny) / tpb.y)),
|
|
int(ceil(float(nz) / tpb.z)));
|
|
|
|
const int scratchpad_size = nx * ny * nz;
|
|
const int bpg2 = (unsigned int)ceil(float(scratchpad_size) /
|
|
float(ELEMS_PER_THREAD * BLOCK_SIZE));
|
|
|
|
// "Features" of this quick & efficient reduction:
|
|
// Block size must be smaller than the computational domain size
|
|
// (otherwise we would have do some additional bounds checking in the
|
|
// second half of kernel_reduce_2of3, which gets quite confusing)
|
|
// Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
|
|
// the work without worrying too much about the array bounds.
|
|
ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size);
|
|
ERRCHK_ALWAYS(!(BLOCK_SIZE % 2));
|
|
// NOTE! Also does not work properly with non-power of two mesh dimension
|
|
// Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
|
|
// see that uninitialized smem values are used in the comparison
|
|
ERRCHK_ALWAYS(is_power_of_two(nx));
|
|
ERRCHK_ALWAYS(is_power_of_two(ny));
|
|
ERRCHK_ALWAYS(is_power_of_two(nz));
|
|
|
|
if (rtype == RTYPE_MAX || rtype == RTYPE_MIN) {
|
|
kernel_reduce_1of3_vec<dlength_vec><<<bpg, tpb, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, scratchpad);
|
|
} else if (rtype == RTYPE_RMS) {
|
|
kernel_reduce_1of3_vec<dsquared_vec><<<bpg, tpb, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, scratchpad);
|
|
} else if (rtype == RTYPE_RMS_EXP) {
|
|
kernel_reduce_1of3_vec<dexp_squared_vec><<<bpg, tpb, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, scratchpad);
|
|
} else {
|
|
ERROR("Unrecognized RTYPE");
|
|
}
|
|
|
|
if (rtype == RTYPE_MAX) {
|
|
kernel_reduce_2of3<dmax><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
|
|
kernel_reduce_3of3<dmax><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
|
|
} else if (rtype == RTYPE_MIN) {
|
|
kernel_reduce_2of3<dmin><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
|
|
kernel_reduce_3of3<dmin><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
|
|
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
|
kernel_reduce_2of3<dsum><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
|
|
kernel_reduce_3of3<dsum><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
|
|
} else {
|
|
ERROR("Unrecognized RTYPE");
|
|
}
|
|
|
|
AcReal result;
|
|
cudaMemcpyAsync(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
return result;
|
|
}
|