Rewrote reductions, now much simpler than before
This commit is contained in:
@@ -200,13 +200,27 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp
|
|||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
|
const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
|
||||||
|
device->local_config.int_params[AC_ny_min],
|
||||||
|
device->local_config.int_params[AC_nz_min]
|
||||||
|
};
|
||||||
|
|
||||||
|
const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
|
||||||
|
device->local_config.int_params[AC_ny_max],
|
||||||
|
device->local_config.int_params[AC_nz_max]
|
||||||
|
};
|
||||||
|
|
||||||
|
*result = reduce_scal(device->streams[stream_type], rtype,
|
||||||
|
start, end, device->vba.in[vtxbuf_handle],
|
||||||
|
device->reduce_scratchpad, device->reduce_result);
|
||||||
|
/*
|
||||||
*result = reduce_scal(device->streams[stream_type], rtype,
|
*result = reduce_scal(device->streams[stream_type], rtype,
|
||||||
device->local_config.int_params[AC_nx],
|
device->local_config.int_params[AC_nx],
|
||||||
device->local_config.int_params[AC_ny],
|
device->local_config.int_params[AC_ny],
|
||||||
device->local_config.int_params[AC_nz],
|
device->local_config.int_params[AC_nz],
|
||||||
device->vba.in[vtxbuf_handle],
|
device->vba.in[vtxbuf_handle],
|
||||||
device->reduce_scratchpad, device->reduce_result);
|
device->reduce_scratchpad, device->reduce_result);
|
||||||
|
*/
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -220,6 +234,22 @@ reduceVec(const Device device, const StreamType stream_type,
|
|||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
|
const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
|
||||||
|
device->local_config.int_params[AC_ny_min],
|
||||||
|
device->local_config.int_params[AC_nz_min]
|
||||||
|
};
|
||||||
|
|
||||||
|
const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
|
||||||
|
device->local_config.int_params[AC_ny_max],
|
||||||
|
device->local_config.int_params[AC_nz_max]
|
||||||
|
};
|
||||||
|
|
||||||
|
*result = reduce_vec(device->streams[stream_type], rtype, start, end,
|
||||||
|
device->vba.in[vtxbuf0],
|
||||||
|
device->vba.in[vtxbuf1],
|
||||||
|
device->vba.in[vtxbuf2],
|
||||||
|
device->reduce_scratchpad, device->reduce_result);
|
||||||
|
/*
|
||||||
*result = reduce_vec(device->streams[stream_type], rtype,
|
*result = reduce_vec(device->streams[stream_type], rtype,
|
||||||
device->local_config.int_params[AC_nx],
|
device->local_config.int_params[AC_nx],
|
||||||
device->local_config.int_params[AC_ny],
|
device->local_config.int_params[AC_ny],
|
||||||
@@ -228,7 +258,7 @@ reduceVec(const Device device, const StreamType stream_type,
|
|||||||
device->vba.in[vtxbuf1],
|
device->vba.in[vtxbuf1],
|
||||||
device->vba.in[vtxbuf2],
|
device->vba.in[vtxbuf2],
|
||||||
device->reduce_scratchpad, device->reduce_result);
|
device->reduce_scratchpad, device->reduce_result);
|
||||||
|
*/
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -811,6 +811,7 @@ typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
|
|||||||
const AcReal&);
|
const AcReal&);
|
||||||
|
|
||||||
typedef AcReal (*FilterFunc)(const AcReal&);
|
typedef AcReal (*FilterFunc)(const AcReal&);
|
||||||
|
typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
/* Comparison funcs */
|
/* Comparison funcs */
|
||||||
@@ -870,6 +871,35 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,
|
|||||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
|
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <FilterFuncVec filter>
|
||||||
|
__global__ void
|
||||||
|
kernel_filter_vec(const __restrict__ AcReal* src0,
|
||||||
|
const __restrict__ AcReal* src1,
|
||||||
|
const __restrict__ AcReal* src2,
|
||||||
|
const int3 start, const int3 end, AcReal* dst)
|
||||||
|
{
|
||||||
|
const int3 src_idx = (int3) {
|
||||||
|
start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||||
|
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||||
|
start.z + threadIdx.z + blockIdx.z * blockDim.z
|
||||||
|
};
|
||||||
|
|
||||||
|
const int nx = end.x - start.x;
|
||||||
|
const int ny = end.y - start.y;
|
||||||
|
const int nz = end.z - start.z;
|
||||||
|
const int3 dst_idx = (int3) {
|
||||||
|
threadIdx.x + blockIdx.x * blockDim.x,
|
||||||
|
threadIdx.y + blockIdx.y * blockDim.y,
|
||||||
|
threadIdx.z + blockIdx.z * blockDim.z
|
||||||
|
};
|
||||||
|
|
||||||
|
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
|
||||||
|
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||||
|
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||||
|
|
||||||
|
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||||
|
}
|
||||||
|
|
||||||
template <ReduceFunc reduce>
|
template <ReduceFunc reduce>
|
||||||
__global__ void
|
__global__ void
|
||||||
kernel_reduce(AcReal* scratchpad, const int num_elems)
|
kernel_reduce(AcReal* scratchpad, const int num_elems)
|
||||||
@@ -920,6 +950,107 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
AcReal
|
||||||
|
reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
||||||
|
const int3& start, const int3& end,
|
||||||
|
const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||||
|
{
|
||||||
|
const unsigned nx = end.x - start.x;
|
||||||
|
const unsigned ny = end.y - start.y;
|
||||||
|
const unsigned nz = end.z - start.z;
|
||||||
|
const unsigned num_elems = nx * ny * nz;
|
||||||
|
|
||||||
|
const dim3 tpb_filter(32, 4, 1);
|
||||||
|
const dim3 bpg_filter(
|
||||||
|
(unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||||
|
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||||
|
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
|
||||||
|
);
|
||||||
|
|
||||||
|
const int tpb_reduce = 128;
|
||||||
|
const int bpg_reduce = num_elems / tpb_reduce;
|
||||||
|
|
||||||
|
ERRCHK(nx >= tpb_filter.x);
|
||||||
|
ERRCHK(ny >= tpb_filter.y);
|
||||||
|
ERRCHK(nz >= tpb_filter.z);
|
||||||
|
ERRCHK(tpb_reduce <= num_elems);
|
||||||
|
ERRCHK(nx * ny * nz % 2 == 0);
|
||||||
|
|
||||||
|
if (rtype == RTYPE_MAX) {
|
||||||
|
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
|
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else if (rtype == RTYPE_MIN) {
|
||||||
|
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
|
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else if (rtype == RTYPE_RMS) {
|
||||||
|
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else if (rtype == RTYPE_RMS_EXP) {
|
||||||
|
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else {
|
||||||
|
ERROR("Unrecognized rtype");
|
||||||
|
}
|
||||||
|
AcReal result;
|
||||||
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
AcReal
|
||||||
|
reduce_vec(const cudaStream_t stream, const ReductionType rtype,
|
||||||
|
const int3& start, const int3& end,
|
||||||
|
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
|
||||||
|
AcReal* scratchpad, AcReal* reduce_result)
|
||||||
|
{
|
||||||
|
const unsigned nx = end.x - start.x;
|
||||||
|
const unsigned ny = end.y - start.y;
|
||||||
|
const unsigned nz = end.z - start.z;
|
||||||
|
const unsigned num_elems = nx * ny * nz;
|
||||||
|
|
||||||
|
const dim3 tpb_filter(32, 4, 1);
|
||||||
|
const dim3 bpg_filter(
|
||||||
|
(unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||||
|
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||||
|
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
|
||||||
|
);
|
||||||
|
|
||||||
|
const int tpb_reduce = 128;
|
||||||
|
const int bpg_reduce = num_elems / tpb_reduce;
|
||||||
|
|
||||||
|
ERRCHK(nx >= tpb_filter.x);
|
||||||
|
ERRCHK(ny >= tpb_filter.y);
|
||||||
|
ERRCHK(nz >= tpb_filter.z);
|
||||||
|
ERRCHK(tpb_reduce <= num_elems);
|
||||||
|
ERRCHK(nx * ny * nz % 2 == 0);
|
||||||
|
|
||||||
|
if (rtype == RTYPE_MAX) {
|
||||||
|
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
|
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else if (rtype == RTYPE_MIN) {
|
||||||
|
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
|
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else if (rtype == RTYPE_RMS) {
|
||||||
|
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else if (rtype == RTYPE_RMS_EXP) {
|
||||||
|
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
} else {
|
||||||
|
ERROR("Unrecognized rtype");
|
||||||
|
}
|
||||||
|
AcReal result;
|
||||||
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static __device__ inline bool
|
static __device__ inline bool
|
||||||
oob(const int& i, const int& j, const int& k)
|
oob(const int& i, const int& j, const int& k)
|
||||||
{
|
{
|
||||||
@@ -1037,7 +1168,7 @@ kernel_reduce_3of3(const __restrict__ AcReal* src, AcReal* result)
|
|||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
reduce_scal(const cudaStream_t stream,
|
reduce_scal2(const cudaStream_t stream,
|
||||||
const ReductionType& rtype, const int& nx, const int& ny,
|
const ReductionType& rtype, const int& nx, const int& ny,
|
||||||
const int& nz, const AcReal* vtxbuf,
|
const int& nz, const AcReal* vtxbuf,
|
||||||
AcReal* scratchpad, AcReal* reduce_result)
|
AcReal* scratchpad, AcReal* reduce_result)
|
||||||
@@ -1094,7 +1225,7 @@ reduce_scal(const cudaStream_t stream,
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
reduce_vec(const cudaStream_t stream,
|
reduce_vec2(const cudaStream_t stream,
|
||||||
const ReductionType& rtype, const int& nx, const int& ny, const int& nz,
|
const ReductionType& rtype, const int& nx, const int& ny, const int& nz,
|
||||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
|
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
|
||||||
AcReal* scratchpad, AcReal* reduce_result)
|
AcReal* scratchpad, AcReal* reduce_result)
|
||||||
|
@@ -457,25 +457,16 @@ run_autotest(void)
|
|||||||
|
|
||||||
if (STENCIL_ORDER > 6)
|
if (STENCIL_ORDER > 6)
|
||||||
printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
|
printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
|
||||||
/*
|
|
||||||
const vec3i test_dims[] = { //
|
|
||||||
{15, 11, 13}, //
|
|
||||||
{17, 61, 127}, //
|
|
||||||
{511, 17, 16}, //
|
|
||||||
{64, 64, 8}, //
|
|
||||||
{32, 32, 64}, //
|
|
||||||
{64, 32, 32}, //
|
|
||||||
{128, 64, 32}};
|
|
||||||
*/
|
|
||||||
const vec3i test_dims[] = {{512, 16, 32}, //
|
|
||||||
{64, 64, 32}, //
|
|
||||||
{32, 32, 64}, //
|
|
||||||
{64, 32, 32}, //
|
|
||||||
{128, 64, 32}};
|
|
||||||
|
|
||||||
//const vec3i test_dims[] = {{256,256,256}};
|
const vec3i test_dims[] = {
|
||||||
//const vec3i test_dims[] = {{256,256,256}};
|
{32, 32, 32},
|
||||||
//const vec3i test_dims[] = {{32, 32, 32}};
|
{64, 32, 32},
|
||||||
|
{32, 64, 32},
|
||||||
|
{32, 32, 64},
|
||||||
|
{64, 64, 32},
|
||||||
|
{64, 32, 64},
|
||||||
|
{32, 64, 64}
|
||||||
|
};
|
||||||
|
|
||||||
int num_failures = 0;
|
int num_failures = 0;
|
||||||
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
||||||
|
Reference in New Issue
Block a user