diff --git a/src/core/device.cu b/src/core/device.cu index e7eee90..9415a07 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -200,13 +200,27 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp { cudaSetDevice(device->id); + const int3 start = (int3) {device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min] + }; + + const int3 end = (int3) {device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max] + }; + + *result = reduce_scal(device->streams[stream_type], rtype, + start, end, device->vba.in[vtxbuf_handle], + device->reduce_scratchpad, device->reduce_result); + /* *result = reduce_scal(device->streams[stream_type], rtype, device->local_config.int_params[AC_nx], device->local_config.int_params[AC_ny], device->local_config.int_params[AC_nz], device->vba.in[vtxbuf_handle], device->reduce_scratchpad, device->reduce_result); - + */ return AC_SUCCESS; } @@ -220,6 +234,22 @@ reduceVec(const Device device, const StreamType stream_type, { cudaSetDevice(device->id); + const int3 start = (int3) {device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min] + }; + + const int3 end = (int3) {device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max] + }; + + *result = reduce_vec(device->streams[stream_type], rtype, start, end, + device->vba.in[vtxbuf0], + device->vba.in[vtxbuf1], + device->vba.in[vtxbuf2], + device->reduce_scratchpad, device->reduce_result); + /* *result = reduce_vec(device->streams[stream_type], rtype, device->local_config.int_params[AC_nx], device->local_config.int_params[AC_ny], @@ -228,7 +258,7 @@ reduceVec(const Device device, const StreamType stream_type, device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], device->reduce_scratchpad, device->reduce_result); - + */ return AC_SUCCESS; } diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index d7985c6..98dbd99 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -811,6 +811,7 @@ typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&, const AcReal&); typedef AcReal (*FilterFunc)(const AcReal&); +typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&); // clang-format off /* Comparison funcs */ @@ -870,6 +871,35 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]); } +template +__global__ void +kernel_filter_vec(const __restrict__ AcReal* src0, + const __restrict__ AcReal* src1, + const __restrict__ AcReal* src2, + const int3 start, const int3 end, AcReal* dst) +{ + const int3 src_idx = (int3) { + start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z + }; + + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; + const int3 dst_idx = (int3) { + threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z + }; + + assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max)); + assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); + assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); + + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); +} + template __global__ void kernel_reduce(AcReal* scratchpad, const int num_elems) @@ -920,6 +950,107 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, } +AcReal +reduce_scal(const cudaStream_t stream, const ReductionType rtype, + const int3& start, const int3& end, + const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) +{ + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; + const unsigned num_elems = nx * ny * nz; + + const dim3 tpb_filter(32, 4, 1); + const dim3 bpg_filter( + (unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z)) + ); + + const int tpb_reduce = 128; + const int bpg_reduce = num_elems / tpb_reduce; + + ERRCHK(nx >= tpb_filter.x); + ERRCHK(ny >= tpb_filter.y); + ERRCHK(nz >= tpb_filter.z); + ERRCHK(tpb_reduce <= num_elems); + ERRCHK(nx * ny * nz % 2 == 0); + + if (rtype == RTYPE_MAX) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_MIN) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS_EXP) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else { + ERROR("Unrecognized rtype"); + } + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} + +AcReal +reduce_vec(const cudaStream_t stream, const ReductionType rtype, + const int3& start, const int3& end, + const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, + AcReal* scratchpad, AcReal* reduce_result) +{ + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; + const unsigned num_elems = nx * ny * nz; + + const dim3 tpb_filter(32, 4, 1); + const dim3 bpg_filter( + (unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z)) + ); + + const int tpb_reduce = 128; + const int bpg_reduce = num_elems / tpb_reduce; + + ERRCHK(nx >= tpb_filter.x); + ERRCHK(ny >= tpb_filter.y); + ERRCHK(nz >= tpb_filter.z); + ERRCHK(tpb_reduce <= num_elems); + ERRCHK(nx * ny * nz % 2 == 0); + + if (rtype == RTYPE_MAX) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_MIN) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS_EXP) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else { + ERROR("Unrecognized rtype"); + } + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} + static __device__ inline bool oob(const int& i, const int& j, const int& k) { @@ -1037,7 +1168,7 @@ kernel_reduce_3of3(const __restrict__ AcReal* src, AcReal* result) ////////////////////////////////////////////////////////////////////////////// AcReal -reduce_scal(const cudaStream_t stream, +reduce_scal2(const cudaStream_t stream, const ReductionType& rtype, const int& nx, const int& ny, const int& nz, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) @@ -1094,7 +1225,7 @@ reduce_scal(const cudaStream_t stream, } AcReal -reduce_vec(const cudaStream_t stream, +reduce_vec2(const cudaStream_t stream, const ReductionType& rtype, const int& nx, const int& ny, const int& nz, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result) diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc index bdbd9e3..7c44be3 100644 --- a/src/standalone/autotest.cc +++ b/src/standalone/autotest.cc @@ -457,25 +457,16 @@ run_autotest(void) if (STENCIL_ORDER > 6) printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen."); - /* - const vec3i test_dims[] = { // - {15, 11, 13}, // - {17, 61, 127}, // - {511, 17, 16}, // - {64, 64, 8}, // - {32, 32, 64}, // - {64, 32, 32}, // - {128, 64, 32}}; - */ - const vec3i test_dims[] = {{512, 16, 32}, // - {64, 64, 32}, // - {32, 32, 64}, // - {64, 32, 32}, // - {128, 64, 32}}; - //const vec3i test_dims[] = {{256,256,256}}; - //const vec3i test_dims[] = {{256,256,256}}; - //const vec3i test_dims[] = {{32, 32, 32}}; + const vec3i test_dims[] = { + {32, 32, 32}, + {64, 32, 32}, + {32, 64, 32}, + {32, 32, 64}, + {64, 64, 32}, + {64, 32, 64}, + {32, 64, 64} + }; int num_failures = 0; for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {