From c9f26d6e5833c792652b1ca9849faaae7ab57f50 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 17 Jun 2019 20:44:37 +0300
Subject: [PATCH] Cleanup

---
 src/core/device.cu           |  18 ---
 src/core/kernels/kernels.cuh | 244 +----------------------------------
 2 files changed, 2 insertions(+), 260 deletions(-)

diff --git a/src/core/device.cu b/src/core/device.cu
index 9415a07..3096405 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -213,14 +213,6 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp
     *result = reduce_scal(device->streams[stream_type], rtype,
                           start, end, device->vba.in[vtxbuf_handle],
                           device->reduce_scratchpad, device->reduce_result);
-    /*
-    *result = reduce_scal(device->streams[stream_type], rtype,
-                          device->local_config.int_params[AC_nx],
-                          device->local_config.int_params[AC_ny],
-                          device->local_config.int_params[AC_nz],
-                          device->vba.in[vtxbuf_handle],
-                          device->reduce_scratchpad, device->reduce_result);
-                          */
     return AC_SUCCESS;
 }
 
@@ -249,16 +241,6 @@ reduceVec(const Device device, const StreamType stream_type,
                           device->vba.in[vtxbuf1],
                           device->vba.in[vtxbuf2],
                           device->reduce_scratchpad, device->reduce_result);
-    /*
-    *result = reduce_vec(device->streams[stream_type], rtype,
-                         device->local_config.int_params[AC_nx],
-                         device->local_config.int_params[AC_ny],
-                         device->local_config.int_params[AC_nz],
-                         device->vba.in[vtxbuf0],
-                         device->vba.in[vtxbuf1],
-                         device->vba.in[vtxbuf2],
-                         device->reduce_scratchpad, device->reduce_result);
-    */
     return AC_SUCCESS;
 }
 
diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh
index 98dbd99..278f2de 100644
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -805,13 +805,9 @@ Reduction steps:
 */
 
 // Function pointer definitions
-typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
-typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
-typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
-                                       const AcReal&);
-
 typedef AcReal (*FilterFunc)(const AcReal&);
 typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
 
 // clang-format off
 /* Comparison funcs */
@@ -857,7 +853,6 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,
 
     const int nx = end.x - start.x;
     const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
     const int3 dst_idx = (int3) {
         threadIdx.x + blockIdx.x * blockDim.x,
         threadIdx.y + blockIdx.y * blockDim.y,
@@ -886,7 +881,6 @@ kernel_filter_vec(const __restrict__ AcReal* src0,
 
     const int nx = end.x - start.x;
     const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
     const int3 dst_idx = (int3) {
         threadIdx.x + blockIdx.x * blockDim.x,
         threadIdx.y + blockIdx.y * blockDim.y,
@@ -939,11 +933,9 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
     if (idx != 0) {
         return;
     }
-    const int scratchpad_size = DCONST_INT(AC_nxyz);
+
     AcReal res = scratchpad[0];
     for (int i = 1; i < num_blocks; ++i) {
-        assert(i * block_size < num_blocks * block_size);
-        assert(i * block_size < scratchpad_size);
         res = reduce(res, scratchpad[i * block_size]);
     }
     *result = res;
@@ -1050,235 +1042,3 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
     cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
     return result;
 }
-
-static __device__ inline bool
-oob(const int& i, const int& j, const int& k)
-{
-    if (i >= d_mesh_info.int_params[AC_nx] ||
-        j >= d_mesh_info.int_params[AC_ny] ||
-        k >= d_mesh_info.int_params[AC_nz])
-        return true;
-    else
-        return false;
-}
-
-template <ReduceInitialScalFunc reduce_initial>
-__global__ void
-kernel_reduce_1of3(const __restrict__ AcReal* src, AcReal* dst)
-{
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k = threadIdx.z + blockIdx.z * blockDim.z;
-
-    if (oob(i, j, k))
-        return;
-
-    const int src_idx = DEVICE_VTXBUF_IDX(
-        i + d_mesh_info.int_params[AC_nx_min],
-        j + d_mesh_info.int_params[AC_ny_min],
-        k + d_mesh_info.int_params[AC_nz_min]);
-    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
-
-    dst[dst_idx] = reduce_initial(src[src_idx]);
-}
-
-template <ReduceInitialVecFunc reduce_initial>
-__global__ void
-kernel_reduce_1of3_vec(const __restrict__ AcReal* src_a,
-                   const __restrict__ AcReal* src_b,
-                   const __restrict__ AcReal* src_c, AcReal* dst)
-{
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
-    const int j = threadIdx.y + blockIdx.y * blockDim.y;
-    const int k = threadIdx.z + blockIdx.z * blockDim.z;
-
-    if (oob(i, j, k))
-        return;
-
-    const int src_idx = DEVICE_VTXBUF_IDX(
-        i + d_mesh_info.int_params[AC_nx_min],
-        j + d_mesh_info.int_params[AC_ny_min],
-        k + d_mesh_info.int_params[AC_nz_min]);
-    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
-
-    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
-                                  src_c[src_idx]);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-#define BLOCK_SIZE (1024)
-#define ELEMS_PER_THREAD (32)
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce_2of3(AcReal* src, AcReal* result)
-{
-    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
-    const int scratchpad_size = DCONST_INT(AC_nxyz);
-
-    if (idx >= scratchpad_size)
-        return;
-
-    __shared__ AcReal smem[BLOCK_SIZE];
-
-    AcReal tmp = src[idx];
-
-    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
-        const int src_idx = idx + i * BLOCK_SIZE;
-        if (src_idx >= scratchpad_size) {
-            // This check is for safety: if accessing uninitialized values
-            // beyond the mesh boundaries, we will immediately start seeing NANs
-            if (threadIdx.x < BLOCK_SIZE)
-                smem[threadIdx.x] = NAN;
-            else
-                break;
-        }
-        tmp = reduce(tmp, src[src_idx]);
-    }
-
-    smem[threadIdx.x] = tmp;
-    __syncthreads();
-
-    int offset = BLOCK_SIZE / 2;
-    while (offset > 0) {
-
-        if (threadIdx.x < offset) {
-            smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
-        }
-        offset /= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        src[idx] = smem[0];
-}
-
-template <ReduceFunc reduce>
-__global__ void
-kernel_reduce_3of3(const __restrict__ AcReal* src, AcReal* result)
-{
-    const int scratchpad_size = DCONST_INT(AC_nxyz);
-    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
-    AcReal tmp    = src[idx];
-    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
-    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
-        tmp = reduce(tmp, src[idx + i * block_offset]);
-
-    *result = tmp;
-}
-//////////////////////////////////////////////////////////////////////////////
-
-AcReal
-reduce_scal2(const cudaStream_t stream,
-             const ReductionType& rtype, const int& nx, const int& ny,
-             const int& nz, const AcReal* vtxbuf,
-             AcReal* scratchpad, AcReal* reduce_result)
-{
-    const dim3 tpb(32, 4, 1);
-    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
-                   int(ceil(AcReal(nz) / tpb.z)));
-
-    const int scratchpad_size = nx * ny * nz;
-    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
-                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
-
-    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN) {
-        const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
-        const int3 end = (int3){start.x + nx, start.y + ny, start.z + nz};
-        kernel_filter<dvalue><<<bpg, tpb, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        ERRCHK_CUDA_KERNEL_ALWAYS();
-        //kernel_reduce_1of3<dvalue><<<bpg, tpb, 0, stream>>>(vtxbuf, scratchpad);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_reduce_1of3<dsquared><<<bpg, tpb, 0, stream>>>(vtxbuf, scratchpad);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_reduce_1of3<dexp_squared><<<bpg, tpb, 0, stream>>>(vtxbuf, scratchpad);
-    } else {
-        ERROR("Unrecognized RTYPE");
-    }
-
-    if (rtype == RTYPE_MAX) {
-        const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
-        const int3 end = (int3){start.x + nx, start.y + ny, start.z + nz};
-        const int num_elems = (end.x - start.x) * (end.y - start.y) * (end.z - start.z);
-        const int tpb = 128;
-        const int bpg = num_elems / tpb;
-        assert(num_elems % tpb == 0);
-        assert(tpb * bpg <= num_elems);
-        kernel_reduce<dmax><<<bpg, tpb, sizeof(AcReal) * tpb, stream>>>(scratchpad, num_elems);
-        ERRCHK_CUDA_KERNEL_ALWAYS();
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg, tpb, reduce_result);
-        ERRCHK_CUDA_KERNEL_ALWAYS();
-        // kernel_reduce_2of3<dmax><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
-        // kernel_reduce_3of3<dmax><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_reduce_2of3<dmin><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
-        kernel_reduce_3of3<dmin><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
-    } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        kernel_reduce_2of3<dsum><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
-        kernel_reduce_3of3<dsum><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
-    } else {
-        ERROR("Unrecognized RTYPE");
-    }
-
-    AcReal result;
-    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
-    return result;
-}
-
-AcReal
-reduce_vec2(const cudaStream_t stream,
-           const ReductionType& rtype, const int& nx, const int& ny, const int& nz,
-           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
-           AcReal* scratchpad, AcReal* reduce_result)
-{
-    const dim3 tpb(32, 4, 1);
-    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
-                   int(ceil(float(ny) / tpb.y)),
-                   int(ceil(float(nz) / tpb.z)));
-
-    const int scratchpad_size = nx * ny * nz;
-    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
-                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
-
-    // "Features" of this quick & efficient reduction:
-    // Block size must be smaller than the computational domain size
-    // (otherwise we would have do some additional bounds checking in the
-    // second half of kernel_reduce_2of3, which gets quite confusing)
-    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
-    // the work without worrying too much about the array bounds.
-    ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size);
-    ERRCHK_ALWAYS(!(BLOCK_SIZE % 2));
-    // NOTE! Also does not work properly with non-power of two mesh dimension
-    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
-    // see that uninitialized smem values are used in the comparison
-    //ERRCHK_ALWAYS(is_power_of_two(nx));
-    //ERRCHK_ALWAYS(is_power_of_two(ny));
-    //ERRCHK_ALWAYS(is_power_of_two(nz));
-
-    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN) {
-        kernel_reduce_1of3_vec<dlength_vec><<<bpg, tpb, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, scratchpad);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_reduce_1of3_vec<dsquared_vec><<<bpg, tpb, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, scratchpad);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_reduce_1of3_vec<dexp_squared_vec><<<bpg, tpb, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, scratchpad);
-    } else {
-        ERROR("Unrecognized RTYPE");
-    }
-
-    if (rtype == RTYPE_MAX) {
-        kernel_reduce_2of3<dmax><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
-        kernel_reduce_3of3<dmax><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_reduce_2of3<dmin><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
-        kernel_reduce_3of3<dmin><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
-    } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        kernel_reduce_2of3<dsum><<<bpg2, BLOCK_SIZE, 0, stream>>>(scratchpad, reduce_result);
-        kernel_reduce_3of3<dsum><<<1, 1, 0, stream>>>(scratchpad, reduce_result);
-    } else {
-        ERROR("Unrecognized RTYPE");
-    }
-
-    AcReal result;
-    cudaMemcpyAsync(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost, stream);
-    cudaStreamSynchronize(stream);
-    return result;
-}