Added multi-GPU reductions. Tested to work with 1-2 GPUs with power of two grid dimensions. Requires more testing in special cases (when using exotic grid dimensions and a large number of GPUs)

2019-06-17 14:45:41 +03:00
parent 0ce689dbe4
commit 59086b3e79
5 changed files with 385 additions and 31 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -424,20 +424,52 @@ acIntegrate(const AcReal& dt)
    return AC_SUCCESS;
 }
 static AcReal
 simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
 {
    AcReal res = results[0];
    for (int i = 1; i < n; ++i) {
        if (rtype == RTYPE_MAX) {
            res = max(res, results[i]);
        } else if (rtype == RTYPE_MIN) {
            res = min(res, results[i]);
        } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
            res = sum(res, results[i]);
        } else {
            ERROR("Invalid rtype");
        }
    }
    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
        res = sqrt(inv_n * res);
    }
    return res;
 }
 AcReal
 acReduceScal(const ReductionType& rtype,
             const VertexBufferHandle& vtxbuffer_handle)
 {
-    // TODO
+    AcReal results[num_devices];
-    return 0;
+    for (int i = 0; i < num_devices; ++i) {
        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
    }
    return simple_final_reduce_scal(rtype, results, num_devices);
 }
 AcReal
 acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
            const VertexBufferHandle& b, const VertexBufferHandle& c)
 {
-    // TODO
+    AcReal results[num_devices];
-    return 0;
+    for (int i = 0; i < num_devices; ++i) {
        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
    }
    return simple_final_reduce_scal(rtype, results, num_devices);
 }
 AcResult
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -37,6 +37,7 @@ __constant__ AcMeshInfo d_mesh_info;
 #define DCONST_INT(X)  (d_mesh_info.int_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #include "kernels/kernels.cuh"
 struct device_s {
@@ -194,16 +195,40 @@ boundcondStep(const Device device, const StreamType stream_type, const int3& sta
 }
 AcResult
-reduceScal(const Device device)
+reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
           const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
    cudaSetDevice(device->id);
    *result = reduce_scal(device->streams[stream_type], rtype,
                          device->local_config.int_params[AC_nx],
                          device->local_config.int_params[AC_ny],
                          device->local_config.int_params[AC_nz],
                          device->vba.in[vtxbuf_handle],
                          device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
 }
 AcResult
-reduceVec(const Device device)
+reduceVec(const Device device, const StreamType stream_type,
          const ReductionType rtype,
          const VertexBufferHandle vec0,
          const VertexBufferHandle vec1,
          const VertexBufferHandle vec2,
          AcReal* result)
 {
    cudaSetDevice(device->id);
    *result = reduce_vec(device->streams[stream_type], rtype,
                         device->local_config.int_params[AC_nx],
                         device->local_config.int_params[AC_ny],
                         device->local_config.int_params[AC_nz],
                         device->vba.in[vec0],
                         device->vba.in[vec1],
                         device->vba.in[vec2],
                         device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
 }
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -51,10 +51,16 @@ AcResult boundcondStep(const Device device, const StreamType stream_type,
                       const int3& start, const int3& end);
 /** */
-AcResult reduceScal(const Device device);
+AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
                    const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult reduceVec(const Device device);
+AcResult reduceVec(const Device device, const StreamType stream_type,
                   const ReductionType rtype,
                   const VertexBufferHandle vec0,
                   const VertexBufferHandle vec1,
                   const VertexBufferHandle vec2,
                   AcReal* result);
 /** */
 AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -792,3 +792,294 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
 ////////////////REDUCE///////////////////////////
 #include "src/core/math_utils.h" // is_power_of_two
 // Function pointer definitions
 typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
 typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
 typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
                                       const AcReal&);
 // clang-format off
 /* Comparison funcs */
 __device__ inline AcReal
 _device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
 __device__ inline AcReal
 _device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
 __device__ inline AcReal
 _device_sum(const AcReal& a, const AcReal& b) { return a + b; }
 /* Function used to determine the values used during reduction */
 __device__ inline AcReal
 _device_length_scal(const AcReal& a) { return AcReal(a); }
 __device__ inline AcReal
 _device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
 __device__ inline AcReal
 _device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
 __device__ inline AcReal
 _device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
 __device__ inline AcReal
 _device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
 __device__ inline AcReal
 _device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
 // clang-format on
 __device__ inline bool
 oob(const int& i, const int& j, const int& k)
 {
    if (i >= d_mesh_info.int_params[AC_nx] ||
        j >= d_mesh_info.int_params[AC_ny] ||
        k >= d_mesh_info.int_params[AC_nz])
        return true;
    else
        return false;
 }
 template <ReduceInitialScalFunc reduce_initial>
 __global__ void
 _kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
 {
    const int i = threadIdx.x + blockIdx.x * blockDim.x;
    const int j = threadIdx.y + blockIdx.y * blockDim.y;
    const int k = threadIdx.z + blockIdx.z * blockDim.z;
    if (oob(i, j, k))
        return;
    const int src_idx = DEVICE_VTXBUF_IDX(
        i + d_mesh_info.int_params[AC_nx_min],
        j + d_mesh_info.int_params[AC_ny_min],
        k + d_mesh_info.int_params[AC_nz_min]);
    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
    dst[dst_idx] = reduce_initial(src[src_idx]);
 }
 template <ReduceInitialVecFunc reduce_initial>
 __global__ void
 _kernel_reduce_vec(const __restrict__ AcReal* src_a,
                   const __restrict__ AcReal* src_b,
                   const __restrict__ AcReal* src_c, AcReal* dst)
 {
    const int i = threadIdx.x + blockIdx.x * blockDim.x;
    const int j = threadIdx.y + blockIdx.y * blockDim.y;
    const int k = threadIdx.z + blockIdx.z * blockDim.z;
    if (oob(i, j, k))
        return;
    const int src_idx = DEVICE_VTXBUF_IDX(
        i + d_mesh_info.int_params[AC_nx_min],
        j + d_mesh_info.int_params[AC_ny_min],
        k + d_mesh_info.int_params[AC_nz_min]);
    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
                                  src_c[src_idx]);
 }
 ///////////////////////////////////////////////////////////////////////////////
 #define BLOCK_SIZE (1024)
 #define ELEMS_PER_THREAD (32)
 template <ReduceFunc reduce>
 __global__ void
 _kernel_reduce(AcReal* src, AcReal* result)
 {
    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
    const int scratchpad_size = DCONST_INT(AC_nxyz);
    if (idx >= scratchpad_size)
        return;
    __shared__ AcReal smem[BLOCK_SIZE];
    AcReal tmp = src[idx];
    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
        const int src_idx = idx + i * BLOCK_SIZE;
        if (src_idx >= scratchpad_size) {
            // This check is for safety: if accessing uninitialized values
            // beyond the mesh boundaries, we will immediately start seeing NANs
            if (threadIdx.x < BLOCK_SIZE)
                smem[threadIdx.x] = NAN;
            else
                break;
        }
        tmp = reduce(tmp, src[src_idx]);
    }
    smem[threadIdx.x] = tmp;
    __syncthreads();
    int offset = BLOCK_SIZE / 2;
    while (offset > 0) {
        if (threadIdx.x < offset) {
            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
            smem[threadIdx.x] = tmp;
        }
        offset /= 2;
        __syncthreads();
    }
    if (threadIdx.x == 0)
        src[idx] = tmp;
 }
 template <ReduceFunc reduce>
 __global__ void
 _kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
 {
    const int scratchpad_size = DCONST_INT(AC_nxyz);
    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
    AcReal tmp    = src[idx];
    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
        tmp = reduce(tmp, src[idx + i * block_offset]);
    *result = tmp;
 }
 //////////////////////////////////////////////////////////////////////////////
 AcReal
 reduce_scal(const cudaStream_t stream,
             const ReductionType& rtype, const int& nx, const int& ny,
             const int& nz, const AcReal* vertex_buffer,
             AcReal* reduce_scratchpad, AcReal* reduce_result)
 {
    const dim3 tpb(32, 4, 1);
    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
                   int(ceil(AcReal(nz) / tpb.z)));
    const int scratchpad_size = nx * ny * nz;
    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
    switch (rtype) {
    case RTYPE_MAX:
        _kernel_reduce_scal<_device_length_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_max>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_max>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_MIN:
        _kernel_reduce_scal<_device_length_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_min>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_min>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_RMS:
        _kernel_reduce_scal<_device_squared_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_RMS_EXP:
        _kernel_reduce_scal<_device_exp_squared_scal>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    default:
        ERROR("Unrecognized RTYPE");
    }
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
 }
 AcReal
 reduce_vec(const cudaStream_t stream,
            const ReductionType& rtype, const int& nx, const int& ny,
            const int& nz, const AcReal* vertex_buffer_a,
            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
            AcReal* reduce_scratchpad, AcReal* reduce_result)
 {
    const dim3 tpb(32, 4, 1);
    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
                   int(ceil(float(ny) / tpb.y)),
                   int(ceil(float(nz) / tpb.z)));
    const int scratchpad_size = nx * ny * nz;
    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
    // "Features" of this quick & efficient reduction:
    // Block size must be smaller than the computational domain size
    // (otherwise we would have do some additional bounds checking in the
    // second half of _kernel_reduce, which gets quite confusing)
    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
    // the work without worrying too much about the array bounds.
    ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size);
    ERRCHK_ALWAYS(!(BLOCK_SIZE % 2));
    // NOTE! Also does not work properly with non-power of two mesh dimension
    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
    // see that uninitialized smem values are used in the comparison
    ERRCHK_ALWAYS(is_power_of_two(nx));
    ERRCHK_ALWAYS(is_power_of_two(ny));
    ERRCHK_ALWAYS(is_power_of_two(nz));
    switch (rtype) {
    case RTYPE_MAX:
        _kernel_reduce_vec<_device_length_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_max>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_max>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_MIN:
        _kernel_reduce_vec<_device_length_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_min>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_min>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_RMS:
        _kernel_reduce_vec<_device_squared_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    case RTYPE_RMS_EXP:
        _kernel_reduce_vec<_device_exp_squared_vec>
            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
                           reduce_scratchpad);
        _kernel_reduce<_device_sum>
            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
        _kernel_reduce_block<_device_sum>
            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
        break;
    default:
        ERROR("Unrecognized RTYPE");
    }
    AcReal result;
    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
    return result;
 }
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -478,7 +478,7 @@ run_autotest(void)
 	//const vec3i test_dims[] = {{32, 32, 32}};
 	int num_failures = 0;
-	/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
 		config.int_params[AC_nx] = test_dims[i].x;
 		config.int_params[AC_ny] = test_dims[i].y;
 		config.int_params[AC_nz] = test_dims[i].z;
@@ -489,7 +489,7 @@ run_autotest(void)
 		num_failures += check_reductions(config);
 		fflush(stdout);
-	}*/ // TODO uncomment
+	}
 	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
 		config.int_params[AC_nx] = test_dims[i].x;