Added multi-GPU reductions. Tested to work with 1-2 GPUs with power of two grid dimensions. Requires more testing in special cases (when using exotic grid dimensions and a large number of GPUs)

This commit is contained in:
jpekkila
2019-06-17 14:45:41 +03:00
parent 0ce689dbe4
commit 59086b3e79
5 changed files with 385 additions and 31 deletions

View File

@@ -424,20 +424,52 @@ acIntegrate(const AcReal& dt)
return AC_SUCCESS; return AC_SUCCESS;
} }
static AcReal
simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
{
AcReal res = results[0];
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
} else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
res = sum(res, results[i]);
} else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res);
}
return res;
}
AcReal AcReal
acReduceScal(const ReductionType& rtype, acReduceScal(const ReductionType& rtype,
const VertexBufferHandle& vtxbuffer_handle) const VertexBufferHandle& vtxbuffer_handle)
{ {
// TODO AcReal results[num_devices];
return 0; for (int i = 0; i < num_devices; ++i) {
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
} }
AcReal AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
const VertexBufferHandle& b, const VertexBufferHandle& c) const VertexBufferHandle& b, const VertexBufferHandle& c)
{ {
// TODO AcReal results[num_devices];
return 0; for (int i = 0; i < num_devices; ++i) {
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
} }
AcResult AcResult

View File

@@ -37,6 +37,7 @@ __constant__ AcMeshInfo d_mesh_info;
#define DCONST_INT(X) (d_mesh_info.int_params[X]) #define DCONST_INT(X) (d_mesh_info.int_params[X])
#define DCONST_REAL(X) (d_mesh_info.real_params[X]) #define DCONST_REAL(X) (d_mesh_info.real_params[X])
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
#include "kernels/kernels.cuh" #include "kernels/kernels.cuh"
struct device_s { struct device_s {
@@ -194,16 +195,40 @@ boundcondStep(const Device device, const StreamType stream_type, const int3& sta
} }
AcResult AcResult
reduceScal(const Device device) reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
*result = reduce_scal(device->streams[stream_type], rtype,
device->local_config.int_params[AC_nx],
device->local_config.int_params[AC_ny],
device->local_config.int_params[AC_nz],
device->vba.in[vtxbuf_handle],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult AcResult
reduceVec(const Device device) reduceVec(const Device device, const StreamType stream_type,
const ReductionType rtype,
const VertexBufferHandle vec0,
const VertexBufferHandle vec1,
const VertexBufferHandle vec2,
AcReal* result)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
*result = reduce_vec(device->streams[stream_type], rtype,
device->local_config.int_params[AC_nx],
device->local_config.int_params[AC_ny],
device->local_config.int_params[AC_nz],
device->vba.in[vec0],
device->vba.in[vec1],
device->vba.in[vec2],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS; return AC_SUCCESS;
} }

View File

@@ -51,10 +51,16 @@ AcResult boundcondStep(const Device device, const StreamType stream_type,
const int3& start, const int3& end); const int3& start, const int3& end);
/** */ /** */
AcResult reduceScal(const Device device); AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
/** */ /** */
AcResult reduceVec(const Device device); AcResult reduceVec(const Device device, const StreamType stream_type,
const ReductionType rtype,
const VertexBufferHandle vec0,
const VertexBufferHandle vec1,
const VertexBufferHandle vec2,
AcReal* result);
/** */ /** */
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number, AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,

View File

@@ -792,3 +792,294 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
ERRCHK_CUDA_KERNEL(); ERRCHK_CUDA_KERNEL();
return AC_SUCCESS; return AC_SUCCESS;
} }
////////////////REDUCE///////////////////////////
#include "src/core/math_utils.h" // is_power_of_two
// Function pointer definitions
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
const AcReal&);
// clang-format off
/* Comparison funcs */
__device__ inline AcReal
_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
__device__ inline AcReal
_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
__device__ inline AcReal
_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
/* Function used to determine the values used during reduction */
__device__ inline AcReal
_device_length_scal(const AcReal& a) { return AcReal(a); }
__device__ inline AcReal
_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
__device__ inline AcReal
_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
__device__ inline AcReal
_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
__device__ inline AcReal
_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
__device__ inline AcReal
_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
// clang-format on
__device__ inline bool
oob(const int& i, const int& j, const int& k)
{
if (i >= d_mesh_info.int_params[AC_nx] ||
j >= d_mesh_info.int_params[AC_ny] ||
k >= d_mesh_info.int_params[AC_nz])
return true;
else
return false;
}
template <ReduceInitialScalFunc reduce_initial>
__global__ void
_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
{
const int i = threadIdx.x + blockIdx.x * blockDim.x;
const int j = threadIdx.y + blockIdx.y * blockDim.y;
const int k = threadIdx.z + blockIdx.z * blockDim.z;
if (oob(i, j, k))
return;
const int src_idx = DEVICE_VTXBUF_IDX(
i + d_mesh_info.int_params[AC_nx_min],
j + d_mesh_info.int_params[AC_ny_min],
k + d_mesh_info.int_params[AC_nz_min]);
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
dst[dst_idx] = reduce_initial(src[src_idx]);
}
template <ReduceInitialVecFunc reduce_initial>
__global__ void
_kernel_reduce_vec(const __restrict__ AcReal* src_a,
const __restrict__ AcReal* src_b,
const __restrict__ AcReal* src_c, AcReal* dst)
{
const int i = threadIdx.x + blockIdx.x * blockDim.x;
const int j = threadIdx.y + blockIdx.y * blockDim.y;
const int k = threadIdx.z + blockIdx.z * blockDim.z;
if (oob(i, j, k))
return;
const int src_idx = DEVICE_VTXBUF_IDX(
i + d_mesh_info.int_params[AC_nx_min],
j + d_mesh_info.int_params[AC_ny_min],
k + d_mesh_info.int_params[AC_nz_min]);
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
src_c[src_idx]);
}
///////////////////////////////////////////////////////////////////////////////
#define BLOCK_SIZE (1024)
#define ELEMS_PER_THREAD (32)
template <ReduceFunc reduce>
__global__ void
_kernel_reduce(AcReal* src, AcReal* result)
{
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
const int scratchpad_size = DCONST_INT(AC_nxyz);
if (idx >= scratchpad_size)
return;
__shared__ AcReal smem[BLOCK_SIZE];
AcReal tmp = src[idx];
for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
const int src_idx = idx + i * BLOCK_SIZE;
if (src_idx >= scratchpad_size) {
// This check is for safety: if accessing uninitialized values
// beyond the mesh boundaries, we will immediately start seeing NANs
if (threadIdx.x < BLOCK_SIZE)
smem[threadIdx.x] = NAN;
else
break;
}
tmp = reduce(tmp, src[src_idx]);
}
smem[threadIdx.x] = tmp;
__syncthreads();
int offset = BLOCK_SIZE / 2;
while (offset > 0) {
if (threadIdx.x < offset) {
tmp = reduce(tmp, smem[threadIdx.x + offset]);
smem[threadIdx.x] = tmp;
}
offset /= 2;
__syncthreads();
}
if (threadIdx.x == 0)
src[idx] = tmp;
}
template <ReduceFunc reduce>
__global__ void
_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
{
const int scratchpad_size = DCONST_INT(AC_nxyz);
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
AcReal tmp = src[idx];
const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
tmp = reduce(tmp, src[idx + i * block_offset]);
*result = tmp;
}
//////////////////////////////////////////////////////////////////////////////
AcReal
reduce_scal(const cudaStream_t stream,
const ReductionType& rtype, const int& nx, const int& ny,
const int& nz, const AcReal* vertex_buffer,
AcReal* reduce_scratchpad, AcReal* reduce_result)
{
const dim3 tpb(32, 4, 1);
const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
int(ceil(AcReal(nz) / tpb.z)));
const int scratchpad_size = nx * ny * nz;
const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) /
AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
switch (rtype) {
case RTYPE_MAX:
_kernel_reduce_scal<_device_length_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_max>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_max>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_MIN:
_kernel_reduce_scal<_device_length_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_min>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_min>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS:
_kernel_reduce_scal<_device_squared_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS_EXP:
_kernel_reduce_scal<_device_exp_squared_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
default:
ERROR("Unrecognized RTYPE");
}
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;
}
AcReal
reduce_vec(const cudaStream_t stream,
const ReductionType& rtype, const int& nx, const int& ny,
const int& nz, const AcReal* vertex_buffer_a,
const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
AcReal* reduce_scratchpad, AcReal* reduce_result)
{
const dim3 tpb(32, 4, 1);
const dim3 bpg(int(ceil(float(nx) / tpb.x)),
int(ceil(float(ny) / tpb.y)),
int(ceil(float(nz) / tpb.z)));
const int scratchpad_size = nx * ny * nz;
const int bpg2 = (unsigned int)ceil(float(scratchpad_size) /
float(ELEMS_PER_THREAD * BLOCK_SIZE));
// "Features" of this quick & efficient reduction:
// Block size must be smaller than the computational domain size
// (otherwise we would have do some additional bounds checking in the
// second half of _kernel_reduce, which gets quite confusing)
// Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
// the work without worrying too much about the array bounds.
ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size);
ERRCHK_ALWAYS(!(BLOCK_SIZE % 2));
// NOTE! Also does not work properly with non-power of two mesh dimension
// Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
// see that uninitialized smem values are used in the comparison
ERRCHK_ALWAYS(is_power_of_two(nx));
ERRCHK_ALWAYS(is_power_of_two(ny));
ERRCHK_ALWAYS(is_power_of_two(nz));
switch (rtype) {
case RTYPE_MAX:
_kernel_reduce_vec<_device_length_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_max>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_max>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_MIN:
_kernel_reduce_vec<_device_length_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_min>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_min>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS:
_kernel_reduce_vec<_device_squared_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS_EXP:
_kernel_reduce_vec<_device_exp_squared_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
default:
ERROR("Unrecognized RTYPE");
}
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;
}

View File

@@ -478,7 +478,7 @@ run_autotest(void)
//const vec3i test_dims[] = {{32, 32, 32}}; //const vec3i test_dims[] = {{32, 32, 32}};
int num_failures = 0; int num_failures = 0;
/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
config.int_params[AC_nx] = test_dims[i].x; config.int_params[AC_nx] = test_dims[i].x;
config.int_params[AC_ny] = test_dims[i].y; config.int_params[AC_ny] = test_dims[i].y;
config.int_params[AC_nz] = test_dims[i].z; config.int_params[AC_nz] = test_dims[i].z;
@@ -489,7 +489,7 @@ run_autotest(void)
num_failures += check_reductions(config); num_failures += check_reductions(config);
fflush(stdout); fflush(stdout);
}*/ // TODO uncomment }
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
config.int_params[AC_nx] = test_dims[i].x; config.int_params[AC_nx] = test_dims[i].x;