Added multi-GPU reductions. Tested to work with 1-2 GPUs with power of two grid dimensions. Requires more testing in special cases (when using exotic grid dimensions and a large number of GPUs)
This commit is contained in:
@@ -158,16 +158,16 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
|
||||
/*
|
||||
Here we decompose the host mesh and distribute it among the GPUs in
|
||||
the node.
|
||||
|
||||
|
||||
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
||||
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
||||
one for each GPU. Here we check which parts of the range s0...s1 maps
|
||||
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
||||
the data if needed.
|
||||
|
||||
|
||||
The index mapping is inherently quite involved, but here's a picture which
|
||||
hopefully helps make sense out of all this.
|
||||
|
||||
|
||||
|
||||
Grid
|
||||
|----num_vertices---|
|
||||
@@ -181,8 +181,8 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
|
||||
^ ^
|
||||
d0 d1
|
||||
|
||||
^ ^
|
||||
db da
|
||||
^ ^
|
||||
db da
|
||||
|
||||
*/
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
@@ -294,7 +294,7 @@ acBoundcondStep(void)
|
||||
|
||||
/*
|
||||
// ===MIIKKANOTE START==========================================
|
||||
%JP: The old way for computing boundary conditions conflicts with the
|
||||
%JP: The old way for computing boundary conditions conflicts with the
|
||||
way we have to do things with multiple GPUs.
|
||||
|
||||
The older approach relied on unified memory, which represented the whole
|
||||
@@ -303,7 +303,7 @@ in its current state is more meant for quick prototyping when performance is not
|
||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
|
||||
when managing the memory explicitly.
|
||||
|
||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||
Quick rundown:
|
||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||
contain the extents of the whole simulation domain and the decomposed grids, respectively.
|
||||
@@ -314,7 +314,7 @@ Quick rundown:
|
||||
The whole simulation domain is decomposed with respect to the z dimension.
|
||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||
contain (nx, ny, nz / num_devices) vertices.
|
||||
|
||||
|
||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||
|
||||
@@ -328,35 +328,35 @@ Changes required to this commented code block:
|
||||
laid out in device memory
|
||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
|
||||
of type "Device" which should be passed to single-GPU functions. In this file, all devices
|
||||
are stored in a global array "devices[num_devices]".
|
||||
are stored in a global array "devices[num_devices]".
|
||||
- Every single-GPU function is executed asynchronously by default such that we
|
||||
can optimize Astaroth by executing memory transactions concurrently with computation.
|
||||
Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||
as a parameter and commands executing in different streams can be processed
|
||||
as a parameter and commands executing in different streams can be processed
|
||||
in parallel/concurrently.
|
||||
|
||||
|
||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||
|
||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||
|
||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||
|
||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
|
||||
|
||||
on a single device are "local", in the sense that they can be computed without having
|
||||
to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||
the data to the fron and back plates outside this range. In the solution we use here,
|
||||
we solve the local boundaries first, and then just exchange the front and back plates
|
||||
in a "ring", like so
|
||||
in a "ring", like so
|
||||
device_id
|
||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||
|
||||
|
||||
|
||||
// ======MIIKKANOTE END==========================================
|
||||
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
||||
moved into device.cu, function boundCondStep()
|
||||
In astaroth.cu, we use acBoundcondStep()
|
||||
In astaroth.cu, we use acBoundcondStep()
|
||||
just to distribute the work and manage
|
||||
communication between GPUs.
|
||||
|
||||
@@ -364,8 +364,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
|
||||
|
||||
exit(0);
|
||||
#else
|
||||
|
||||
|
||||
|
||||
|
||||
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
|
||||
|
||||
const int3 start = (int3){0, 0, device_id * depth};
|
||||
@@ -378,8 +378,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
|
||||
// TODO uses the default stream currently
|
||||
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
|
||||
wedge_boundconds(0, tpb, start, end, d_buffer);
|
||||
} else {
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||
} else {
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
*/
|
||||
@@ -424,20 +424,52 @@ acIntegrate(const AcReal& dt)
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcReal
|
||||
simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
|
||||
{
|
||||
AcReal res = results[0];
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (rtype == RTYPE_MAX) {
|
||||
res = max(res, results[i]);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
res = min(res, results[i]);
|
||||
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
res = sum(res, results[i]);
|
||||
} else {
|
||||
ERROR("Invalid rtype");
|
||||
}
|
||||
}
|
||||
|
||||
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
|
||||
res = sqrt(inv_n * res);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceScal(const ReductionType& rtype,
|
||||
const VertexBufferHandle& vtxbuffer_handle)
|
||||
{
|
||||
// TODO
|
||||
return 0;
|
||||
AcReal results[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
|
||||
}
|
||||
|
||||
return simple_final_reduce_scal(rtype, results, num_devices);
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
|
||||
const VertexBufferHandle& b, const VertexBufferHandle& c)
|
||||
{
|
||||
// TODO
|
||||
return 0;
|
||||
AcReal results[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
|
||||
}
|
||||
|
||||
return simple_final_reduce_scal(rtype, results, num_devices);
|
||||
}
|
||||
|
||||
AcResult
|
||||
|
@@ -37,6 +37,7 @@ __constant__ AcMeshInfo d_mesh_info;
|
||||
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
||||
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
||||
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
|
||||
#include "kernels/kernels.cuh"
|
||||
|
||||
struct device_s {
|
||||
@@ -118,7 +119,7 @@ AcResult
|
||||
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||
{
|
||||
cudaSetDevice(id);
|
||||
cudaDeviceReset();
|
||||
cudaDeviceReset();
|
||||
|
||||
// Create Device
|
||||
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
|
||||
@@ -194,16 +195,40 @@ boundcondStep(const Device device, const StreamType stream_type, const int3& sta
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceScal(const Device device)
|
||||
reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
*result = reduce_scal(device->streams[stream_type], rtype,
|
||||
device->local_config.int_params[AC_nx],
|
||||
device->local_config.int_params[AC_ny],
|
||||
device->local_config.int_params[AC_nz],
|
||||
device->vba.in[vtxbuf_handle],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceVec(const Device device)
|
||||
reduceVec(const Device device, const StreamType stream_type,
|
||||
const ReductionType rtype,
|
||||
const VertexBufferHandle vec0,
|
||||
const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2,
|
||||
AcReal* result)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
*result = reduce_vec(device->streams[stream_type], rtype,
|
||||
device->local_config.int_params[AC_nx],
|
||||
device->local_config.int_params[AC_ny],
|
||||
device->local_config.int_params[AC_nz],
|
||||
device->vba.in[vec0],
|
||||
device->vba.in[vec1],
|
||||
device->vba.in[vec2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
|
@@ -51,10 +51,16 @@ AcResult boundcondStep(const Device device, const StreamType stream_type,
|
||||
const int3& start, const int3& end);
|
||||
|
||||
/** */
|
||||
AcResult reduceScal(const Device device);
|
||||
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
|
||||
/** */
|
||||
AcResult reduceVec(const Device device);
|
||||
AcResult reduceVec(const Device device, const StreamType stream_type,
|
||||
const ReductionType rtype,
|
||||
const VertexBufferHandle vec0,
|
||||
const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2,
|
||||
AcReal* result);
|
||||
|
||||
/** */
|
||||
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||
|
@@ -754,7 +754,7 @@ randf(void)
|
||||
}
|
||||
|
||||
AcResult
|
||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
||||
const AcReal dt, VertexBufferArray* buffer)
|
||||
{
|
||||
const dim3 tpb(32, 1, 4);
|
||||
@@ -792,3 +792,294 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
////////////////REDUCE///////////////////////////
|
||||
#include "src/core/math_utils.h" // is_power_of_two
|
||||
|
||||
// Function pointer definitions
|
||||
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
||||
typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
|
||||
typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
|
||||
const AcReal&);
|
||||
|
||||
// clang-format off
|
||||
/* Comparison funcs */
|
||||
__device__ inline AcReal
|
||||
_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
|
||||
|
||||
/* Function used to determine the values used during reduction */
|
||||
__device__ inline AcReal
|
||||
_device_length_scal(const AcReal& a) { return AcReal(a); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
|
||||
// clang-format on
|
||||
|
||||
__device__ inline bool
|
||||
oob(const int& i, const int& j, const int& k)
|
||||
{
|
||||
if (i >= d_mesh_info.int_params[AC_nx] ||
|
||||
j >= d_mesh_info.int_params[AC_ny] ||
|
||||
k >= d_mesh_info.int_params[AC_nz])
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
template <ReduceInitialScalFunc reduce_initial>
|
||||
__global__ void
|
||||
_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
|
||||
{
|
||||
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
if (oob(i, j, k))
|
||||
return;
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(
|
||||
i + d_mesh_info.int_params[AC_nx_min],
|
||||
j + d_mesh_info.int_params[AC_ny_min],
|
||||
k + d_mesh_info.int_params[AC_nz_min]);
|
||||
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
||||
|
||||
dst[dst_idx] = reduce_initial(src[src_idx]);
|
||||
}
|
||||
|
||||
template <ReduceInitialVecFunc reduce_initial>
|
||||
__global__ void
|
||||
_kernel_reduce_vec(const __restrict__ AcReal* src_a,
|
||||
const __restrict__ AcReal* src_b,
|
||||
const __restrict__ AcReal* src_c, AcReal* dst)
|
||||
{
|
||||
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
if (oob(i, j, k))
|
||||
return;
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(
|
||||
i + d_mesh_info.int_params[AC_nx_min],
|
||||
j + d_mesh_info.int_params[AC_ny_min],
|
||||
k + d_mesh_info.int_params[AC_nz_min]);
|
||||
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
||||
|
||||
dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
|
||||
src_c[src_idx]);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
#define BLOCK_SIZE (1024)
|
||||
#define ELEMS_PER_THREAD (32)
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
_kernel_reduce(AcReal* src, AcReal* result)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
||||
|
||||
if (idx >= scratchpad_size)
|
||||
return;
|
||||
|
||||
__shared__ AcReal smem[BLOCK_SIZE];
|
||||
|
||||
AcReal tmp = src[idx];
|
||||
|
||||
for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
|
||||
const int src_idx = idx + i * BLOCK_SIZE;
|
||||
if (src_idx >= scratchpad_size) {
|
||||
// This check is for safety: if accessing uninitialized values
|
||||
// beyond the mesh boundaries, we will immediately start seeing NANs
|
||||
if (threadIdx.x < BLOCK_SIZE)
|
||||
smem[threadIdx.x] = NAN;
|
||||
else
|
||||
break;
|
||||
}
|
||||
tmp = reduce(tmp, src[src_idx]);
|
||||
}
|
||||
|
||||
smem[threadIdx.x] = tmp;
|
||||
__syncthreads();
|
||||
|
||||
int offset = BLOCK_SIZE / 2;
|
||||
while (offset > 0) {
|
||||
|
||||
if (threadIdx.x < offset) {
|
||||
tmp = reduce(tmp, smem[threadIdx.x + offset]);
|
||||
smem[threadIdx.x] = tmp;
|
||||
}
|
||||
offset /= 2;
|
||||
__syncthreads();
|
||||
}
|
||||
if (threadIdx.x == 0)
|
||||
src[idx] = tmp;
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
|
||||
{
|
||||
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
||||
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||
AcReal tmp = src[idx];
|
||||
const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||
for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
|
||||
tmp = reduce(tmp, src[idx + i * block_offset]);
|
||||
|
||||
*result = tmp;
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
AcReal
|
||||
reduce_scal(const cudaStream_t stream,
|
||||
const ReductionType& rtype, const int& nx, const int& ny,
|
||||
const int& nz, const AcReal* vertex_buffer,
|
||||
AcReal* reduce_scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const dim3 tpb(32, 4, 1);
|
||||
const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
|
||||
int(ceil(AcReal(nz) / tpb.z)));
|
||||
|
||||
const int scratchpad_size = nx * ny * nz;
|
||||
const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) /
|
||||
AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
|
||||
|
||||
switch (rtype) {
|
||||
case RTYPE_MAX:
|
||||
_kernel_reduce_scal<_device_length_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_max>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_max>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_MIN:
|
||||
_kernel_reduce_scal<_device_length_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_min>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_min>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_RMS:
|
||||
_kernel_reduce_scal<_device_squared_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_RMS_EXP:
|
||||
_kernel_reduce_scal<_device_exp_squared_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
default:
|
||||
ERROR("Unrecognized RTYPE");
|
||||
}
|
||||
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
||||
|
||||
AcReal
|
||||
reduce_vec(const cudaStream_t stream,
|
||||
const ReductionType& rtype, const int& nx, const int& ny,
|
||||
const int& nz, const AcReal* vertex_buffer_a,
|
||||
const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
|
||||
AcReal* reduce_scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const dim3 tpb(32, 4, 1);
|
||||
const dim3 bpg(int(ceil(float(nx) / tpb.x)),
|
||||
int(ceil(float(ny) / tpb.y)),
|
||||
int(ceil(float(nz) / tpb.z)));
|
||||
|
||||
const int scratchpad_size = nx * ny * nz;
|
||||
const int bpg2 = (unsigned int)ceil(float(scratchpad_size) /
|
||||
float(ELEMS_PER_THREAD * BLOCK_SIZE));
|
||||
|
||||
// "Features" of this quick & efficient reduction:
|
||||
// Block size must be smaller than the computational domain size
|
||||
// (otherwise we would have do some additional bounds checking in the
|
||||
// second half of _kernel_reduce, which gets quite confusing)
|
||||
// Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
|
||||
// the work without worrying too much about the array bounds.
|
||||
ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size);
|
||||
ERRCHK_ALWAYS(!(BLOCK_SIZE % 2));
|
||||
// NOTE! Also does not work properly with non-power of two mesh dimension
|
||||
// Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
|
||||
// see that uninitialized smem values are used in the comparison
|
||||
ERRCHK_ALWAYS(is_power_of_two(nx));
|
||||
ERRCHK_ALWAYS(is_power_of_two(ny));
|
||||
ERRCHK_ALWAYS(is_power_of_two(nz));
|
||||
|
||||
switch (rtype) {
|
||||
case RTYPE_MAX:
|
||||
_kernel_reduce_vec<_device_length_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_max>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_max>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_MIN:
|
||||
_kernel_reduce_vec<_device_length_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_min>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_min>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_RMS:
|
||||
_kernel_reduce_vec<_device_squared_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_RMS_EXP:
|
||||
_kernel_reduce_vec<_device_exp_squared_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
default:
|
||||
ERROR("Unrecognized RTYPE");
|
||||
}
|
||||
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
||||
|
@@ -478,7 +478,7 @@ run_autotest(void)
|
||||
//const vec3i test_dims[] = {{32, 32, 32}};
|
||||
|
||||
int num_failures = 0;
|
||||
/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
||||
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
||||
config.int_params[AC_nx] = test_dims[i].x;
|
||||
config.int_params[AC_ny] = test_dims[i].y;
|
||||
config.int_params[AC_nz] = test_dims[i].z;
|
||||
@@ -489,7 +489,7 @@ run_autotest(void)
|
||||
|
||||
num_failures += check_reductions(config);
|
||||
fflush(stdout);
|
||||
}*/ // TODO uncomment
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
||||
config.int_params[AC_nx] = test_dims[i].x;
|
||||
|
Reference in New Issue
Block a user