Added multi-GPU reductions. Tested to work with 1-2 GPUs with power of two grid dimensions. Requires more testing in special cases (when using exotic grid dimensions and a large number of GPUs)
This commit is contained in:
@@ -158,16 +158,16 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
|
|||||||
/*
|
/*
|
||||||
Here we decompose the host mesh and distribute it among the GPUs in
|
Here we decompose the host mesh and distribute it among the GPUs in
|
||||||
the node.
|
the node.
|
||||||
|
|
||||||
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
||||||
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
||||||
one for each GPU. Here we check which parts of the range s0...s1 maps
|
one for each GPU. Here we check which parts of the range s0...s1 maps
|
||||||
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
||||||
the data if needed.
|
the data if needed.
|
||||||
|
|
||||||
The index mapping is inherently quite involved, but here's a picture which
|
The index mapping is inherently quite involved, but here's a picture which
|
||||||
hopefully helps make sense out of all this.
|
hopefully helps make sense out of all this.
|
||||||
|
|
||||||
|
|
||||||
Grid
|
Grid
|
||||||
|----num_vertices---|
|
|----num_vertices---|
|
||||||
@@ -181,8 +181,8 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
|
|||||||
^ ^
|
^ ^
|
||||||
d0 d1
|
d0 d1
|
||||||
|
|
||||||
^ ^
|
^ ^
|
||||||
db da
|
db da
|
||||||
|
|
||||||
*/
|
*/
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
@@ -294,7 +294,7 @@ acBoundcondStep(void)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
// ===MIIKKANOTE START==========================================
|
// ===MIIKKANOTE START==========================================
|
||||||
%JP: The old way for computing boundary conditions conflicts with the
|
%JP: The old way for computing boundary conditions conflicts with the
|
||||||
way we have to do things with multiple GPUs.
|
way we have to do things with multiple GPUs.
|
||||||
|
|
||||||
The older approach relied on unified memory, which represented the whole
|
The older approach relied on unified memory, which represented the whole
|
||||||
@@ -303,7 +303,7 @@ in its current state is more meant for quick prototyping when performance is not
|
|||||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
|
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
|
||||||
when managing the memory explicitly.
|
when managing the memory explicitly.
|
||||||
|
|
||||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||||
Quick rundown:
|
Quick rundown:
|
||||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||||
contain the extents of the whole simulation domain and the decomposed grids, respectively.
|
contain the extents of the whole simulation domain and the decomposed grids, respectively.
|
||||||
@@ -314,7 +314,7 @@ Quick rundown:
|
|||||||
The whole simulation domain is decomposed with respect to the z dimension.
|
The whole simulation domain is decomposed with respect to the z dimension.
|
||||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||||
contain (nx, ny, nz / num_devices) vertices.
|
contain (nx, ny, nz / num_devices) vertices.
|
||||||
|
|
||||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||||
|
|
||||||
@@ -328,35 +328,35 @@ Changes required to this commented code block:
|
|||||||
laid out in device memory
|
laid out in device memory
|
||||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
|
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
|
||||||
of type "Device" which should be passed to single-GPU functions. In this file, all devices
|
of type "Device" which should be passed to single-GPU functions. In this file, all devices
|
||||||
are stored in a global array "devices[num_devices]".
|
are stored in a global array "devices[num_devices]".
|
||||||
- Every single-GPU function is executed asynchronously by default such that we
|
- Every single-GPU function is executed asynchronously by default such that we
|
||||||
can optimize Astaroth by executing memory transactions concurrently with computation.
|
can optimize Astaroth by executing memory transactions concurrently with computation.
|
||||||
Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||||
as a parameter and commands executing in different streams can be processed
|
as a parameter and commands executing in different streams can be processed
|
||||||
in parallel/concurrently.
|
in parallel/concurrently.
|
||||||
|
|
||||||
|
|
||||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||||
|
|
||||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||||
|
|
||||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
|
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
|
||||||
|
|
||||||
on a single device are "local", in the sense that they can be computed without having
|
on a single device are "local", in the sense that they can be computed without having
|
||||||
to exchange data with neighboring GPUs. Special care is needed only for transferring
|
to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||||
the data to the fron and back plates outside this range. In the solution we use here,
|
the data to the fron and back plates outside this range. In the solution we use here,
|
||||||
we solve the local boundaries first, and then just exchange the front and back plates
|
we solve the local boundaries first, and then just exchange the front and back plates
|
||||||
in a "ring", like so
|
in a "ring", like so
|
||||||
device_id
|
device_id
|
||||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||||
|
|
||||||
|
|
||||||
// ======MIIKKANOTE END==========================================
|
// ======MIIKKANOTE END==========================================
|
||||||
|
|
||||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
||||||
moved into device.cu, function boundCondStep()
|
moved into device.cu, function boundCondStep()
|
||||||
In astaroth.cu, we use acBoundcondStep()
|
In astaroth.cu, we use acBoundcondStep()
|
||||||
just to distribute the work and manage
|
just to distribute the work and manage
|
||||||
communication between GPUs.
|
communication between GPUs.
|
||||||
|
|
||||||
@@ -364,8 +364,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
|
|||||||
|
|
||||||
exit(0);
|
exit(0);
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
|
||||||
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
|
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
|
||||||
|
|
||||||
const int3 start = (int3){0, 0, device_id * depth};
|
const int3 start = (int3){0, 0, device_id * depth};
|
||||||
@@ -378,8 +378,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
|
|||||||
// TODO uses the default stream currently
|
// TODO uses the default stream currently
|
||||||
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
|
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
|
||||||
wedge_boundconds(0, tpb, start, end, d_buffer);
|
wedge_boundconds(0, tpb, start, end, d_buffer);
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||||
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
||||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||||
*/
|
*/
|
||||||
@@ -424,20 +424,52 @@ acIntegrate(const AcReal& dt)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static AcReal
|
||||||
|
simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
|
||||||
|
{
|
||||||
|
AcReal res = results[0];
|
||||||
|
for (int i = 1; i < n; ++i) {
|
||||||
|
if (rtype == RTYPE_MAX) {
|
||||||
|
res = max(res, results[i]);
|
||||||
|
} else if (rtype == RTYPE_MIN) {
|
||||||
|
res = min(res, results[i]);
|
||||||
|
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||||
|
res = sum(res, results[i]);
|
||||||
|
} else {
|
||||||
|
ERROR("Invalid rtype");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||||
|
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
|
||||||
|
res = sqrt(inv_n * res);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
acReduceScal(const ReductionType& rtype,
|
acReduceScal(const ReductionType& rtype,
|
||||||
const VertexBufferHandle& vtxbuffer_handle)
|
const VertexBufferHandle& vtxbuffer_handle)
|
||||||
{
|
{
|
||||||
// TODO
|
AcReal results[num_devices];
|
||||||
return 0;
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
|
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return simple_final_reduce_scal(rtype, results, num_devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
|
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
|
||||||
const VertexBufferHandle& b, const VertexBufferHandle& c)
|
const VertexBufferHandle& b, const VertexBufferHandle& c)
|
||||||
{
|
{
|
||||||
// TODO
|
AcReal results[num_devices];
|
||||||
return 0;
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
|
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return simple_final_reduce_scal(rtype, results, num_devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ __constant__ AcMeshInfo d_mesh_info;
|
|||||||
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
||||||
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
|
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
|
||||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
||||||
|
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
|
||||||
#include "kernels/kernels.cuh"
|
#include "kernels/kernels.cuh"
|
||||||
|
|
||||||
struct device_s {
|
struct device_s {
|
||||||
@@ -118,7 +119,7 @@ AcResult
|
|||||||
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||||
{
|
{
|
||||||
cudaSetDevice(id);
|
cudaSetDevice(id);
|
||||||
cudaDeviceReset();
|
cudaDeviceReset();
|
||||||
|
|
||||||
// Create Device
|
// Create Device
|
||||||
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
|
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
|
||||||
@@ -194,16 +195,40 @@ boundcondStep(const Device device, const StreamType stream_type, const int3& sta
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
reduceScal(const Device device)
|
reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||||
|
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
|
*result = reduce_scal(device->streams[stream_type], rtype,
|
||||||
|
device->local_config.int_params[AC_nx],
|
||||||
|
device->local_config.int_params[AC_ny],
|
||||||
|
device->local_config.int_params[AC_nz],
|
||||||
|
device->vba.in[vtxbuf_handle],
|
||||||
|
device->reduce_scratchpad, device->reduce_result);
|
||||||
|
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
reduceVec(const Device device)
|
reduceVec(const Device device, const StreamType stream_type,
|
||||||
|
const ReductionType rtype,
|
||||||
|
const VertexBufferHandle vec0,
|
||||||
|
const VertexBufferHandle vec1,
|
||||||
|
const VertexBufferHandle vec2,
|
||||||
|
AcReal* result)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
|
*result = reduce_vec(device->streams[stream_type], rtype,
|
||||||
|
device->local_config.int_params[AC_nx],
|
||||||
|
device->local_config.int_params[AC_ny],
|
||||||
|
device->local_config.int_params[AC_nz],
|
||||||
|
device->vba.in[vec0],
|
||||||
|
device->vba.in[vec1],
|
||||||
|
device->vba.in[vec2],
|
||||||
|
device->reduce_scratchpad, device->reduce_result);
|
||||||
|
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -51,10 +51,16 @@ AcResult boundcondStep(const Device device, const StreamType stream_type,
|
|||||||
const int3& start, const int3& end);
|
const int3& start, const int3& end);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult reduceScal(const Device device);
|
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||||
|
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult reduceVec(const Device device);
|
AcResult reduceVec(const Device device, const StreamType stream_type,
|
||||||
|
const ReductionType rtype,
|
||||||
|
const VertexBufferHandle vec0,
|
||||||
|
const VertexBufferHandle vec1,
|
||||||
|
const VertexBufferHandle vec2,
|
||||||
|
AcReal* result);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||||
|
|||||||
@@ -754,7 +754,7 @@ randf(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
||||||
const AcReal dt, VertexBufferArray* buffer)
|
const AcReal dt, VertexBufferArray* buffer)
|
||||||
{
|
{
|
||||||
const dim3 tpb(32, 1, 4);
|
const dim3 tpb(32, 1, 4);
|
||||||
@@ -792,3 +792,294 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
|
|||||||
ERRCHK_CUDA_KERNEL();
|
ERRCHK_CUDA_KERNEL();
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////REDUCE///////////////////////////
|
||||||
|
#include "src/core/math_utils.h" // is_power_of_two
|
||||||
|
|
||||||
|
// Function pointer definitions
|
||||||
|
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
||||||
|
typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
|
||||||
|
typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
|
||||||
|
const AcReal&);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
/* Comparison funcs */
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
||||||
|
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
||||||
|
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
|
||||||
|
|
||||||
|
/* Function used to determine the values used during reduction */
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_length_scal(const AcReal& a) { return AcReal(a); }
|
||||||
|
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
|
||||||
|
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
|
||||||
|
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
||||||
|
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
|
||||||
|
|
||||||
|
__device__ inline AcReal
|
||||||
|
_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
__device__ inline bool
|
||||||
|
oob(const int& i, const int& j, const int& k)
|
||||||
|
{
|
||||||
|
if (i >= d_mesh_info.int_params[AC_nx] ||
|
||||||
|
j >= d_mesh_info.int_params[AC_ny] ||
|
||||||
|
k >= d_mesh_info.int_params[AC_nz])
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <ReduceInitialScalFunc reduce_initial>
|
||||||
|
__global__ void
|
||||||
|
_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
|
||||||
|
{
|
||||||
|
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
||||||
|
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
||||||
|
|
||||||
|
if (oob(i, j, k))
|
||||||
|
return;
|
||||||
|
|
||||||
|
const int src_idx = DEVICE_VTXBUF_IDX(
|
||||||
|
i + d_mesh_info.int_params[AC_nx_min],
|
||||||
|
j + d_mesh_info.int_params[AC_ny_min],
|
||||||
|
k + d_mesh_info.int_params[AC_nz_min]);
|
||||||
|
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
||||||
|
|
||||||
|
dst[dst_idx] = reduce_initial(src[src_idx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <ReduceInitialVecFunc reduce_initial>
|
||||||
|
__global__ void
|
||||||
|
_kernel_reduce_vec(const __restrict__ AcReal* src_a,
|
||||||
|
const __restrict__ AcReal* src_b,
|
||||||
|
const __restrict__ AcReal* src_c, AcReal* dst)
|
||||||
|
{
|
||||||
|
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
||||||
|
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
||||||
|
|
||||||
|
if (oob(i, j, k))
|
||||||
|
return;
|
||||||
|
|
||||||
|
const int src_idx = DEVICE_VTXBUF_IDX(
|
||||||
|
i + d_mesh_info.int_params[AC_nx_min],
|
||||||
|
j + d_mesh_info.int_params[AC_ny_min],
|
||||||
|
k + d_mesh_info.int_params[AC_nz_min]);
|
||||||
|
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
||||||
|
|
||||||
|
dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
|
||||||
|
src_c[src_idx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
#define BLOCK_SIZE (1024)
|
||||||
|
#define ELEMS_PER_THREAD (32)
|
||||||
|
|
||||||
|
template <ReduceFunc reduce>
|
||||||
|
__global__ void
|
||||||
|
_kernel_reduce(AcReal* src, AcReal* result)
|
||||||
|
{
|
||||||
|
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||||
|
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
||||||
|
|
||||||
|
if (idx >= scratchpad_size)
|
||||||
|
return;
|
||||||
|
|
||||||
|
__shared__ AcReal smem[BLOCK_SIZE];
|
||||||
|
|
||||||
|
AcReal tmp = src[idx];
|
||||||
|
|
||||||
|
for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
|
||||||
|
const int src_idx = idx + i * BLOCK_SIZE;
|
||||||
|
if (src_idx >= scratchpad_size) {
|
||||||
|
// This check is for safety: if accessing uninitialized values
|
||||||
|
// beyond the mesh boundaries, we will immediately start seeing NANs
|
||||||
|
if (threadIdx.x < BLOCK_SIZE)
|
||||||
|
smem[threadIdx.x] = NAN;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
tmp = reduce(tmp, src[src_idx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
smem[threadIdx.x] = tmp;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
int offset = BLOCK_SIZE / 2;
|
||||||
|
while (offset > 0) {
|
||||||
|
|
||||||
|
if (threadIdx.x < offset) {
|
||||||
|
tmp = reduce(tmp, smem[threadIdx.x + offset]);
|
||||||
|
smem[threadIdx.x] = tmp;
|
||||||
|
}
|
||||||
|
offset /= 2;
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
src[idx] = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <ReduceFunc reduce>
|
||||||
|
__global__ void
|
||||||
|
_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
|
||||||
|
{
|
||||||
|
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
||||||
|
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||||
|
AcReal tmp = src[idx];
|
||||||
|
const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||||
|
for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
|
||||||
|
tmp = reduce(tmp, src[idx + i * block_offset]);
|
||||||
|
|
||||||
|
*result = tmp;
|
||||||
|
}
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
AcReal
|
||||||
|
reduce_scal(const cudaStream_t stream,
|
||||||
|
const ReductionType& rtype, const int& nx, const int& ny,
|
||||||
|
const int& nz, const AcReal* vertex_buffer,
|
||||||
|
AcReal* reduce_scratchpad, AcReal* reduce_result)
|
||||||
|
{
|
||||||
|
const dim3 tpb(32, 4, 1);
|
||||||
|
const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
|
||||||
|
int(ceil(AcReal(nz) / tpb.z)));
|
||||||
|
|
||||||
|
const int scratchpad_size = nx * ny * nz;
|
||||||
|
const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) /
|
||||||
|
AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
|
||||||
|
|
||||||
|
switch (rtype) {
|
||||||
|
case RTYPE_MAX:
|
||||||
|
_kernel_reduce_scal<_device_length_scal>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_max>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_max>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
case RTYPE_MIN:
|
||||||
|
_kernel_reduce_scal<_device_length_scal>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_min>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_min>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
case RTYPE_RMS:
|
||||||
|
_kernel_reduce_scal<_device_squared_scal>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_sum>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_sum>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
case RTYPE_RMS_EXP:
|
||||||
|
_kernel_reduce_scal<_device_exp_squared_scal>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_sum>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_sum>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
ERROR("Unrecognized RTYPE");
|
||||||
|
}
|
||||||
|
|
||||||
|
AcReal result;
|
||||||
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
AcReal
|
||||||
|
reduce_vec(const cudaStream_t stream,
|
||||||
|
const ReductionType& rtype, const int& nx, const int& ny,
|
||||||
|
const int& nz, const AcReal* vertex_buffer_a,
|
||||||
|
const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
|
||||||
|
AcReal* reduce_scratchpad, AcReal* reduce_result)
|
||||||
|
{
|
||||||
|
const dim3 tpb(32, 4, 1);
|
||||||
|
const dim3 bpg(int(ceil(float(nx) / tpb.x)),
|
||||||
|
int(ceil(float(ny) / tpb.y)),
|
||||||
|
int(ceil(float(nz) / tpb.z)));
|
||||||
|
|
||||||
|
const int scratchpad_size = nx * ny * nz;
|
||||||
|
const int bpg2 = (unsigned int)ceil(float(scratchpad_size) /
|
||||||
|
float(ELEMS_PER_THREAD * BLOCK_SIZE));
|
||||||
|
|
||||||
|
// "Features" of this quick & efficient reduction:
|
||||||
|
// Block size must be smaller than the computational domain size
|
||||||
|
// (otherwise we would have do some additional bounds checking in the
|
||||||
|
// second half of _kernel_reduce, which gets quite confusing)
|
||||||
|
// Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
|
||||||
|
// the work without worrying too much about the array bounds.
|
||||||
|
ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size);
|
||||||
|
ERRCHK_ALWAYS(!(BLOCK_SIZE % 2));
|
||||||
|
// NOTE! Also does not work properly with non-power of two mesh dimension
|
||||||
|
// Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
|
||||||
|
// see that uninitialized smem values are used in the comparison
|
||||||
|
ERRCHK_ALWAYS(is_power_of_two(nx));
|
||||||
|
ERRCHK_ALWAYS(is_power_of_two(ny));
|
||||||
|
ERRCHK_ALWAYS(is_power_of_two(nz));
|
||||||
|
|
||||||
|
switch (rtype) {
|
||||||
|
case RTYPE_MAX:
|
||||||
|
_kernel_reduce_vec<_device_length_vec>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||||
|
reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_max>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_max>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
case RTYPE_MIN:
|
||||||
|
_kernel_reduce_vec<_device_length_vec>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||||
|
reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_min>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_min>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
case RTYPE_RMS:
|
||||||
|
_kernel_reduce_vec<_device_squared_vec>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||||
|
reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_sum>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_sum>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
case RTYPE_RMS_EXP:
|
||||||
|
_kernel_reduce_vec<_device_exp_squared_vec>
|
||||||
|
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||||
|
reduce_scratchpad);
|
||||||
|
_kernel_reduce<_device_sum>
|
||||||
|
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
_kernel_reduce_block<_device_sum>
|
||||||
|
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
ERROR("Unrecognized RTYPE");
|
||||||
|
}
|
||||||
|
|
||||||
|
AcReal result;
|
||||||
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|||||||
@@ -478,7 +478,7 @@ run_autotest(void)
|
|||||||
//const vec3i test_dims[] = {{32, 32, 32}};
|
//const vec3i test_dims[] = {{32, 32, 32}};
|
||||||
|
|
||||||
int num_failures = 0;
|
int num_failures = 0;
|
||||||
/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
||||||
config.int_params[AC_nx] = test_dims[i].x;
|
config.int_params[AC_nx] = test_dims[i].x;
|
||||||
config.int_params[AC_ny] = test_dims[i].y;
|
config.int_params[AC_ny] = test_dims[i].y;
|
||||||
config.int_params[AC_nz] = test_dims[i].z;
|
config.int_params[AC_nz] = test_dims[i].z;
|
||||||
@@ -489,7 +489,7 @@ run_autotest(void)
|
|||||||
|
|
||||||
num_failures += check_reductions(config);
|
num_failures += check_reductions(config);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}*/ // TODO uncomment
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
||||||
config.int_params[AC_nx] = test_dims[i].x;
|
config.int_params[AC_nx] = test_dims[i].x;
|
||||||
|
|||||||
Reference in New Issue
Block a user