Autoformatted all CUDA/C/C++ code

This commit is contained in:
jpekkila
2019-06-18 16:42:56 +03:00
parent 6fdc4cddb2
commit 8864266042
12 changed files with 1053 additions and 1111 deletions

View File

@@ -28,33 +28,24 @@
#include "errchk.h"
#include "device.cuh"
#include "math_utils.h" // sum for reductions
#include "math_utils.h" // sum for reductions
#include "standalone/config_loader.h" // update_config
const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
static const int MAX_NUM_DEVICES = 32;
static int num_devices = 1;
static const int MAX_NUM_DEVICES = 32;
static int num_devices = 1;
static Device devices[MAX_NUM_DEVICES] = {};
static Grid
createGrid(const AcMeshInfo& config)
{
Grid grid;
grid.m = (int3) {
config.int_params[AC_mx],
config.int_params[AC_my],
config.int_params[AC_mz]
};
grid.n = (int3) {
config.int_params[AC_nx],
config.int_params[AC_ny],
config.int_params[AC_nz]
};
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
return grid;
}
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
static int3
gridIdx3d(const Grid& grid, const int idx)
{
return (int3){idx % grid.m.x,
(idx % (grid.m.x * grid.m.y)) / grid.m.x,
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
idx / (grid.m.x * grid.m.y)};
}
@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
printf("Grid m "); printInt3(grid.m); printf("\n");
printf("Grid n "); printInt3(grid.n); printf("\n");
// clang-format off
printf("Grid m "); printInt3(grid.m); printf("\n");
printf("Grid n "); printInt3(grid.n); printf("\n");
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
// clang-format on
// Initialize the devices
for (int i = 0; i < num_devices; ++i) {
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
*/
if (db.z >= da.z) {
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
const int3 da_local = (int3){
da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
// printf("\n");
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
}
printf("\n");
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
*/
if (db.z >= da.z) {
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
const int3 da_local = (int3){
da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
// printf("\n");
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
}
printf("\n");
@@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh)
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
{
const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
const int3 end = (int3){STENCIL_ORDER/2 + subgrid.n.x,
STENCIL_ORDER/2 + subgrid.n.y,
STENCIL_ORDER/2 + subgrid.n.z};
const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
const int3 end = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
STENCIL_ORDER / 2 + subgrid.n.z};
for (int i = 0; i < num_devices; ++i) {
rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
}
@@ -278,121 +273,125 @@ acBoundcondStep(void)
{
acSynchronize();
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY,
(int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
} else {
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
(int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
}
else {
// Local boundary conditions
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
}
/*
// ===MIIKKANOTE START==========================================
%JP: The old way for computing boundary conditions conflicts with the
way we have to do things with multiple GPUs.
/*
// ===MIIKKANOTE START==========================================
%JP: The old way for computing boundary conditions conflicts with the
way we have to do things with multiple GPUs.
The older approach relied on unified memory, which represented the whole
memory area as one huge mesh instead of several smaller ones. However, unified memory
in its current state is more meant for quick prototyping when performance is not an issue.
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
when managing the memory explicitly.
The older approach relied on unified memory, which represented the whole
memory area as one huge mesh instead of several smaller ones. However, unified memory
in its current state is more meant for quick prototyping when performance is not an issue.
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
than when managing the memory explicitly.
In this new approach, I have simplified the multi- and single-GPU layers significantly.
Quick rundown:
New struct: Grid. There are two global variables, "grid" and "subgrid", which
contain the extents of the whole simulation domain and the decomposed grids, respectively.
To simplify thing, we require that each GPU is assigned the same amount of work,
therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
to work with.
In this new approach, I have simplified the multi- and single-GPU layers significantly.
Quick rundown:
New struct: Grid. There are two global variables, "grid" and "subgrid", which
contain the extents of the whole simulation domain and the decomposed grids,
respectively. To simplify thing, we require that each GPU is assigned the same amount of
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
work with.
The whole simulation domain is decomposed with respect to the z dimension.
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
contain (nx, ny, nz / num_devices) vertices.
The whole simulation domain is decomposed with respect to the z dimension.
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
contain (nx, ny, nz / num_devices) vertices.
An local index (i, j, k) in some subgrid can be mapped to the global grid with
global idx = (i, j, k + device_id * subgrid.n.z)
An local index (i, j, k) in some subgrid can be mapped to the global grid with
global idx = (i, j, k + device_id * subgrid.n.z)
Terminology:
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
Terminology:
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
Changes required to this commented code block:
- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
instead. Same holds for any complex index calculations. Instead, the local coordinates
should be passed as an int3 type without having to consider how the data is actually
laid out in device memory
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
of type "Device" which should be passed to single-GPU functions. In this file, all devices
are stored in a global array "devices[num_devices]".
- Every single-GPU function is executed asynchronously by default such that we
can optimize Astaroth by executing memory transactions concurrently with computation.
Therefore a StreamType should be passed as a parameter to single-GPU functions.
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
as a parameter and commands executing in different streams can be processed
in parallel/concurrently.
Changes required to this commented code block:
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
device.cu instead. Same holds for any complex index calculations. Instead, the local
coordinates should be passed as an int3 type without having to consider how the data is
actually laid out in device memory
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
handle of type "Device" which should be passed to single-GPU functions. In this file, all
devices are stored in a global array "devices[num_devices]".
- Every single-GPU function is executed asynchronously by default such that we
can optimize Astaroth by executing memory transactions concurrently with
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
as a parameter and commands executing in different streams can be processed
in parallel/concurrently.
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
With multiple GPUs, periodic boundary conditions applied on indices ranging from
With multiple GPUs, periodic boundary conditions applied on indices ranging from
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
STENCIL_ORDER/2)
on a single device are "local", in the sense that they can be computed without having
to exchange data with neighboring GPUs. Special care is needed only for transferring
the data to the fron and back plates outside this range. In the solution we use here,
we solve the local boundaries first, and then just exchange the front and back plates
in a "ring", like so
device_id
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
on a single device are "local", in the sense that they can be computed without
having to exchange data with neighboring GPUs. Special care is needed only for transferring
the data to the fron and back plates outside this range. In the solution we use
here, we solve the local boundaries first, and then just exchange the front and back plates
in a "ring", like so
device_id
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
// ======MIIKKANOTE END==========================================
// ======MIIKKANOTE END==========================================
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
moved into device.cu, function boundCondStep()
In astaroth.cu, we use acBoundcondStep()
just to distribute the work and manage
communication between GPUs.
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
moved into device.cu, function
boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
manage communication between GPUs.
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
best_dims.z, double(best_time) / NUM_ITERATIONS);
exit(0);
#else
exit(0);
#else
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
const int3 start = (int3){0, 0, device_id * depth};
const int3 end = (int3){mesh_info.int_params[AC_mx],
mesh_info.int_params[AC_my],
min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
const int3 start = (int3){0, 0, device_id * depth};
const int3 end = (int3){mesh_info.int_params[AC_mx],
mesh_info.int_params[AC_my],
min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
const dim3 tpb(8,2,8);
const dim3 tpb(8,2,8);
// TODO uses the default stream currently
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
wedge_boundconds(0, tpb, start, end, d_buffer);
} else {
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
*/
// TODO uses the default stream currently
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
wedge_boundconds(0, tpb, start, end, d_buffer);
} else {
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
*/
// Exchange halos
for (int i = 0; i < num_devices; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3) {0, 0, subgrid.n.z};
const int3 dst = (int3) {0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
devices[(i + 1) % num_devices], dst, num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
devices[i], dst, num_vertices);
}
}
}
@@ -427,26 +426,28 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
} else if (rtype == RTYPE_MIN) {
}
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
}
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
res = sum(res, results[i]);
} else {
}
else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res);
res = sqrt(inv_n * res);
}
return res;
}
AcReal
acReduceScal(const ReductionType& rtype,
const VertexBufferHandle& vtxbuffer_handle)
acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
{
AcReal results[num_devices];
for (int i = 0; i < num_devices; ++i) {
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
}
AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
const VertexBufferHandle& b, const VertexBufferHandle& c)
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
const VertexBufferHandle& c)
{
AcReal results[num_devices];
for (int i = 0; i < num_devices; ++i) {