816 lines
30 KiB
Plaintext
816 lines
30 KiB
Plaintext
/*
|
|
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
|
|
|
This file is part of Astaroth.
|
|
|
|
Astaroth is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Astaroth is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* \brief Multi-GPU implementation.
|
|
*
|
|
%JP: The old way for computing boundary conditions conflicts with the
|
|
way we have to do things with multiple GPUs.
|
|
|
|
The older approach relied on unified memory, which represented the whole
|
|
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
|
in its current state is more meant for quick prototyping when performance is not an issue.
|
|
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
|
|
than when managing the memory explicitly.
|
|
|
|
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
|
Quick rundown:
|
|
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
|
contain the extents of the whole simulation domain and the decomposed grids,
|
|
respectively. To simplify thing, we require that each GPU is assigned the same amount of
|
|
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
|
|
work with.
|
|
|
|
The whole simulation domain is decomposed with respect to the z dimension.
|
|
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
|
contain (nx, ny, nz / num_devices) vertices.
|
|
|
|
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
|
global idx = (i, j, k + device_id * subgrid.n.z)
|
|
|
|
Terminology:
|
|
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
|
|
|
Changes required to this commented code block:
|
|
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
|
|
device.cu instead. Same holds for any complex index calculations. Instead, the local
|
|
coordinates should be passed as an int3 type without having to consider how the data is
|
|
actually laid out in device memory
|
|
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
|
|
handle of type "Device" which should be passed to single-GPU functions. In this file, all
|
|
devices are stored in a global array "devices[num_devices]".
|
|
- Every single-GPU function is executed asynchronously by default such that we
|
|
can optimize Astaroth by executing memory transactions concurrently with
|
|
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
|
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
|
as a parameter and commands executing in different streams can be processed
|
|
in parallel/concurrently.
|
|
|
|
|
|
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
|
|
|
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
|
|
|
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
|
|
STENCIL_ORDER/2)
|
|
|
|
on a single device are "local", in the sense that they can be computed without
|
|
having to exchange data with neighboring GPUs. Special care is needed only for transferring
|
|
the data to the fron and back plates outside this range. In the solution we use
|
|
here, we solve the local boundaries first, and then just exchange the front and back plates
|
|
in a "ring", like so
|
|
device_id
|
|
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
|
|
|
### Throughout this file we use the following notation and names for various index offsets
|
|
|
|
Global coordinates: coordinates with respect to the global grid (static Grid grid)
|
|
Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
|
|
|
|
s0, s1: source indices in global coordinates
|
|
d0, d1: destination indices in global coordinates
|
|
da = max(s0, d0);
|
|
db = min(s1, d1);
|
|
|
|
These are used in at least
|
|
acLoad()
|
|
acStore()
|
|
acSynchronizeHalos()
|
|
|
|
Here we decompose the host mesh and distribute it among the GPUs in
|
|
the node.
|
|
|
|
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
|
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
|
one for each GPU. Here we check which parts of the range s0...s1 maps
|
|
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
|
the data if needed.
|
|
|
|
The index mapping is inherently quite involved, but here's a picture which
|
|
hopefully helps make sense out of all this.
|
|
|
|
|
|
Grid
|
|
|----num_vertices---|
|
|
xxx|....................................................|xxx
|
|
^ ^ ^ ^
|
|
d0 d1 s0 (src) s1
|
|
|
|
Subgrid
|
|
|
|
xxx|.............|xxx
|
|
^ ^
|
|
d0 d1
|
|
|
|
^ ^
|
|
db da
|
|
*
|
|
*/
|
|
#include "astaroth_node.h"
|
|
|
|
#include "astaroth_device.h"
|
|
#include "errchk.h"
|
|
#include "math_utils.h" // sum for reductions
|
|
|
|
static const int MAX_NUM_DEVICES = 32;
|
|
|
|
struct node_s {
|
|
int id;
|
|
|
|
int num_devices;
|
|
Device devices[MAX_NUM_DEVICES];
|
|
|
|
Grid grid;
|
|
Grid subgrid;
|
|
|
|
AcMeshInfo config;
|
|
};
|
|
|
|
static int
|
|
gridIdx(const Grid grid, const int3 idx)
|
|
{
|
|
return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
|
|
}
|
|
|
|
static int3
|
|
gridIdx3d(const Grid grid, const int idx)
|
|
{
|
|
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
|
idx / (grid.m.x * grid.m.y)};
|
|
}
|
|
|
|
static void
|
|
printInt3(const int3 vec)
|
|
{
|
|
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
|
|
}
|
|
|
|
static inline void
|
|
print(const AcMeshInfo config)
|
|
{
|
|
for (int i = 0; i < NUM_INT_PARAMS; ++i)
|
|
printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
|
|
for (int i = 0; i < NUM_REAL_PARAMS; ++i)
|
|
printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
|
|
}
|
|
|
|
static void
|
|
update_builtin_params(AcMeshInfo* config)
|
|
{
|
|
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
|
///////////// PAD TEST
|
|
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
|
///////////// PAD TEST
|
|
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
|
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
|
|
|
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
|
config->int_params[AC_nx_min] = NGHOST;
|
|
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
|
config->int_params[AC_ny_min] = NGHOST;
|
|
config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
|
|
config->int_params[AC_nz_min] = NGHOST;
|
|
config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
|
|
|
|
/* Additional helper params */
|
|
// Int helpers
|
|
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
|
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
|
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
|
}
|
|
|
|
static Grid
|
|
createGrid(const AcMeshInfo config)
|
|
{
|
|
Grid grid;
|
|
|
|
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
|
|
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
|
|
|
|
return grid;
|
|
}
|
|
|
|
AcResult
|
|
acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
|
|
{
|
|
struct node_s* node = (struct node_s*)malloc(sizeof(*node));
|
|
node->id = id;
|
|
node->config = node_config;
|
|
|
|
// Get node->num_devices
|
|
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices));
|
|
if (node->num_devices < 1) {
|
|
ERROR("No CUDA devices found!");
|
|
return AC_FAILURE;
|
|
}
|
|
if (node->num_devices > MAX_NUM_DEVICES) {
|
|
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
|
|
node->num_devices = MAX_NUM_DEVICES;
|
|
}
|
|
if (!AC_MULTIGPU_ENABLED) {
|
|
WARNING("MULTIGPU_ENABLED was false. Using only one device");
|
|
node->num_devices = 1; // Use only one device if multi-GPU is not enabled
|
|
}
|
|
// Check that node->num_devices is divisible with AC_nz. This makes decomposing the
|
|
// problem domain to multiple GPUs much easier since we do not have to worry
|
|
// about remainders
|
|
ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0);
|
|
|
|
// Decompose the problem domain
|
|
// The main grid
|
|
node->grid = createGrid(node->config);
|
|
|
|
// Subgrids
|
|
AcMeshInfo subgrid_config = node->config;
|
|
subgrid_config.int_params[AC_nz] /= node->num_devices;
|
|
update_builtin_params(&subgrid_config);
|
|
#if VERBOSE_PRINTING // Defined in astaroth.h
|
|
printf("###############################################################\n");
|
|
printf("Config dimensions recalculated:\n");
|
|
print(subgrid_config);
|
|
printf("###############################################################\n");
|
|
#endif
|
|
node->subgrid = createGrid(subgrid_config);
|
|
|
|
// Periodic boundary conditions become weird if the system can "fold unto itself".
|
|
ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER);
|
|
ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
|
|
ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
|
|
|
|
#if VERBOSE_PRINTING
|
|
// clang-format off
|
|
printf("Grid m "); printInt3(node->grid.m); printf("\n");
|
|
printf("Grid n "); printInt3(node->grid.n); printf("\n");
|
|
printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
|
|
printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
|
|
// clang-format on
|
|
#endif
|
|
|
|
// Initialize the devices
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
const int3 multinode_offset = (int3){0, 0, 0}; // Placeholder
|
|
const int3 multigpu_offset = (int3){0, 0, i * node->subgrid.n.z};
|
|
subgrid_config.int3_params[AC_global_grid_n] = node->grid.n;
|
|
subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset;
|
|
|
|
acDeviceCreate(i, subgrid_config, &node->devices[i]);
|
|
acDevicePrintInfo(node->devices[i]);
|
|
}
|
|
|
|
// Enable peer access
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
const int front = (i + 1) % node->num_devices;
|
|
const int back = (i - 1 + node->num_devices) % node->num_devices;
|
|
|
|
int can_access_front, can_access_back;
|
|
cudaDeviceCanAccessPeer(&can_access_front, i, front);
|
|
cudaDeviceCanAccessPeer(&can_access_back, i, back);
|
|
#if VERBOSE_PRINTING
|
|
printf(
|
|
"Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
|
|
i, front, can_access_front, back, can_access_back);
|
|
#endif
|
|
|
|
cudaSetDevice(i);
|
|
if (can_access_front) {
|
|
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
|
|
}
|
|
if (can_access_back) {
|
|
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
|
|
}
|
|
}
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
|
|
*node_handle = node;
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeDestroy(Node node)
|
|
{
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
acDeviceDestroy(node->devices[i]);
|
|
}
|
|
free(node);
|
|
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodePrintInfo(const Node node)
|
|
{
|
|
(void)node;
|
|
WARNING("Not implemented");
|
|
return AC_FAILURE;
|
|
}
|
|
|
|
AcResult
|
|
acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
|
|
{
|
|
config->num_devices = node->num_devices;
|
|
config->devices = node->devices;
|
|
config->grid = node->grid;
|
|
config->subgrid = node->subgrid;
|
|
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeAutoOptimize(const Node node)
|
|
{
|
|
(void)node;
|
|
WARNING("Not implemented");
|
|
return AC_FAILURE;
|
|
}
|
|
|
|
AcResult
|
|
acNodeSynchronizeStream(const Node node, const Stream stream)
|
|
{
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
acDeviceSynchronizeStream(node->devices[i], stream);
|
|
}
|
|
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
|
|
const VertexBufferHandle vtxbuf_handle)
|
|
{
|
|
acNodeSynchronizeStream(node, stream);
|
|
// Exchanges the halos of subgrids
|
|
// After this step, the data within the main grid ranging from
|
|
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
|
|
// has been synchronized and transferred to appropriate subgrids
|
|
|
|
// We loop only to node->num_devices - 1 since the front and back plate of the grid is not
|
|
// transferred because their contents depend on the boundary conditions.
|
|
|
|
// IMPORTANT NOTE: the boundary conditions must be applied before
|
|
// callingacNodeSynchronizeStream(node, this function! I.e. the halos of subgrids must contain
|
|
// up-to-date data!
|
|
|
|
const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
|
|
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices - 1; ++i) {
|
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
|
const int3 src = (int3){0, 0, node->subgrid.n.z};
|
|
const int3 dst = (int3){0, 0, 0};
|
|
|
|
const Device src_device = node->devices[i];
|
|
Device dst_device = node->devices[i + 1];
|
|
|
|
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
|
num_vertices, dst_device);
|
|
}
|
|
// #pragma omp parallel for
|
|
for (int i = 1; i < node->num_devices; ++i) {
|
|
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
|
const int3 src = (int3){0, 0, NGHOST};
|
|
const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
|
|
|
|
const Device src_device = node->devices[i];
|
|
Device dst_device = node->devices[i - 1];
|
|
|
|
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
|
num_vertices, dst_device);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeSynchronizeMesh(const Node node, const Stream stream)
|
|
{
|
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
|
acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
|
|
}
|
|
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeSwapBuffers(const Node node)
|
|
{
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
acDeviceSwapBuffers(node->devices[i]);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
|
|
const AcReal value)
|
|
{
|
|
acNodeSynchronizeStream(node, stream);
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
acDeviceLoadScalarConstant(node->devices[i], stream, param, value);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
|
|
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
|
const int3 dst, const int num_vertices)
|
|
{
|
|
acNodeSynchronizeStream(node, stream);
|
|
// See the beginning of the file for an explanation of the index mapping
|
|
// // #pragma omp parallel for
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
|
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
|
|
|
|
const int3 s0 = src; // dst; // TODO fix
|
|
(void)dst; // TODO fix
|
|
const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
|
|
|
|
const int3 da = max(s0, d0);
|
|
const int3 db = min(s1, d1);
|
|
/*
|
|
printf("Device %d\n", i);
|
|
printf("\ts0: "); printInt3(s0); printf("\n");
|
|
printf("\td0: "); printInt3(d0); printf("\n");
|
|
printf("\tda: "); printInt3(da); printf("\n");
|
|
printf("\tdb: "); printInt3(db); printf("\n");
|
|
printf("\td1: "); printInt3(d1); printf("\n");
|
|
printf("\ts1: "); printInt3(s1); printf("\n");
|
|
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
|
|
*/
|
|
if (db.z >= da.z) {
|
|
const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
|
|
// DECOMPOSITION OFFSET HERE
|
|
const int3 da_global = da; // src + da - dst; // TODO fix
|
|
const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
|
|
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
|
|
// printf("\n");
|
|
acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle,
|
|
da_global, da_local, copy_cells);
|
|
}
|
|
// printf("\n");
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
|
|
const int3 src, const int3 dst, const int num_vertices)
|
|
{
|
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
|
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
|
|
num_vertices);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
|
|
const VertexBufferHandle vtxbuf_handle)
|
|
{
|
|
const int3 src = (int3){0, 0, 0};
|
|
const int3 dst = src;
|
|
const size_t num_vertices = acVertexBufferSize(host_mesh.info);
|
|
|
|
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst,
|
|
num_vertices);
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
|
|
{
|
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
|
acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
|
|
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
|
const int3 dst, const int num_vertices, AcMesh* host_mesh)
|
|
{
|
|
acNodeSynchronizeStream(node, stream);
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
|
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
|
|
|
|
const int3 s0 = src; // TODO fix
|
|
(void)dst; // TODO fix
|
|
const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
|
|
|
|
const int3 da = max(s0, d0);
|
|
const int3 db = min(s1, d1);
|
|
if (db.z >= da.z) {
|
|
const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
|
|
// DECOMPOSITION OFFSET HERE
|
|
const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
|
|
const int3 da_global = da; // dst + da - src; // TODO fix
|
|
acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
|
|
da_global, copy_cells, host_mesh);
|
|
}
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
|
|
const int num_vertices, AcMesh* host_mesh)
|
|
{
|
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
|
acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst,
|
|
num_vertices, host_mesh);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeStoreVertexBuffer(const Node node, const Stream stream,
|
|
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
|
|
{
|
|
const int3 src = (int3){0, 0, 0};
|
|
const int3 dst = src;
|
|
const size_t num_vertices = acVertexBufferSize(host_mesh->info);
|
|
|
|
acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
|
|
host_mesh);
|
|
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
|
|
{
|
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
|
acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start,
|
|
const int3 end, const AcReal dt)
|
|
{
|
|
acNodeSynchronizeStream(node, stream);
|
|
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
// DECOMPOSITION OFFSET HERE
|
|
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
|
|
const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};
|
|
|
|
const int3 da = max(start, d0);
|
|
const int3 db = min(end, d1);
|
|
|
|
if (db.z >= da.z) {
|
|
const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};
|
|
const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};
|
|
acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt);
|
|
}
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
static AcResult
|
|
local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
|
|
{
|
|
acNodeSynchronizeStream(node, stream);
|
|
|
|
if (node->num_devices > 1) {
|
|
// Local boundary conditions
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
|
|
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
|
|
acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
|
|
}
|
|
}
|
|
else {
|
|
acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
|
|
node->subgrid.m);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
static AcResult
|
|
global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
|
|
{
|
|
acNodeSynchronizeStream(node, stream);
|
|
|
|
if (node->num_devices > 1) {
|
|
const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
|
|
{
|
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
|
const int3 src = (int3){0, 0, node->subgrid.n.z};
|
|
const int3 dst = (int3){0, 0, 0};
|
|
|
|
const Device src_device = node->devices[node->num_devices - 1];
|
|
Device dst_device = node->devices[0];
|
|
|
|
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
|
num_vertices, dst_device);
|
|
}
|
|
{
|
|
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
|
const int3 src = (int3){0, 0, NGHOST};
|
|
const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
|
|
|
|
const Device src_device = node->devices[0];
|
|
Device dst_device = node->devices[node->num_devices - 1];
|
|
|
|
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
|
num_vertices, dst_device);
|
|
}
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeIntegrate(const Node node, const AcReal dt)
|
|
{
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
// xxx|OOO OOOOOOOOO OOO|xxx
|
|
// ^ ^ ^ ^
|
|
// n0 n1 n2 n3
|
|
// const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
|
|
// const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
|
// const int3 n2 = node->grid.n;
|
|
// const int3 n3 = n0 + node->grid.n;
|
|
|
|
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
|
local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
|
|
}
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
|
|
// Inner inner
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
|
const int3 m2 = node->subgrid.n;
|
|
acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
|
|
}
|
|
|
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
|
acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
|
|
global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
|
|
}
|
|
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
|
acNodeSynchronizeStream(node, (Stream)vtxbuf);
|
|
}
|
|
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) { // Front
|
|
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
|
|
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
|
|
acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
|
|
}
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) { // Back
|
|
const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
|
|
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
|
|
acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
|
|
}
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) { // Bottom
|
|
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
|
|
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
|
|
acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
|
|
}
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) { // Top
|
|
const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
|
|
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
|
|
acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
|
|
}
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) { // Left
|
|
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
|
|
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
|
|
node->subgrid.n.z - 2 * NGHOST};
|
|
acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
|
|
}
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) { // Right
|
|
const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
|
|
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
|
|
node->subgrid.n.z - 2 * NGHOST};
|
|
acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
|
|
}
|
|
acNodeSwapBuffers(node);
|
|
}
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodePeriodicBoundcondStep(const Node node, const Stream stream,
|
|
const VertexBufferHandle vtxbuf_handle)
|
|
{
|
|
local_boundcondstep(node, stream, vtxbuf_handle);
|
|
acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
|
|
|
|
// TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE
|
|
global_boundcondstep(node, stream, vtxbuf_handle);
|
|
// WARNING("Global boundconds should not be done here with multinode");
|
|
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodePeriodicBoundconds(const Node node, const Stream stream)
|
|
{
|
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
|
acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
|
|
}
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
static AcReal
|
|
simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
|
|
const int& n)
|
|
{
|
|
AcReal res = results[0];
|
|
for (int i = 1; i < n; ++i) {
|
|
if (rtype == RTYPE_MAX) {
|
|
res = max(res, results[i]);
|
|
}
|
|
else if (rtype == RTYPE_MIN) {
|
|
res = min(res, results[i]);
|
|
}
|
|
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
|
|
res = sum(res, results[i]);
|
|
}
|
|
else {
|
|
ERROR("Invalid rtype");
|
|
}
|
|
}
|
|
|
|
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
|
const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z);
|
|
res = sqrt(inv_n * res);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
AcResult
|
|
acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
|
|
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
|
{
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
|
|
AcReal results[node->num_devices];
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
|
|
}
|
|
|
|
*result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
|
|
return AC_SUCCESS;
|
|
}
|
|
|
|
AcResult
|
|
acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
|
|
const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c,
|
|
AcReal* result)
|
|
{
|
|
acNodeSynchronizeStream(node, STREAM_ALL);
|
|
|
|
AcReal results[node->num_devices];
|
|
// #pragma omp parallel for
|
|
for (int i = 0; i < node->num_devices; ++i) {
|
|
acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
|
|
}
|
|
|
|
*result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
|
|
return AC_SUCCESS;
|
|
}
|