From 322cdce52ceb474541bc90e0c74dc144e72dfed9 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 7 Aug 2019 20:05:54 +0300 Subject: [PATCH] Added some new comments + some helpful old comments from a time before the interface revision --- include/astaroth.h | 17 +++++++ src/core/node.cu | 107 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/include/astaroth.h b/include/astaroth.h index 94d7e17..43438f7 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -41,27 +41,44 @@ extern "C" { #define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y) */ +/** Allocates all memory and initializes the devices visible to the caller. Should be + * called before any other function in this interface. */ AcResult acInit(const AcMeshInfo mesh_info); +/** Frees all GPU allocations and resets all devices in the node. Should be + * called at exit. */ AcResult acQuit(void); +/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a + * parameter*/ AcResult acSynchronizeStream(const Stream stream); +/** Loads a constant to the memories of the devices visible to the caller */ AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value); +/** Loads an AcMesh to the devices visible to the caller */ AcResult acLoad(const AcMesh host_mesh); +/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/ AcResult acStore(AcMesh* host_mesh); +/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final + * substep and the user is responsible for calling acBoundcondStep before reading the data. */ AcResult acIntegrate(const AcReal dt); +/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the + * caller*/ AcResult acBoundcondStep(void); +/** Does a scalar reduction with the data stored in some vertex buffer */ AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle); +/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */ AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c); +/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory. + */ AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh); #ifdef __cplusplus diff --git a/src/core/node.cu b/src/core/node.cu index fa8a35d..f9a4aa9 100644 --- a/src/core/node.cu +++ b/src/core/node.cu @@ -16,6 +16,113 @@ You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ + +/** + * @file + * \brief Multi-GPU implementation. + * + %JP: The old way for computing boundary conditions conflicts with the + way we have to do things with multiple GPUs. + + The older approach relied on unified memory, which represented the whole + memory area as one huge mesh instead of several smaller ones. However, unified memory + in its current state is more meant for quick prototyping when performance is not an issue. + Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult + than when managing the memory explicitly. + + In this new approach, I have simplified the multi- and single-GPU layers significantly. + Quick rundown: + New struct: Grid. There are two global variables, "grid" and "subgrid", which + contain the extents of the whole simulation domain and the decomposed grids, + respectively. To simplify thing, we require that each GPU is assigned the same amount of + work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to + work with. + + The whole simulation domain is decomposed with respect to the z dimension. + For example, if the grid contains (nx, ny, nz) vertices, then the subgrids + contain (nx, ny, nz / num_devices) vertices. + + An local index (i, j, k) in some subgrid can be mapped to the global grid with + global idx = (i, j, k + device_id * subgrid.n.z) + + Terminology: + - Single-GPU function: a function defined on the single-GPU layer (device.cu) + + Changes required to this commented code block: + - The thread block dimensions (tpb) are no longer passed to the kernel here but in + device.cu instead. Same holds for any complex index calculations. Instead, the local + coordinates should be passed as an int3 type without having to consider how the data is + actually laid out in device memory + - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque + handle of type "Device" which should be passed to single-GPU functions. In this file, all + devices are stored in a global array "devices[num_devices]". + - Every single-GPU function is executed asynchronously by default such that we + can optimize Astaroth by executing memory transactions concurrently with + computation. Therefore a StreamType should be passed as a parameter to single-GPU functions. + Refresher: CUDA function calls are non-blocking when a stream is explicitly passed + as a parameter and commands executing in different streams can be processed + in parallel/concurrently. + + + Note on periodic boundaries (might be helpful when implementing other boundary conditions): + + With multiple GPUs, periodic boundary conditions applied on indices ranging from + + (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - + STENCIL_ORDER/2) + + on a single device are "local", in the sense that they can be computed without + having to exchange data with neighboring GPUs. Special care is needed only for transferring + the data to the fron and back plates outside this range. In the solution we use + here, we solve the local boundaries first, and then just exchange the front and back plates + in a "ring", like so + device_id + (n) <-> 0 <-> 1 <-> ... <-> n <-> (0) + +### Throughout this file we use the following notation and names for various index offsets + + Global coordinates: coordinates with respect to the global grid (static Grid grid) + Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid) + + s0, s1: source indices in global coordinates + d0, d1: destination indices in global coordinates + da = max(s0, d0); + db = min(s1, d1); + + These are used in at least + acLoad() + acStore() + acSynchronizeHalos() + + Here we decompose the host mesh and distribute it among the GPUs in + the node. + + The host mesh is a huge contiguous block of data. Its dimensions are given by + the global variable named "grid". A "grid" is decomposed into "subgrids", + one for each GPU. Here we check which parts of the range s0...s1 maps + to the memory space stored by some GPU, ranging d0...d1, and transfer + the data if needed. + + The index mapping is inherently quite involved, but here's a picture which + hopefully helps make sense out of all this. + + + Grid + |----num_vertices---| + xxx|....................................................|xxx + ^ ^ ^ ^ + d0 d1 s0 (src) s1 + + Subgrid + + xxx|.............|xxx + ^ ^ + d0 d1 + + ^ ^ + db da + * + */ #include "astaroth_node.h" #include "astaroth_device.h"