From 322cdce52ceb474541bc90e0c74dc144e72dfed9 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 7 Aug 2019 20:05:54 +0300
Subject: [PATCH] Added some new comments + some helpful old comments from a
 time before the interface revision

---
 include/astaroth.h |  17 +++++++
 src/core/node.cu   | 107 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/include/astaroth.h b/include/astaroth.h
index 94d7e17..43438f7 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -41,27 +41,44 @@ extern "C" {
 #define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
 */
 
+/** Allocates all memory and initializes the devices visible to the caller. Should be
+ * called before any other function in this interface. */
 AcResult acInit(const AcMeshInfo mesh_info);
 
+/** Frees all GPU allocations and resets all devices in the node. Should be
+ * called at exit. */
 AcResult acQuit(void);
 
+/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a
+ * parameter*/
 AcResult acSynchronizeStream(const Stream stream);
 
+/** Loads a constant to the memories of the devices visible to the caller */
 AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
 
+/** Loads an AcMesh to the devices visible to the caller */
 AcResult acLoad(const AcMesh host_mesh);
 
+/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/
 AcResult acStore(AcMesh* host_mesh);
 
+/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
+ * substep and the user is responsible for calling acBoundcondStep before reading the data. */
 AcResult acIntegrate(const AcReal dt);
 
+/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the
+ * caller*/
 AcResult acBoundcondStep(void);
 
+/** Does a scalar reduction with the data stored in some vertex buffer */
 AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
 
+/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */
 AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
                    const VertexBufferHandle b, const VertexBufferHandle c);
 
+/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory.
+ */
 AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);
 
 #ifdef __cplusplus
diff --git a/src/core/node.cu b/src/core/node.cu
index fa8a35d..f9a4aa9 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -16,6 +16,113 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
+
+/**
+ * @file
+ * \brief Multi-GPU implementation.
+ *
+ %JP: The old way for computing boundary conditions conflicts with the
+ way we have to do things with multiple GPUs.
+
+ The older approach relied on unified memory, which represented the whole
+ memory area as one huge mesh instead of several smaller ones. However, unified memory
+ in its current state is more meant for quick prototyping when performance is not an issue.
+ Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
+ than when managing the memory explicitly.
+
+ In this new approach, I have simplified the multi- and single-GPU layers significantly.
+ Quick rundown:
+         New struct: Grid. There are two global variables, "grid" and "subgrid", which
+         contain the extents of the whole simulation domain and the decomposed grids,
+ respectively. To simplify thing, we require that each GPU is assigned the same amount of
+ work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
+ work with.
+
+         The whole simulation domain is decomposed with respect to the z dimension.
+         For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+         contain (nx, ny, nz / num_devices) vertices.
+
+         An local index (i, j, k) in some subgrid can be mapped to the global grid with
+                 global idx = (i, j, k + device_id * subgrid.n.z)
+
+ Terminology:
+         - Single-GPU function: a function defined on the single-GPU layer (device.cu)
+
+ Changes required to this commented code block:
+         - The thread block dimensions (tpb) are no longer passed to the kernel here but in
+ device.cu instead. Same holds for any complex index calculations. Instead, the local
+ coordinates should be passed as an int3 type without having to consider how the data is
+ actually laid out in device memory
+         - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
+ handle of type "Device" which should be passed to single-GPU functions. In this file, all
+ devices are stored in a global array "devices[num_devices]".
+         - Every single-GPU function is executed asynchronously by default such that we
+           can optimize Astaroth by executing memory transactions concurrently with
+ computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
+           Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+           as a parameter and commands executing in different streams can be processed
+           in parallel/concurrently.
+
+
+ Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+
+         With multiple GPUs, periodic boundary conditions applied on indices ranging from
+
+                 (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
+ STENCIL_ORDER/2)
+
+         on a single device are "local", in the sense that they can be computed without
+ having to exchange data with neighboring GPUs. Special care is needed only for transferring
+         the data to the fron and back plates outside this range. In the solution we use
+ here, we solve the local boundaries first, and then just exchange the front and back plates
+         in a "ring", like so
+                                 device_id
+                     (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+
+### Throughout this file we use the following notation and names for various index offsets
+
+    Global coordinates: coordinates with respect to the global grid (static Grid grid)
+    Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
+
+    s0, s1: source indices in global coordinates
+    d0, d1: destination indices in global coordinates
+    da = max(s0, d0);
+    db = min(s1, d1);
+
+    These are used in at least
+    acLoad()
+    acStore()
+    acSynchronizeHalos()
+
+     Here we decompose the host mesh and distribute it among the GPUs in
+     the node.
+
+     The host mesh is a huge contiguous block of data. Its dimensions are given by
+     the global variable named "grid". A "grid" is decomposed into "subgrids",
+     one for each GPU. Here we check which parts of the range s0...s1 maps
+     to the memory space stored by some GPU, ranging d0...d1, and transfer
+     the data if needed.
+
+     The index mapping is inherently quite involved, but here's a picture which
+     hopefully helps make sense out of all this.
+
+
+     Grid
+                                      |----num_vertices---|
+     xxx|....................................................|xxx
+              ^                   ^   ^                   ^
+             d0                  d1  s0 (src)            s1
+
+     Subgrid
+
+              xxx|.............|xxx
+              ^                   ^
+             d0                  d1
+
+                                  ^   ^
+                                 db  da
+ *
+ */
 #include "astaroth_node.h"
 
 #include "astaroth_device.h"