Added some new comments + some helpful old comments from a time before the interface revision
This commit is contained in:
@@ -41,27 +41,44 @@ extern "C" {
|
|||||||
#define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
|
#define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/** Allocates all memory and initializes the devices visible to the caller. Should be
|
||||||
|
* called before any other function in this interface. */
|
||||||
AcResult acInit(const AcMeshInfo mesh_info);
|
AcResult acInit(const AcMeshInfo mesh_info);
|
||||||
|
|
||||||
|
/** Frees all GPU allocations and resets all devices in the node. Should be
|
||||||
|
* called at exit. */
|
||||||
AcResult acQuit(void);
|
AcResult acQuit(void);
|
||||||
|
|
||||||
|
/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a
|
||||||
|
* parameter*/
|
||||||
AcResult acSynchronizeStream(const Stream stream);
|
AcResult acSynchronizeStream(const Stream stream);
|
||||||
|
|
||||||
|
/** Loads a constant to the memories of the devices visible to the caller */
|
||||||
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
|
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
|
||||||
|
|
||||||
|
/** Loads an AcMesh to the devices visible to the caller */
|
||||||
AcResult acLoad(const AcMesh host_mesh);
|
AcResult acLoad(const AcMesh host_mesh);
|
||||||
|
|
||||||
|
/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/
|
||||||
AcResult acStore(AcMesh* host_mesh);
|
AcResult acStore(AcMesh* host_mesh);
|
||||||
|
|
||||||
|
/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
|
||||||
|
* substep and the user is responsible for calling acBoundcondStep before reading the data. */
|
||||||
AcResult acIntegrate(const AcReal dt);
|
AcResult acIntegrate(const AcReal dt);
|
||||||
|
|
||||||
|
/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the
|
||||||
|
* caller*/
|
||||||
AcResult acBoundcondStep(void);
|
AcResult acBoundcondStep(void);
|
||||||
|
|
||||||
|
/** Does a scalar reduction with the data stored in some vertex buffer */
|
||||||
AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
|
AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
|
||||||
|
|
||||||
|
/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */
|
||||||
AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
|
AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
|
||||||
const VertexBufferHandle b, const VertexBufferHandle c);
|
const VertexBufferHandle b, const VertexBufferHandle c);
|
||||||
|
|
||||||
|
/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory.
|
||||||
|
*/
|
||||||
AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);
|
AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
107
src/core/node.cu
107
src/core/node.cu
@@ -16,6 +16,113 @@
|
|||||||
You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file
|
||||||
|
* \brief Multi-GPU implementation.
|
||||||
|
*
|
||||||
|
%JP: The old way for computing boundary conditions conflicts with the
|
||||||
|
way we have to do things with multiple GPUs.
|
||||||
|
|
||||||
|
The older approach relied on unified memory, which represented the whole
|
||||||
|
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
||||||
|
in its current state is more meant for quick prototyping when performance is not an issue.
|
||||||
|
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
|
||||||
|
than when managing the memory explicitly.
|
||||||
|
|
||||||
|
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||||
|
Quick rundown:
|
||||||
|
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||||
|
contain the extents of the whole simulation domain and the decomposed grids,
|
||||||
|
respectively. To simplify thing, we require that each GPU is assigned the same amount of
|
||||||
|
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
|
||||||
|
work with.
|
||||||
|
|
||||||
|
The whole simulation domain is decomposed with respect to the z dimension.
|
||||||
|
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||||
|
contain (nx, ny, nz / num_devices) vertices.
|
||||||
|
|
||||||
|
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||||
|
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||||
|
|
||||||
|
Terminology:
|
||||||
|
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
||||||
|
|
||||||
|
Changes required to this commented code block:
|
||||||
|
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
|
||||||
|
device.cu instead. Same holds for any complex index calculations. Instead, the local
|
||||||
|
coordinates should be passed as an int3 type without having to consider how the data is
|
||||||
|
actually laid out in device memory
|
||||||
|
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
|
||||||
|
handle of type "Device" which should be passed to single-GPU functions. In this file, all
|
||||||
|
devices are stored in a global array "devices[num_devices]".
|
||||||
|
- Every single-GPU function is executed asynchronously by default such that we
|
||||||
|
can optimize Astaroth by executing memory transactions concurrently with
|
||||||
|
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||||
|
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||||
|
as a parameter and commands executing in different streams can be processed
|
||||||
|
in parallel/concurrently.
|
||||||
|
|
||||||
|
|
||||||
|
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||||
|
|
||||||
|
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||||
|
|
||||||
|
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
|
||||||
|
STENCIL_ORDER/2)
|
||||||
|
|
||||||
|
on a single device are "local", in the sense that they can be computed without
|
||||||
|
having to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||||
|
the data to the fron and back plates outside this range. In the solution we use
|
||||||
|
here, we solve the local boundaries first, and then just exchange the front and back plates
|
||||||
|
in a "ring", like so
|
||||||
|
device_id
|
||||||
|
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||||
|
|
||||||
|
### Throughout this file we use the following notation and names for various index offsets
|
||||||
|
|
||||||
|
Global coordinates: coordinates with respect to the global grid (static Grid grid)
|
||||||
|
Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
|
||||||
|
|
||||||
|
s0, s1: source indices in global coordinates
|
||||||
|
d0, d1: destination indices in global coordinates
|
||||||
|
da = max(s0, d0);
|
||||||
|
db = min(s1, d1);
|
||||||
|
|
||||||
|
These are used in at least
|
||||||
|
acLoad()
|
||||||
|
acStore()
|
||||||
|
acSynchronizeHalos()
|
||||||
|
|
||||||
|
Here we decompose the host mesh and distribute it among the GPUs in
|
||||||
|
the node.
|
||||||
|
|
||||||
|
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
||||||
|
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
||||||
|
one for each GPU. Here we check which parts of the range s0...s1 maps
|
||||||
|
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
||||||
|
the data if needed.
|
||||||
|
|
||||||
|
The index mapping is inherently quite involved, but here's a picture which
|
||||||
|
hopefully helps make sense out of all this.
|
||||||
|
|
||||||
|
|
||||||
|
Grid
|
||||||
|
|----num_vertices---|
|
||||||
|
xxx|....................................................|xxx
|
||||||
|
^ ^ ^ ^
|
||||||
|
d0 d1 s0 (src) s1
|
||||||
|
|
||||||
|
Subgrid
|
||||||
|
|
||||||
|
xxx|.............|xxx
|
||||||
|
^ ^
|
||||||
|
d0 d1
|
||||||
|
|
||||||
|
^ ^
|
||||||
|
db da
|
||||||
|
*
|
||||||
|
*/
|
||||||
#include "astaroth_node.h"
|
#include "astaroth_node.h"
|
||||||
|
|
||||||
#include "astaroth_device.h"
|
#include "astaroth_device.h"
|
||||||
|
Reference in New Issue
Block a user