diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps index cc692fe..813f51a 100644 --- a/acc/mhd_solver/stencil_process.sps +++ b/acc/mhd_solver/stencil_process.sps @@ -229,9 +229,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto // MV: Good idea. No an immediate priority. // Fun related article: // https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/ - xx.x = xx.x*(2.0*M_PI/(dsx*globalGrid.n.x)); - xx.y = xx.y*(2.0*M_PI/(dsy*globalGrid.n.y)); - xx.z = xx.z*(2.0*M_PI/(dsz*globalGrid.n.z)); + xx.x = xx.x*(2.0*M_PI/(dsx*globalGridN.x)); + xx.y = xx.y*(2.0*M_PI/(dsy*globalGridN.y)); + xx.z = xx.z*(2.0*M_PI/(dsz*globalGridN.z)); Scalar cos_phi = cos(phi); Scalar sin_phi = sin(phi); @@ -254,9 +254,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto Vector forcing(int3 globalVertexIdx, Scalar dt) { - Vector a = Scalar(.5) * (Vector){globalGrid.n.x * dsx, - globalGrid.n.y * dsy, - globalGrid.n.z * dsz}; // source (origin) + Vector a = Scalar(.5) * (Vector){globalGridN.x * dsx, + globalGridN.y * dsy, + globalGridN.z * dsz}; // source (origin) Vector xx = (Vector){(globalVertexIdx.x - nx_min) * dsx, (globalVertexIdx.y - ny_min) * dsy, (globalVertexIdx.z - nz_min) * dsz}; // sink (current index) diff --git a/include/astaroth.h b/include/astaroth.h index 52ba3d0..43438f7 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -24,95 +24,62 @@ extern "C" { #include "astaroth_defines.h" -/** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more, - * AC_FAILURE otherwise. */ -AcResult acCheckDeviceAvailability(void); +#include "astaroth_device.h" +#include "astaroth_node.h" -/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL - * passed as a parameter */ -AcResult acSynchronizeStream(const StreamType stream); +/* +#include "astaroth_grid.h" +#define acInit(x) acGridInit(x) +#define acQuit() acGridQuit() +#define acLoad(x) acGridLoadMesh(STREAM_DEFAULT, x) +#define acReduceScal(x, y) acGridReduceScal(STREAM_DEFAULT, x, y) +#define acReduceVec(x, y, z, w) acGridReduceVec(STREAM_DEFAULT, x, y, z, w) +#define acBoundcondStep() acGridPeriodicBoundcondStep(STREAM_DEFAULT) +#define acIntegrate(x) acGridIntegrateStep(STREAM_DEFAULT, x) +#define acStore(x) acGridStoreMesh(STREAM_DEFAULT, x) +#define acSynchronizeStream(x) acGridSynchronizeStream(x) +#define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y) +*/ -/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos - * of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */ -AcResult acSynchronizeMesh(void); - -/** Starting point of all GPU computation. Handles the allocation and -initialization of *all memory needed on all GPUs in the node*. In other words, -setups everything GPU-side so that calling any other GPU interface function -afterwards does not result in illegal memory accesses. */ +/** Allocates all memory and initializes the devices visible to the caller. Should be + * called before any other function in this interface. */ AcResult acInit(const AcMeshInfo mesh_info); /** Frees all GPU allocations and resets all devices in the node. Should be * called at exit. */ AcResult acQuit(void); -/** Does all three substeps of the RK3 integration and computes the boundary -conditions when necessary. The result is synchronized and the boundary conditions are applied -after the final substep, after which the result can be fetched to CPU memory with acStore. */ +/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a + * parameter*/ +AcResult acSynchronizeStream(const Stream stream); + +/** Loads a constant to the memories of the devices visible to the caller */ +AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value); + +/** Loads an AcMesh to the devices visible to the caller */ +AcResult acLoad(const AcMesh host_mesh); + +/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/ +AcResult acStore(AcMesh* host_mesh); + +/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final + * substep and the user is responsible for calling acBoundcondStep before reading the data. */ AcResult acIntegrate(const AcReal dt); -/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the - * whole computational domain, which must be up to date and synchronized before calling - * acReduceScal. - */ -AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle a); +/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the + * caller*/ +AcResult acBoundcondStep(void); -/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the - * whole computational domain, which must be up to date and synchronized before calling - * acReduceVec. - */ +/** Does a scalar reduction with the data stored in some vertex buffer */ +AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle); + +/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */ AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c); -/** Distributes the host mesh among the GPUs in the node. Synchronous. */ -AcResult acLoad(const AcMesh host_mesh); - -/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous. - */ -AcResult acStore(AcMesh* host_mesh); - -/* - * ============================================================================= - * Astaroth interface: Advanced functions. Asynchronous. - * ============================================================================= - */ -/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */ -AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value); -AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, - const StreamType stream); - -/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */ -AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 start, const int num_vertices); -AcResult acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 start, const int num_vertices, - const StreamType stream); - -/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to - * CPU memory. Asynchronous. - */ -AcResult acStoreWithOffset(const int3 start, const int num_vertices, AcMesh* host_mesh); -AcResult acStoreWithOffsetAsync(const int3 start, const int num_vertices, AcMesh* host_mesh, - const StreamType stream); - -/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/ -AcResult acIntegrateStep(const int isubstep, const AcReal dt); -AcResult acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream); - -/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions. - * Asynchronous.*/ -AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, - const int3 end); -AcResult acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start, - const int3 end, const StreamType stream); - -/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */ -AcResult acBoundcondStep(void); -AcResult acBoundcondStepAsync(const StreamType stream); - -/* - * ============================================================================= - * Revised interface - * ============================================================================= +/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory. */ +AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh); #ifdef __cplusplus } // extern "C" diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h index 73a25d9..345dd5d 100644 --- a/include/astaroth_defines.h +++ b/include/astaroth_defines.h @@ -86,7 +86,9 @@ typedef struct { FUNC(AC_nxy),\ FUNC(AC_nxyz),\ -#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC) +#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)\ + FUNC(AC_global_grid_n),\ + FUNC(AC_multigpu_offset), #define AC_FOR_BUILTIN_REAL_PARAM_TYPES(FUNC) @@ -106,9 +108,26 @@ typedef enum { typedef enum { STREAM_DEFAULT, - NUM_STREAM_TYPES, // - STREAM_ALL -} StreamType; + STREAM_0, + STREAM_1, + STREAM_2, + STREAM_3, + STREAM_4, + STREAM_5, + STREAM_6, + STREAM_7, + STREAM_8, + STREAM_9, + STREAM_10, + STREAM_11, + STREAM_12, + STREAM_13, + STREAM_14, + STREAM_15, + STREAM_16, + NUM_STREAM_TYPES +} Stream; +#define STREAM_ALL (NUM_STREAM_TYPES) #define AC_GEN_ID(X) X typedef enum { diff --git a/include/astaroth_device.h b/include/astaroth_device.h new file mode 100644 index 0000000..442b1b1 --- /dev/null +++ b/include/astaroth_device.h @@ -0,0 +1,127 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include "astaroth_defines.h" + +typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles + // in Vulkan, f.ex. VkDevice + +/** */ +AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device); + +/** */ +AcResult acDeviceDestroy(Device device); + +/** */ +AcResult acDevicePrintInfo(const Device device); + +/** */ +AcResult acDeviceAutoOptimize(const Device device); + +/** */ +AcResult acDeviceSynchronizeStream(const Device device, const Stream stream); + +/** */ +AcResult acDeviceSwapBuffers(const Device device); + +/** */ +AcResult acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param, + const AcReal value); + +/** */ +AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, + const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices); + +/** Deprecated */ +AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream, + const AcMesh host_mesh, const int3 src, const int3 dst, + const int num_vertices); + +/** */ +AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle); + +/** Deprecated */ +AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh); + +/** */ +AcResult acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices, + AcMesh* host_mesh); + +/** Deprecated */ +AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src, + const int3 dst, const int num_vertices, AcMesh* host_mesh); + +/** */ +AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh); + +/** Deprecated */ +AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh); + +/** */ +AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, + const int3 src, const int3 dst, + const int num_vertices, Device dst_device); + +/** Deprecated */ +AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, + const int3 src, const int3 dst, const int num_vertices, + Device* dst_device); + +/** */ +AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, Device dst_device); + +/** Deprecated */ +AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device); + +/** */ +AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number, + const int3 start, const int3 end, const AcReal dt); +/** */ +AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 start, + const int3 end); + +/** */ +AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start, + const int3 end); + +/** */ +AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result); +/** */ +AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype, + const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, + const VertexBufferHandle vtxbuf2, AcReal* result); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/include/astaroth_grid.h b/include/astaroth_grid.h new file mode 100644 index 0000000..3fb5f8f --- /dev/null +++ b/include/astaroth_grid.h @@ -0,0 +1,99 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include "astaroth_defines.h" + +/** */ +AcResult acGridInit(const AcMeshInfo node_config); + +/** */ +AcResult acGridQuit(void); + +/** */ +AcResult acGridSynchronizeStream(const Stream stream); + +/** */ +AcResult acGridSwapBuffers(void); + +/** */ +AcResult acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value); + +/** */ +AcResult acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices); + +/** */ +AcResult acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src, + const int3 dst, const int num_vertices); + +/** */ +AcResult acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle); + +/** */ +AcResult acGridLoadMesh(const Stream stream, const AcMesh host_mesh); + +/** */ +AcResult acGridStoreVertexBufferWithOffset(const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices, + AcMesh* host_mesh); + +/** */ +AcResult acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst, + const int num_vertices, AcMesh* host_mesh); + +/** */ +AcResult acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle, + AcMesh* host_mesh); + +/** */ +AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh); + +/** */ +AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, + const int3 end, const AcReal dt); + +/** */ +AcResult acGridIntegrateStep(const Stream stream, const AcReal dt); + +/** */ +AcResult acGridPeriodicBoundcondStep(const Stream stream); +/** */ +/* acGridReduceScal(const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result); */ +AcReal acGridReduceScal(const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle); +/** */ +/*AcResult acGridReduceVec(const Stream stream, const ReductionType rtype, + const VertexBufferHandle vec0, const VertexBufferHandle vec1, + const VertexBufferHandle vec2, AcReal* result);*/ +AcReal acGridReduceVec(const Stream stream, const ReductionType rtype, + const VertexBufferHandle vec0, const VertexBufferHandle vec1, + const VertexBufferHandle vec2); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/include/astaroth_node.h b/include/astaroth_node.h new file mode 100644 index 0000000..6b2d626 --- /dev/null +++ b/include/astaroth_node.h @@ -0,0 +1,124 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include "astaroth_defines.h" + +typedef struct node_s* Node; // Opaque pointer to node_s. + +typedef struct { + +} DeviceConfiguration; + +/** */ +AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node); + +/** */ +AcResult acNodeDestroy(Node node); + +/** */ +AcResult acNodePrintInfo(const Node node); + +/** */ +AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config); + +/** */ +AcResult acNodeAutoOptimize(const Node node); + +/** */ +AcResult acNodeSynchronizeStream(const Node node, const Stream stream); + +/** Deprecated ? */ +AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle); // Not in Device + +/** */ +AcResult acNodeSynchronizeMesh(const Node node, const Stream stream); // Not in Device + +/** */ +AcResult acNodeSwapBuffers(const Node node); + +/** */ +AcResult acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param, + const AcReal value); + +/** Deprecated ? Might be useful though if the user wants to load only one vtxbuf. But in this case + * the user should supply a AcReal* instead of vtxbuf_handle */ +AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, + const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices); + +/** */ +AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh, + const int3 src, const int3 dst, const int num_vertices); + +/** Deprecated ? */ +AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle); + +/** */ +AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh); + +/** Deprecated ? */ +AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices, + AcMesh* host_mesh); + +/** */ +AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, + const int3 dst, const int num_vertices, AcMesh* host_mesh); + +/** Deprecated ? */ +AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh); + +/** */ +AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh); + +/** */ +AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number, + const int3 start, const int3 end, const AcReal dt); + +/** */ +AcResult acNodeIntegrate(const Node node, const AcReal dt); + +/** */ +AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle); + +/** */ +AcResult acNodePeriodicBoundconds(const Node node, const Stream stream); + +/** */ +AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result); +/** */ +AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype, + const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, + const VertexBufferHandle vtxbuf2, AcReal* result); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index b56c770..642c7f1 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -3,7 +3,8 @@ ######################################## ## Find packages -find_package(CUDA 9 REQUIRED) +find_package(CUDA REQUIRED) +# find_package(OpenMP REQUIRED) ## Architecture and optimization flags set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37 @@ -12,7 +13,8 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37 -gencode arch=compute_61,code=sm_61 -lineinfo -ftz=true # Flush denormalized floats to zero - -std=c++11) + -std=c++11 + )# --compiler-options ${OpenMP_CXX_FLAGS}) #--maxrregcount=255 # -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache # =cg to opt out @@ -32,6 +34,6 @@ else () endif () ## Create and link the library -include_directories(.) -cuda_add_library(astaroth_core STATIC astaroth.cu device.cu) +cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu) +target_include_directories(astaroth_core PRIVATE .) target_link_libraries(astaroth_core m) diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 736956b..7393fea 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -16,119 +16,8 @@ You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ - -/** - * @file - * \brief Multi-GPU implementation. - * - %JP: The old way for computing boundary conditions conflicts with the - way we have to do things with multiple GPUs. - - The older approach relied on unified memory, which represented the whole - memory area as one huge mesh instead of several smaller ones. However, unified memory - in its current state is more meant for quick prototyping when performance is not an issue. - Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult - than when managing the memory explicitly. - - In this new approach, I have simplified the multi- and single-GPU layers significantly. - Quick rundown: - New struct: Grid. There are two global variables, "grid" and "subgrid", which - contain the extents of the whole simulation domain and the decomposed grids, - respectively. To simplify thing, we require that each GPU is assigned the same amount of - work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to - work with. - - The whole simulation domain is decomposed with respect to the z dimension. - For example, if the grid contains (nx, ny, nz) vertices, then the subgrids - contain (nx, ny, nz / num_devices) vertices. - - An local index (i, j, k) in some subgrid can be mapped to the global grid with - global idx = (i, j, k + device_id * subgrid.n.z) - - Terminology: - - Single-GPU function: a function defined on the single-GPU layer (device.cu) - - Changes required to this commented code block: - - The thread block dimensions (tpb) are no longer passed to the kernel here but in - device.cu instead. Same holds for any complex index calculations. Instead, the local - coordinates should be passed as an int3 type without having to consider how the data is - actually laid out in device memory - - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque - handle of type "Device" which should be passed to single-GPU functions. In this file, all - devices are stored in a global array "devices[num_devices]". - - Every single-GPU function is executed asynchronously by default such that we - can optimize Astaroth by executing memory transactions concurrently with - computation. Therefore a StreamType should be passed as a parameter to single-GPU functions. - Refresher: CUDA function calls are non-blocking when a stream is explicitly passed - as a parameter and commands executing in different streams can be processed - in parallel/concurrently. - - - Note on periodic boundaries (might be helpful when implementing other boundary conditions): - - With multiple GPUs, periodic boundary conditions applied on indices ranging from - - (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - - STENCIL_ORDER/2) - - on a single device are "local", in the sense that they can be computed without - having to exchange data with neighboring GPUs. Special care is needed only for transferring - the data to the fron and back plates outside this range. In the solution we use - here, we solve the local boundaries first, and then just exchange the front and back plates - in a "ring", like so - device_id - (n) <-> 0 <-> 1 <-> ... <-> n <-> (0) - -### Throughout this file we use the following notation and names for various index offsets - - Global coordinates: coordinates with respect to the global grid (static Grid grid) - Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid) - - s0, s1: source indices in global coordinates - d0, d1: destination indices in global coordinates - da = max(s0, d0); - db = min(s1, d1); - - These are used in at least - acLoad() - acStore() - acSynchronizeHalos() - - Here we decompose the host mesh and distribute it among the GPUs in - the node. - - The host mesh is a huge contiguous block of data. Its dimensions are given by - the global variable named "grid". A "grid" is decomposed into "subgrids", - one for each GPU. Here we check which parts of the range s0...s1 maps - to the memory space stored by some GPU, ranging d0...d1, and transfer - the data if needed. - - The index mapping is inherently quite involved, but here's a picture which - hopefully helps make sense out of all this. - - - Grid - |----num_vertices---| - xxx|....................................................|xxx - ^ ^ ^ ^ - d0 d1 s0 (src) s1 - - Subgrid - - xxx|.............|xxx - ^ ^ - d0 d1 - - ^ ^ - db da - * - */ +// #include "astaroth_defines.h" #include "astaroth.h" -#include "errchk.h" - -#include "device.cuh" -#include "math_utils.h" // sum for reductions -// #include "standalone/config_loader.h" // update_config #define AC_GEN_STR(X) #X const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) // @@ -142,532 +31,80 @@ const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) / const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)}; #undef AC_GEN_STR -static const int MAX_NUM_DEVICES = 32; -static int num_devices = 0; -static Device devices[MAX_NUM_DEVICES] = {}; - -static Grid grid; // A grid consists of num_devices subgrids -static Grid subgrid; - -static int -gridIdx(const Grid grid, const int3 idx) -{ - return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y; -} - -static int3 -gridIdx3d(const Grid grid, const int idx) -{ - return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x, - idx / (grid.m.x * grid.m.y)}; -} - -static void -printInt3(const int3 vec) -{ - printf("(%d, %d, %d)", vec.x, vec.y, vec.z); -} - -static inline void -print(const AcMeshInfo config) -{ - for (int i = 0; i < NUM_INT_PARAMS; ++i) - printf("[%s]: %d\n", intparam_names[i], config.int_params[i]); - for (int i = 0; i < NUM_REAL_PARAMS; ++i) - printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i])); -} - -static void -update_builtin_params(AcMeshInfo* config) -{ - config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER; - ///////////// PAD TEST - // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; - ///////////// PAD TEST - config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER; - config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER; - - // Bounds for the computational domain, i.e. nx_min <= i < nx_max - config->int_params[AC_nx_min] = NGHOST; - config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx]; - config->int_params[AC_ny_min] = NGHOST; - config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST; - config->int_params[AC_nz_min] = NGHOST; - config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST; - - /* Additional helper params */ - // Int helpers - config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my]; - config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny]; - config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz]; - -#if VERBOSE_PRINTING // Defined in astaroth.h - printf("###############################################################\n"); - printf("Config dimensions recalculated:\n"); - print(*config); - printf("###############################################################\n"); -#endif -} - -static Grid -createGrid(const AcMeshInfo config) -{ - Grid grid; - - grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]}; - grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]}; - - return grid; -} +static const int num_nodes = 1; +static Node nodes[num_nodes]; AcResult -acCheckDeviceAvailability(void) +acInit(const AcMeshInfo mesh_info) { - int device_count; // Separate from num_devices to avoid side effects - ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count)); - if (device_count > 0) - return AC_SUCCESS; - else - return AC_FAILURE; -} - -AcResult -acSynchronizeStream(const StreamType stream) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - synchronize(devices[i], stream); - } - - return AC_SUCCESS; -} - -static AcResult -synchronize_halos(const StreamType stream) -{ - // Exchanges the halos of subgrids - // After this step, the data within the main grid ranging from - // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z - // has been synchronized and transferred to appropriate subgrids - - // We loop only to num_devices - 1 since the front and back plate of the grid is not - // transferred because their contents depend on the boundary conditions. - - // IMPORTANT NOTE: the boundary conditions must be applied before calling this function! - // I.e. the halos of subgrids must contain up-to-date data! - // #pragma omp parallel for - for (int i = 0; i < num_devices - 1; ++i) { - const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; - // ...|ooooxxx|... -> xxx|ooooooo|... - { - const int3 src = (int3){0, 0, subgrid.n.z}; - const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst, - num_vertices); - } - // ...|ooooooo|xxx <- ...|xxxoooo|... - { - const int3 src = (int3){0, 0, NGHOST}; - const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst, - num_vertices); - } - } - return AC_SUCCESS; -} - -AcResult -acSynchronizeMesh(void) -{ - acSynchronizeStream(STREAM_ALL); - synchronize_halos(STREAM_DEFAULT); - acSynchronizeStream(STREAM_ALL); - - return AC_SUCCESS; -} - -AcResult -acInit(const AcMeshInfo config) -{ - // Get num_devices - ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices)); - if (num_devices < 1) { - ERROR("No CUDA devices found!"); - return AC_FAILURE; - } - if (num_devices > MAX_NUM_DEVICES) { - WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES"); - num_devices = MAX_NUM_DEVICES; - } - if (!AC_MULTIGPU_ENABLED) { - WARNING("MULTIGPU_ENABLED was false. Using only one device"); - num_devices = 1; // Use only one device if multi-GPU is not enabled - } - // Check that num_devices is divisible with AC_nz. This makes decomposing the - // problem domain to multiple GPUs much easier since we do not have to worry - // about remainders - ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0); - - // Decompose the problem domain - // The main grid - grid = createGrid(config); - - // Subgrids - AcMeshInfo subgrid_config = config; - subgrid_config.int_params[AC_nz] /= num_devices; - update_builtin_params(&subgrid_config); - subgrid = createGrid(subgrid_config); - - // Periodic boundary conditions become weird if the system can "fold unto itself". - ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER); - ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER); - ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER); - -#if VERBOSE_PRINTING - // clang-format off - printf("Grid m "); printInt3(grid.m); printf("\n"); - printf("Grid n "); printInt3(grid.n); printf("\n"); - printf("Subrid m "); printInt3(subgrid.m); printf("\n"); - printf("Subrid n "); printInt3(subgrid.n); printf("\n"); - // clang-format on -#endif - - // Initialize the devices - for (int i = 0; i < num_devices; ++i) { - createDevice(i, subgrid_config, &devices[i]); - loadGlobalGrid(devices[i], grid); - printDeviceInfo(devices[i]); - } - - // Enable peer access - for (int i = 0; i < num_devices; ++i) { - const int front = (i + 1) % num_devices; - const int back = (i - 1 + num_devices) % num_devices; - - int can_access_front, can_access_back; - cudaDeviceCanAccessPeer(&can_access_front, i, front); - cudaDeviceCanAccessPeer(&can_access_back, i, back); -#if VERBOSE_PRINTING - printf( - "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n", - i, front, can_access_front, back, can_access_back); -#endif - - cudaSetDevice(i); - if (can_access_front) { - ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); - } - if (can_access_back) { - ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); - } - } - - acSynchronizeStream(STREAM_ALL); - return AC_SUCCESS; + return acNodeCreate(0, mesh_info, &nodes[0]); } AcResult acQuit(void) { - acSynchronizeStream(STREAM_ALL); - - for (int i = 0; i < num_devices; ++i) { - destroyDevice(devices[i]); - } - return AC_SUCCESS; + return acNodeDestroy(nodes[0]); } AcResult -acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start, - const int3 end, const StreamType stream) +acSynchronizeStream(const Stream stream) { - // See the beginning of the file for an explanation of the index mapping - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - // DECOMPOSITION OFFSET HERE - const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z}; - const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z}; - - const int3 da = max(start, d0); - const int3 db = min(end, d1); - - if (db.z >= da.z) { - const int3 da_local = da - (int3){0, 0, i * subgrid.n.z}; - const int3 db_local = db - (int3){0, 0, i * subgrid.n.z}; - rkStep(devices[i], stream, isubstep, da_local, db_local, dt); - } - } - return AC_SUCCESS; + return acNodeSynchronizeStream(nodes[0], stream); } AcResult -acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end) +acLoadDeviceConstant(const AcRealParam param, const AcReal value) { - return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT); + return acNodeLoadConstant(nodes[0], STREAM_DEFAULT, param, value); } AcResult -acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream) +acLoad(const AcMesh host_mesh) { - const int3 start = (int3){NGHOST, NGHOST, NGHOST}; - const int3 end = start + grid.n; - return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream); + return acNodeLoadMesh(nodes[0], STREAM_DEFAULT, host_mesh); } AcResult -acIntegrateStep(const int isubstep, const AcReal dt) +acStore(AcMesh* host_mesh) { - return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT); -} - -static AcResult -local_boundcondstep(const StreamType stream) -{ - if (num_devices == 1) { - boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m); - } - else { - // Local boundary conditions - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE - const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; - boundcondStep(devices[i], stream, d0, d1); - } - } - return AC_SUCCESS; -} - -static AcResult -global_boundcondstep(const StreamType stream) -{ - if (num_devices > 1) { - // With periodic boundary conditions we exchange the front and back plates of the - // grid. The exchange is done between the first and last device (0 and num_devices - 1). - const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; - // ...|ooooxxx|... -> xxx|ooooooo|... - { - const int3 src = (int3){0, 0, subgrid.n.z}; - const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst, - num_vertices); - } - // ...|ooooooo|xxx <- ...|xxxoooo|... - { - const int3 src = (int3){0, 0, NGHOST}; - const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst, - num_vertices); - } - } - return AC_SUCCESS; -} - -AcResult -acBoundcondStepAsync(const StreamType stream) -{ - ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES); - - local_boundcondstep(stream); - acSynchronizeStream(stream); - global_boundcondstep(stream); - synchronize_halos(stream); - acSynchronizeStream(stream); - return AC_SUCCESS; -} - -AcResult -acBoundcondStep(void) -{ - return acBoundcondStepAsync(STREAM_DEFAULT); -} - -static AcResult -swap_buffers(void) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - swapBuffers(devices[i]); - } - return AC_SUCCESS; + return acNodeStoreMesh(nodes[0], STREAM_DEFAULT, host_mesh); } AcResult acIntegrate(const AcReal dt) { - acSynchronizeStream(STREAM_ALL); - for (int isubstep = 0; isubstep < 3; ++isubstep) { - acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized. - swap_buffers(); - acBoundcondStep(); - } - return AC_SUCCESS; + /* + acNodeIntegrate(nodes[0], dt); + return acBoundcondStep(); + */ + return acNodeIntegrate(nodes[0], dt); } -static AcReal -simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n) +AcResult +acBoundcondStep(void) { - AcReal res = results[0]; - for (int i = 1; i < n; ++i) { - if (rtype == RTYPE_MAX) { - res = max(res, results[i]); - } - else if (rtype == RTYPE_MIN) { - res = min(res, results[i]); - } - else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) { - res = sum(res, results[i]); - } - else { - ERROR("Invalid rtype"); - } - } - - if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { - const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z); - res = sqrt(inv_n * res); - } - return res; + return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT); } AcReal -acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle) +acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle) { - acSynchronizeStream(STREAM_ALL); - - AcReal results[num_devices]; - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]); - } - - return simple_final_reduce_scal(rtype, results, num_devices); + AcReal result; + acNodeReduceScal(nodes[0], STREAM_DEFAULT, rtype, vtxbuf_handle, &result); + return result; } AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c) { - acSynchronizeStream(STREAM_ALL); - - AcReal results[num_devices]; - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]); - } - - return simple_final_reduce_scal(rtype, results, num_devices); + AcReal result; + acNodeReduceVec(nodes[0], STREAM_DEFAULT, rtype, a, b, c, &result); + return result; } AcResult -acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices, - const StreamType stream) +acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh) { - // See the beginning of the file for an explanation of the index mapping - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE - const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z}; - - const int3 s0 = src; - const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices); - - const int3 da = max(s0, d0); - const int3 db = min(s1, d1); - /* - printf("Device %d\n", i); - printf("\ts0: "); printInt3(s0); printf("\n"); - printf("\td0: "); printInt3(d0); printf("\n"); - printf("\tda: "); printInt3(da); printf("\n"); - printf("\tdb: "); printInt3(db); printf("\n"); - printf("\td1: "); printInt3(d1); printf("\n"); - printf("\ts1: "); printInt3(s1); printf("\n"); - printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i); - */ - if (db.z >= da.z) { - const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da); - // DECOMPOSITION OFFSET HERE - const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; - // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); - // printf("\n"); - copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells); - } - // printf("\n"); - } - return AC_SUCCESS; + return acNodeStoreMeshWithOffset(nodes[0], STREAM_DEFAULT, dst, dst, num_vertices, host_mesh); } - -AcResult -acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices) -{ - return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT); -} - -AcResult -acLoad(const AcMesh host_mesh) -{ - acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info)); - acSynchronizeStream(STREAM_ALL); - return AC_SUCCESS; -} - -AcResult -acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh, - const StreamType stream) -{ - // See the beginning of the file for an explanation of the index mapping - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE - const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z}; - - const int3 s0 = src; - const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices); - - const int3 da = max(s0, d0); - const int3 db = min(s1, d1); - if (db.z >= da.z) { - const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da); - // DECOMPOSITION OFFSET HERE - const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; - copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh); - } - } - return AC_SUCCESS; -} - -AcResult -acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh) -{ - return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT); -} - -AcResult -acStore(AcMesh* host_mesh) -{ - acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh); - acSynchronizeStream(STREAM_ALL); - return AC_SUCCESS; -} - -AcResult -acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - loadDeviceConstant(devices[i], stream, param, value); - } - return AC_SUCCESS; -} - -AcResult -acLoadDeviceConstant(const AcRealParam param, const AcReal value) -{ - return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT); -} - -/* - * ============================================================================= - * Revised interface - * ============================================================================= - */ diff --git a/src/core/device.cu b/src/core/device.cu index fdd9d74..c81d468 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -24,7 +24,7 @@ * Detailed info. * */ -#include "device.cuh" +#include "astaroth_device.h" #include "errchk.h" @@ -40,19 +40,19 @@ typedef struct { } VertexBufferArray; __constant__ AcMeshInfo d_mesh_info; -__constant__ int3 d_multigpu_offset; -__constant__ Grid globalGrid; #define DCONST_INT(X) (d_mesh_info.int_params[X]) #define DCONST_INT3(X) (d_mesh_info.int3_params[X]) #define DCONST_REAL(X) (d_mesh_info.real_params[X]) #define DCONST_REAL3(X) (d_mesh_info.real3_params[X]) #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) +#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n]) +#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset]) #include "kernels/boundconds.cuh" #include "kernels/integration.cuh" #include "kernels/reductions.cuh" -static dim3 rk3_tpb = (dim3){32, 1, 4}; +static dim3 rk3_tpb(32, 1, 4); #if PACKED_DATA_TRANSFERS // Defined in device.cuh // #include "kernels/pack_unpack.cuh" @@ -76,8 +76,96 @@ struct device_s { #endif }; +// clang-format off +static __global__ void dummy_kernel(void) {} +// clang-format on + AcResult -printDeviceInfo(const Device device) +acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) +{ + cudaSetDevice(id); + cudaDeviceReset(); + + // Create Device + struct device_s* device = (struct device_s*)malloc(sizeof(*device)); + ERRCHK_ALWAYS(device); + + device->id = id; + device->local_config = device_config; + + // Check that the code was compiled for the proper GPU architecture + printf("Trying to run a dummy kernel. If this fails, make sure that your\n" + "device supports the CUDA architecture you are compiling for.\n" + "Running dummy kernel... "); + fflush(stdout); + dummy_kernel<<<1, 1>>>(); + ERRCHK_CUDA_KERNEL_ALWAYS(); + printf("Success!\n"); + + // Concurrency + for (int i = 0; i < NUM_STREAM_TYPES; ++i) { + cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0); + } + + // Memory + const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); + } + ERRCHK_CUDA_ALWAYS( + cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); + ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); + +#if PACKED_DATA_TRANSFERS +// Allocate data required for packed transfers here (cudaMalloc) +#endif + + // Device constants + ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0, + cudaMemcpyHostToDevice)); + + printf("Created device %d (%p)\n", device->id, device); + *device_handle = device; + + // Autoptimize + if (id == 0) { + acDeviceAutoOptimize(device); + } + + return AC_SUCCESS; +} + +AcResult +acDeviceDestroy(Device device) +{ + cudaSetDevice(device->id); + printf("Destroying device %d (%p)\n", device->id, device); + + // Memory + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + cudaFree(device->vba.in[i]); + cudaFree(device->vba.out[i]); + } + cudaFree(device->reduce_scratchpad); + cudaFree(device->reduce_result); + +#if PACKED_DATA_TRANSFERS +// Free data required for packed tranfers here (cudaFree) +#endif + + // Concurrency + for (int i = 0; i < NUM_STREAM_TYPES; ++i) { + cudaStreamDestroy(device->streams[i]); + } + + // Destroy Device + free(device); + return AC_SUCCESS; +} + +AcResult +acDevicePrintInfo(const Device device) { const int device_id = device->id; @@ -136,305 +224,8 @@ printDeviceInfo(const Device device) return AC_SUCCESS; } -static __global__ void -dummy_kernel(void) -{ -} - AcResult -createDevice(const int id, const AcMeshInfo device_config, Device* device_handle) -{ - cudaSetDevice(id); - cudaDeviceReset(); - - // Create Device - struct device_s* device = (struct device_s*)malloc(sizeof(*device)); - ERRCHK_ALWAYS(device); - - device->id = id; - device->local_config = device_config; - - // Check that the code was compiled for the proper GPU architecture - printf("Trying to run a dummy kernel. If this fails, make sure that your\n" - "device supports the CUDA architecture you are compiling for.\n" - "Running dummy kernel... "); - fflush(stdout); - dummy_kernel<<<1, 1>>>(); - ERRCHK_CUDA_KERNEL_ALWAYS(); - printf("Success!\n"); - - // Concurrency - for (int i = 0; i < NUM_STREAM_TYPES; ++i) { - cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0); - } - - // Memory - const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); - } - ERRCHK_CUDA_ALWAYS( - cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); - -#if PACKED_DATA_TRANSFERS -// Allocate data required for packed transfers here (cudaMalloc) -#endif - - // Device constants - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0, - cudaMemcpyHostToDevice)); - - // Multi-GPU offset. This is used to compute globalVertexIdx. - // Might be better to calculate this in astaroth.cu instead of here, s.t. - // everything related to the decomposition is limited to the multi-GPU layer - const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]}; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset, - sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice)); - - printf("Created device %d (%p)\n", device->id, device); - *device_handle = device; - - // Autoptimize - if (id == 0) - autoOptimize(device); - - return AC_SUCCESS; -} - -AcResult -destroyDevice(Device device) -{ - cudaSetDevice(device->id); - printf("Destroying device %d (%p)\n", device->id, device); - - // Memory - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - cudaFree(device->vba.in[i]); - cudaFree(device->vba.out[i]); - } - cudaFree(device->reduce_scratchpad); - cudaFree(device->reduce_result); - -#if PACKED_DATA_TRANSFERS -// Free data required for packed tranfers here (cudaFree) -#endif - - // Concurrency - for (int i = 0; i < NUM_STREAM_TYPES; ++i) { - cudaStreamDestroy(device->streams[i]); - } - - // Destroy Device - free(device); - return AC_SUCCESS; -} - -AcResult -boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end) -{ - cudaSetDevice(device->id); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]); - } - return AC_SUCCESS; -} - -AcResult -reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, - const VertexBufferHandle vtxbuf_handle, AcReal* result) -{ - cudaSetDevice(device->id); - - const int3 start = (int3){device->local_config.int_params[AC_nx_min], - device->local_config.int_params[AC_ny_min], - device->local_config.int_params[AC_nz_min]}; - - const int3 end = (int3){device->local_config.int_params[AC_nx_max], - device->local_config.int_params[AC_ny_max], - device->local_config.int_params[AC_nz_max]}; - - *result = reduce_scal(device->streams[stream_type], rtype, start, end, - device->vba.in[vtxbuf_handle], device->reduce_scratchpad, - device->reduce_result); - return AC_SUCCESS; -} - -AcResult -reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype, - const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, - const VertexBufferHandle vtxbuf2, AcReal* result) -{ - cudaSetDevice(device->id); - - const int3 start = (int3){device->local_config.int_params[AC_nx_min], - device->local_config.int_params[AC_ny_min], - device->local_config.int_params[AC_nz_min]}; - - const int3 end = (int3){device->local_config.int_params[AC_nx_max], - device->local_config.int_params[AC_ny_max], - device->local_config.int_params[AC_nz_max]}; - - *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0], - device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], - device->reduce_scratchpad, device->reduce_result); - return AC_SUCCESS; -} - -AcResult -rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start, - const int3& end, const AcReal dt) -{ - cudaSetDevice(device->id); - - // const dim3 tpb(32, 1, 4); - const dim3 tpb = rk3_tpb; - - const int3 n = end - start; - const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // - (unsigned int)ceil(n.y / AcReal(tpb.y)), // - (unsigned int)ceil(n.z / AcReal(tpb.z))); - - if (step_number == 0) - solve<0><<streams[stream_type]>>>(start, end, device->vba, dt); - else if (step_number == 1) - solve<1><<streams[stream_type]>>>(start, end, device->vba, dt); - else - solve<2><<streams[stream_type]>>>(start, end, device->vba, dt); - - ERRCHK_CUDA_KERNEL(); - - return AC_SUCCESS; -} - -AcResult -synchronize(const Device device, const StreamType stream_type) -{ - cudaSetDevice(device->id); - if (stream_type == STREAM_ALL) { - cudaDeviceSynchronize(); - } - else { - cudaStreamSynchronize(device->streams[stream_type]); - } - return AC_SUCCESS; -} - -static AcResult -loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src, - const size_t bytes, AcReal* dst) -{ - cudaSetDevice(device->id); - ERRCHK_CUDA( - cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type])); - return AC_SUCCESS; -} - -static AcResult -storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src, - const size_t bytes, AcReal* dst) -{ - cudaSetDevice(device->id); - ERRCHK_CUDA( - cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type])); - return AC_SUCCESS; -} - -AcResult -copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh, - const int3& src, const int3& dst, const int num_vertices) -{ - const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info); - const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], - num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]); - } - return AC_SUCCESS; -} - -AcResult -copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst, - const int num_vertices, AcMesh* host_mesh) -{ - const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config); - const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - storeWithOffset(device, stream_type, &device->vba.in[i][src_idx], - num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]); - } - return AC_SUCCESS; -} - -AcResult -copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src, - Device dst_device, const int3& dst, const int num_vertices) -{ - cudaSetDevice(src_device->id); - const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config); - const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config); - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id, - &src_device->vba.in[i][src_idx], src_device->id, - sizeof(src_device->vba.in[i][0]) * num_vertices, - src_device->streams[stream_type])); - } - return AC_SUCCESS; -} - -AcResult -swapBuffers(const Device device) -{ - cudaSetDevice(device->id); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - AcReal* tmp = device->vba.in[i]; - device->vba.in[i] = device->vba.out[i]; - device->vba.out[i] = tmp; - } - return AC_SUCCESS; -} - -AcResult -loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param, - const int value) -{ - cudaSetDevice(device->id); - // CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here) - // and something like d_mesh_info.real_params[] cannot be directly accessed. - // Therefore we have to obfuscate the code a bit and compute the offset address before - // invoking cudaMemcpyToSymbol. - const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, - device->streams[stream_type])); - return AC_SUCCESS; -} - -AcResult -loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param, - const AcReal value) -{ - cudaSetDevice(device->id); - const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, - device->streams[stream_type])); - return AC_SUCCESS; -} - -AcResult -loadGlobalGrid(const Device device, const Grid grid) -{ - cudaSetDevice(device->id); - ERRCHK_CUDA_ALWAYS( - cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice)); - return AC_SUCCESS; -} - -AcResult -autoOptimize(const Device device) +acDeviceAutoOptimize(const Device device) { cudaSetDevice(device->id); @@ -513,12 +304,303 @@ autoOptimize(const Device device) return AC_SUCCESS; } +AcResult +acDeviceSynchronizeStream(const Device device, const Stream stream) +{ + cudaSetDevice(device->id); + if (stream == STREAM_ALL) { + cudaDeviceSynchronize(); + } + else { + cudaStreamSynchronize(device->streams[stream]); + } + return AC_SUCCESS; +} + +AcResult +acDeviceSwapBuffers(const Device device) +{ + cudaSetDevice(device->id); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + AcReal* tmp = device->vba.in[i]; + device->vba.in[i] = device->vba.out[i]; + device->vba.out[i] = tmp; + } + return AC_SUCCESS; +} + +AcResult +acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param, + const AcReal value) +{ + cudaSetDevice(device->id); + const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; + ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, + cudaMemcpyHostToDevice, device->streams[stream])); + return AC_SUCCESS; +} + +AcResult +acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices) +{ + cudaSetDevice(device->id); + const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info); + const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config); + + const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx]; + AcReal* dst_ptr = &device->vba.in[vtxbuf_handle][dst_idx]; + const size_t bytes = num_vertices * sizeof(src_ptr[0]); + + ERRCHK_CUDA( // + cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) // + ); + + return AC_SUCCESS; +} + +AcResult +acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh, + const int3 src, const int3 dst, const int num_vertices) +{ + WARNING("This function is deprecated"); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src, + dst, num_vertices); + } + return AC_SUCCESS; +} + +AcResult +acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle) +{ + const int3 src = (int3){0, 0, 0}; + const int3 dst = src; + const size_t num_vertices = acVertexBufferSize(device->local_config); + acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst, + num_vertices); + + return AC_SUCCESS; +} + +AcResult +acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh) +{ + WARNING("This function is deprecated"); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i); + } + + return AC_SUCCESS; +} + +AcResult +acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices, AcMesh* host_mesh) +{ + cudaSetDevice(device->id); + const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config); + const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info); + + const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx]; + AcReal* dst_ptr = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx]; + const size_t bytes = num_vertices * sizeof(src_ptr[0]); + + ERRCHK_CUDA( // + cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) // + ); + + return AC_SUCCESS; +} + +AcResult +acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src, + const int3 dst, const int num_vertices, AcMesh* host_mesh) +{ + WARNING("This function is deprecated"); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst, + num_vertices, host_mesh); + } + + return AC_SUCCESS; +} + +AcResult +acDeviceStoreVertexBuffer(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh) +{ + int3 src = (int3){0, 0, 0}; + int3 dst = src; + const size_t num_vertices = acVertexBufferSize(device->local_config); + + acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices, + host_mesh); + + return AC_SUCCESS; +} + +AcResult +acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh) +{ + WARNING("This function is deprecated"); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh); + } + + return AC_SUCCESS; +} + +AcResult +acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices, Device dst_device) +{ + cudaSetDevice(src_device->id); + const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config); + const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config); + + const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx]; + AcReal* dst_ptr = &dst_device->vba.in[vtxbuf_handle][dst_idx]; + const size_t bytes = num_vertices * sizeof(src_ptr[0]); + + ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes, + src_device->streams[stream])); + return AC_SUCCESS; +} + +AcResult +acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src, + const int3 dst, const int num_vertices, Device dst_device) +{ + WARNING("This function is deprecated"); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst, + num_vertices, dst_device); + } + return AC_SUCCESS; +} + +AcResult +acDeviceTransferVertexBuffer(const Device src_device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, Device dst_device) +{ + int3 src = (int3){0, 0, 0}; + int3 dst = src; + const size_t num_vertices = acVertexBufferSize(src_device->local_config); + + acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, + num_vertices, dst_device); + return AC_SUCCESS; +} + +AcResult +acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device) +{ + WARNING("This function is deprecated"); + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device); + } + return AC_SUCCESS; +} + +AcResult +acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number, + const int3 start, const int3 end, const AcReal dt) +{ + cudaSetDevice(device->id); + + const dim3 tpb = rk3_tpb; + + const int3 n = end - start; + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // + (unsigned int)ceil(n.y / AcReal(tpb.y)), // + (unsigned int)ceil(n.z / AcReal(tpb.z))); + + if (step_number == 0) + solve<0><<streams[stream]>>>(start, end, device->vba, dt); + else if (step_number == 1) + solve<1><<streams[stream]>>>(start, end, device->vba, dt); + else + solve<2><<streams[stream]>>>(start, end, device->vba, dt); + + ERRCHK_CUDA_KERNEL(); + + return AC_SUCCESS; +} + +AcResult +acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type, + const VertexBufferHandle vtxbuf_handle, const int3 start, + const int3 end) +{ + cudaSetDevice(device->id); + const cudaStream_t stream = device->streams[stream_type]; + + const dim3 tpb(8, 2, 8); + const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), + (unsigned int)ceil((end.y - start.y) / (float)tpb.y), + (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); + + kernel_periodic_boundconds<<>>(start, end, device->vba.in[vtxbuf_handle]); + ERRCHK_CUDA_KERNEL(); + + return AC_SUCCESS; +} + +AcResult +acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start, + const int3 end) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end); + } + return AC_SUCCESS; +} + +AcResult +acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result) +{ + cudaSetDevice(device->id); + + const int3 start = (int3){device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min]}; + + const int3 end = (int3){device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max]}; + + *result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle], + device->reduce_scratchpad, device->reduce_result); + return AC_SUCCESS; +} + +AcResult +acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, + const VertexBufferHandle vtxbuf2, AcReal* result) +{ + cudaSetDevice(device->id); + + const int3 start = (int3){device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min]}; + + const int3 end = (int3){device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max]}; + + *result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], + device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], + device->reduce_scratchpad, device->reduce_result); + return AC_SUCCESS; +} + #if PACKED_DATA_TRANSFERS // Functions for calling packed data transfers #endif - -/* - * ============================================================================= - * Revised interface - * ============================================================================= - */ diff --git a/src/core/device.cuh b/src/core/device.cuh deleted file mode 100644 index 7f1fad4..0000000 --- a/src/core/device.cuh +++ /dev/null @@ -1,107 +0,0 @@ -/* - Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. - - This file is part of Astaroth. - - Astaroth is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Astaroth is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Astaroth. If not, see . -*/ - -/** - * @file - * \brief Brief info. - * - * Detailed info. - * - */ -#pragma once -#include "astaroth.h" - -typedef struct { - int3 m; - int3 n; -} Grid; - -typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles - // in Vulkan, f.ex. VkDevice - -/** */ -AcResult printDeviceInfo(const Device device); - -/** */ -AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device); - -/** */ -AcResult destroyDevice(Device device); - -/** */ -AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start, - const int3& end); - -/** */ -AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, - const VertexBufferHandle vtxbuf_handle, AcReal* result); - -/** */ -AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype, - const VertexBufferHandle vec0, const VertexBufferHandle vec1, - const VertexBufferHandle vec2, AcReal* result); - -/** */ -AcResult rkStep(const Device device, const StreamType stream_type, const int step_number, - const int3& start, const int3& end, const AcReal dt); - -/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as - a StreamType, the function synchronizes all streams on the device. */ -AcResult synchronize(const Device device, const StreamType stream_type); - -/** */ -AcResult copyMeshToDevice(const Device device, const StreamType stream_type, - const AcMesh& host_mesh, const int3& src, const int3& dst, - const int num_vertices); - -/** */ -AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, - const int3& dst, const int num_vertices, AcMesh* host_mesh); - -/** */ -AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx, - Device dst, const int3& dst_idx, const int num_vertices); - -/** Swaps the input/output buffers used in computations */ -AcResult swapBuffers(const Device device); - -/** */ -AcResult loadDeviceConstant(const Device device, const StreamType stream_type, - const AcIntParam param, const int value); - -/** */ -AcResult loadDeviceConstant(const Device device, const StreamType stream_type, - const AcRealParam param, const AcReal value); - -/** */ -AcResult loadGlobalGrid(const Device device, const Grid grid); - -/** */ -AcResult autoOptimize(const Device device); - -// #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers -#if PACKED_DATA_TRANSFERS -// Declarations used for packed data transfers -#endif - -/* - * ============================================================================= - * Revised interface - * ============================================================================= - */ diff --git a/src/core/grid.cu b/src/core/grid.cu new file mode 100644 index 0000000..359fe00 --- /dev/null +++ b/src/core/grid.cu @@ -0,0 +1,208 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ +#include "astaroth_grid.h" + +#include "astaroth_node.h" + +const size_t MAX_NUM_NODES = 32; +size_t num_nodes = 0; +static Node nodes[MAX_NUM_NODES]; + +/** */ +AcResult +acGridInit(const AcMeshInfo node_config) +{ + acNodeCreate(0, node_config, &nodes[0]); + ++num_nodes; + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridQuit(void) +{ + acNodeDestroy(nodes[0]); + --num_nodes; + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridSynchronizeStream(const Stream stream) +{ + for (int i = 0; i < num_nodes; ++i) { + acNodeSynchronizeStream(nodes[i], stream); + } + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridSwapBuffers(void) +{ + for (int i = 0; i < num_nodes; ++i) { + acNodeSwapBuffers(nodes[i]); + } + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value) +{ + for (int i = 0; i < num_nodes; ++i) { + acNodeLoadConstant(node, stream, param, value); + } + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices) +{ + for (int i = 0; i < num_nodes; ++i) { + acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle) + } + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src, + const int3 dst, const int num_vertices) +{ + for (int i = 0; i < num_nodes; ++i) { + acNodeLoadMeshWithOffset(nodes[i], stream, host_mesh, src, dst, num_vertices); + } + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle) +{ + for (int i = 0; i < num_nodes; ++i) { + acNodeLoadVertexBuffer(node, stream, host_mesh, vtxbuf_handle); + } + WARNING("Proper multinode not yet implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridLoadMesh(const Stream stream, const AcMesh host_mesh) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle, + const int3 src, const int3 dst, const int num_vertices, + AcMesh* host_mesh) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst, + const int num_vertices, AcMesh* host_mesh) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle, + AcMesh* host_mesh) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridStoreMesh(const Stream stream, AcMesh* host_mesh) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end, + const AcReal dt) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridIntegrateStep(const Stream stream, const AcReal dt) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +/** */ +AcResult +acGridPeriodicBoundcondStep(const Stream stream) +{ + WARNING("Not implemented"); + return AC_FAILURE; +} + +#if 0 +/** */ +AcResult acGridReduceScal(const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result); +/** */ +AcResult acGridReduceVec(const Stream stream, const ReductionType rtype, + const VertexBufferHandle vec0, const VertexBufferHandle vec1, + const VertexBufferHandle vec2, AcReal* result); +#endif +/** */ +AcResult +acGridReduceScal(const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result) +{ + return 0; +} +/** */ +AcResult +acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0, + const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result) +{ + return 0; +} diff --git a/src/core/node.cu b/src/core/node.cu new file mode 100644 index 0000000..f9a4aa9 --- /dev/null +++ b/src/core/node.cu @@ -0,0 +1,818 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ + +/** + * @file + * \brief Multi-GPU implementation. + * + %JP: The old way for computing boundary conditions conflicts with the + way we have to do things with multiple GPUs. + + The older approach relied on unified memory, which represented the whole + memory area as one huge mesh instead of several smaller ones. However, unified memory + in its current state is more meant for quick prototyping when performance is not an issue. + Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult + than when managing the memory explicitly. + + In this new approach, I have simplified the multi- and single-GPU layers significantly. + Quick rundown: + New struct: Grid. There are two global variables, "grid" and "subgrid", which + contain the extents of the whole simulation domain and the decomposed grids, + respectively. To simplify thing, we require that each GPU is assigned the same amount of + work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to + work with. + + The whole simulation domain is decomposed with respect to the z dimension. + For example, if the grid contains (nx, ny, nz) vertices, then the subgrids + contain (nx, ny, nz / num_devices) vertices. + + An local index (i, j, k) in some subgrid can be mapped to the global grid with + global idx = (i, j, k + device_id * subgrid.n.z) + + Terminology: + - Single-GPU function: a function defined on the single-GPU layer (device.cu) + + Changes required to this commented code block: + - The thread block dimensions (tpb) are no longer passed to the kernel here but in + device.cu instead. Same holds for any complex index calculations. Instead, the local + coordinates should be passed as an int3 type without having to consider how the data is + actually laid out in device memory + - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque + handle of type "Device" which should be passed to single-GPU functions. In this file, all + devices are stored in a global array "devices[num_devices]". + - Every single-GPU function is executed asynchronously by default such that we + can optimize Astaroth by executing memory transactions concurrently with + computation. Therefore a StreamType should be passed as a parameter to single-GPU functions. + Refresher: CUDA function calls are non-blocking when a stream is explicitly passed + as a parameter and commands executing in different streams can be processed + in parallel/concurrently. + + + Note on periodic boundaries (might be helpful when implementing other boundary conditions): + + With multiple GPUs, periodic boundary conditions applied on indices ranging from + + (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - + STENCIL_ORDER/2) + + on a single device are "local", in the sense that they can be computed without + having to exchange data with neighboring GPUs. Special care is needed only for transferring + the data to the fron and back plates outside this range. In the solution we use + here, we solve the local boundaries first, and then just exchange the front and back plates + in a "ring", like so + device_id + (n) <-> 0 <-> 1 <-> ... <-> n <-> (0) + +### Throughout this file we use the following notation and names for various index offsets + + Global coordinates: coordinates with respect to the global grid (static Grid grid) + Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid) + + s0, s1: source indices in global coordinates + d0, d1: destination indices in global coordinates + da = max(s0, d0); + db = min(s1, d1); + + These are used in at least + acLoad() + acStore() + acSynchronizeHalos() + + Here we decompose the host mesh and distribute it among the GPUs in + the node. + + The host mesh is a huge contiguous block of data. Its dimensions are given by + the global variable named "grid". A "grid" is decomposed into "subgrids", + one for each GPU. Here we check which parts of the range s0...s1 maps + to the memory space stored by some GPU, ranging d0...d1, and transfer + the data if needed. + + The index mapping is inherently quite involved, but here's a picture which + hopefully helps make sense out of all this. + + + Grid + |----num_vertices---| + xxx|....................................................|xxx + ^ ^ ^ ^ + d0 d1 s0 (src) s1 + + Subgrid + + xxx|.............|xxx + ^ ^ + d0 d1 + + ^ ^ + db da + * + */ +#include "astaroth_node.h" + +#include "astaroth_device.h" +#include "errchk.h" +#include "math_utils.h" // sum for reductions + +static const int MAX_NUM_DEVICES = 32; + +typedef struct { + int3 m; + int3 n; +} Grid; + +struct node_s { + int id; + + int num_devices; + Device devices[MAX_NUM_DEVICES]; + + Grid grid; + Grid subgrid; + + AcMeshInfo config; +}; + +static int +gridIdx(const Grid grid, const int3 idx) +{ + return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y; +} + +static int3 +gridIdx3d(const Grid grid, const int idx) +{ + return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x, + idx / (grid.m.x * grid.m.y)}; +} + +static void +printInt3(const int3 vec) +{ + printf("(%d, %d, %d)", vec.x, vec.y, vec.z); +} + +static inline void +print(const AcMeshInfo config) +{ + for (int i = 0; i < NUM_INT_PARAMS; ++i) + printf("[%s]: %d\n", intparam_names[i], config.int_params[i]); + for (int i = 0; i < NUM_REAL_PARAMS; ++i) + printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i])); +} + +static void +update_builtin_params(AcMeshInfo* config) +{ + config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER; + ///////////// PAD TEST + // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; + ///////////// PAD TEST + config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER; + config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER; + + // Bounds for the computational domain, i.e. nx_min <= i < nx_max + config->int_params[AC_nx_min] = NGHOST; + config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx]; + config->int_params[AC_ny_min] = NGHOST; + config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST; + config->int_params[AC_nz_min] = NGHOST; + config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST; + + /* Additional helper params */ + // Int helpers + config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my]; + config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny]; + config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz]; +} + +static Grid +createGrid(const AcMeshInfo config) +{ + Grid grid; + + grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]}; + grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]}; + + return grid; +} + +AcResult +acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle) +{ + struct node_s* node = (struct node_s*)malloc(sizeof(*node)); + node->id = id; + node->config = node_config; + + // Get node->num_devices + ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices)); + if (node->num_devices < 1) { + ERROR("No CUDA devices found!"); + return AC_FAILURE; + } + if (node->num_devices > MAX_NUM_DEVICES) { + WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES"); + node->num_devices = MAX_NUM_DEVICES; + } + if (!AC_MULTIGPU_ENABLED) { + WARNING("MULTIGPU_ENABLED was false. Using only one device"); + node->num_devices = 1; // Use only one device if multi-GPU is not enabled + } + // Check that node->num_devices is divisible with AC_nz. This makes decomposing the + // problem domain to multiple GPUs much easier since we do not have to worry + // about remainders + ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0); + + // Decompose the problem domain + // The main grid + node->grid = createGrid(node->config); + + // Subgrids + AcMeshInfo subgrid_config = node->config; + subgrid_config.int_params[AC_nz] /= node->num_devices; + update_builtin_params(&subgrid_config); +#if VERBOSE_PRINTING // Defined in astaroth.h + printf("###############################################################\n"); + printf("Config dimensions recalculated:\n"); + print(subgrid_config); + printf("###############################################################\n"); +#endif + node->subgrid = createGrid(subgrid_config); + + // Periodic boundary conditions become weird if the system can "fold unto itself". + ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER); + ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER); + ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER); + +#if VERBOSE_PRINTING + // clang-format off + printf("Grid m "); printInt3(node->grid.m); printf("\n"); + printf("Grid n "); printInt3(node->grid.n); printf("\n"); + printf("Subrid m "); printInt3(node->subgrid.m); printf("\n"); + printf("Subrid n "); printInt3(node->subgrid.n); printf("\n"); + // clang-format on +#endif + + // Initialize the devices + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int3 multinode_offset = (int3){0, 0, 0}; // Placeholder + const int3 multigpu_offset = (int3){0, 0, i * node->subgrid.n.z}; + subgrid_config.int3_params[AC_global_grid_n] = node->grid.n; + subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset; + + acDeviceCreate(i, subgrid_config, &node->devices[i]); + acDevicePrintInfo(node->devices[i]); + } + + // Enable peer access + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int front = (i + 1) % node->num_devices; + const int back = (i - 1 + node->num_devices) % node->num_devices; + + int can_access_front, can_access_back; + cudaDeviceCanAccessPeer(&can_access_front, i, front); + cudaDeviceCanAccessPeer(&can_access_back, i, back); +#if VERBOSE_PRINTING + printf( + "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n", + i, front, can_access_front, back, can_access_back); +#endif + + cudaSetDevice(i); + if (can_access_front) { + ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); + } + if (can_access_back) { + ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); + } + } + acNodeSynchronizeStream(node, STREAM_ALL); + + *node_handle = node; + return AC_SUCCESS; +} + +AcResult +acNodeDestroy(Node node) +{ + acNodeSynchronizeStream(node, STREAM_ALL); + + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + acDeviceDestroy(node->devices[i]); + } + free(node); + + return AC_SUCCESS; +} + +AcResult +acNodePrintInfo(const Node node) +{ + (void)node; + WARNING("Not implemented"); + return AC_FAILURE; +} + +AcResult +acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config) +{ + (void)node; + (void)config; + WARNING("Not implemented"); + return AC_FAILURE; +} + +AcResult +acNodeAutoOptimize(const Node node) +{ + (void)node; + WARNING("Not implemented"); + return AC_FAILURE; +} + +AcResult +acNodeSynchronizeStream(const Node node, const Stream stream) +{ + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + acDeviceSynchronizeStream(node->devices[i], stream); + } + + return AC_SUCCESS; +} + +AcResult +acNodeSynchronizeVertexBuffer(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle) +{ + acNodeSynchronizeStream(node, stream); + // Exchanges the halos of subgrids + // After this step, the data within the main grid ranging from + // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z + // has been synchronized and transferred to appropriate subgrids + + // We loop only to node->num_devices - 1 since the front and back plate of the grid is not + // transferred because their contents depend on the boundary conditions. + + // IMPORTANT NOTE: the boundary conditions must be applied before + // callingacNodeSynchronizeStream(node, this function! I.e. the halos of subgrids must contain + // up-to-date data! + + const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST; + + // #pragma omp parallel for + for (int i = 0; i < node->num_devices - 1; ++i) { + // ...|ooooxxx|... -> xxx|ooooooo|... + const int3 src = (int3){0, 0, node->subgrid.n.z}; + const int3 dst = (int3){0, 0, 0}; + + const Device src_device = node->devices[i]; + Device dst_device = node->devices[i + 1]; + + acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, + num_vertices, dst_device); + } + // #pragma omp parallel for + for (int i = 1; i < node->num_devices; ++i) { + // ...|ooooooo|xxx <- ...|xxxoooo|... + const int3 src = (int3){0, 0, NGHOST}; + const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z}; + + const Device src_device = node->devices[i]; + Device dst_device = node->devices[i - 1]; + + acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, + num_vertices, dst_device); + } + return AC_SUCCESS; +} + +AcResult +acNodeSynchronizeMesh(const Node node, const Stream stream) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i); + } + + return AC_SUCCESS; +} + +AcResult +acNodeSwapBuffers(const Node node) +{ + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + acDeviceSwapBuffers(node->devices[i]); + } + return AC_SUCCESS; +} + +AcResult +acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param, + const AcReal value) +{ + acNodeSynchronizeStream(node, stream); + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + acDeviceLoadConstant(node->devices[i], stream, param, value); + } + return AC_SUCCESS; +} + +AcResult +acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices) +{ + acNodeSynchronizeStream(node, stream); + // See the beginning of the file for an explanation of the index mapping + // // #pragma omp parallel for + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE + const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; + + const int3 s0 = src; // dst; // TODO fix + (void)dst; // TODO fix + const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices); + + const int3 da = max(s0, d0); + const int3 db = min(s1, d1); + /* + printf("Device %d\n", i); + printf("\ts0: "); printInt3(s0); printf("\n"); + printf("\td0: "); printInt3(d0); printf("\n"); + printf("\tda: "); printInt3(da); printf("\n"); + printf("\tdb: "); printInt3(db); printf("\n"); + printf("\td1: "); printInt3(d1); printf("\n"); + printf("\ts1: "); printInt3(s1); printf("\n"); + printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i); + */ + if (db.z >= da.z) { + const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da); + // DECOMPOSITION OFFSET HERE + const int3 da_global = da; // src + da - dst; // TODO fix + const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices}; + // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); + // printf("\n"); + acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle, + da_global, da_local, copy_cells); + } + // printf("\n"); + } + return AC_SUCCESS; +} + +AcResult +acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh, + const int3 src, const int3 dst, const int num_vertices) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst, + num_vertices); + } + return AC_SUCCESS; +} + +AcResult +acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh, + const VertexBufferHandle vtxbuf_handle) +{ + const int3 src = (int3){0, 0, 0}; + const int3 dst = src; + const size_t num_vertices = acVertexBufferSize(host_mesh.info); + + acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst, + num_vertices); + return AC_SUCCESS; +} + +AcResult +acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i); + } + return AC_SUCCESS; +} + +AcResult +acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 src, + const int3 dst, const int num_vertices, AcMesh* host_mesh) +{ + acNodeSynchronizeStream(node, stream); + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE + const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; + + const int3 s0 = src; // TODO fix + (void)dst; // TODO fix + const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices); + + const int3 da = max(s0, d0); + const int3 db = min(s1, d1); + if (db.z >= da.z) { + const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da); + // DECOMPOSITION OFFSET HERE + const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices}; + const int3 da_global = da; // dst + da - src; // TODO fix + acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local, + da_global, copy_cells, host_mesh); + } + } + return AC_SUCCESS; +} + +AcResult +acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst, + const int num_vertices, AcMesh* host_mesh) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst, + num_vertices, host_mesh); + } + return AC_SUCCESS; +} + +AcResult +acNodeStoreVertexBuffer(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh) +{ + const int3 src = (int3){0, 0, 0}; + const int3 dst = src; + const size_t num_vertices = acVertexBufferSize(host_mesh->info); + + acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices, + host_mesh); + + return AC_SUCCESS; +} + +AcResult +acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh); + } + return AC_SUCCESS; +} + +AcResult +acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start, + const int3 end, const AcReal dt) +{ + acNodeSynchronizeStream(node, stream); + + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + // DECOMPOSITION OFFSET HERE + const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z}; + const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z}; + + const int3 da = max(start, d0); + const int3 db = min(end, d1); + + if (db.z >= da.z) { + const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z}; + const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z}; + acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt); + } + } + return AC_SUCCESS; +} + +static AcResult +local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf) +{ + acNodeSynchronizeStream(node, stream); + + if (node->num_devices > 1) { + // Local boundary conditions + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE + const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z}; + acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1); + } + } + else { + acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0}, + node->subgrid.m); + } + return AC_SUCCESS; +} + +static AcResult +global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle) +{ + acNodeSynchronizeStream(node, stream); + + if (node->num_devices > 1) { + const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST; + { + // ...|ooooxxx|... -> xxx|ooooooo|... + const int3 src = (int3){0, 0, node->subgrid.n.z}; + const int3 dst = (int3){0, 0, 0}; + + const Device src_device = node->devices[node->num_devices - 1]; + Device dst_device = node->devices[0]; + + acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, + num_vertices, dst_device); + } + { + // ...|ooooooo|xxx <- ...|xxxoooo|... + const int3 src = (int3){0, 0, NGHOST}; + const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z}; + + const Device src_device = node->devices[0]; + Device dst_device = node->devices[node->num_devices - 1]; + + acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, + num_vertices, dst_device); + } + } + return AC_SUCCESS; +} + +AcResult +acNodeIntegrate(const Node node, const AcReal dt) +{ + acNodeSynchronizeStream(node, STREAM_ALL); + // xxx|OOO OOOOOOOOO OOO|xxx + // ^ ^ ^ ^ + // n0 n1 n2 n3 + // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST}; + // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + // const int3 n2 = node->grid.n; + // const int3 n3 = n0 + node->grid.n; + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acNodeSynchronizeStream(node, STREAM_ALL); + for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { + local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf); + } + acNodeSynchronizeStream(node, STREAM_ALL); + + // Inner inner + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = node->subgrid.n; + acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt); + } + + for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { + acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf); + global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf); + } + for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { + acNodeSynchronizeStream(node, (Stream)vtxbuf); + } + + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Back + const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Top + const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST, + node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Right + const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST, + node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt); + } + acNodeSwapBuffers(node); + } + acNodeSynchronizeStream(node, STREAM_ALL); + return AC_SUCCESS; +} + +AcResult +acNodePeriodicBoundcondStep(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle) +{ + local_boundcondstep(node, stream, vtxbuf_handle); + acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle); + + // TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE + global_boundcondstep(node, stream, vtxbuf_handle); + // WARNING("Global boundconds should not be done here with multinode"); + + return AC_SUCCESS; +} + +AcResult +acNodePeriodicBoundconds(const Node node, const Stream stream) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i); + } + return AC_SUCCESS; +} + +static AcReal +simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results, + const int& n) +{ + AcReal res = results[0]; + for (int i = 1; i < n; ++i) { + if (rtype == RTYPE_MAX) { + res = max(res, results[i]); + } + else if (rtype == RTYPE_MIN) { + res = min(res, results[i]); + } + else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) { + res = sum(res, results[i]); + } + else { + ERROR("Invalid rtype"); + } + } + + if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z); + res = sqrt(inv_n * res); + } + return res; +} + +AcResult +acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result) +{ + acNodeSynchronizeStream(node, STREAM_ALL); + + AcReal results[node->num_devices]; + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]); + } + + *result = simple_final_reduce_scal(node, rtype, results, node->num_devices); + return AC_SUCCESS; +} + +AcResult +acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype, + const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c, + AcReal* result) +{ + acNodeSynchronizeStream(node, STREAM_ALL); + + AcReal results[node->num_devices]; + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]); + } + + *result = simple_final_reduce_scal(node, rtype, results, node->num_devices); + return AC_SUCCESS; +} diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc index e65a783..f8a5312 100644 --- a/src/standalone/autotest.cc +++ b/src/standalone/autotest.cc @@ -29,13 +29,13 @@ #include #include "config_loader.h" -#include "src/core/math_utils.h" #include "model/host_forcing.h" #include "model/host_memory.h" #include "model/host_timestep.h" #include "model/model_boundconds.h" #include "model/model_reduce.h" #include "model/model_rk3.h" +#include "src/core/math_utils.h" #include "src/core/errchk.h" @@ -431,8 +431,9 @@ check_rk3(const AcMeshInfo& mesh_info) acIntegrate(dt); model_rk3(dt, model_mesh); - boundconds(model_mesh->info, model_mesh); } + boundconds(model_mesh->info, model_mesh); + acBoundcondStep(); acStore(gpu_mesh); bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh); @@ -726,6 +727,7 @@ run_autotest(void) // Device integration step acIntegrate(dt); + acBoundcondStep(); acStore(candidate_mesh); // Check fields diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc index 24f7f15..1a6a919 100644 --- a/src/standalone/benchmark.cc +++ b/src/standalone/benchmark.cc @@ -197,7 +197,6 @@ run_benchmark(void) double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]); - acStore(mesh); acQuit(); acmesh_destroy(mesh); } @@ -269,7 +268,6 @@ run_benchmark(void) write_result_to_file(wallclock * 1e3f / steps); #endif - acStore(mesh); acQuit(); acmesh_destroy(mesh); diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc index 41f32b7..2e8945f 100644 --- a/src/standalone/renderer.cc +++ b/src/standalone/renderer.cc @@ -32,13 +32,13 @@ #include // memcpy #include "config_loader.h" -#include "src/core/errchk.h" -#include "src/core/math_utils.h" #include "model/host_forcing.h" #include "model/host_memory.h" #include "model/host_timestep.h" #include "model/model_reduce.h" #include "model/model_rk3.h" +#include "src/core/errchk.h" +#include "src/core/math_utils.h" #include "timer_hires.h" // Window @@ -106,6 +106,7 @@ renderer_init(const int& mx, const int& my) // Setup window window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, window_width, window_height, SDL_WINDOW_SHOWN); + ERRCHK_ALWAYS(window); // Setup SDL renderer renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED); @@ -409,9 +410,10 @@ run_renderer(void) /* Render */ const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f; if (timer_diff_sec >= desired_frame_time) { - // acStore(mesh); const int num_vertices = mesh->info.int_params[AC_mxy]; const int3 dst = (int3){0, 0, k_slice}; + acBoundcondStep(); + // acStore(mesh); acStoreWithOffset(dst, num_vertices, mesh); acSynchronizeStream(STREAM_ALL); renderer_draw(*mesh); // Bottleneck is here @@ -421,7 +423,6 @@ run_renderer(void) } printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f)); - acStore(mesh); acQuit(); acmesh_destroy(mesh); diff --git a/src/standalone/simulation.cc b/src/standalone/simulation.cc index ebd527e..51a9a76 100644 --- a/src/standalone/simulation.cc +++ b/src/standalone/simulation.cc @@ -27,13 +27,13 @@ #include "run.h" #include "config_loader.h" -#include "src/core/errchk.h" -#include "src/core/math_utils.h" #include "model/host_forcing.h" #include "model/host_memory.h" #include "model/host_timestep.h" #include "model/model_reduce.h" #include "model/model_rk3.h" +#include "src/core/errchk.h" +#include "src/core/math_utils.h" #include "timer_hires.h" #include @@ -205,6 +205,7 @@ run_simulation(void) write_mesh_info(&mesh_info); print_diagnostics(0, AcReal(.0), t_step, diag_file); + acBoundcondStep(); acStore(mesh); save_mesh(*mesh, 0, t_step); @@ -218,13 +219,13 @@ run_simulation(void) /* initialize random seed: */ srand(312256655); - //TODO_SINK. init_sink_particle() + // TODO_SINK. init_sink_particle() // Initialize the basic variables of the sink particle to a suitable initial value. // 1. Location of the particle // 2. Mass of the particle // (3. Velocity of the particle) - // This at the level of Host in this case. - // acUpdate_sink_particle() will do the similar trick to the device. + // This at the level of Host in this case. + // acUpdate_sink_particle() will do the similar trick to the device. /* Step the simulation */ for (int i = 1; i < max_steps; ++i) { @@ -236,26 +237,26 @@ run_simulation(void) loadForcingParamsToDevice(forcing_params); #endif - //TODO_SINK acUpdate_sink_particle() - // Update properties of the sing particle for acIntegrate(). Essentially: - // 1. Location of the particle - // 2. Mass of the particle - // (3. Velocity of the particle) - // These can be used for calculating he gravitational field. + // TODO_SINK acUpdate_sink_particle() + // Update properties of the sing particle for acIntegrate(). Essentially: + // 1. Location of the particle + // 2. Mass of the particle + // (3. Velocity of the particle) + // These can be used for calculating he gravitational field. acIntegrate(dt); - //TODO_SINK acAdvect_sink_particle() - // THIS IS OPTIONAL. We will start from unmoving particle. - // 1. Calculate the equation of motion for the sink particle. - // NOTE: Might require embedding with acIntegrate(dt). + // TODO_SINK acAdvect_sink_particle() + // THIS IS OPTIONAL. We will start from unmoving particle. + // 1. Calculate the equation of motion for the sink particle. + // NOTE: Might require embedding with acIntegrate(dt). - //TODO_SINK acAccrete_sink_particle() - // Calculate accretion of the sink particle from the surrounding medium - // 1. Transfer density into sink particle mass - // 2. Transfer momentum into sink particle - // (OPTIONAL: Affection the motion of the particle) - // NOTE: Might require embedding with acIntegrate(dt). - // This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference. + // TODO_SINK acAccrete_sink_particle() + // Calculate accretion of the sink particle from the surrounding medium + // 1. Transfer density into sink particle mass + // 2. Transfer momentum into sink particle + // (OPTIONAL: Affection the motion of the particle) + // NOTE: Might require embedding with acIntegrate(dt). + // This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference. t_step += dt; @@ -298,9 +299,10 @@ run_simulation(void) assumed to be asynchronous, so the meshes must be also synchronized before transferring the data to the CPU. Like so: - acSynchronize(); + acBoundcondStep(); acStore(mesh); */ + acBoundcondStep(); acStore(mesh); save_mesh(*mesh, i, t_step);