From 5be775dbff99e516c17efdb78a3f8b423f97d039 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 31 Jul 2019 17:48:48 +0300 Subject: [PATCH] Various intermediate changes --- include/astaroth_device.h | 21 +- include/astaroth_grid.h | 11 +- include/astaroth_node.h | 13 +- src/core/CMakeLists.txt | 2 +- src/core/astaroth.cu | 658 +------------------------------------- src/core/device.cu | 502 +---------------------------- src/core/grid.cu | 19 ++ src/core/node.cu | 2 +- 8 files changed, 43 insertions(+), 1185 deletions(-) diff --git a/include/astaroth_device.h b/include/astaroth_device.h index d98ffcb..48c8db2 100644 --- a/include/astaroth_device.h +++ b/include/astaroth_device.h @@ -103,21 +103,18 @@ AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stre AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device); /** */ -AcResult acDeviceIntegrateSubstep(const Device device, const StreamType stream_type, - const int step_number, const int3 start, const int3 end, - const AcReal dt); +AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number, + const int3 start, const int3 end, const AcReal dt); /** */ -AcResult acDevicePeriodicBoundcondStep(const Device device, const StreamType stream_type, - const int3 start, const int3 end); +AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 start, + const int3 end); /** */ -AcResult acDeviceReduceScal(const Device device, const StreamType stream_type, - const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, - AcReal* result); +AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf_handle, AcReal* result); /** */ -AcResult acDeviceReduceVec(const Device device, const StreamType stream_type, - const ReductionType rtype, const VertexBufferHandle vec0, - const VertexBufferHandle vec1, const VertexBufferHandle vec2, - AcReal* result); +AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vec0, const VertexBufferHandle vec1, + const VertexBufferHandle vec2, AcReal* result); #ifdef __cplusplus } // extern "C" diff --git a/include/astaroth_grid.h b/include/astaroth_grid.h index 4200343..7dfe323 100644 --- a/include/astaroth_grid.h +++ b/include/astaroth_grid.h @@ -89,16 +89,15 @@ AcResult acGridTransferVertexBuffer(const Stream stream, const VertexBufferHandl AcResult acGridTransferMesh(const Stream stream); /** */ -AcResult acGridIntegrateSubstep(const StreamType stream_type, const int step_number, - const int3 start, const int3 end, const AcReal dt); +AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, + const int3 end, const AcReal dt); /** */ -AcResult acGridPeriodicBoundcondStep(const StreamType stream_type, const int3 start, - const int3 end); +AcResult acGridPeriodicBoundcondStep(const Stream stream, const int3 start, const int3 end); /** */ -AcResult acGridReduceScal(const StreamType stream_type, const ReductionType rtype, +AcResult acGridReduceScal(const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result); /** */ -AcResult acGridReduceVec(const StreamType stream_type, const ReductionType rtype, +AcResult acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0, const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result); diff --git a/include/astaroth_node.h b/include/astaroth_node.h index 83e0a45..216db25 100644 --- a/include/astaroth_node.h +++ b/include/astaroth_node.h @@ -104,17 +104,16 @@ AcResult acNodeTransferVertexBuffer(const Node src_node, const Stream stream, AcResult acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node); /** */ -AcResult acNodeIntegrateSubstep(const Node node, const StreamType stream_type, - const int step_number, const int3 start, const int3 end, - const AcReal dt); +AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number, + const int3 start, const int3 end, const AcReal dt); /** */ -AcResult acNodePeriodicBoundcondStep(const Node node, const StreamType stream_type, - const int3 start, const int3 end); +AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const int3 start, + const int3 end); /** */ -AcResult acNodeReduceScal(const Node node, const StreamType stream_type, const ReductionType rtype, +AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result); /** */ -AcResult acNodeReduceVec(const Node node, const StreamType stream_type, const ReductionType rtype, +AcResult acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0, const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result); diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 5cbc271..6054444 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -3,7 +3,7 @@ ######################################## ## Find packages -find_package(CUDA 9 REQUIRED) +find_package(CUDA REQUIRED) ## Architecture and optimization flags set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37 diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 1cb9f1e..1ef56d4 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -16,660 +16,4 @@ You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ - -/** - * @file - * \brief Multi-GPU implementation. - * - %JP: The old way for computing boundary conditions conflicts with the - way we have to do things with multiple GPUs. - - The older approach relied on unified memory, which represented the whole - memory area as one huge mesh instead of several smaller ones. However, unified memory - in its current state is more meant for quick prototyping when performance is not an issue. - Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult - than when managing the memory explicitly. - - In this new approach, I have simplified the multi- and single-GPU layers significantly. - Quick rundown: - New struct: Grid. There are two global variables, "grid" and "subgrid", which - contain the extents of the whole simulation domain and the decomposed grids, - respectively. To simplify thing, we require that each GPU is assigned the same amount of - work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to - work with. - - The whole simulation domain is decomposed with respect to the z dimension. - For example, if the grid contains (nx, ny, nz) vertices, then the subgrids - contain (nx, ny, nz / num_devices) vertices. - - An local index (i, j, k) in some subgrid can be mapped to the global grid with - global idx = (i, j, k + device_id * subgrid.n.z) - - Terminology: - - Single-GPU function: a function defined on the single-GPU layer (device.cu) - - Changes required to this commented code block: - - The thread block dimensions (tpb) are no longer passed to the kernel here but in - device.cu instead. Same holds for any complex index calculations. Instead, the local - coordinates should be passed as an int3 type without having to consider how the data is - actually laid out in device memory - - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque - handle of type "Device" which should be passed to single-GPU functions. In this file, all - devices are stored in a global array "devices[num_devices]". - - Every single-GPU function is executed asynchronously by default such that we - can optimize Astaroth by executing memory transactions concurrently with - computation. Therefore a StreamType should be passed as a parameter to single-GPU functions. - Refresher: CUDA function calls are non-blocking when a stream is explicitly passed - as a parameter and commands executing in different streams can be processed - in parallel/concurrently. - - - Note on periodic boundaries (might be helpful when implementing other boundary conditions): - - With multiple GPUs, periodic boundary conditions applied on indices ranging from - - (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - - STENCIL_ORDER/2) - - on a single device are "local", in the sense that they can be computed without - having to exchange data with neighboring GPUs. Special care is needed only for transferring - the data to the fron and back plates outside this range. In the solution we use - here, we solve the local boundaries first, and then just exchange the front and back plates - in a "ring", like so - device_id - (n) <-> 0 <-> 1 <-> ... <-> n <-> (0) - -### Throughout this file we use the following notation and names for various index offsets - - Global coordinates: coordinates with respect to the global grid (static Grid grid) - Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid) - - s0, s1: source indices in global coordinates - d0, d1: destination indices in global coordinates - da = max(s0, d0); - db = min(s1, d1); - - These are used in at least - acLoad() - acStore() - acSynchronizeHalos() - - Here we decompose the host mesh and distribute it among the GPUs in - the node. - - The host mesh is a huge contiguous block of data. Its dimensions are given by - the global variable named "grid". A "grid" is decomposed into "subgrids", - one for each GPU. Here we check which parts of the range s0...s1 maps - to the memory space stored by some GPU, ranging d0...d1, and transfer - the data if needed. - - The index mapping is inherently quite involved, but here's a picture which - hopefully helps make sense out of all this. - - - Grid - |----num_vertices---| - xxx|....................................................|xxx - ^ ^ ^ ^ - d0 d1 s0 (src) s1 - - Subgrid - - xxx|.............|xxx - ^ ^ - d0 d1 - - ^ ^ - db da - * - */ -#include "astaroth.h" -#include "errchk.h" - -#include "device.cuh" -#include "math_utils.h" // sum for reductions -// #include "standalone/config_loader.h" // update_config - -#define AC_GEN_STR(X) #X -const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) // - AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)}; -const char* int3param_names[] = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) // - AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)}; -const char* realparam_names[] = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) // - AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)}; -const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) // - AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)}; -const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)}; -#undef AC_GEN_STR - -static const int MAX_NUM_DEVICES = 32; -static int num_devices = 0; -static Device devices[MAX_NUM_DEVICES] = {}; - -static Grid grid; // A grid consists of num_devices subgrids -static Grid subgrid; - -static int -gridIdx(const Grid grid, const int3 idx) -{ - return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y; -} - -static int3 -gridIdx3d(const Grid grid, const int idx) -{ - return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x, - idx / (grid.m.x * grid.m.y)}; -} - -static void -printInt3(const int3 vec) -{ - printf("(%d, %d, %d)", vec.x, vec.y, vec.z); -} - -static inline void -print(const AcMeshInfo config) -{ - for (int i = 0; i < NUM_INT_PARAMS; ++i) - printf("[%s]: %d\n", intparam_names[i], config.int_params[i]); - for (int i = 0; i < NUM_REAL_PARAMS; ++i) - printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i])); -} - -static void -update_builtin_params(AcMeshInfo* config) -{ - config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER; - ///////////// PAD TEST - // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; - ///////////// PAD TEST - config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER; - config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER; - - // Bounds for the computational domain, i.e. nx_min <= i < nx_max - config->int_params[AC_nx_min] = NGHOST; - config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx]; - config->int_params[AC_ny_min] = NGHOST; - config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST; - config->int_params[AC_nz_min] = NGHOST; - config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST; - - /* Additional helper params */ - // Int helpers - config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my]; - config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny]; - config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz]; - -#if VERBOSE_PRINTING // Defined in astaroth.h - printf("###############################################################\n"); - printf("Config dimensions recalculated:\n"); - print(*config); - printf("###############################################################\n"); -#endif -} - -static Grid -createGrid(const AcMeshInfo config) -{ - Grid grid; - - grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]}; - grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]}; - - return grid; -} - -AcResult -acCheckDeviceAvailability(void) -{ - int device_count; // Separate from num_devices to avoid side effects - ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count)); - if (device_count > 0) - return AC_SUCCESS; - else - return AC_FAILURE; -} - -AcResult -acSynchronizeStream(const StreamType stream) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - synchronize(devices[i], stream); - } - - return AC_SUCCESS; -} - -static AcResult -synchronize_halos(const StreamType stream) -{ - // Exchanges the halos of subgrids - // After this step, the data within the main grid ranging from - // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z - // has been synchronized and transferred to appropriate subgrids - - // We loop only to num_devices - 1 since the front and back plate of the grid is not - // transferred because their contents depend on the boundary conditions. - - // IMPORTANT NOTE: the boundary conditions must be applied before calling this function! - // I.e. the halos of subgrids must contain up-to-date data! - // #pragma omp parallel for - for (int i = 0; i < num_devices - 1; ++i) { - const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; - // ...|ooooxxx|... -> xxx|ooooooo|... - { - const int3 src = (int3){0, 0, subgrid.n.z}; - const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst, - num_vertices); - } - // ...|ooooooo|xxx <- ...|xxxoooo|... - { - const int3 src = (int3){0, 0, NGHOST}; - const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst, - num_vertices); - } - } - return AC_SUCCESS; -} - -AcResult -acSynchronizeMesh(void) -{ - acSynchronizeStream(STREAM_ALL); - synchronize_halos(STREAM_DEFAULT); - acSynchronizeStream(STREAM_ALL); - - return AC_SUCCESS; -} - -AcResult -acInit(const AcMeshInfo config) -{ - // Get num_devices - ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices)); - if (num_devices < 1) { - ERROR("No CUDA devices found!"); - return AC_FAILURE; - } - if (num_devices > MAX_NUM_DEVICES) { - WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES"); - num_devices = MAX_NUM_DEVICES; - } - if (!AC_MULTIGPU_ENABLED) { - WARNING("MULTIGPU_ENABLED was false. Using only one device"); - num_devices = 1; // Use only one device if multi-GPU is not enabled - } - // Check that num_devices is divisible with AC_nz. This makes decomposing the - // problem domain to multiple GPUs much easier since we do not have to worry - // about remainders - ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0); - - // Decompose the problem domain - // The main grid - grid = createGrid(config); - - // Subgrids - AcMeshInfo subgrid_config = config; - subgrid_config.int_params[AC_nz] /= num_devices; - update_builtin_params(&subgrid_config); - subgrid = createGrid(subgrid_config); - - // Periodic boundary conditions become weird if the system can "fold unto itself". - ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER); - ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER); - ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER); - -#if VERBOSE_PRINTING - // clang-format off - printf("Grid m "); printInt3(grid.m); printf("\n"); - printf("Grid n "); printInt3(grid.n); printf("\n"); - printf("Subrid m "); printInt3(subgrid.m); printf("\n"); - printf("Subrid n "); printInt3(subgrid.n); printf("\n"); - // clang-format on -#endif - - // Initialize the devices - for (int i = 0; i < num_devices; ++i) { - createDevice(i, subgrid_config, &devices[i]); - loadGlobalGrid(devices[i], grid); - printDeviceInfo(devices[i]); - } - - // Enable peer access - for (int i = 0; i < num_devices; ++i) { - const int front = (i + 1) % num_devices; - const int back = (i - 1 + num_devices) % num_devices; - - int can_access_front, can_access_back; - cudaDeviceCanAccessPeer(&can_access_front, i, front); - cudaDeviceCanAccessPeer(&can_access_back, i, back); -#if VERBOSE_PRINTING - printf( - "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n", - i, front, can_access_front, back, can_access_back); -#endif - - cudaSetDevice(i); - if (can_access_front) { - ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); - } - if (can_access_back) { - ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); - } - } - - acSynchronizeStream(STREAM_ALL); - return AC_SUCCESS; -} - -AcResult -acQuit(void) -{ - acSynchronizeStream(STREAM_ALL); - - for (int i = 0; i < num_devices; ++i) { - destroyDevice(devices[i]); - } - return AC_SUCCESS; -} - -AcResult -acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start, - const int3 end, const StreamType stream) -{ - // See the beginning of the file for an explanation of the index mapping - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - // DECOMPOSITION OFFSET HERE - const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z}; - const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z}; - - const int3 da = max(start, d0); - const int3 db = min(end, d1); - - if (db.z >= da.z) { - const int3 da_local = da - (int3){0, 0, i * subgrid.n.z}; - const int3 db_local = db - (int3){0, 0, i * subgrid.n.z}; - rkStep(devices[i], stream, isubstep, da_local, db_local, dt); - } - } - return AC_SUCCESS; -} - -AcResult -acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end) -{ - return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT); -} - -AcResult -acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream) -{ - const int3 start = (int3){NGHOST, NGHOST, NGHOST}; - const int3 end = start + grid.n; - return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream); -} - -AcResult -acIntegrateStep(const int isubstep, const AcReal dt) -{ - return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT); -} - -static AcResult -local_boundcondstep(const StreamType stream) -{ - if (num_devices == 1) { - boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m); - } - else { - // Local boundary conditions - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE - const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; - boundcondStep(devices[i], stream, d0, d1); - } - } - return AC_SUCCESS; -} - -static AcResult -global_boundcondstep(const StreamType stream) -{ - if (num_devices > 1) { - // With periodic boundary conditions we exchange the front and back plates of the - // grid. The exchange is done between the first and last device (0 and num_devices - 1). - const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; - // ...|ooooxxx|... -> xxx|ooooooo|... - { - const int3 src = (int3){0, 0, subgrid.n.z}; - const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst, - num_vertices); - } - // ...|ooooooo|xxx <- ...|xxxoooo|... - { - const int3 src = (int3){0, 0, NGHOST}; - const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst, - num_vertices); - } - } - return AC_SUCCESS; -} - -AcResult -acBoundcondStepAsync(const StreamType stream) -{ - ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES); - - local_boundcondstep(stream); - acSynchronizeStream(stream); - global_boundcondstep(stream); - synchronize_halos(stream); - acSynchronizeStream(stream); - return AC_SUCCESS; -} - -AcResult -acBoundcondStep(void) -{ - return acBoundcondStepAsync(STREAM_DEFAULT); -} - -static AcResult -swap_buffers(void) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - swapBuffers(devices[i]); - } - return AC_SUCCESS; -} - -AcResult -acIntegrate(const AcReal dt) -{ - acSynchronizeStream(STREAM_ALL); - for (int isubstep = 0; isubstep < 3; ++isubstep) { - acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized. - swap_buffers(); - acBoundcondStep(); - } - return AC_SUCCESS; -} - -static AcReal -simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n) -{ - AcReal res = results[0]; - for (int i = 1; i < n; ++i) { - if (rtype == RTYPE_MAX) { - res = max(res, results[i]); - } - else if (rtype == RTYPE_MIN) { - res = min(res, results[i]); - } - else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) { - res = sum(res, results[i]); - } - else { - ERROR("Invalid rtype"); - } - } - - if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { - const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z); - res = sqrt(inv_n * res); - } - return res; -} - -AcReal -acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle) -{ - acSynchronizeStream(STREAM_ALL); - - AcReal results[num_devices]; - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]); - } - - return simple_final_reduce_scal(rtype, results, num_devices); -} - -AcReal -acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, - const VertexBufferHandle c) -{ - acSynchronizeStream(STREAM_ALL); - - AcReal results[num_devices]; - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]); - } - - return simple_final_reduce_scal(rtype, results, num_devices); -} - -AcResult -acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices, - const StreamType stream) -{ - // See the beginning of the file for an explanation of the index mapping - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE - const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z}; - - const int3 s0 = src; - const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices); - - const int3 da = max(s0, d0); - const int3 db = min(s1, d1); - /* - printf("Device %d\n", i); - printf("\ts0: "); printInt3(s0); printf("\n"); - printf("\td0: "); printInt3(d0); printf("\n"); - printf("\tda: "); printInt3(da); printf("\n"); - printf("\tdb: "); printInt3(db); printf("\n"); - printf("\td1: "); printInt3(d1); printf("\n"); - printf("\ts1: "); printInt3(s1); printf("\n"); - printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i); - */ - if (db.z >= da.z) { - const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da); - // DECOMPOSITION OFFSET HERE - const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; - // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); - // printf("\n"); - copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells); - } - // printf("\n"); - } - return AC_SUCCESS; -} - -AcResult -acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices) -{ - return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT); -} - -AcResult -acLoad(const AcMesh host_mesh) -{ - acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info)); - acSynchronizeStream(STREAM_ALL); - return AC_SUCCESS; -} - -AcResult -acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh, - const StreamType stream) -{ - // See the beginning of the file for an explanation of the index mapping - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE - const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z}; - - const int3 s0 = src; - const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices); - - const int3 da = max(s0, d0); - const int3 db = min(s1, d1); - if (db.z >= da.z) { - const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da); - // DECOMPOSITION OFFSET HERE - const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; - copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh); - } - } - return AC_SUCCESS; -} - -AcResult -acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh) -{ - return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT); -} - -AcResult -acStore(AcMesh* host_mesh) -{ - acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh); - acSynchronizeStream(STREAM_ALL); - return AC_SUCCESS; -} - -AcResult -acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - loadDeviceConstant(devices[i], stream, param, value); - } - return AC_SUCCESS; -} - -AcResult -acLoadDeviceConstant(const AcRealParam param, const AcReal value) -{ - return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT); -} - -/* - * ============================================================================= - * Revised interface - * ============================================================================= - */ -======= ->>>>>>> Stashed changes +#include "astaroth_defines.h" diff --git a/src/core/device.cu b/src/core/device.cu index c046d18..5d7dd69 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -16,507 +16,7 @@ You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ - -/** - * @file - * \brief Brief info. - * - * Detailed info. - * - */ -#include "device.cuh" - -#include "errchk.h" - -// Device info -#define REGISTERS_PER_THREAD (255) -#define MAX_REGISTERS_PER_BLOCK (65536) -#define MAX_THREADS_PER_BLOCK (1024) -#define WARP_SIZE (32) - -typedef struct { - AcReal* in[NUM_VTXBUF_HANDLES]; - AcReal* out[NUM_VTXBUF_HANDLES]; -} VertexBufferArray; - -__constant__ AcMeshInfo d_mesh_info; -__constant__ int3 d_multigpu_offset; -__constant__ Grid globalGrid; -#define DCONST_INT(X) (d_mesh_info.int_params[X]) -#define DCONST_INT3(X) (d_mesh_info.int3_params[X]) -#define DCONST_REAL(X) (d_mesh_info.real_params[X]) -#define DCONST_REAL3(X) (d_mesh_info.real3_params[X]) -#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) -#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) -#include "kernels/kernels.cuh" - -static dim3 rk3_tpb = (dim3){32, 1, 4}; - -#if PACKED_DATA_TRANSFERS // Defined in device.cuh -// #include "kernels/pack_unpack.cuh" -#endif +#include "astaroth_device.h" struct device_s { - int id; - AcMeshInfo local_config; - - // Concurrency - cudaStream_t streams[NUM_STREAM_TYPES]; - - // Memory - VertexBufferArray vba; - AcReal* reduce_scratchpad; - AcReal* reduce_result; - -#if PACKED_DATA_TRANSFERS -// Declare memory for buffers needed for packed data transfers here -// AcReal* data_packing_buffer; -#endif }; - -AcResult -printDeviceInfo(const Device device) -{ - const int device_id = device->id; - - cudaDeviceProp props; - cudaGetDeviceProperties(&props, device_id); - printf("--------------------------------------------------\n"); - printf("Device Number: %d\n", device_id); - const size_t bus_id_max_len = 128; - char bus_id[bus_id_max_len]; - cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id); - printf(" PCI bus ID: %s\n", bus_id); - printf(" Device name: %s\n", props.name); - printf(" Compute capability: %d.%d\n", props.major, props.minor); - - // Compute - printf(" Compute\n"); - printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz - printf(" Stream processors: %d\n", props.multiProcessorCount); - printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio); - printf( - " Compute mode: %d\n", - (int)props - .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0 - // Memory - printf(" Global memory\n"); - printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000)); - printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth); - printf(" Peak Memory Bandwidth (GiB/s): %f\n", - 2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.)); - printf(" ECC enabled: %d\n", props.ECCEnabled); - - // Memory usage - size_t free_bytes, total_bytes; - cudaMemGetInfo(&free_bytes, &total_bytes); - const size_t used_bytes = total_bytes - free_bytes; - printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024)); - printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024)); - printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024)); - printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024)); - printf(" Caches\n"); - printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported); - printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported); - printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024)); - // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error - // MV: while compiling in TIARA gp cluster. Therefore commeted out. - //!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); - //!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024)); - printf(" Other\n"); - printf(" Warp size: %d\n", props.warpSize); - // printf(" Single to double perf. ratio: %dx\n", - // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA - // versions - printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported); - printf("--------------------------------------------------\n"); - - return AC_SUCCESS; -} - -static __global__ void -dummy_kernel(void) -{ -} - -AcResult -createDevice(const int id, const AcMeshInfo device_config, Device* device_handle) -{ - cudaSetDevice(id); - cudaDeviceReset(); - - // Create Device - struct device_s* device = (struct device_s*)malloc(sizeof(*device)); - ERRCHK_ALWAYS(device); - - device->id = id; - device->local_config = device_config; - - // Check that the code was compiled for the proper GPU architecture - printf("Trying to run a dummy kernel. If this fails, make sure that your\n" - "device supports the CUDA architecture you are compiling for.\n" - "Running dummy kernel... "); - fflush(stdout); - dummy_kernel<<<1, 1>>>(); - ERRCHK_CUDA_KERNEL_ALWAYS(); - printf("Success!\n"); - - // Concurrency - for (int i = 0; i < NUM_STREAM_TYPES; ++i) { - cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0); - } - - // Memory - const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); - } - ERRCHK_CUDA_ALWAYS( - cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); - -#if PACKED_DATA_TRANSFERS -// Allocate data required for packed transfers here (cudaMalloc) -#endif - - // Device constants - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0, - cudaMemcpyHostToDevice)); - - // Multi-GPU offset. This is used to compute globalVertexIdx. - // Might be better to calculate this in astaroth.cu instead of here, s.t. - // everything related to the decomposition is limited to the multi-GPU layer - const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]}; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset, - sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice)); - - printf("Created device %d (%p)\n", device->id, device); - *device_handle = device; - - // Autoptimize - if (id == 0) - autoOptimize(device); - - return AC_SUCCESS; -} - -AcResult -destroyDevice(Device device) -{ - cudaSetDevice(device->id); - printf("Destroying device %d (%p)\n", device->id, device); - - // Memory - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - cudaFree(device->vba.in[i]); - cudaFree(device->vba.out[i]); - } - cudaFree(device->reduce_scratchpad); - cudaFree(device->reduce_result); - -#if PACKED_DATA_TRANSFERS -// Free data required for packed tranfers here (cudaFree) -#endif - - // Concurrency - for (int i = 0; i < NUM_STREAM_TYPES; ++i) { - cudaStreamDestroy(device->streams[i]); - } - - // Destroy Device - free(device); - return AC_SUCCESS; -} - -AcResult -boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end) -{ - cudaSetDevice(device->id); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]); - } - return AC_SUCCESS; -} - -AcResult -reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, - const VertexBufferHandle vtxbuf_handle, AcReal* result) -{ - cudaSetDevice(device->id); - - const int3 start = (int3){device->local_config.int_params[AC_nx_min], - device->local_config.int_params[AC_ny_min], - device->local_config.int_params[AC_nz_min]}; - - const int3 end = (int3){device->local_config.int_params[AC_nx_max], - device->local_config.int_params[AC_ny_max], - device->local_config.int_params[AC_nz_max]}; - - *result = reduce_scal(device->streams[stream_type], rtype, start, end, - device->vba.in[vtxbuf_handle], device->reduce_scratchpad, - device->reduce_result); - return AC_SUCCESS; -} - -AcResult -reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype, - const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, - const VertexBufferHandle vtxbuf2, AcReal* result) -{ - cudaSetDevice(device->id); - - const int3 start = (int3){device->local_config.int_params[AC_nx_min], - device->local_config.int_params[AC_ny_min], - device->local_config.int_params[AC_nz_min]}; - - const int3 end = (int3){device->local_config.int_params[AC_nx_max], - device->local_config.int_params[AC_ny_max], - device->local_config.int_params[AC_nz_max]}; - - *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0], - device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], - device->reduce_scratchpad, device->reduce_result); - return AC_SUCCESS; -} - -AcResult -rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start, - const int3& end, const AcReal dt) -{ - cudaSetDevice(device->id); - - // const dim3 tpb(32, 1, 4); - const dim3 tpb = rk3_tpb; - - const int3 n = end - start; - const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // - (unsigned int)ceil(n.y / AcReal(tpb.y)), // - (unsigned int)ceil(n.z / AcReal(tpb.z))); - - if (step_number == 0) - solve<0><<streams[stream_type]>>>(start, end, device->vba, dt); - else if (step_number == 1) - solve<1><<streams[stream_type]>>>(start, end, device->vba, dt); - else - solve<2><<streams[stream_type]>>>(start, end, device->vba, dt); - - ERRCHK_CUDA_KERNEL(); - - return AC_SUCCESS; -} - -AcResult -synchronize(const Device device, const StreamType stream_type) -{ - cudaSetDevice(device->id); - if (stream_type == STREAM_ALL) { - cudaDeviceSynchronize(); - } - else { - cudaStreamSynchronize(device->streams[stream_type]); - } - return AC_SUCCESS; -} - -static AcResult -loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src, - const size_t bytes, AcReal* dst) -{ - cudaSetDevice(device->id); - ERRCHK_CUDA( - cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type])); - return AC_SUCCESS; -} - -static AcResult -storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src, - const size_t bytes, AcReal* dst) -{ - cudaSetDevice(device->id); - ERRCHK_CUDA( - cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type])); - return AC_SUCCESS; -} - -AcResult -copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh, - const int3& src, const int3& dst, const int num_vertices) -{ - const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info); - const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], - num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]); - } - return AC_SUCCESS; -} - -AcResult -copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst, - const int num_vertices, AcMesh* host_mesh) -{ - const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config); - const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - storeWithOffset(device, stream_type, &device->vba.in[i][src_idx], - num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]); - } - return AC_SUCCESS; -} - -AcResult -copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src, - Device dst_device, const int3& dst, const int num_vertices) -{ - cudaSetDevice(src_device->id); - const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config); - const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config); - - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id, - &src_device->vba.in[i][src_idx], src_device->id, - sizeof(src_device->vba.in[i][0]) * num_vertices, - src_device->streams[stream_type])); - } - return AC_SUCCESS; -} - -AcResult -swapBuffers(const Device device) -{ - cudaSetDevice(device->id); - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - AcReal* tmp = device->vba.in[i]; - device->vba.in[i] = device->vba.out[i]; - device->vba.out[i] = tmp; - } - return AC_SUCCESS; -} - -AcResult -loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param, - const int value) -{ - cudaSetDevice(device->id); - // CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here) - // and something like d_mesh_info.real_params[] cannot be directly accessed. - // Therefore we have to obfuscate the code a bit and compute the offset address before - // invoking cudaMemcpyToSymbol. - const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, - device->streams[stream_type])); - return AC_SUCCESS; -} - -AcResult -loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param, - const AcReal value) -{ - cudaSetDevice(device->id); - const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, - device->streams[stream_type])); - return AC_SUCCESS; -} - -AcResult -loadGlobalGrid(const Device device, const Grid grid) -{ - cudaSetDevice(device->id); - ERRCHK_CUDA_ALWAYS( - cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice)); - return AC_SUCCESS; -} - -AcResult -autoOptimize(const Device device) -{ - cudaSetDevice(device->id); - - // RK3 - const int3 start = (int3){NGHOST, NGHOST, NGHOST}; - const int3 end = start + (int3){device->local_config.int_params[AC_nx], // - device->local_config.int_params[AC_ny], // - device->local_config.int_params[AC_nz]}; - - dim3 best_dims(0, 0, 0); - float best_time = INFINITY; - const int num_iterations = 10; - - for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { - for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { - for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) { - - if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z) - break; - if (x * y * z > MAX_THREADS_PER_BLOCK) - break; - - if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK) - break; - - if (((x * y * z) % WARP_SIZE) != 0) - continue; - - const dim3 tpb(x, y, z); - const int3 n = end - start; - const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // - (unsigned int)ceil(n.y / AcReal(tpb.y)), // - (unsigned int)ceil(n.z / AcReal(tpb.z))); - - cudaDeviceSynchronize(); - if (cudaGetLastError() != cudaSuccess) // resets the error if any - continue; - - // printf("(%d, %d, %d)\n", x, y, z); - - cudaEvent_t tstart, tstop; - cudaEventCreate(&tstart); - cudaEventCreate(&tstop); - - cudaEventRecord(tstart); // ---------------------------------------- Timing start - - for (int i = 0; i < num_iterations; ++i) - solve<2><<>>(start, end, device->vba, FLT_EPSILON); - - cudaEventRecord(tstop); // ----------------------------------------- Timing end - cudaEventSynchronize(tstop); - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, tstart, tstop); - - ERRCHK_CUDA_KERNEL_ALWAYS(); - if (milliseconds < best_time) { - best_time = milliseconds; - best_dims = tpb; - } - } - } - } -#if VERBOSE_PRINTING - printf( - "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", - best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); -#endif - /* - FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); - ERRCHK(fp); - fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z); - fclose(fp); - */ - - rk3_tpb = best_dims; - return AC_SUCCESS; -} - -#if PACKED_DATA_TRANSFERS -// Functions for calling packed data transfers -#endif - -/* - * ============================================================================= - * Revised interface - * ============================================================================= - */ diff --git a/src/core/grid.cu b/src/core/grid.cu index e69de29..83c7453 100644 --- a/src/core/grid.cu +++ b/src/core/grid.cu @@ -0,0 +1,19 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ +#include "astaroth_grid.h" diff --git a/src/core/node.cu b/src/core/node.cu index c3bfc05..1c3d5cf 100644 --- a/src/core/node.cu +++ b/src/core/node.cu @@ -16,7 +16,7 @@ You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ -// #include "astaroth_node.h" +#include "astaroth_node.h" struct node_s { };