diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b47e13..a3da8a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ #-------------------General---------------------------------------------------# -project(ASTAROTH_2.0 CXX) +project(ASTAROTH_2.0 C CXX) set (CMAKE_CXX_STANDARD 11) cmake_minimum_required (VERSION 3.5.1) # Need >= 3.8 for first-class CUDA support cmake_policy (SET CMP0023 NEW) @@ -181,3 +181,5 @@ if (BUILD_STANDALONE) cuda_add_executable(ac_run src/standalone/main.cc) target_link_libraries(ac_run astaroth_standalone astaroth_core ${SDL2_LIBRARY}) endif() + +add_subdirectory(ctest) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt new file mode 100644 index 0000000..b2c84bf --- /dev/null +++ b/ctest/CMakeLists.txt @@ -0,0 +1,3 @@ +include_directories(${CMAKE_SOURCE_DIR}/include) +add_executable(ctest main.c) +target_link_libraries(ctest astaroth_core) diff --git a/ctest/main.c b/ctest/main.c new file mode 100644 index 0000000..5ed5ffc --- /dev/null +++ b/ctest/main.c @@ -0,0 +1,17 @@ +#include +#include + +#include "astaroth.h" + +int +main(void) +{ + AcMeshInfo info = { + .int_params[AC_mx] = 128, + .int_params[AC_my] = 64, + .int_params[AC_mz] = 32, + }; + acInit(info); + acQuit(); + return EXIT_SUCCESS; +} diff --git a/include/astaroth.h b/include/astaroth.h index be0081e..52ba3d0 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -40,7 +40,7 @@ AcResult acSynchronizeMesh(void); initialization of *all memory needed on all GPUs in the node*. In other words, setups everything GPU-side so that calling any other GPU interface function afterwards does not result in illegal memory accesses. */ -AcResult acInit(const AcMeshInfo& mesh_info); +AcResult acInit(const AcMeshInfo mesh_info); /** Frees all GPU allocations and resets all devices in the node. Should be * called at exit. */ @@ -49,23 +49,23 @@ AcResult acQuit(void); /** Does all three substeps of the RK3 integration and computes the boundary conditions when necessary. The result is synchronized and the boundary conditions are applied after the final substep, after which the result can be fetched to CPU memory with acStore. */ -AcResult acIntegrate(const AcReal& dt); +AcResult acIntegrate(const AcReal dt); /** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the * whole computational domain, which must be up to date and synchronized before calling * acReduceScal. */ -AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a); +AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle a); /** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the * whole computational domain, which must be up to date and synchronized before calling * acReduceVec. */ -AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, - const VertexBufferHandle& b, const VertexBufferHandle& c); +AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a, + const VertexBufferHandle b, const VertexBufferHandle c); /** Distributes the host mesh among the GPUs in the node. Synchronous. */ -AcResult acLoad(const AcMesh& host_mesh); +AcResult acLoad(const AcMesh host_mesh); /** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous. */ @@ -82,32 +82,38 @@ AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream); /** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */ -AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices); -AcResult acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& start, const int num_vertices, +AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 start, const int num_vertices); +AcResult acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 start, const int num_vertices, const StreamType stream); /** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to * CPU memory. Asynchronous. */ -AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh); -AcResult acStoreWithOffsetAsync(const int3& start, const int num_vertices, AcMesh* host_mesh, +AcResult acStoreWithOffset(const int3 start, const int num_vertices, AcMesh* host_mesh); +AcResult acStoreWithOffsetAsync(const int3 start, const int num_vertices, AcMesh* host_mesh, const StreamType stream); /** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/ -AcResult acIntegrateStep(const int& isubstep, const AcReal& dt); -AcResult acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream); +AcResult acIntegrateStep(const int isubstep, const AcReal dt); +AcResult acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream); /** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions. * Asynchronous.*/ -AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, - const int3& end); -AcResult acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start, - const int3& end, const StreamType stream); +AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, + const int3 end); +AcResult acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start, + const int3 end, const StreamType stream); /** Performs the boundary condition step on the GPUs in the node. Asynchronous. */ AcResult acBoundcondStep(void); AcResult acBoundcondStepAsync(const StreamType stream); +/* + * ============================================================================= + * Revised interface + * ============================================================================= + */ + #ifdef __cplusplus } // extern "C" #endif diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h index ac9804b..0915d79 100644 --- a/include/astaroth_defines.h +++ b/include/astaroth_defines.h @@ -140,31 +140,31 @@ typedef struct { * ============================================================================= */ static inline size_t -acVertexBufferSize(const AcMeshInfo& info) +acVertexBufferSize(const AcMeshInfo info) { return info.int_params[AC_mx] * info.int_params[AC_my] * info.int_params[AC_mz]; } static inline size_t -acVertexBufferSizeBytes(const AcMeshInfo& info) +acVertexBufferSizeBytes(const AcMeshInfo info) { return sizeof(AcReal) * acVertexBufferSize(info); } static inline size_t -acVertexBufferCompdomainSize(const AcMeshInfo& info) +acVertexBufferCompdomainSize(const AcMeshInfo info) { return info.int_params[AC_nx] * info.int_params[AC_ny] * info.int_params[AC_nz]; } static inline size_t -acVertexBufferCompdomainSizeBytes(const AcMeshInfo& info) +acVertexBufferCompdomainSizeBytes(const AcMeshInfo info) { return sizeof(AcReal) * acVertexBufferCompdomainSize(info); } static inline size_t -acVertexBufferIdx(const int i, const int j, const int k, const AcMeshInfo& info) +acVertexBufferIdx(const int i, const int j, const int k, const AcMeshInfo info) { return i + // j * info.int_params[AC_mx] + // @@ -173,25 +173,25 @@ acVertexBufferIdx(const int i, const int j, const int k, const AcMeshInfo& info) /* static inline int -acGetParam(const AcMeshInfo& info, const AcIntParam param) +acGetParam(const AcMeshInfo info, const AcIntParam param) { return info.int_params[param]; } static inline int3 -acGetParam(const AcMeshInfo& info, const AcInt3Param param) +acGetParam(const AcMeshInfo info, const AcInt3Param param) { return info.int3_params[param]; } static inline AcReal -acGetParam(const AcMeshInfo& info, const AcRealParam param) +acGetParam(const AcMeshInfo info, const AcRealParam param) { return info.real_params[param]; } static inline AcReal3 -acGetParam(const AcMeshInfo& info, const AcReal3Param param) +acGetParam(const AcMeshInfo info, const AcReal3Param param) { return info.real3_params[param]; } diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 68cdb25..79560c9 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -57,3 +57,4 @@ file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu") # With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed # With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic") +target_link_libraries(astaroth_core m) diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index fadf4c0..05af3d3 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -127,8 +127,8 @@ #include "errchk.h" #include "device.cuh" -#include "math_utils.h" // sum for reductions -#include "standalone/config_loader.h" // update_config +#include "math_utils.h" // sum for reductions +// #include "standalone/config_loader.h" // update_config #define AC_GEN_STR(X) #X const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) // @@ -156,7 +156,7 @@ gridIdx(const Grid grid, const int3 idx) } static int3 -gridIdx3d(const Grid& grid, const int idx) +gridIdx3d(const Grid grid, const int idx) { return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x, idx / (grid.m.x * grid.m.y)}; @@ -168,8 +168,49 @@ printInt3(const int3 vec) printf("(%d, %d, %d)", vec.x, vec.y, vec.z); } +static inline void +print(const AcMeshInfo config) +{ + for (int i = 0; i < NUM_INT_PARAMS; ++i) + printf("[%s]: %d\n", intparam_names[i], config.int_params[i]); + for (int i = 0; i < NUM_REAL_PARAMS; ++i) + printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i])); +} + +static void +update_builtin_params(AcMeshInfo* config) +{ + config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER; + ///////////// PAD TEST + // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; + ///////////// PAD TEST + config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER; + config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER; + + // Bounds for the computational domain, i.e. nx_min <= i < nx_max + config->int_params[AC_nx_min] = NGHOST; + config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx]; + config->int_params[AC_ny_min] = NGHOST; + config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST; + config->int_params[AC_nz_min] = NGHOST; + config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST; + + /* Additional helper params */ + // Int helpers + config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my]; + config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny]; + config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz]; + +#if VERBOSE_PRINTING // Defined in astaroth.h + printf("###############################################################\n"); + printf("Config dimensions recalculated:\n"); + print(*config); + printf("###############################################################\n"); +#endif +} + static Grid -createGrid(const AcMeshInfo& config) +createGrid(const AcMeshInfo config) { Grid grid; @@ -246,7 +287,7 @@ acSynchronizeMesh(void) } AcResult -acInit(const AcMeshInfo& config) +acInit(const AcMeshInfo config) { // Get num_devices ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices)); @@ -274,7 +315,7 @@ acInit(const AcMeshInfo& config) // Subgrids AcMeshInfo subgrid_config = config; subgrid_config.int_params[AC_nz] /= num_devices; - update_config(&subgrid_config); + update_builtin_params(&subgrid_config); subgrid = createGrid(subgrid_config); // Periodic boundary conditions become weird if the system can "fold unto itself". @@ -337,8 +378,8 @@ acQuit(void) } AcResult -acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start, - const int3& end, const StreamType stream) +acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start, + const int3 end, const StreamType stream) { // See the beginning of the file for an explanation of the index mapping // #pragma omp parallel for @@ -360,13 +401,13 @@ acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3 } AcResult -acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end) +acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end) { return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT); } AcResult -acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream) +acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream) { const int3 start = (int3){NGHOST, NGHOST, NGHOST}; const int3 end = start + grid.n; @@ -374,7 +415,7 @@ acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType str } AcResult -acIntegrateStep(const int& isubstep, const AcReal& dt) +acIntegrateStep(const int isubstep, const AcReal dt) { return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT); } @@ -452,7 +493,7 @@ swap_buffers(void) } AcResult -acIntegrate(const AcReal& dt) +acIntegrate(const AcReal dt) { acSynchronizeStream(STREAM_ALL); for (int isubstep = 0; isubstep < 3; ++isubstep) { @@ -464,7 +505,7 @@ acIntegrate(const AcReal& dt) } static AcReal -simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n) +simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n) { AcReal res = results[0]; for (int i = 1; i < n; ++i) { @@ -491,7 +532,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons } AcReal -acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle) +acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle) { acSynchronizeStream(STREAM_ALL); @@ -505,8 +546,8 @@ acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_han } AcReal -acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b, - const VertexBufferHandle& c) +acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, + const VertexBufferHandle c) { acSynchronizeStream(STREAM_ALL); @@ -520,7 +561,7 @@ acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const Verte } AcResult -acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& src, const int num_vertices, +acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices, const StreamType stream) { // See the beginning of the file for an explanation of the index mapping @@ -558,13 +599,13 @@ acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& src, const int num_ve } AcResult -acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices) +acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices) { return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT); } AcResult -acLoad(const AcMesh& host_mesh) +acLoad(const AcMesh host_mesh) { acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info)); acSynchronizeStream(STREAM_ALL); @@ -572,7 +613,7 @@ acLoad(const AcMesh& host_mesh) } AcResult -acStoreWithOffsetAsync(const int3& src, const int num_vertices, AcMesh* host_mesh, +acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh, const StreamType stream) { // See the beginning of the file for an explanation of the index mapping @@ -597,7 +638,7 @@ acStoreWithOffsetAsync(const int3& src, const int num_vertices, AcMesh* host_mes } AcResult -acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh) +acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh) { return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT); } @@ -625,3 +666,9 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value) { return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT); } + +/* + * ============================================================================= + * Revised interface + * ============================================================================= + */ diff --git a/src/core/device.cu b/src/core/device.cu index 7b624e3..f4bef17 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -512,3 +512,9 @@ autoOptimize(const Device device) #if PACKED_DATA_TRANSFERS // Functions for calling packed data transfers #endif + +/* + * ============================================================================= + * Revised interface + * ============================================================================= + */ diff --git a/src/core/device.cuh b/src/core/device.cuh index 9d20620..7f1fad4 100644 --- a/src/core/device.cuh +++ b/src/core/device.cuh @@ -99,3 +99,9 @@ AcResult autoOptimize(const Device device); #if PACKED_DATA_TRANSFERS // Declarations used for packed data transfers #endif + +/* + * ============================================================================= + * Revised interface + * ============================================================================= + */