Merged in cmakelist_rewrite_and_C_API_conformity_07-26 (pull request #1)

2019-08-07 06:53:17 +03:00
parent a6fca069a7 abf4815174
commit cf6b75f82a
27 changed files with 415 additions and 334 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,58 +2,36 @@
 ##  CMakeLists.txt for Astaroth Core  ##
 ########################################

-#----------------------Find CUDA-----------------------------------------------#
-
+## Find packages
 find_package(CUDA 9 REQUIRED)

-#----------------------CUDA settings-------------------------------------------#
-
-set(CUDA_SEPARABLE_COMPILATION OFF)
-set(CUDA_PROPAGATE_HOST_FLAGS ON)
-
-#----------------------Setup CUDA compilation flags----------------------------#
-
-# Generate code for the default architecture (Pascal)
+## Architecture and optimization flags
 set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
                    -gencode arch=compute_50,code=sm_50
                    -gencode arch=compute_60,code=sm_60
                    -gencode arch=compute_61,code=sm_61
                    -lineinfo
-                    -ftz=true
-                    -std=c++11) #--maxrregcount=255 -ftz=true #ftz = flush denormalized floats to zero
-# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
-# =cg to opt out
-
-# Additional CUDA optimization flags
-if (CMAKE_BUILD_TYPE MATCHES RELEASE)
-    # Doesn't set any additional flags, see CUDA_NVCC_FLAGS_DEBUG below on how
-    # to add more
-    set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE})
-endif()
-
-# Additional CUDA debug flags
-if (CMAKE_BUILD_TYPE MATCHES DEBUG)
-    # The debug flags must be set inside this if clause, since either CMake 3.5
-    # or nvcc 7.5 is bugged:
-    # CMake converts these into empty strings when doing RELEASE build, but nvcc
-    # 7.5 fails to parse empty flags.
-    set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};
-                               --device-debug;
-                               --generate-line-info;
-                               --ptxas-options=-v)
-endif()
-
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCH_FLAGS}")
+                    -ftz=true # Flush denormalized floats to zero
+                    -std=c++11)
+                    #--maxrregcount=255
+                    # -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
+                    # =cg to opt out


-message("CUDA_NVCC_FLAGS: " ${CUDA_NVCC_FLAGS})
+set(CUDA_WARNING_FLAGS --compiler-options -Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion) # -Wshadow

+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS})
+set(CUDA_NVCC_FLAGS_RELEASE)
+set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)

-#------------------Compile and create a static library-------------------------#
-file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu")
+## Definitions
+if (MULTIGPU_ENABLED)
+    add_definitions(-DAC_MULTIGPU_ENABLED=1)
+else ()
+    add_definitions(-DAC_MULTIGPU_ENABLED=0)
+endif ()

-# Use -fPIC if -fpic not supported. Some quick non-scientific tests:
-# Without fpic: 4.94 user, 4.04 system, 0:09.88 elapsed
-# With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed
-# With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed
-CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic")
+## Create and link the library
+include_directories(.)
+cuda_add_library(astaroth_core STATIC astaroth.cu device.cu)
+target_link_libraries(astaroth_core m)
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -127,8 +127,8 @@
 #include "errchk.h"

 #include "device.cuh"
-#include "math_utils.h"               // sum for reductions
-#include "standalone/config_loader.h" // update_config
+#include "math_utils.h" // sum for reductions
+// #include "standalone/config_loader.h" // update_config

 #define AC_GEN_STR(X) #X
 const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
@@ -156,7 +156,7 @@ gridIdx(const Grid grid, const int3 idx)
 }

 static int3
-gridIdx3d(const Grid& grid, const int idx)
+gridIdx3d(const Grid grid, const int idx)
 {
    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                  idx / (grid.m.x * grid.m.y)};
@@ -168,8 +168,49 @@ printInt3(const int3 vec)
    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
 }

+static inline void
+print(const AcMeshInfo config)
+{
+    for (int i = 0; i < NUM_INT_PARAMS; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+}
+
+static void
+update_builtin_params(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = NGHOST;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = NGHOST;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
+    config->int_params[AC_nz_min] = NGHOST;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
+
+#if VERBOSE_PRINTING // Defined in astaroth.h
+    printf("###############################################################\n");
+    printf("Config dimensions recalculated:\n");
+    print(*config);
+    printf("###############################################################\n");
+#endif
+}
+
 static Grid
-createGrid(const AcMeshInfo& config)
+createGrid(const AcMeshInfo config)
 {
    Grid grid;

@@ -246,7 +287,7 @@ acSynchronizeMesh(void)
 }

 AcResult
-acInit(const AcMeshInfo& config)
+acInit(const AcMeshInfo config)
 {
    // Get num_devices
    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
@@ -274,7 +315,7 @@ acInit(const AcMeshInfo& config)
    // Subgrids
    AcMeshInfo subgrid_config = config;
    subgrid_config.int_params[AC_nz] /= num_devices;
-    update_config(&subgrid_config);
+    update_builtin_params(&subgrid_config);
    subgrid = createGrid(subgrid_config);

    // Periodic boundary conditions become weird if the system can "fold unto itself".
@@ -337,8 +378,8 @@ acQuit(void)
 }

 AcResult
-acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start,
-                               const int3& end, const StreamType stream)
+acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
+                               const int3 end, const StreamType stream)
 {
    // See the beginning of the file for an explanation of the index mapping
    // #pragma omp parallel for
@@ -360,13 +401,13 @@ acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3
 }

 AcResult
-acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
+acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end)
 {
    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
 }

 AcResult
-acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream)
+acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream)
 {
    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
    const int3 end   = start + grid.n;
@@ -374,7 +415,7 @@ acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType str
 }

 AcResult
-acIntegrateStep(const int& isubstep, const AcReal& dt)
+acIntegrateStep(const int isubstep, const AcReal dt)
 {
    return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
 }
@@ -452,7 +493,7 @@ swap_buffers(void)
 }

 AcResult
-acIntegrate(const AcReal& dt)
+acIntegrate(const AcReal dt)
 {
    acSynchronizeStream(STREAM_ALL);
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
@@ -464,7 +505,7 @@ acIntegrate(const AcReal& dt)
 }

 static AcReal
-simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n)
 {
    AcReal res = results[0];
    for (int i = 1; i < n; ++i) {
@@ -490,7 +531,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
 }

 AcReal
-acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
+acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle)
 {
    acSynchronizeStream(STREAM_ALL);

@@ -504,8 +545,8 @@ acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_han
 }

 AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
-            const VertexBufferHandle& c)
+acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
+            const VertexBufferHandle c)
 {
    acSynchronizeStream(STREAM_ALL);

@@ -519,7 +560,7 @@ acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const Verte
 }

 AcResult
-acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& src, const int num_vertices,
+acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices,
                      const StreamType stream)
 {
    // See the beginning of the file for an explanation of the index mapping
@@ -557,13 +598,13 @@ acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& src, const int num_ve
 }

 AcResult
-acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
+acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices)
 {
    return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
 }

 AcResult
-acLoad(const AcMesh& host_mesh)
+acLoad(const AcMesh host_mesh)
 {
    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info));
    acSynchronizeStream(STREAM_ALL);
@@ -571,7 +612,7 @@ acLoad(const AcMesh& host_mesh)
 }

 AcResult
-acStoreWithOffsetAsync(const int3& src, const int num_vertices, AcMesh* host_mesh,
+acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh,
                       const StreamType stream)
 {
    // See the beginning of the file for an explanation of the index mapping
@@ -596,7 +637,7 @@ acStoreWithOffsetAsync(const int3& src, const int num_vertices, AcMesh* host_mes
 }

 AcResult
-acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
+acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh)
 {
    return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
 }
@@ -624,3 +665,9 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
 {
    return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
 }
+
+/*
+ * =============================================================================
+ * Revised interface
+ * =============================================================================
+ */
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -516,3 +516,9 @@ autoOptimize(const Device device)
 #if PACKED_DATA_TRANSFERS
 // Functions for calling packed data transfers
 #endif
+
+/*
+ * =============================================================================
+ * Revised interface
+ * =============================================================================
+ */
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -99,3 +99,9 @@ AcResult autoOptimize(const Device device);
 #if PACKED_DATA_TRANSFERS
 // Declarations used for packed data transfers
 #endif
+
+/*
+ * =============================================================================
+ * Revised interface
+ * =============================================================================
+ */