Merge branch 'master' into sink_20190723

2019-09-16 10:57:15 +08:00
parent 88a8198810 55e4357d77
commit 42f92c7d49
16 changed files with 1100 additions and 40 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -22,15 +22,16 @@
 #include "math_utils.h" // int3 + int3

 #define AC_GEN_STR(X) #X
-const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
+const char* intparam_names[]    = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
+const char* int3param_names[]   = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
+const char* realparam_names[]   = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
+const char* real3param_names[]  = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
+const char* vtxbuf_names[]      = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 #undef AC_GEN_STR

 static const int num_nodes = 1;
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -37,6 +37,8 @@
 typedef struct {
    AcReal* in[NUM_VTXBUF_HANDLES];
    AcReal* out[NUM_VTXBUF_HANDLES];
+
+    AcReal* profiles[NUM_SCALARARRAY_HANDLES];
 } VertexBufferArray;

 struct device_s {
@@ -44,7 +46,7 @@ struct device_s {
    AcMeshInfo local_config;

    // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
+    cudaStream_t streams[NUM_STREAMS];

    // Memory
    VertexBufferArray vba;
@@ -97,6 +99,32 @@ DCONST(const VertexBufferHandle handle)
 //#define globalMeshN_min // Placeholder
 #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
+//#include <thrust/complex.h>
+// using namespace thrust;
+#include <cuComplex.h>
+#if AC_DOUBLE_PRECISION == 1
+typedef cuDoubleComplex acComplex;
+#define acComplex(x, y) make_cuDoubleComplex(x, y)
+#else
+typedef cuFloatComplex acComplex;
+#define acComplex(x, y) make_cuFloatComplex(x, y)
+#endif
+static __device__ inline acComplex
+exp(const acComplex& val)
+{
+    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
+}
+static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+{
+    return (acComplex){a * b.x, a * b.y};
+}
+
+static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+{
+    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
+}
+//#include <complex>
+
 #include "kernels/boundconds.cuh"
 #include "kernels/integration.cuh"
 #include "kernels/reductions.cuh"
@@ -135,16 +163,26 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
    printf("Success!\n");

    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+    for (int i = 0; i < NUM_STREAMS; ++i) {
        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
    }

    // Memory
+    // VBA in/out
    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
    }
+    // VBA Profiles
+    const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
+                                                           max(device_config.int_params[AC_my],
+                                                               device_config.int_params[AC_mz]));
+    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
+    }
+
+    // Reductions
    ERRCHK_CUDA_ALWAYS(
        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
@@ -178,6 +216,10 @@ acDeviceDestroy(Device device)
        cudaFree(device->vba.in[i]);
        cudaFree(device->vba.out[i]);
    }
+    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
+        cudaFree(device->vba.profiles[i]);
+    }
+
    cudaFree(device->reduce_scratchpad);
    cudaFree(device->reduce_result);

@@ -186,7 +228,7 @@ acDeviceDestroy(Device device)
 #endif

    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+    for (int i = 0; i < NUM_STREAMS; ++i) {
        cudaStreamDestroy(device->streams[i]);
    }

@@ -405,6 +447,21 @@ acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3P
    return AC_SUCCESS;
 }

+AcResult
+acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
+                        const size_t start, const AcReal* data, const size_t num)
+{
+    cudaSetDevice(device->id);
+
+    ERRCHK(start + num <= max(device->local_config.int_params[AC_mx],
+                              max(device->local_config.int_params[AC_my],
+                                  device->local_config.int_params[AC_mz])));
+
+    ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num,
+                                cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
 AcResult
 acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
 {
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -70,11 +70,11 @@ create_rotz(const AcReal radians)
 #define cos __cosf
 #define exp __expf
 */
-#define sin sinf
-#define cos cosf
-#define exp expf
-#define rsqrt rsqrtf // hardware reciprocal sqrt
-#endif               // AC_DOUBLE_PRECISION == 0
+//#define sin sinf
+//#define cos cosf
+//#define exp expf
+//#define rsqrt rsqrtf // hardware reciprocal sqrt
+#endif // AC_DOUBLE_PRECISION == 0

 /*
 * =============================================================================
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -124,6 +124,11 @@ static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }

+static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
 static HOST_DEVICE_INLINE AcReal
 dot(const AcReal3& a, const AcReal3& b)
 {