Merge branch 'master' into sink_20190723

This commit is contained in:
Miikka Vaisala
2019-09-16 10:57:15 +08:00
16 changed files with 1100 additions and 40 deletions

View File

@@ -22,15 +22,16 @@
#include "math_utils.h" // int3 + int3
#define AC_GEN_STR(X) #X
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
const char* int3param_names[] = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
const char* int3param_names[] = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
const char* realparam_names[] = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
const char* realparam_names[] = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
#undef AC_GEN_STR
static const int num_nodes = 1;

View File

@@ -37,6 +37,8 @@
typedef struct {
AcReal* in[NUM_VTXBUF_HANDLES];
AcReal* out[NUM_VTXBUF_HANDLES];
AcReal* profiles[NUM_SCALARARRAY_HANDLES];
} VertexBufferArray;
struct device_s {
@@ -44,7 +46,7 @@ struct device_s {
AcMeshInfo local_config;
// Concurrency
cudaStream_t streams[NUM_STREAM_TYPES];
cudaStream_t streams[NUM_STREAMS];
// Memory
VertexBufferArray vba;
@@ -97,6 +99,32 @@ DCONST(const VertexBufferHandle handle)
//#define globalMeshN_min // Placeholder
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
//#include <thrust/complex.h>
// using namespace thrust;
#include <cuComplex.h>
#if AC_DOUBLE_PRECISION == 1
typedef cuDoubleComplex acComplex;
#define acComplex(x, y) make_cuDoubleComplex(x, y)
#else
typedef cuFloatComplex acComplex;
#define acComplex(x, y) make_cuFloatComplex(x, y)
#endif
static __device__ inline acComplex
exp(const acComplex& val)
{
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
}
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
{
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
}
//#include <complex>
#include "kernels/boundconds.cuh"
#include "kernels/integration.cuh"
#include "kernels/reductions.cuh"
@@ -135,16 +163,26 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
printf("Success!\n");
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
for (int i = 0; i < NUM_STREAMS; ++i) {
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
}
// Memory
// VBA in/out
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
}
// VBA Profiles
const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
max(device_config.int_params[AC_my],
device_config.int_params[AC_mz]));
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
}
// Reductions
ERRCHK_CUDA_ALWAYS(
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
@@ -178,6 +216,10 @@ acDeviceDestroy(Device device)
cudaFree(device->vba.in[i]);
cudaFree(device->vba.out[i]);
}
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
cudaFree(device->vba.profiles[i]);
}
cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result);
@@ -186,7 +228,7 @@ acDeviceDestroy(Device device)
#endif
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
for (int i = 0; i < NUM_STREAMS; ++i) {
cudaStreamDestroy(device->streams[i]);
}
@@ -405,6 +447,21 @@ acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3P
return AC_SUCCESS;
}
AcResult
acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
const size_t start, const AcReal* data, const size_t num)
{
cudaSetDevice(device->id);
ERRCHK(start + num <= max(device->local_config.int_params[AC_mx],
max(device->local_config.int_params[AC_my],
device->local_config.int_params[AC_mz])));
ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
{

View File

@@ -70,11 +70,11 @@ create_rotz(const AcReal radians)
#define cos __cosf
#define exp __expf
*/
#define sin sinf
#define cos cosf
#define exp expf
#define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0
//#define sin sinf
//#define cos cosf
//#define exp expf
//#define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0
/*
* =============================================================================

View File

@@ -124,6 +124,11 @@ static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal
dot(const AcReal3& a, const AcReal3& b)
{