Now nvcc is used to compile kernels only. All host code, incl. device.cc, MPI communication and others are now compiled with the host C++ compiler. This should work around an nvcc/MPI bug on Puhti.
This commit is contained in:
@@ -26,7 +26,10 @@ extern "C" {
|
||||
#include <stdlib.h> // size_t
|
||||
//#include <vector_types.h> // CUDA vector types (float4, etc)
|
||||
|
||||
#ifndef __CUDACC__
|
||||
//#ifndef __CUDACC__
|
||||
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
|
||||
#include <cuda_runtime_api.h>
|
||||
#else
|
||||
typedef struct {
|
||||
int x, y, z;
|
||||
} int3;
|
||||
@@ -42,7 +45,8 @@ typedef struct {
|
||||
typedef struct {
|
||||
double x, y, z;
|
||||
} double3;
|
||||
#endif // __CUDACC__
|
||||
#endif
|
||||
//#endif // __CUDACC__
|
||||
|
||||
// Library flags
|
||||
#define STENCIL_ORDER (6)
|
||||
|
@@ -2,8 +2,13 @@
|
||||
## CMakeLists.txt for Astaroth Core ##
|
||||
########################################
|
||||
|
||||
# Require C++11
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
## Find packages
|
||||
find_package(CUDA REQUIRED)
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
# find_package(OpenMP REQUIRED)
|
||||
|
||||
## Architecture and optimization flags
|
||||
@@ -23,8 +28,13 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
|
||||
set(CUDA_WARNING_FLAGS --compiler-options -Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion) # -Wshadow
|
||||
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS})
|
||||
set(CUDA_NVCC_FLAGS_RELEASE)
|
||||
set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
|
||||
|
||||
# Workaround for CMake bug https://gitlab.kitware.com/cmake/cmake/issues/16411
|
||||
if (CMAKE_BUILD_TYPE_UPPER STREQUAL "RELEASE")
|
||||
set(CUDA_NVCC_FLAGS_RELEASE -O3)
|
||||
elseif (CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG")
|
||||
set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
|
||||
endif()
|
||||
|
||||
if (MPI_ENABLED)
|
||||
find_package(MPI REQUIRED)
|
||||
@@ -36,10 +46,11 @@ if (MPI_ENABLED)
|
||||
endif ()
|
||||
|
||||
## Create and link the library
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries
|
||||
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
|
||||
target_include_directories(astaroth_core PRIVATE .)
|
||||
target_link_libraries(astaroth_core m)
|
||||
#set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries
|
||||
cuda_add_library(astaroth_core STATIC astaroth.cc device.cc node.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu)
|
||||
target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
|
||||
target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m)
|
||||
target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
|
||||
add_dependencies(astaroth_core dsl_headers)
|
||||
|
||||
## Definitions
|
||||
|
@@ -26,20 +26,10 @@
|
||||
*/
|
||||
#include "astaroth_device.h"
|
||||
|
||||
#include "math_utils.h"
|
||||
#include "errchk.h"
|
||||
|
||||
// Device info
|
||||
#define REGISTERS_PER_THREAD (255)
|
||||
#define MAX_REGISTERS_PER_BLOCK (65536)
|
||||
#define MAX_THREADS_PER_BLOCK (1024)
|
||||
#define WARP_SIZE (32)
|
||||
|
||||
typedef struct {
|
||||
AcReal* in[NUM_VTXBUF_HANDLES];
|
||||
AcReal* out[NUM_VTXBUF_HANDLES];
|
||||
|
||||
AcReal* profiles[NUM_SCALARARRAY_HANDLES];
|
||||
} VertexBufferArray;
|
||||
#include "kernels/common.cuh"
|
||||
|
||||
struct device_s {
|
||||
int id;
|
||||
@@ -63,94 +53,14 @@ struct device_s {
|
||||
#endif
|
||||
};
|
||||
|
||||
__constant__ AcMeshInfo d_mesh_info;
|
||||
static int __device__ __forceinline__
|
||||
DCONST(const AcIntParam param)
|
||||
{
|
||||
return d_mesh_info.int_params[param];
|
||||
}
|
||||
static int3 __device__ __forceinline__
|
||||
DCONST(const AcInt3Param param)
|
||||
{
|
||||
return d_mesh_info.int3_params[param];
|
||||
}
|
||||
static AcReal __device__ __forceinline__
|
||||
DCONST(const AcRealParam param)
|
||||
{
|
||||
return d_mesh_info.real_params[param];
|
||||
}
|
||||
static AcReal3 __device__ __forceinline__
|
||||
DCONST(const AcReal3Param param)
|
||||
{
|
||||
return d_mesh_info.real3_params[param];
|
||||
}
|
||||
static __device__ constexpr VertexBufferHandle
|
||||
DCONST(const VertexBufferHandle handle)
|
||||
{
|
||||
return handle;
|
||||
}
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
|
||||
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
|
||||
#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
|
||||
//#define globalMeshM // Placeholder
|
||||
//#define localMeshN // Placeholder
|
||||
//#define localMeshM // Placeholder
|
||||
//#define localMeshN_min // Placeholder
|
||||
//#define globalMeshN_min // Placeholder
|
||||
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
|
||||
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
|
||||
//#include <thrust/complex.h>
|
||||
// using namespace thrust;
|
||||
#include <cuComplex.h>
|
||||
#if AC_DOUBLE_PRECISION == 1
|
||||
typedef cuDoubleComplex acComplex;
|
||||
#define acComplex(x, y) make_cuDoubleComplex(x, y)
|
||||
#else
|
||||
typedef cuFloatComplex acComplex;
|
||||
#define acComplex(x, y) make_cuFloatComplex(x, y)
|
||||
#endif
|
||||
static __device__ inline acComplex
|
||||
exp(const acComplex& val)
|
||||
{
|
||||
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
|
||||
}
|
||||
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
|
||||
{
|
||||
return (acComplex){a * b.x, a * b.y};
|
||||
}
|
||||
|
||||
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
|
||||
{
|
||||
return (acComplex){a * b.x, a * b.y};
|
||||
}
|
||||
|
||||
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
|
||||
{
|
||||
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
|
||||
}
|
||||
//#include <complex>
|
||||
|
||||
#include "kernels/boundconds.cuh"
|
||||
#include "kernels/integration.cuh"
|
||||
#include "kernels/reductions.cuh"
|
||||
|
||||
static dim3 rk3_tpb(32, 1, 4);
|
||||
|
||||
#if PACKED_DATA_TRANSFERS // Defined in device.cuh
|
||||
// #include "kernels/pack_unpack.cuh"
|
||||
#endif
|
||||
|
||||
static __global__ void
|
||||
dummy_kernel(void)
|
||||
{
|
||||
DCONST((AcIntParam)0);
|
||||
DCONST((AcInt3Param)0);
|
||||
DCONST((AcRealParam)0);
|
||||
DCONST((AcReal3Param)0);
|
||||
acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
|
||||
a* a;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||
{
|
||||
@@ -171,8 +81,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
||||
"device supports the CUDA architecture you are compiling for.\n"
|
||||
"Running dummy kernel... ");
|
||||
fflush(stdout);
|
||||
dummy_kernel<<<1, 1>>>();
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
acKernelDummy();
|
||||
printf("Success!\n");
|
||||
|
||||
// Concurrency
|
||||
@@ -184,21 +93,21 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
||||
// VBA in/out
|
||||
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.in[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.out[i], vba_size_bytes));
|
||||
}
|
||||
// VBA Profiles
|
||||
const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
|
||||
max(device_config.int_params[AC_my],
|
||||
device_config.int_params[AC_mz]));
|
||||
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.profiles[i], profile_size_bytes));
|
||||
}
|
||||
|
||||
// Reductions
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||
cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
|
||||
|
||||
#if AC_MPI_ENABLED
|
||||
// Allocate data required for packed transfers here (cudaMalloc)
|
||||
@@ -206,8 +115,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
||||
device_config.int_params[AC_my] * NGHOST * NUM_VTXBUF_HANDLES *
|
||||
sizeof(AcReal);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->inner[i], block_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->outer[i], block_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes));
|
||||
|
||||
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes));
|
||||
@@ -332,87 +241,11 @@ AcResult
|
||||
acDeviceAutoOptimize(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
// RK3
|
||||
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
|
||||
const int3 end = start + (int3){device->local_config.int_params[AC_nx], //
|
||||
device->local_config.int_params[AC_ny], //
|
||||
device->local_config.int_params[AC_nz]};
|
||||
|
||||
dim3 best_dims(0, 0, 0);
|
||||
float best_time = INFINITY;
|
||||
const int num_iterations = 10;
|
||||
|
||||
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
|
||||
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
|
||||
for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
|
||||
|
||||
if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
|
||||
break;
|
||||
if (x * y * z > MAX_THREADS_PER_BLOCK)
|
||||
break;
|
||||
|
||||
if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
|
||||
break;
|
||||
|
||||
if (((x * y * z) % WARP_SIZE) != 0)
|
||||
continue;
|
||||
|
||||
const dim3 tpb(x, y, z);
|
||||
const int3 n = end - start;
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
if (cudaGetLastError() != cudaSuccess) // resets the error if any
|
||||
continue;
|
||||
|
||||
// printf("(%d, %d, %d)\n", x, y, z);
|
||||
|
||||
cudaEvent_t tstart, tstop;
|
||||
cudaEventCreate(&tstart);
|
||||
cudaEventCreate(&tstop);
|
||||
|
||||
// #ifdef AC_dt
|
||||
acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
|
||||
/*#else
|
||||
ERROR("FATAL ERROR: acDeviceAutoOptimize() or
|
||||
acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
|
||||
it or call the generated " "device function acDeviceKernel_<kernel name> which does
|
||||
not require the " "timestep to be defined.\n"); #endif*/
|
||||
|
||||
cudaEventRecord(tstart); // ---------------------------------------- Timing start
|
||||
for (int i = 0; i < num_iterations; ++i)
|
||||
solve<2><<<bpg, tpb>>>(start, end, device->vba);
|
||||
|
||||
cudaEventRecord(tstop); // ----------------------------------------- Timing end
|
||||
cudaEventSynchronize(tstop);
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, tstart, tstop);
|
||||
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
if (milliseconds < best_time) {
|
||||
best_time = milliseconds;
|
||||
best_dims = tpb;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#if VERBOSE_PRINTING
|
||||
printf(
|
||||
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
|
||||
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
|
||||
#endif
|
||||
/*
|
||||
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
|
||||
ERRCHK(fp);
|
||||
fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
|
||||
fclose(fp);
|
||||
*/
|
||||
|
||||
rk3_tpb = best_dims;
|
||||
return AC_SUCCESS;
|
||||
const int3 end = (int3){device->local_config.int_params[AC_mx], //
|
||||
device->local_config.int_params[AC_my], //
|
||||
device->local_config.int_params[AC_mz]};
|
||||
return acKernelAutoOptimizeIntegration(start, end, device->vba);
|
||||
}
|
||||
|
||||
AcResult
|
||||
@@ -446,7 +279,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
@@ -457,7 +290,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
@@ -468,7 +301,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
@@ -479,7 +312,7 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
@@ -510,7 +343,7 @@ acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo
|
||||
ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
|
||||
device->local_config.int_params[AC_multigpu_offset]);
|
||||
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(&d_mesh_info, &device_config, sizeof(device_config),
|
||||
0, cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
@@ -685,52 +518,17 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
|
||||
const int3 start, const int3 end, const AcReal dt)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const dim3 tpb = rk3_tpb;
|
||||
|
||||
const int3 n = end - start;
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||
|
||||
//#ifdef AC_dt
|
||||
acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
|
||||
/*#else
|
||||
(void)dt;
|
||||
ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
|
||||
"called, but AC_dt was not defined. Either define it or call the generated "
|
||||
"device function acDeviceKernel_<kernel name> which does not require the "
|
||||
"timestep to be defined.\n");
|
||||
#endif*/
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||
else if (step_number == 1)
|
||||
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||
else
|
||||
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
|
||||
return AC_SUCCESS;
|
||||
return acKernelIntegrateSubstep(device->streams[stream], step_number, start, end, device->vba);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type,
|
||||
acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 start,
|
||||
const int3 end)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const cudaStream_t stream = device->streams[stream_type];
|
||||
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
|
||||
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
|
||||
return AC_SUCCESS;
|
||||
return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]);
|
||||
}
|
||||
|
||||
AcResult
|
||||
@@ -757,7 +555,7 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
|
||||
*result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
@@ -777,7 +575,7 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
|
||||
*result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
|
||||
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
return AC_SUCCESS;
|
@@ -65,7 +65,7 @@
|
||||
* CUDA-specific error checking
|
||||
* =============================================================================
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
|
||||
static inline void
|
||||
cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
|
||||
{
|
||||
@@ -98,8 +98,6 @@ cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
|
||||
ERRCHK_CUDA(cudaPeekAtLastError()); \
|
||||
ERRCHK_CUDA(cudaDeviceSynchronize()); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
|
||||
@@ -112,3 +110,4 @@ cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
|
||||
|
||||
#define WARNCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__, false); }
|
||||
// clang-format on
|
||||
#endif // AC_USE_CUDA_RUNTIME_API
|
||||
|
89
src/core/kernels/boundconds.cu
Normal file
89
src/core/kernels/boundconds.cu
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "common.cuh"
|
||||
|
||||
#include "src/core/errchk.h"
|
||||
|
||||
__global__ void
|
||||
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
{
|
||||
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
// If within the start-end range (this allows threadblock dims that are not
|
||||
// divisible by end - start)
|
||||
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
||||
return;
|
||||
|
||||
// if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
|
||||
// return;
|
||||
|
||||
// If destination index is inside the computational domain, return since
|
||||
// the boundary conditions are only applied to the ghost zones
|
||||
if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
|
||||
j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
|
||||
return;
|
||||
|
||||
// Find the source index
|
||||
// Map to nx, ny, nz coordinates
|
||||
int i_src = i_dst - DCONST(AC_nx_min);
|
||||
int j_src = j_dst - DCONST(AC_ny_min);
|
||||
int k_src = k_dst - DCONST(AC_nz_min);
|
||||
|
||||
// Translate (s.t. the index is always positive)
|
||||
i_src += DCONST(AC_nx);
|
||||
j_src += DCONST(AC_ny);
|
||||
k_src += DCONST(AC_nz);
|
||||
|
||||
// Wrap
|
||||
i_src %= DCONST(AC_nx);
|
||||
j_src %= DCONST(AC_ny);
|
||||
k_src %= DCONST(AC_nz);
|
||||
|
||||
// Map to mx, my, mz coordinates
|
||||
i_src += DCONST(AC_nx_min);
|
||||
j_src += DCONST(AC_ny_min);
|
||||
k_src += DCONST(AC_nz_min);
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
||||
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
||||
vtxbuf[dst_idx] = vtxbuf[src_idx];
|
||||
}
|
||||
|
||||
AcResult
|
||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
||||
{
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
|
||||
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
return AC_SUCCESS;
|
||||
}
|
@@ -26,61 +26,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
__global__ void
|
||||
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
{
|
||||
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
|
||||
#include "astaroth.h"
|
||||
|
||||
// If within the start-end range (this allows threadblock dims that are not
|
||||
// divisible by end - start)
|
||||
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
||||
return;
|
||||
|
||||
// if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
|
||||
// return;
|
||||
|
||||
// If destination index is inside the computational domain, return since
|
||||
// the boundary conditions are only applied to the ghost zones
|
||||
if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
|
||||
j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
|
||||
return;
|
||||
|
||||
// Find the source index
|
||||
// Map to nx, ny, nz coordinates
|
||||
int i_src = i_dst - DCONST(AC_nx_min);
|
||||
int j_src = j_dst - DCONST(AC_ny_min);
|
||||
int k_src = k_dst - DCONST(AC_nz_min);
|
||||
|
||||
// Translate (s.t. the index is always positive)
|
||||
i_src += DCONST(AC_nx);
|
||||
j_src += DCONST(AC_ny);
|
||||
k_src += DCONST(AC_nz);
|
||||
|
||||
// Wrap
|
||||
i_src %= DCONST(AC_nx);
|
||||
j_src %= DCONST(AC_ny);
|
||||
k_src %= DCONST(AC_nz);
|
||||
|
||||
// Map to mx, my, mz coordinates
|
||||
i_src += DCONST(AC_nx_min);
|
||||
j_src += DCONST(AC_ny_min);
|
||||
k_src += DCONST(AC_nz_min);
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
||||
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
||||
vtxbuf[dst_idx] = vtxbuf[src_idx];
|
||||
}
|
||||
|
||||
static void
|
||||
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
||||
{
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
|
||||
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
}
|
||||
AcResult
|
||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf);
|
||||
|
136
src/core/kernels/common.cuh
Normal file
136
src/core/kernels/common.cuh
Normal file
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include "astaroth.h"
|
||||
|
||||
extern __constant__ AcMeshInfo d_mesh_info;
|
||||
|
||||
typedef struct {
|
||||
AcReal* in[NUM_VTXBUF_HANDLES];
|
||||
AcReal* out[NUM_VTXBUF_HANDLES];
|
||||
|
||||
AcReal* profiles[NUM_SCALARARRAY_HANDLES];
|
||||
} VertexBufferArray;
|
||||
|
||||
static int __device__ __forceinline__
|
||||
DCONST(const AcIntParam param)
|
||||
{
|
||||
return d_mesh_info.int_params[param];
|
||||
}
|
||||
static int3 __device__ __forceinline__
|
||||
DCONST(const AcInt3Param param)
|
||||
{
|
||||
return d_mesh_info.int3_params[param];
|
||||
}
|
||||
static AcReal __device__ __forceinline__
|
||||
DCONST(const AcRealParam param)
|
||||
{
|
||||
return d_mesh_info.real_params[param];
|
||||
}
|
||||
static AcReal3 __device__ __forceinline__
|
||||
DCONST(const AcReal3Param param)
|
||||
{
|
||||
return d_mesh_info.real3_params[param];
|
||||
}
|
||||
static __device__ constexpr VertexBufferHandle
|
||||
DCONST(const VertexBufferHandle handle)
|
||||
{
|
||||
return handle;
|
||||
}
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
|
||||
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
|
||||
#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
|
||||
//#define globalMeshM // Placeholder
|
||||
//#define localMeshN // Placeholder
|
||||
//#define localMeshM // Placeholder
|
||||
//#define localMeshN_min // Placeholder
|
||||
//#define globalMeshN_min // Placeholder
|
||||
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
|
||||
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
|
||||
|
||||
|
||||
|
||||
|
||||
static __device__ constexpr int
|
||||
IDX(const int i)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int i, const int j, const int k)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(i, j, k);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int3 idx)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//#include <thrust/complex.h>
|
||||
// using namespace thrust;
|
||||
|
||||
|
||||
|
||||
|
||||
#include <cuComplex.h>
|
||||
#if AC_DOUBLE_PRECISION == 1
|
||||
typedef cuDoubleComplex acComplex;
|
||||
#define acComplex(x, y) make_cuDoubleComplex(x, y)
|
||||
#else
|
||||
typedef cuFloatComplex acComplex;
|
||||
#define acComplex(x, y) make_cuFloatComplex(x, y)
|
||||
#endif
|
||||
static __device__ inline acComplex
|
||||
exp(const acComplex& val)
|
||||
{
|
||||
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
|
||||
}
|
||||
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
|
||||
{
|
||||
return (acComplex){a * b.x, a * b.y};
|
||||
}
|
||||
|
||||
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
|
||||
{
|
||||
return (acComplex){a * b.x, a * b.y};
|
||||
}
|
||||
|
||||
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
|
||||
{
|
||||
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
|
||||
}
|
||||
//#include <complex>
|
296
src/core/kernels/integration.cu
Normal file
296
src/core/kernels/integration.cu
Normal file
@@ -0,0 +1,296 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "common.cuh"
|
||||
|
||||
#include "src/core/errchk.h"
|
||||
#include "src/core/math_utils.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
__constant__ AcMeshInfo d_mesh_info; // Extern in kernels/common.cuh.
|
||||
|
||||
static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
|
||||
|
||||
// Device info
|
||||
#define REGISTERS_PER_THREAD (255)
|
||||
#define MAX_REGISTERS_PER_BLOCK (65536)
|
||||
#define MAX_THREADS_PER_BLOCK (1024)
|
||||
#define WARP_SIZE (32)
|
||||
|
||||
#define make_int3(a, b, c) \
|
||||
(int3) { (int)a, (int)b, (int)c }
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
|
||||
const AcReal dt)
|
||||
{
|
||||
// Williamson (1980)
|
||||
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
|
||||
|
||||
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
||||
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
||||
switch (step_number) {
|
||||
case 0:
|
||||
return state_current + beta[step_number + 1] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
}
|
||||
}
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal3
|
||||
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||
const AcReal3 rate_of_change, const AcReal dt)
|
||||
{
|
||||
return (AcReal3){
|
||||
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||
}
|
||||
|
||||
#define rk3(state_previous, state_current, rate_of_change, dt) \
|
||||
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
||||
|
||||
static __device__ void
|
||||
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
|
||||
{
|
||||
out[handle][idx] = value;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void
|
||||
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
|
||||
{
|
||||
write(out, vec.x, idx, value.x);
|
||||
write(out, vec.y, idx, value.y);
|
||||
write(out, vec.z, idx, value.z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
|
||||
{
|
||||
return field[handle][idx];
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
{
|
||||
return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
|
||||
read_out(idx, field, handle.z)};
|
||||
}
|
||||
|
||||
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
||||
#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
|
||||
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
||||
|
||||
#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
|
||||
#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
|
||||
|
||||
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
||||
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \
|
||||
threadIdx.y + blockIdx.y * blockDim.y + start.y, \
|
||||
threadIdx.z + blockIdx.z * blockDim.z + start.z}; \
|
||||
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
|
||||
d_multigpu_offset.y + vertexIdx.y, \
|
||||
d_multigpu_offset.z + vertexIdx.z}; \
|
||||
(void)globalVertexIdx; \
|
||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
|
||||
return; \
|
||||
\
|
||||
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
|
||||
vertexIdx.z < DCONST(AC_nz_max)); \
|
||||
\
|
||||
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
|
||||
vertexIdx.z >= DCONST(AC_nz_min)); \
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
// clang-format off
|
||||
/*
|
||||
#define GEN_DEVICE_FUNC_HOOK(identifier) \
|
||||
template <int step_number> \
|
||||
AcResult acDeviceKernel_##identifier(const cudaStream_t stream, \
|
||||
const int3 start, const int3 end, VertexBufferArray vba) \
|
||||
{ \
|
||||
\
|
||||
const dim3 tpb(32, 1, 4); \
|
||||
\
|
||||
const int3 n = end - start; \
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), \
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z))); \
|
||||
\
|
||||
identifier<step_number> \
|
||||
<<<bpg, tpb, 0, stream>>>(start, end, vba); \
|
||||
ERRCHK_CUDA_KERNEL(); \
|
||||
\
|
||||
return AC_SUCCESS; \
|
||||
}
|
||||
*/
|
||||
#define GEN_DEVICE_FUNC_HOOK(identifier)
|
||||
|
||||
#include "user_kernels.h"
|
||||
|
||||
static dim3 rk3_tpb(32, 1, 4);
|
||||
|
||||
AcResult
|
||||
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
|
||||
{
|
||||
// RK3
|
||||
dim3 best_dims(0, 0, 0);
|
||||
float best_time = INFINITY;
|
||||
const int num_iterations = 10;
|
||||
|
||||
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
|
||||
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
|
||||
for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
|
||||
|
||||
if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
|
||||
break;
|
||||
if (x * y * z > MAX_THREADS_PER_BLOCK)
|
||||
break;
|
||||
|
||||
if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
|
||||
break;
|
||||
|
||||
if (((x * y * z) % WARP_SIZE) != 0)
|
||||
continue;
|
||||
|
||||
const dim3 tpb(x, y, z);
|
||||
const int3 n = end - start;
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
if (cudaGetLastError() != cudaSuccess) // resets the error if any
|
||||
continue;
|
||||
|
||||
// printf("(%d, %d, %d)\n", x, y, z);
|
||||
|
||||
cudaEvent_t tstart, tstop;
|
||||
cudaEventCreate(&tstart);
|
||||
cudaEventCreate(&tstop);
|
||||
|
||||
// #ifdef AC_dt
|
||||
//acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO note, temporarily disabled
|
||||
/*#else
|
||||
ERROR("FATAL ERROR: acDeviceAutoOptimize() or
|
||||
acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
|
||||
it or call the generated " "device function acDeviceKernel_<kernel name> which does
|
||||
not require the " "timestep to be defined.\n"); #endif*/
|
||||
|
||||
cudaEventRecord(tstart); // ---------------------------------------- Timing start
|
||||
for (int i = 0; i < num_iterations; ++i)
|
||||
solve<2><<<bpg, tpb>>>(start, end, vba);
|
||||
|
||||
cudaEventRecord(tstop); // ----------------------------------------- Timing end
|
||||
cudaEventSynchronize(tstop);
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, tstart, tstop);
|
||||
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
if (milliseconds < best_time) {
|
||||
best_time = milliseconds;
|
||||
best_dims = tpb;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#if VERBOSE_PRINTING
|
||||
printf(
|
||||
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
|
||||
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
|
||||
#endif
|
||||
/*
|
||||
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
|
||||
ERRCHK(fp);
|
||||
fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
|
||||
fclose(fp);
|
||||
*/
|
||||
|
||||
rk3_tpb = best_dims;
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba)
|
||||
{
|
||||
const dim3 tpb = rk3_tpb;
|
||||
|
||||
const int3 n = end - start;
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||
|
||||
//#ifdef AC_dt
|
||||
//acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
|
||||
/*#else
|
||||
(void)dt;
|
||||
ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
|
||||
"called, but AC_dt was not defined. Either define it or call the generated "
|
||||
"device function acDeviceKernel_<kernel name> which does not require the "
|
||||
"timestep to be defined.\n");
|
||||
#endif*/
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
|
||||
else if (step_number == 1)
|
||||
solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
|
||||
else
|
||||
solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
|
||||
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static __global__ void
|
||||
dummy_kernel(void)
|
||||
{
|
||||
DCONST((AcIntParam)0);
|
||||
DCONST((AcInt3Param)0);
|
||||
DCONST((AcRealParam)0);
|
||||
DCONST((AcReal3Param)0);
|
||||
acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
|
||||
a* a;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acKernelDummy(void)
|
||||
{
|
||||
dummy_kernel<<<1, 1>>>();
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
@@ -25,146 +25,12 @@
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include "src/core/math_utils.h"
|
||||
|
||||
#include <assert.h>
|
||||
AcResult
|
||||
acKernelDummy(void);
|
||||
|
||||
static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
|
||||
AcResult
|
||||
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
|
||||
|
||||
static __device__ constexpr int
|
||||
IDX(const int i)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int i, const int j, const int k)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(i, j, k);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int3 idx)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
||||
}
|
||||
|
||||
#define make_int3(a, b, c) \
|
||||
(int3) { (int)a, (int)b, (int)c }
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
|
||||
const AcReal dt)
|
||||
{
|
||||
// Williamson (1980)
|
||||
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
|
||||
|
||||
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
||||
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
||||
switch (step_number) {
|
||||
case 0:
|
||||
return state_current + beta[step_number + 1] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
}
|
||||
}
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal3
|
||||
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||
const AcReal3 rate_of_change, const AcReal dt)
|
||||
{
|
||||
return (AcReal3){
|
||||
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||
}
|
||||
|
||||
#define rk3(state_previous, state_current, rate_of_change, dt) \
|
||||
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
||||
|
||||
static __device__ void
|
||||
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
|
||||
{
|
||||
out[handle][idx] = value;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void
|
||||
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
|
||||
{
|
||||
write(out, vec.x, idx, value.x);
|
||||
write(out, vec.y, idx, value.y);
|
||||
write(out, vec.z, idx, value.z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
|
||||
{
|
||||
return field[handle][idx];
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
{
|
||||
return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
|
||||
read_out(idx, field, handle.z)};
|
||||
}
|
||||
|
||||
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
||||
#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
|
||||
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
||||
|
||||
#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
|
||||
#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
|
||||
|
||||
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
||||
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \
|
||||
threadIdx.y + blockIdx.y * blockDim.y + start.y, \
|
||||
threadIdx.z + blockIdx.z * blockDim.z + start.z}; \
|
||||
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
|
||||
d_multigpu_offset.y + vertexIdx.y, \
|
||||
d_multigpu_offset.z + vertexIdx.z}; \
|
||||
(void)globalVertexIdx; \
|
||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
|
||||
return; \
|
||||
\
|
||||
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
|
||||
vertexIdx.z < DCONST(AC_nz_max)); \
|
||||
\
|
||||
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
|
||||
vertexIdx.z >= DCONST(AC_nz_min)); \
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
// clang-format off
|
||||
#define GEN_DEVICE_FUNC_HOOK(identifier) \
|
||||
template <int step_number> \
|
||||
AcResult acDeviceKernel_##identifier(const Device device, const Stream stream, \
|
||||
const int3 start, const int3 end) \
|
||||
{ \
|
||||
cudaSetDevice(device->id); \
|
||||
\
|
||||
const dim3 tpb(32, 1, 4); \
|
||||
\
|
||||
const int3 n = end - start; \
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), \
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z))); \
|
||||
\
|
||||
identifier<step_number> \
|
||||
<<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba); \
|
||||
ERRCHK_CUDA_KERNEL(); \
|
||||
\
|
||||
return AC_SUCCESS; \
|
||||
}
|
||||
|
||||
|
||||
#include "user_kernels.h"
|
||||
AcResult
|
||||
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba);
|
||||
|
283
src/core/kernels/reductions.cu
Normal file
283
src/core/kernels/reductions.cu
Normal file
@@ -0,0 +1,283 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "common.cuh"
|
||||
|
||||
#include "src/core/errchk.h"
|
||||
#include "src/core/math_utils.h" // is_power_of_two
|
||||
|
||||
/*
|
||||
Reduction steps:
|
||||
1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
|
||||
2 of 3: Compute most of the reductions into a single block of data
|
||||
3 of 3: After all results have been stored into the final block, reduce the data in the final block
|
||||
*/
|
||||
|
||||
// Function pointer definitions
|
||||
typedef AcReal (*FilterFunc)(const AcReal&);
|
||||
typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
|
||||
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
||||
|
||||
// clang-format off
|
||||
/* Comparison funcs */
|
||||
static __device__ inline AcReal
|
||||
dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsum(const AcReal& a, const AcReal& b) { return a + b; }
|
||||
|
||||
/* Function used to determine the values used during reduction */
|
||||
static __device__ inline AcReal
|
||||
dvalue(const AcReal& a) { return AcReal(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared(const AcReal& a) { return (AcReal)(a*a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
|
||||
// clang-format on
|
||||
|
||||
#include <assert.h>
|
||||
template <FilterFunc filter>
|
||||
__global__ void
|
||||
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
|
||||
src_idx.z < DCONST(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <FilterFuncVec filter>
|
||||
__global__ void
|
||||
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
|
||||
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
|
||||
src_idx.z < DCONST(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
||||
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce(AcReal* scratchpad, const int num_elems)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
extern __shared__ AcReal smem[];
|
||||
if (idx < num_elems) {
|
||||
smem[threadIdx.x] = scratchpad[idx];
|
||||
}
|
||||
else {
|
||||
smem[threadIdx.x] = NAN;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int offset = blockDim.x / 2;
|
||||
assert(offset % 2 == 0);
|
||||
while (offset > 0) {
|
||||
if (threadIdx.x < offset) {
|
||||
smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
|
||||
}
|
||||
offset /= 2;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
scratchpad[idx] = smem[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
||||
const int block_size, AcReal* result)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
AcReal res = scratchpad[0];
|
||||
for (int i = 1; i < num_blocks; ++i) {
|
||||
res = reduce(res, scratchpad[i * block_size]);
|
||||
}
|
||||
*result = res;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||
AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
@@ -25,258 +25,13 @@
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include <astaroth.h>
|
||||
|
||||
#include "src/core/math_utils.h" // is_power_of_two
|
||||
AcReal
|
||||
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result);
|
||||
|
||||
/*
|
||||
Reduction steps:
|
||||
1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
|
||||
2 of 3: Compute most of the reductions into a single block of data
|
||||
3 of 3: After all results have been stored into the final block, reduce the data in the final block
|
||||
*/
|
||||
|
||||
// Function pointer definitions
|
||||
typedef AcReal (*FilterFunc)(const AcReal&);
|
||||
typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
|
||||
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
||||
|
||||
// clang-format off
|
||||
/* Comparison funcs */
|
||||
static __device__ inline AcReal
|
||||
dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsum(const AcReal& a, const AcReal& b) { return a + b; }
|
||||
|
||||
/* Function used to determine the values used during reduction */
|
||||
static __device__ inline AcReal
|
||||
dvalue(const AcReal& a) { return AcReal(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared(const AcReal& a) { return (AcReal)(a*a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
|
||||
|
||||
static __device__ inline AcReal
|
||||
dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
|
||||
// clang-format on
|
||||
|
||||
#include <assert.h>
|
||||
template <FilterFunc filter>
|
||||
__global__ void
|
||||
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
|
||||
src_idx.z < DCONST(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <FilterFuncVec filter>
|
||||
__global__ void
|
||||
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
|
||||
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
|
||||
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
|
||||
src_idx.z < DCONST(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
||||
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce(AcReal* scratchpad, const int num_elems)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
extern __shared__ AcReal smem[];
|
||||
if (idx < num_elems) {
|
||||
smem[threadIdx.x] = scratchpad[idx];
|
||||
}
|
||||
else {
|
||||
smem[threadIdx.x] = NAN;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int offset = blockDim.x / 2;
|
||||
assert(offset % 2 == 0);
|
||||
while (offset > 0) {
|
||||
if (threadIdx.x < offset) {
|
||||
smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
|
||||
}
|
||||
offset /= 2;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
scratchpad[idx] = smem[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
||||
const int block_size, AcReal* result)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
AcReal res = scratchpad[0];
|
||||
for (int i = 1; i < num_blocks; ++i) {
|
||||
res = reduce(res, scratchpad[i * block_size]);
|
||||
}
|
||||
*result = res;
|
||||
}
|
||||
|
||||
static AcReal
|
||||
reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
||||
|
||||
static AcReal
|
||||
reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
AcReal
|
||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||
AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
|
||||
ERRCHK(nx >= tpb_filter.x);
|
||||
ERRCHK(ny >= tpb_filter.y);
|
||||
ERRCHK(nz >= tpb_filter.z);
|
||||
ERRCHK(tpb_reduce <= num_elems);
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
// clang-format off
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
return result;
|
||||
}
|
||||
AcReal* reduce_result);
|
||||
|
Reference in New Issue
Block a user