Added Astaroth 2.0

This commit is contained in:
jpekkila
2019-06-14 14:18:35 +03:00
parent 4e4f84c8ff
commit 0e48766a68
87 changed files with 18058 additions and 1 deletions

70
src/core/CMakeLists.txt Normal file
View File

@@ -0,0 +1,70 @@
########################################
## CMakeLists.txt for Astaroth Core ##
########################################
#----------------------Find CUDA-----------------------------------------------#
find_package(CUDA)
if (NOT CUDA_FOUND)
# find_package(CUDA REQUIRED) gives a confusing error message if it fails,
# therefore we print the reason here explicitly
message(FATAL_ERROR "CUDA not found")
endif()
#----------------------CUDA settings-------------------------------------------#
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_PROPAGATE_HOST_FLAGS ON)
# CUDA_BUILD_CUBIN requires that we're compiling for only one architecture
# set(CUDA_BUILD_CUBIN ON)
#----------------------Setup CUDA compilation flags----------------------------#
# Generate code for the default architecture (Pascal)
set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
-gencode arch=compute_50,code=sm_50
-gencode arch=compute_60,code=sm_60
-gencode arch=compute_61,code=sm_61
-lineinfo
--maxrregcount=255
-ftz=true
-std=c++11) #--maxrregcount=255 -ftz=true #ftz = flush denormalized floats to zero
# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
# =cg to opt out
# Additional CUDA optimization flags
if (CMAKE_BUILD_TYPE MATCHES RELEASE)
# Doesn't set any additional flags, see CUDA_NVCC_FLAGS_DEBUG below on how
# to add more
set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE})
endif()
# Additional CUDA debug flags
if (CMAKE_BUILD_TYPE MATCHES DEBUG)
# The debug flags must be set inside this if clause, since either CMake 3.5
# or nvcc 7.5 is bugged:
# CMake converts these into empty strings when doing RELEASE build, but nvcc
# 7.5 fails to parse empty flags.
set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};
--device-debug;
--generate-line-info;
--ptxas-options=-v)
endif()
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCH_FLAGS}")
message("CUDA_NVCC_FLAGS: " ${CUDA_NVCC_FLAGS})
#------------------Compile and create a static library-------------------------#
file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu")
# Use -fPIC if -fpic not supported. Some quick non-scientific tests:
# Without fpic: 4.94 user, 4.04 system, 0:09.88 elapsed
# With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed
# With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed
CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic")

451
src/core/astaroth.cu Normal file
View File

@@ -0,0 +1,451 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Multi-GPU implementation.
*
* Detailed info.
*
*/
#include "astaroth.h"
#include "errchk.h"
#include "device.cuh"
#include "math_utils.h" // sum for reductions
#include "standalone/config_loader.h" // update_config
const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
static const int MAX_NUM_DEVICES = 32;
static int num_devices = 1;
static Device devices[MAX_NUM_DEVICES] = {};
typedef struct {
int3 m;
int3 n;
} Grid;
static Grid
createGrid(const AcMeshInfo& config)
{
Grid grid;
grid.m = (int3) {
config.int_params[AC_mx],
config.int_params[AC_my],
config.int_params[AC_mz]
};
grid.n = (int3) {
config.int_params[AC_nx],
config.int_params[AC_ny],
config.int_params[AC_nz]
};
return grid;
}
static Grid grid; // A grid consists of num_devices subgrids
static Grid subgrid;
static int
gridIdx(const Grid& grid, const int i, const int j, const int k)
{
return i + j * grid.m.x + k * grid.m.x * grid.m.y;
}
static int3
gridIdx3d(const Grid& grid, const int idx)
{
return (int3){idx % grid.m.x,
(idx % (grid.m.x * grid.m.y)) / grid.m.x,
idx / (grid.m.x * grid.m.y)};
}
void
printInt3(const int3 vec)
{
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
}
AcResult
acInit(const AcMeshInfo& config)
{
// Check devices
cudaGetDeviceCount(&num_devices);
if (num_devices < 1) {
ERROR("No CUDA devices found!");
return AC_FAILURE;
}
if (num_devices > MAX_NUM_DEVICES) {
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
num_devices = MAX_NUM_DEVICES;
}
if (!AC_MULTIGPU_ENABLED) {
WARNING("MULTIGPU_ENABLED was false. Using only one device");
num_devices = 1; // Use only one device if multi-GPU is not enabled
}
// Check that num_devices is divisible with AC_nz. This makes decomposing the
// problem domain to multiple GPUs much easier since we do not have to worry
// about remainders
ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
// Decompose the problem domain
// The main grid
grid = createGrid(config);
// Subgrids
AcMeshInfo subgrid_config = config;
subgrid_config.int_params[AC_nz] /= num_devices;
update_config(&subgrid_config);
subgrid = createGrid(subgrid_config);
// Periodic boundary conditions become weird if the system can "fold unto itself".
ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
printf("Grid m "); printInt3(grid.m); printf("\n");
printf("Grid n "); printInt3(grid.n); printf("\n");
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
// Initialize the devices
for (int i = 0; i < num_devices; ++i) {
createDevice(i, subgrid_config, &devices[i]);
printDeviceInfo(devices[i]);
}
return AC_SUCCESS;
}
AcResult
acQuit(void)
{
for (int i = 0; i < num_devices; ++i) {
destroyDevice(devices[i]);
}
return AC_SUCCESS;
}
int
gridIdxx(const Grid grid, const int3 idx)
{
return gridIdx(grid, idx.x, idx.y, idx.z);
}
AcResult
acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
{
/*
Here we decompose the host mesh and distribute it among the GPUs in
the node.
The host mesh is a huge contiguous block of data. Its dimensions are given by
the global variable named "grid". A "grid" is decomposed into "subgrids",
one for each GPU. Here we check which parts of the range s0...s1 maps
to the memory space stored by some GPU, ranging d0...d1, and transfer
the data if needed.
The index mapping is inherently quite involved, but here's a picture which
hopefully helps make sense out of all this.
Grid
|----num_vertices---|
xxx|....................................................|xxx
^ ^ ^ ^
d0 d1 s0 (src) s1
Subgrid
xxx|.............|xxx
^ ^
d0 d1
^ ^
db da
*/
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
const int3 s0 = src;
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
/*
printf("Device %d\n", i);
printf("\ts0: "); printInt3(s0); printf("\n");
printf("\td0: "); printInt3(d0); printf("\n");
printf("\tda: "); printInt3(da); printf("\n");
printf("\tdb: "); printInt3(db); printf("\n");
printf("\td1: "); printInt3(d1); printf("\n");
printf("\ts1: "); printInt3(s1); printf("\n");
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
*/
if (db.z >= da.z) {
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
}
printf("\n");
}
return AC_SUCCESS;
}
AcResult
acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
{
// See acLoadWithOffset() for an explanation of the index mapping
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
const int3 s0 = src;
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
/*
printf("Device %d\n", i);
printf("\ts0: "); printInt3(s0); printf("\n");
printf("\td0: "); printInt3(d0); printf("\n");
printf("\tda: "); printInt3(da); printf("\n");
printf("\tdb: "); printInt3(db); printf("\n");
printf("\td1: "); printInt3(d1); printf("\n");
printf("\ts1: "); printInt3(s1); printf("\n");
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
*/
if (db.z >= da.z) {
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
}
printf("\n");
}
return AC_SUCCESS;
}
// acCopyMeshToDevice
AcResult
acLoad(const AcMesh& host_mesh)
{
return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
}
// acCopyMeshToHost
AcResult
acStore(AcMesh* host_mesh)
{
return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
}
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
{
const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
const int3 end = (int3){STENCIL_ORDER/2 + subgrid.n.x,
STENCIL_ORDER/2 + subgrid.n.y,
STENCIL_ORDER/2 + subgrid.n.z};
for (int i = 0; i < num_devices; ++i) {
rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
}
return AC_SUCCESS;
}
AcResult
acBoundcondStep(void)
{
acSynchronize();
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY,
(int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
} else {
// Local boundary conditions
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
}
/*
// ===MIIKKANOTE START==========================================
%JP: The old way for computing boundary conditions conflicts with the
way we have to do things with multiple GPUs.
The older approach relied on unified memory, which represented the whole
memory area as one huge mesh instead of several smaller ones. However, unified memory
in its current state is more meant for quick prototyping when performance is not an issue.
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
when managing the memory explicitly.
In this new approach, I have simplified the multi- and single-GPU layers significantly.
Quick rundown:
New struct: Grid. There are two global variables, "grid" and "subgrid", which
contain the extents of the whole simulation domain and the decomposed grids, respectively.
To simplify thing, we require that each GPU is assigned the same amount of work,
therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
to work with.
The whole simulation domain is decomposed with respect to the z dimension.
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
contain (nx, ny, nz / num_devices) vertices.
An local index (i, j, k) in some subgrid can be mapped to the global grid with
global idx = (i, j, k + device_id * subgrid.n.z)
Terminology:
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
Changes required to this commented code block:
- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
instead. Same holds for any complex index calculations. Instead, the local coordinates
should be passed as an int3 type without having to consider how the data is actually
laid out in device memory
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
of type "Device" which should be passed to single-GPU functions. In this file, all devices
are stored in a global array "devices[num_devices]".
- Every single-GPU function is executed asynchronously by default such that we
can optimize Astaroth by executing memory transactions concurrently with computation.
Therefore a StreamType should be passed as a parameter to single-GPU functions.
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
as a parameter and commands executing in different streams can be processed
in parallel/concurrently.
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
With multiple GPUs, periodic boundary conditions applied on indices ranging from
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
on a single device are "local", in the sense that they can be computed without having
to exchange data with neighboring GPUs. Special care is needed only for transferring
the data to the fron and back plates outside this range. In the solution we use here,
we solve the local boundaries first, and then just exchange the front and back plates
in a "ring", like so
device_id
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
// ======MIIKKANOTE END==========================================
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
moved into device.cu, function boundCondStep()
In astaroth.cu, we use acBoundcondStep()
just to distribute the work and manage
communication between GPUs.
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
exit(0);
#else
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
const int3 start = (int3){0, 0, device_id * depth};
const int3 end = (int3){mesh_info.int_params[AC_mx],
mesh_info.int_params[AC_my],
min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
const dim3 tpb(8,2,8);
// TODO uses the default stream currently
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
wedge_boundconds(0, tpb, start, end, d_buffer);
} else {
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
*/
// Exchange halos
for (int i = 0; i < num_devices; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3) {0, 0, subgrid.n.z};
const int3 dst = (int3) {0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
}
}
}
acSynchronize();
return AC_SUCCESS;
}
static AcResult
acSwapBuffers(void)
{
for (int i = 0; i < num_devices; ++i) {
swapBuffers(devices[i]);
}
return AC_SUCCESS;
}
AcResult
acIntegrate(const AcReal& dt)
{
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acBoundcondStep();
acIntegrateStep(isubstep, dt);
acSwapBuffers();
}
return AC_SUCCESS;
}
AcReal
acReduceScal(const ReductionType& rtype,
const VertexBufferHandle& vtxbuffer_handle)
{
// TODO
return 0;
}
AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
const VertexBufferHandle& b, const VertexBufferHandle& c)
{
// TODO
return 0;
}
AcResult
acSynchronize(void)
{
for (int i = 0; i < num_devices; ++i) {
synchronize(devices[i], STREAM_ALL);
}
return AC_SUCCESS;
}

309
src/core/device.cu Normal file
View File

@@ -0,0 +1,309 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "device.cuh"
#include "errchk.h"
typedef struct {
AcReal* in[NUM_VTXBUF_HANDLES];
AcReal* out[NUM_VTXBUF_HANDLES];
} VertexBufferArray;
__constant__ AcMeshInfo d_mesh_info;
#define DCONST_INT(X) (d_mesh_info.int_params[X])
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
#include "kernels/kernels.cuh"
struct device_s {
int id;
AcMeshInfo local_config;
// Concurrency
cudaStream_t streams[NUM_STREAM_TYPES];
// Memory
VertexBufferArray vba;
AcReal* reduce_scratchpad;
AcReal* reduce_result;
};
AcResult
printDeviceInfo(const Device device)
{
const int device_id = device->id;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device_id);
printf("--------------------------------------------------\n");
printf("Device Number: %d\n", device_id);
const size_t bus_id_max_len = 128;
char bus_id[bus_id_max_len];
cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
printf(" PCI bus ID: %s\n", bus_id);
printf(" Device name: %s\n", props.name);
printf(" Compute capability: %d.%d\n", props.major, props.minor);
// Compute
printf(" Compute\n");
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
printf(" Stream processors: %d\n", props.multiProcessorCount);
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
printf(" Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
// Memory
printf(" Global memory\n");
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
(8. * 1024. * 1024. * 1024.));
printf(" ECC enabled: %d\n", props.ECCEnabled);
// Memory usage
size_t free_bytes, total_bytes;
cudaMemGetInfo(&free_bytes, &total_bytes);
const size_t used_bytes = total_bytes - free_bytes;
printf(" Total global mem: %.2f GiB\n",
props.totalGlobalMem / (1024.0 * 1024 * 1024));
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory free (GiB): %.2f\n",
free_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory total (GiB): %.2f\n",
total_bytes / (1024.0 * 1024 * 1024));
printf(" Caches\n");
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
printf(" Shared mem per block: %ld KiB\n",
props.sharedMemPerBlock / (1024));
printf(" Other\n");
printf(" Warp size: %d\n", props.warpSize);
// printf(" Single to double perf. ratio: %dx\n",
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
// versions
printf(" Stream priorities supported: %d\n",
props.streamPrioritiesSupported);
printf("--------------------------------------------------\n");
return AC_SUCCESS;
}
static __global__ void dummy_kernel(void) {}
AcResult
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
{
cudaSetDevice(id);
cudaDeviceReset();
// Create Device
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
ERRCHK_ALWAYS(device);
device->id = id;
device->local_config = device_config;
// Check that the code was compiled for the proper GPU architecture
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
"device supports the CUDA architecture you are compiling for.\n"
"Running dummy kernel... ");
fflush(stdout);
dummy_kernel<<<1, 1>>>();
ERRCHK_CUDA_KERNEL_ALWAYS();
printf("Success!\n");
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamCreate(&device->streams[i]);
}
// Memory
const size_t vba_size_bytes = AC_VTXBUF_SIZE_BYTES(device_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
}
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
// Device constants
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
cudaMemcpyHostToDevice));
printf("Created device %d (%p)\n", device->id, device);
*device_handle = device;
return AC_SUCCESS;
}
AcResult
destroyDevice(Device device)
{
cudaSetDevice(device->id);
printf("Destroying device %d (%p)\n", device->id, device);
// Memory
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
cudaFree(device->vba.in[i]);
cudaFree(device->vba.out[i]);
}
cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result);
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i)
cudaStreamDestroy(device->streams[i]);
// Destroy Device
free(device);
return AC_SUCCESS;
}
AcResult
boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
{
cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
}
return AC_SUCCESS;
}
AcResult
reduceScal(const Device device)
{
cudaSetDevice(device->id);
return AC_SUCCESS;
}
AcResult
reduceVec(const Device device)
{
cudaSetDevice(device->id);
return AC_SUCCESS;
}
AcResult
rkStep(const Device device, const StreamType stream_type, const int step_number,
const int3& start, const int3& end, const AcReal dt)
{
cudaSetDevice(device->id);
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
return AC_SUCCESS;
}
AcResult
synchronize(const Device device, const StreamType stream_type)
{
cudaSetDevice(device->id);
if (stream_type == STREAM_ALL) {
cudaDeviceSynchronize();
} else {
cudaStreamSynchronize(device->streams[stream_type]);
}
return AC_SUCCESS;
}
static AcResult
loadWithOffset(const Device device, const StreamType stream_type,
const AcReal* src, const size_t bytes, AcReal* dst)
{
cudaSetDevice(device->id);
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
device->streams[stream_type]));
return AC_SUCCESS;
}
static AcResult
storeWithOffset(const Device device, const StreamType stream_type,
const AcReal* src, const size_t bytes, AcReal* dst)
{
cudaSetDevice(device->id);
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
device->streams[stream_type]));
return AC_SUCCESS;
}
AcResult
copyMeshToDevice(const Device device, const StreamType stream_type,
const AcMesh& host_mesh, const int3& src, const int3& dst,
const int num_vertices)
{
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
&device->vba.in[i][dst_idx]);
}
return AC_SUCCESS;
}
AcResult
copyMeshToHost(const Device device, const StreamType stream_type,
const int3& src, const int3& dst, const int num_vertices,
AcMesh* host_mesh)
{
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
num_vertices * sizeof(AcReal),
&host_mesh->vertex_buffer[i][dst_idx]);
}
return AC_SUCCESS;
}
AcResult
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
const int3& src, Device dst_device, const int3& dst,
const int num_vertices)
{
cudaSetDevice(src_device->id);
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, dst_device->local_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
&src_device->vba.in[i][src_idx], src_device->id,
sizeof(src_device->vba.in[i][0]) * num_vertices,
src_device->streams[stream_type]));
}
return AC_SUCCESS;
}
AcResult
swapBuffers(const Device device)
{
cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
AcReal* tmp = device->vba.in[i];
device->vba.in[i] = device->vba.out[i];
device->vba.out[i] = tmp;
}
return AC_SUCCESS;
}

82
src/core/device.cuh Normal file
View File

@@ -0,0 +1,82 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include "astaroth.h"
typedef enum {
STREAM_PRIMARY,
STREAM_SECONDARY,
NUM_STREAM_TYPES,
STREAM_ALL
} StreamType;
typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
// in Vulkan, f.ex. VkDevice
/** */
AcResult printDeviceInfo(const Device device);
/** */
AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
/** */
AcResult destroyDevice(Device device);
/** */
AcResult boundcondStep(const Device device, const StreamType stream_type,
const int3& start, const int3& end);
/** */
AcResult reduceScal(const Device device);
/** */
AcResult reduceVec(const Device device);
/** */
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
const int3& start, const int3& end, const AcReal dt);
/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
a StreamType, the function synchronizes all streams on the device. */
AcResult synchronize(const Device device, const StreamType stream_type);
/** */
AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
const AcMesh& host_mesh, const int3& src, const int3& dst,
const int num_vertices);
/** */
AcResult copyMeshToHost(const Device device, const StreamType stream_type,
const int3& src, const int3& dst, const int num_vertices,
AcMesh* host_mesh);
/** */
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
Device dst, const int3& dst_idx, const int num_vertices);
/** Swaps the input/output buffers used in computations */
AcResult swapBuffers(const Device device);

112
src/core/errchk.h Normal file
View File

@@ -0,0 +1,112 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// clang-format off
/*
* =============================================================================
* General error checking
* =============================================================================
*/
#define ERROR(str) \
{ \
time_t t; time(&t); \
fprintf(stderr, "%s", ctime(&t)); \
fprintf(stderr, "\tError in file %s line %d: %s\n", \
__FILE__, __LINE__, str); \
fflush(stderr); \
exit(EXIT_FAILURE); \
abort(); \
}
#define WARNING(str) \
{ \
time_t t; time(&t); \
fprintf(stderr, "%s", ctime(&t)); \
fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
__FILE__, __LINE__, str); \
fflush(stderr); \
}
// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
#define ERRCHK(retval) { if (!(retval)) ERROR(#retval " was false"); }
#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
/*
* =============================================================================
* CUDA-specific error checking
* =============================================================================
*/
#ifdef __CUDACC__
static inline void
cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess) {
time_t t; time(&t); \
fprintf(stderr, "%s", ctime(&t)); \
fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
file, line, cudaGetErrorString(code)); \
fflush(stderr); \
if (abort)
exit(code);
}
}
#ifdef NDEBUG
#undef ERRCHK
#undef WARNCHK
#define ERRCHK(params)
#define WARNCHK(params)
#define ERRCHK_CUDA(params) params;
#define WARNCHK_CUDA(params) params;
#define ERRCHK_CUDA_KERNEL() {}
#else
#define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
#define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
#define ERRCHK_CUDA_KERNEL() \
{ \
ERRCHK_CUDA(cudaPeekAtLastError()); \
ERRCHK_CUDA(cudaDeviceSynchronize()); \
}
#endif
#endif
#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
#define ERRCHK_CUDA_KERNEL_ALWAYS() \
{ \
ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError()); \
ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize()); \
}
// clang-format on

2
src/core/kernels/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
# Ignore the generated headers
stencil_process.cuh stencil_assembly.cuh

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,794 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
__global__ void
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
{
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
// If within the start-end range (this allows threadblock dims that are not
// divisible by end - start)
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
return;
//if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
// return;
// If destination index is inside the computational domain, return since
// the boundary conditions are only applied to the ghost zones
if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
return;
// Find the source index
// Map to nx, ny, nz coordinates
int i_src = i_dst - DCONST_INT(AC_nx_min);
int j_src = j_dst - DCONST_INT(AC_ny_min);
int k_src = k_dst - DCONST_INT(AC_nz_min);
// Translate (s.t. the index is always positive)
i_src += DCONST_INT(AC_nx);
j_src += DCONST_INT(AC_ny);
k_src += DCONST_INT(AC_nz);
// Wrap
i_src %= DCONST_INT(AC_nx);
j_src %= DCONST_INT(AC_ny);
k_src %= DCONST_INT(AC_nz);
// Map to mx, my, mz coordinates
i_src += DCONST_INT(AC_nx_min);
j_src += DCONST_INT(AC_ny_min);
k_src += DCONST_INT(AC_nz_min);
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
vertex_buffer[dst_idx] = vertex_buffer[src_idx];
}
void
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vertex_buffer)
{
const dim3 tpb(8,2,8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
ERRCHK_CUDA_KERNEL();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#include <assert.h>
static __device__ __forceinline__ int
IDX(const int i)
{
return i;
}
static __device__ __forceinline__ int
IDX(const int i, const int j, const int k)
{
return DEVICE_VTXBUF_IDX(i, j, k);
}
static __device__ __forceinline__ int
IDX(const int3 idx)
{
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
}
static __forceinline__ AcMatrix
create_rotz(const AcReal radians)
{
AcMatrix mat;
mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
mat.row[2] = (AcReal3){0, 0, 0};
return mat;
}
#if AC_DOUBLE_PRECISION == 0
#define sin __sinf
#define cos __cosf
#define exp __expf
#define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0
/*
typedef struct {
int i, j, k;
} int3;*/
/*
* =============================================================================
* Level 0 (Input Assembly Stage)
* =============================================================================
*/
/*
* =============================================================================
* Level 0.1 (Read stencil elements and solve derivatives)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {0, 1.0 / 2.0};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
#elif STENCIL_ORDER == 6
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
#elif STENCIL_ORDER == 8
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
-1.0 / 280.0};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = 0;
#pragma unroll
for (int i = 1; i <= MID; ++i)
res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
return res * inv_ds;
}
static __device__ __forceinline__ AcReal
second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {-2., 1.};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
#elif STENCIL_ORDER == 6
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
1.0 / 90.0};
#elif STENCIL_ORDER == 8
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
8.0 / 315.0, -1.0 / 560.0};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = coefficients[0] * pencil[MID];
#pragma unroll
for (int i = 1; i <= MID; ++i)
res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
return res * inv_ds * inv_ds;
}
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
static __device__ __forceinline__ AcReal
cross_derivative(const AcReal* __restrict__ pencil_a,
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
const AcReal inv_ds_b)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {0, 1.0 / 4.0};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
#elif STENCIL_ORDER == 6
const AcReal fac = (1. / 720.);
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
2.0 * fac};
#elif STENCIL_ORDER == 8
const AcReal fac = (1. / 20160.);
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
128. * fac, -9. * fac};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = AcReal(0.);
#pragma unroll
for (int i = 1; i <= MID; ++i) {
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
pencil_b[MID + i] - pencil_b[MID - i]);
}
return res * inv_ds_a * inv_ds_b;
}
static __device__ __forceinline__ AcReal
derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
}
static __device__ __forceinline__ AcReal
derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
}
static __device__ __forceinline__ AcReal
derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
}
/*
* =============================================================================
* Level 0.2 (Caching functions)
* =============================================================================
*/
#include "stencil_assembly.cuh"
/*
typedef struct {
AcRealData x;
AcRealData y;
AcRealData z;
} AcReal3Data;
static __device__ __forceinline__ AcReal3Data
read_data(const int i, const int j, const int k,
AcReal* __restrict__ buf[], const int3& handle)
{
AcReal3Data data;
data.x = read_data(i, j, k, buf, handle.x);
data.y = read_data(i, j, k, buf, handle.y);
data.z = read_data(i, j, k, buf, handle.z);
return data;
}
*/
/*
* =============================================================================
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
* =============================================================================
*/
static __host__ __device__ __forceinline__ AcReal3
operator-(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator+(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator-(const AcReal3& a)
{
return (AcReal3){-a.x, -a.y, -a.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator*(const AcReal a, const AcReal3& b)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static __host__ __device__ __forceinline__ AcReal
dot(const AcReal3& a, const AcReal3& b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
static __host__ __device__ __forceinline__ AcReal3
mul(const AcMatrix& aa, const AcReal3& x)
{
return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
}
static __host__ __device__ __forceinline__ AcReal3
cross(const AcReal3& a, const AcReal3& b)
{
AcReal3 c;
c.x = a.y * b.z - a.z * b.y;
c.y = a.z * b.x - a.x * b.z;
c.z = a.x * b.y - a.y * b.x;
return c;
}
static __host__ __device__ __forceinline__ bool
is_valid(const AcReal a)
{
return !isnan(a) && !isinf(a);
}
static __host__ __device__ __forceinline__ bool
is_valid(const AcReal3& a)
{
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
}
/*
* =============================================================================
* Level 1 (Stencil Processing Stage)
* =============================================================================
*/
/*
* =============================================================================
* Level 1.1 (Terms)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
laplace(const AcRealData& data)
{
return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
}
static __device__ __forceinline__ AcReal
divergence(const AcReal3Data& vec)
{
return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
}
static __device__ __forceinline__ AcReal3
laplace_vec(const AcReal3Data& vec)
{
return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
}
static __device__ __forceinline__ AcReal3
curl(const AcReal3Data& vec)
{
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
gradient(vec.x).z - gradient(vec.z).x,
gradient(vec.y).x - gradient(vec.x).y};
}
static __device__ __forceinline__ AcReal3
gradient_of_divergence(const AcReal3Data& vec)
{
return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
}
// Takes uu gradients and returns S
static __device__ __forceinline__ AcMatrix
stress_tensor(const AcReal3Data& vec)
{
AcMatrix S;
S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
S.row[1].x = S.row[0].y;
S.row[2].x = S.row[0].z;
S.row[2].y = S.row[1].z;
return S;
}
static __device__ __forceinline__ AcReal
contract(const AcMatrix& mat)
{
AcReal res = 0;
#pragma unroll
for (int i = 0; i < 3; ++i)
res += dot(mat.row[i], mat.row[i]);
return res;
}
/*
* =============================================================================
* Level 1.2 (Equations)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
length(const AcReal3& vec)
{
return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
}
static __device__ __forceinline__ AcReal
reciprocal_len(const AcReal3& vec)
{
return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
}
static __device__ __forceinline__ AcReal3
normalized(const AcReal3& vec)
{
const AcReal inv_len = reciprocal_len(vec);
return inv_len * vec;
}
// Sinusoidal forcing
// https://arxiv.org/pdf/1704.04676.pdf
__constant__ AcReal3 forcing_vec;
__constant__ AcReal forcing_phi;
static __device__ __forceinline__ AcReal3
forcing(const int i, const int j, const int k)
{
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
AcReal inv_len = reciprocal_len(k_vec);
if (isnan(inv_len) || isinf(inv_len))
inv_len = 0;
if (inv_len > 2) // hack to make it cool
inv_len = 2;
const AcReal k_dot_x = dot(k_vec, forcing_vec);
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
return inv_len * inv_len * waves * forcing_vec;
}
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
#define LNT0 (AcReal(0.0))
#define LNRHO0 (AcReal(0.0))
#define H_CONST (AcReal(0.0))
#define C_CONST (AcReal(0.0))
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const AcReal state_previous, const AcReal state_current,
const AcReal rate_of_change, const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
AcReal(8. / 15.)};
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
// access (when accessing beta[step_number-1] even when step_number >= 1)
switch (step_number) {
case 0:
return state_current + beta[step_number + 1] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number + 1] *
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
/*
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
const AcReal rate_of_change, const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {AcReal(1. / 3.), AcReal(15. / 16.),
AcReal(8. / 15.)};
switch (step_number) {
case 0:
return state_current + beta[step_number] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number] *
(alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
*/
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
const AcReal3 rate_of_change, const AcReal dt)
{
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
}
#define rk3(state_previous, state_current, rate_of_change, dt)\
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
/*
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const int idx, const AcReal out, const int handle,
const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
{
return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
}
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
{
return (AcReal3) {
rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
};
}
#define RK3(handle, in_cached, rate_of_change, dt) \
rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
*/
/*
* =============================================================================
* Level 1.3 (Kernels)
* =============================================================================
*/
static __device__ void
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
{
out[handle][idx] = value;
}
static __device__ void
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
{
write(out, vec.x, idx, value.x);
write(out, vec.y, idx, value.y);
write(out, vec.z, idx, value.z);
}
static __device__ AcReal
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
{
return field[handle][idx];
}
static __device__ AcReal3
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
{
return (AcReal3) { read_out(idx, field, handle.x),
read_out(idx, field, handle.y),
read_out(idx, field, handle.z) };
}
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
// also write for clarity here also, not for the DSL
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
#define GEN_KERNEL_PARAM_BOILERPLATE \
const int3 start, const int3 end, VertexBufferArray buffer
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
return;\
\
\
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
vertexIdx.z < DCONST_INT(AC_nz_max));\
\
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
vertexIdx.z >= DCONST_INT(AC_nz_min));\
\
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
#include "stencil_process.cuh"
/*
* =============================================================================
* Level 2 (Host calls)
* =============================================================================
*/
static AcReal
randf(void)
{
return AcReal(rand()) / AcReal(RAND_MAX);
}
AcResult
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
const AcReal dt, VertexBufferArray* buffer)
{
const dim3 tpb(32, 1, 4);
/////////////////// Forcing
#if LFORCING
const AcReal ff_scale = AcReal(.2);
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
const AcMatrix rotz = create_rotz(radians);
ff = mul(rotz, ff);
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
#endif // LFORCING
//////////////////////////
const int nx = end.x - start.x;
const int ny = end.y - start.y;
const int nz = end.z - start.z;
const dim3 bpg(
(unsigned int)ceil(nx / AcReal(tpb.x)),
(unsigned int)ceil(ny / AcReal(tpb.y)),
(unsigned int)ceil(nz / AcReal(tpb.z)));
if (step_number == 0)
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
else
solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}

338
src/core/kernels/reduce.cuh Normal file
View File

@@ -0,0 +1,338 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include "device_globals.cuh"
#include "src/core/errchk.h"
#include "src/core/math_utils.h"
// Function pointer definitions
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
const AcReal&);
// clang-format off
/* Comparison funcs */
__device__ inline AcReal
_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
__device__ inline AcReal
_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
__device__ inline AcReal
_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
/* Function used to determine the values used during reduction */
__device__ inline AcReal
_device_length_scal(const AcReal& a) { return AcReal(a); }
__device__ inline AcReal
_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
__device__ inline AcReal
_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
__device__ inline AcReal
_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
__device__ inline AcReal
_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
__device__ inline AcReal
_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
// clang-format on
__device__ inline bool
oob(const int& i, const int& j, const int& k)
{
if (i >= d_mesh_info.int_params[AC_nx] ||
j >= d_mesh_info.int_params[AC_ny] ||
k >= d_mesh_info.int_params[AC_nz])
return true;
else
return false;
}
template <ReduceInitialScalFunc reduce_initial>
__global__ void
_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
{
const int i = threadIdx.x + blockIdx.x * blockDim.x;
const int j = threadIdx.y + blockIdx.y * blockDim.y;
const int k = threadIdx.z + blockIdx.z * blockDim.z;
if (oob(i, j, k))
return;
const int src_idx = DEVICE_VTXBUF_IDX(
i + d_mesh_info.int_params[AC_nx_min],
j + d_mesh_info.int_params[AC_ny_min],
k + d_mesh_info.int_params[AC_nz_min]);
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
dst[dst_idx] = reduce_initial(src[src_idx]);
}
template <ReduceInitialVecFunc reduce_initial>
__global__ void
_kernel_reduce_vec(const __restrict__ AcReal* src_a,
const __restrict__ AcReal* src_b,
const __restrict__ AcReal* src_c, AcReal* dst)
{
const int i = threadIdx.x + blockIdx.x * blockDim.x;
const int j = threadIdx.y + blockIdx.y * blockDim.y;
const int k = threadIdx.z + blockIdx.z * blockDim.z;
if (oob(i, j, k))
return;
const int src_idx = DEVICE_VTXBUF_IDX(
i + d_mesh_info.int_params[AC_nx_min],
j + d_mesh_info.int_params[AC_ny_min],
k + d_mesh_info.int_params[AC_nz_min]);
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
src_c[src_idx]);
}
///////////////////////////////////////////////////////////////////////////////
#define BLOCK_SIZE (1024)
#define ELEMS_PER_THREAD (32)
template <ReduceFunc reduce>
__global__ void
_kernel_reduce(AcReal* src, AcReal* result)
{
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
const int scratchpad_size = DCONST_INT(AC_nxyz);
if (idx >= scratchpad_size)
return;
__shared__ AcReal smem[BLOCK_SIZE];
AcReal tmp = src[idx];
for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
const int src_idx = idx + i * BLOCK_SIZE;
if (src_idx >= scratchpad_size) {
// This check is for safety: if accessing uninitialized values
// beyond the mesh boundaries, we will immediately start seeing NANs
if (threadIdx.x < BLOCK_SIZE)
smem[threadIdx.x] = NAN;
else
break;
}
tmp = reduce(tmp, src[src_idx]);
}
smem[threadIdx.x] = tmp;
__syncthreads();
int offset = BLOCK_SIZE / 2;
while (offset > 0) {
if (threadIdx.x < offset) {
tmp = reduce(tmp, smem[threadIdx.x + offset]);
smem[threadIdx.x] = tmp;
}
offset /= 2;
__syncthreads();
}
if (threadIdx.x == 0)
src[idx] = tmp;
}
template <ReduceFunc reduce>
__global__ void
_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
{
const int scratchpad_size = DCONST_INT(AC_nxyz);
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
AcReal tmp = src[idx];
const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
tmp = reduce(tmp, src[idx + i * block_offset]);
*result = tmp;
}
//////////////////////////////////////////////////////////////////////////////
AcReal
_reduce_scal(const cudaStream_t stream,
const ReductionType& rtype, const int& nx, const int& ny,
const int& nz, const AcReal* vertex_buffer,
AcReal* reduce_scratchpad, AcReal* reduce_result)
{
bool solve_mean = false;
const dim3 tpb(32, 4, 1);
const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
int(ceil(AcReal(nz) / tpb.z)));
const int scratchpad_size = nx * ny * nz;
const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) /
AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
switch (rtype) {
case RTYPE_MAX:
_kernel_reduce_scal<_device_length_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_max>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_max>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_MIN:
_kernel_reduce_scal<_device_length_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_min>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_min>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS:
_kernel_reduce_scal<_device_squared_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
case RTYPE_RMS_EXP:
_kernel_reduce_scal<_device_exp_squared_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
default:
ERROR("Unrecognized RTYPE");
}
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
if (solve_mean) {
const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
return inv_n * result;
}
else {
return result;
}
}
AcReal
_reduce_vec(const cudaStream_t stream,
const ReductionType& rtype, const int& nx, const int& ny,
const int& nz, const AcReal* vertex_buffer_a,
const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
AcReal* reduce_scratchpad, AcReal* reduce_result)
{
bool solve_mean = false;
const dim3 tpb(32, 4, 1);
const dim3 bpg(int(ceil(float(nx) / tpb.x)),
int(ceil(float(ny) / tpb.y)),
int(ceil(float(nz) / tpb.z)));
const int scratchpad_size = nx * ny * nz;
const int bpg2 = (unsigned int)ceil(float(scratchpad_size) /
float(ELEMS_PER_THREAD * BLOCK_SIZE));
// "Features" of this quick & efficient reduction:
// Block size must be smaller than the computational domain size
// (otherwise we would have do some additional bounds checking in the
// second half of _kernel_reduce, which gets quite confusing)
// Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
// the work without worrying too much about the array bounds.
ERRCHK(BLOCK_SIZE <= scratchpad_size);
ERRCHK(!(BLOCK_SIZE % 2));
// NOTE! Also does not work properly with non-power of two mesh dimension
// Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
// see that uninitialized smem values are used in the comparison
ERRCHK(is_power_of_two(nx));
ERRCHK(is_power_of_two(ny));
ERRCHK(is_power_of_two(nz));
switch (rtype) {
case RTYPE_MAX:
_kernel_reduce_vec<_device_length_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_max>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_max>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_MIN:
_kernel_reduce_vec<_device_length_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_min>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_min>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS:
_kernel_reduce_vec<_device_squared_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
case RTYPE_RMS_EXP:
_kernel_reduce_vec<_device_exp_squared_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
default:
ERROR("Unrecognized RTYPE");
}
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
if (solve_mean) {
const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
return inv_n * result;
}
else {
return result;
}
}

742
src/core/kernels/rk3.cuh Normal file
View File

@@ -0,0 +1,742 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Implementation of the integration pipeline
*
*
*
*/
#pragma once
#include "device_globals.cuh"
#include <assert.h>
/*
#define RK_THREADS_X (32)
#define RK_THREADS_Y (1)
#define RK_THREADS_Z (4)
#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
*/
static __device__ __forceinline__ int
IDX(const int i)
{
return i;
}
static __device__ __forceinline__ int
IDX(const int i, const int j, const int k)
{
return DEVICE_VTXBUF_IDX(i, j, k);
}
static __device__ __forceinline__ int
IDX(const int3 idx)
{
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
}
static __forceinline__ AcMatrix
create_rotz(const AcReal radians)
{
AcMatrix mat;
mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
mat.row[2] = (AcReal3){0, 0, 0};
return mat;
}
#if AC_DOUBLE_PRECISION == 0
#define sin __sinf
#define cos __cosf
#define exp __expf
#define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0
/*
typedef struct {
int i, j, k;
} int3;*/
/*
* =============================================================================
* Level 0 (Input Assembly Stage)
* =============================================================================
*/
/*
* =============================================================================
* Level 0.1 (Read stencil elements and solve derivatives)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {0, 1.0 / 2.0};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
#elif STENCIL_ORDER == 6
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
#elif STENCIL_ORDER == 8
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
-1.0 / 280.0};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = 0;
#pragma unroll
for (int i = 1; i <= MID; ++i)
res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
return res * inv_ds;
}
static __device__ __forceinline__ AcReal
second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {-2., 1.};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
#elif STENCIL_ORDER == 6
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
1.0 / 90.0};
#elif STENCIL_ORDER == 8
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
8.0 / 315.0, -1.0 / 560.0};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = coefficients[0] * pencil[MID];
#pragma unroll
for (int i = 1; i <= MID; ++i)
res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
return res * inv_ds * inv_ds;
}
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
static __device__ __forceinline__ AcReal
cross_derivative(const AcReal* __restrict__ pencil_a,
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
const AcReal inv_ds_b)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {0, 1.0 / 4.0};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
#elif STENCIL_ORDER == 6
const AcReal fac = (1. / 720.);
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
2.0 * fac};
#elif STENCIL_ORDER == 8
const AcReal fac = (1. / 20160.);
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
128. * fac, -9. * fac};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = AcReal(0.);
#pragma unroll
for (int i = 1; i <= MID; ++i) {
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
pencil_b[MID + i] - pencil_b[MID - i]);
}
return res * inv_ds_a * inv_ds_b;
}
static __device__ __forceinline__ AcReal
derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
}
static __device__ __forceinline__ AcReal
derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
}
static __device__ __forceinline__ AcReal
derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
}
/*
* =============================================================================
* Level 0.2 (Caching functions)
* =============================================================================
*/
#include "stencil_assembly.cuh"
/*
typedef struct {
AcRealData x;
AcRealData y;
AcRealData z;
} AcReal3Data;
static __device__ __forceinline__ AcReal3Data
read_data(const int i, const int j, const int k,
AcReal* __restrict__ buf[], const int3& handle)
{
AcReal3Data data;
data.x = read_data(i, j, k, buf, handle.x);
data.y = read_data(i, j, k, buf, handle.y);
data.z = read_data(i, j, k, buf, handle.z);
return data;
}
*/
/*
* =============================================================================
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
* =============================================================================
*/
static __host__ __device__ __forceinline__ AcReal3
operator-(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator+(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator-(const AcReal3& a)
{
return (AcReal3){-a.x, -a.y, -a.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator*(const AcReal a, const AcReal3& b)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static __host__ __device__ __forceinline__ AcReal
dot(const AcReal3& a, const AcReal3& b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
static __host__ __device__ __forceinline__ AcReal3
mul(const AcMatrix& aa, const AcReal3& x)
{
return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
}
static __host__ __device__ __forceinline__ AcReal3
cross(const AcReal3& a, const AcReal3& b)
{
AcReal3 c;
c.x = a.y * b.z - a.z * b.y;
c.y = a.z * b.x - a.x * b.z;
c.z = a.x * b.y - a.y * b.x;
return c;
}
static __host__ __device__ __forceinline__ bool
is_valid(const AcReal a)
{
return !isnan(a) && !isinf(a);
}
static __host__ __device__ __forceinline__ bool
is_valid(const AcReal3& a)
{
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
}
/*
* =============================================================================
* Level 1 (Stencil Processing Stage)
* =============================================================================
*/
/*
* =============================================================================
* Level 1.1 (Terms)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
laplace(const AcRealData& data)
{
return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
}
static __device__ __forceinline__ AcReal
divergence(const AcReal3Data& vec)
{
return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
}
static __device__ __forceinline__ AcReal3
laplace_vec(const AcReal3Data& vec)
{
return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
}
static __device__ __forceinline__ AcReal3
curl(const AcReal3Data& vec)
{
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
gradient(vec.x).z - gradient(vec.z).x,
gradient(vec.y).x - gradient(vec.x).y};
}
static __device__ __forceinline__ AcReal3
gradient_of_divergence(const AcReal3Data& vec)
{
return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
}
// Takes uu gradients and returns S
static __device__ __forceinline__ AcMatrix
stress_tensor(const AcReal3Data& vec)
{
AcMatrix S;
S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
S.row[1].x = S.row[0].y;
S.row[2].x = S.row[0].z;
S.row[2].y = S.row[1].z;
return S;
}
static __device__ __forceinline__ AcReal
contract(const AcMatrix& mat)
{
AcReal res = 0;
#pragma unroll
for (int i = 0; i < 3; ++i)
res += dot(mat.row[i], mat.row[i]);
return res;
}
/*
* =============================================================================
* Level 1.2 (Equations)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
length(const AcReal3& vec)
{
return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
}
static __device__ __forceinline__ AcReal
reciprocal_len(const AcReal3& vec)
{
return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
}
static __device__ __forceinline__ AcReal3
normalized(const AcReal3& vec)
{
const AcReal inv_len = reciprocal_len(vec);
return inv_len * vec;
}
// Sinusoidal forcing
// https://arxiv.org/pdf/1704.04676.pdf
__constant__ AcReal3 forcing_vec;
__constant__ AcReal forcing_phi;
static __device__ __forceinline__ AcReal3
forcing(const int i, const int j, const int k)
{
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
AcReal inv_len = reciprocal_len(k_vec);
if (isnan(inv_len) || isinf(inv_len))
inv_len = 0;
if (inv_len > 2) // hack to make it cool
inv_len = 2;
const AcReal k_dot_x = dot(k_vec, forcing_vec);
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
return inv_len * inv_len * waves * forcing_vec;
}
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
#define LNT0 (AcReal(0.0))
#define LNRHO0 (AcReal(0.0))
#define H_CONST (AcReal(0.0))
#define C_CONST (AcReal(0.0))
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const AcReal state_previous, const AcReal state_current,
const AcReal rate_of_change, const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
AcReal(8. / 15.)};
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
// access (when accessing beta[step_number-1] even when step_number >= 1)
switch (step_number) {
case 0:
return state_current + beta[step_number + 1] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number + 1] *
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
/*
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
const AcReal rate_of_change, const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {AcReal(1. / 3.), AcReal(15. / 16.),
AcReal(8. / 15.)};
switch (step_number) {
case 0:
return state_current + beta[step_number] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number] *
(alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
*/
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
const AcReal3 rate_of_change, const AcReal dt)
{
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
}
#define rk3(state_previous, state_current, rate_of_change, dt)\
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
/*
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const int idx, const AcReal out, const int handle,
const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
{
return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
}
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
{
return (AcReal3) {
rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
};
}
#define RK3(handle, in_cached, rate_of_change, dt) \
rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
*/
/*
* =============================================================================
* Level 1.3 (Kernels)
* =============================================================================
*/
static __device__ void
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
{
out[handle][idx] = value;
}
static __device__ void
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
{
write(out, vec.x, idx, value.x);
write(out, vec.y, idx, value.y);
write(out, vec.z, idx, value.z);
}
static __device__ AcReal
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
{
return field[handle][idx];
}
static __device__ AcReal3
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
{
return (AcReal3) { read_out(idx, field, handle.x),
read_out(idx, field, handle.y),
read_out(idx, field, handle.z) };
}
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
// also write for clarity here also, not for the DSL
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
#define GEN_KERNEL_PARAM_BOILERPLATE \
const int3 start, const int3 end, VertexBufferArray buffer
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
return;\
\
\
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
vertexIdx.z < DCONST_INT(AC_nz_max));\
\
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
vertexIdx.z >= DCONST_INT(AC_nz_min));\
\
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
#include "stencil_process.cuh"
/*
* =============================================================================
* Level 2 (Host calls)
* =============================================================================
*/
static AcReal
randf(void)
{
return AcReal(rand()) / AcReal(RAND_MAX);
}
AcResult
rk3_step_async(const cudaStream_t stream, const dim3& tpb,
const int3& start, const int3& end, const int& step_number,
const AcReal dt, const AcMeshInfo& /*mesh_info*/,
VertexBufferArray* buffer)
{
/////////////////// Forcing
#if LFORCING
const AcReal ff_scale = AcReal(.2);
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
const AcMatrix rotz = create_rotz(radians);
ff = mul(rotz, ff);
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
#endif // LFORCING
//////////////////////////
const int nx = end.x - start.x;
const int ny = end.y - start.y;
const int nz = end.z - start.z;
const dim3 bpg(
(unsigned int)ceil(nx / AcReal(tpb.x)),
(unsigned int)ceil(ny / AcReal(tpb.y)),
(unsigned int)ceil(nz / AcReal(tpb.z)));
if (step_number == 0)
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
else
solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}

91
src/core/math_utils.h Normal file
View File

@@ -0,0 +1,91 @@
/*
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include <math.h> // isnan, isinf
#include <stdlib.h> // rand
template <class T>
static inline const T
max(const T& a, const T& b)
{
return a > b ? a : b;
}
template <class T>
static inline const T
min(const T& a, const T& b)
{
return a < b ? a : b;
}
template <class T>
static inline const T
sum(const T& a, const T& b)
{
return a + b;
}
template <class T>
static inline const T
is_valid(const T& val)
{
if (isnan(val) || isinf(val))
return false;
else
return true;
}
template <class T>
static inline const T
clamp(const T& val, const T& min, const T& max)
{
return val < min ? min : val > max ? max : val;
}
static inline AcReal
randr()
{
return AcReal(rand()) / AcReal(RAND_MAX);
}
static inline int3
operator+(const int3& a, const int3& b)
{
return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static inline int3
operator-(const int3& a, const int3& b)
{
return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
}
static inline bool
is_power_of_two(const unsigned val)
{
return val && !(val & (val - 1));
}