Added Astaroth 2.0
This commit is contained in:
70
src/core/CMakeLists.txt
Normal file
70
src/core/CMakeLists.txt
Normal file
@@ -0,0 +1,70 @@
|
||||
########################################
|
||||
## CMakeLists.txt for Astaroth Core ##
|
||||
########################################
|
||||
|
||||
#----------------------Find CUDA-----------------------------------------------#
|
||||
|
||||
find_package(CUDA)
|
||||
if (NOT CUDA_FOUND)
|
||||
# find_package(CUDA REQUIRED) gives a confusing error message if it fails,
|
||||
# therefore we print the reason here explicitly
|
||||
message(FATAL_ERROR "CUDA not found")
|
||||
endif()
|
||||
|
||||
|
||||
#----------------------CUDA settings-------------------------------------------#
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_PROPAGATE_HOST_FLAGS ON)
|
||||
|
||||
# CUDA_BUILD_CUBIN requires that we're compiling for only one architecture
|
||||
# set(CUDA_BUILD_CUBIN ON)
|
||||
|
||||
|
||||
#----------------------Setup CUDA compilation flags----------------------------#
|
||||
|
||||
# Generate code for the default architecture (Pascal)
|
||||
set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
|
||||
-gencode arch=compute_50,code=sm_50
|
||||
-gencode arch=compute_60,code=sm_60
|
||||
-gencode arch=compute_61,code=sm_61
|
||||
-lineinfo
|
||||
--maxrregcount=255
|
||||
-ftz=true
|
||||
-std=c++11) #--maxrregcount=255 -ftz=true #ftz = flush denormalized floats to zero
|
||||
# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
|
||||
# =cg to opt out
|
||||
|
||||
# Additional CUDA optimization flags
|
||||
if (CMAKE_BUILD_TYPE MATCHES RELEASE)
|
||||
# Doesn't set any additional flags, see CUDA_NVCC_FLAGS_DEBUG below on how
|
||||
# to add more
|
||||
set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE})
|
||||
endif()
|
||||
|
||||
# Additional CUDA debug flags
|
||||
if (CMAKE_BUILD_TYPE MATCHES DEBUG)
|
||||
# The debug flags must be set inside this if clause, since either CMake 3.5
|
||||
# or nvcc 7.5 is bugged:
|
||||
# CMake converts these into empty strings when doing RELEASE build, but nvcc
|
||||
# 7.5 fails to parse empty flags.
|
||||
set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};
|
||||
--device-debug;
|
||||
--generate-line-info;
|
||||
--ptxas-options=-v)
|
||||
endif()
|
||||
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCH_FLAGS}")
|
||||
|
||||
|
||||
message("CUDA_NVCC_FLAGS: " ${CUDA_NVCC_FLAGS})
|
||||
|
||||
|
||||
#------------------Compile and create a static library-------------------------#
|
||||
file(GLOB CUDA_SOURCES "*.cu" "kernels/*.cu")
|
||||
|
||||
# Use -fPIC if -fpic not supported. Some quick non-scientific tests:
|
||||
# Without fpic: 4.94 user, 4.04 system, 0:09.88 elapsed
|
||||
# With fpic: 4.96 user, 4.02 system, 0:09.90 elapsed
|
||||
# With fPIC: 4.94 user, 4.05 system, 0:10.23 elapsed
|
||||
CUDA_ADD_LIBRARY(astaroth_core STATIC ${CUDA_SOURCES} OPTIONS --compiler-options "-fpic")
|
||||
451
src/core/astaroth.cu
Normal file
451
src/core/astaroth.cu
Normal file
@@ -0,0 +1,451 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Multi-GPU implementation.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "astaroth.h"
|
||||
#include "errchk.h"
|
||||
|
||||
#include "device.cuh"
|
||||
#include "math_utils.h" // sum for reductions
|
||||
#include "standalone/config_loader.h" // update_config
|
||||
|
||||
const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
|
||||
|
||||
|
||||
static const int MAX_NUM_DEVICES = 32;
|
||||
static int num_devices = 1;
|
||||
static Device devices[MAX_NUM_DEVICES] = {};
|
||||
|
||||
typedef struct {
|
||||
int3 m;
|
||||
int3 n;
|
||||
} Grid;
|
||||
|
||||
static Grid
|
||||
createGrid(const AcMeshInfo& config)
|
||||
{
|
||||
Grid grid;
|
||||
grid.m = (int3) {
|
||||
config.int_params[AC_mx],
|
||||
config.int_params[AC_my],
|
||||
config.int_params[AC_mz]
|
||||
};
|
||||
|
||||
grid.n = (int3) {
|
||||
config.int_params[AC_nx],
|
||||
config.int_params[AC_ny],
|
||||
config.int_params[AC_nz]
|
||||
};
|
||||
|
||||
return grid;
|
||||
}
|
||||
|
||||
static Grid grid; // A grid consists of num_devices subgrids
|
||||
static Grid subgrid;
|
||||
|
||||
static int
|
||||
gridIdx(const Grid& grid, const int i, const int j, const int k)
|
||||
{
|
||||
return i + j * grid.m.x + k * grid.m.x * grid.m.y;
|
||||
}
|
||||
|
||||
static int3
|
||||
gridIdx3d(const Grid& grid, const int idx)
|
||||
{
|
||||
return (int3){idx % grid.m.x,
|
||||
(idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
||||
idx / (grid.m.x * grid.m.y)};
|
||||
}
|
||||
|
||||
void
|
||||
printInt3(const int3 vec)
|
||||
{
|
||||
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acInit(const AcMeshInfo& config)
|
||||
{
|
||||
// Check devices
|
||||
cudaGetDeviceCount(&num_devices);
|
||||
if (num_devices < 1) {
|
||||
ERROR("No CUDA devices found!");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
if (num_devices > MAX_NUM_DEVICES) {
|
||||
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
|
||||
num_devices = MAX_NUM_DEVICES;
|
||||
}
|
||||
if (!AC_MULTIGPU_ENABLED) {
|
||||
WARNING("MULTIGPU_ENABLED was false. Using only one device");
|
||||
num_devices = 1; // Use only one device if multi-GPU is not enabled
|
||||
}
|
||||
// Check that num_devices is divisible with AC_nz. This makes decomposing the
|
||||
// problem domain to multiple GPUs much easier since we do not have to worry
|
||||
// about remainders
|
||||
ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
|
||||
|
||||
// Decompose the problem domain
|
||||
// The main grid
|
||||
grid = createGrid(config);
|
||||
|
||||
// Subgrids
|
||||
AcMeshInfo subgrid_config = config;
|
||||
subgrid_config.int_params[AC_nz] /= num_devices;
|
||||
update_config(&subgrid_config);
|
||||
subgrid = createGrid(subgrid_config);
|
||||
|
||||
// Periodic boundary conditions become weird if the system can "fold unto itself".
|
||||
ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
|
||||
|
||||
printf("Grid m "); printInt3(grid.m); printf("\n");
|
||||
printf("Grid n "); printInt3(grid.n); printf("\n");
|
||||
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
|
||||
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
|
||||
|
||||
// Initialize the devices
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
createDevice(i, subgrid_config, &devices[i]);
|
||||
printDeviceInfo(devices[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acQuit(void)
|
||||
{
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
destroyDevice(devices[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
int
|
||||
gridIdxx(const Grid grid, const int3 idx)
|
||||
{
|
||||
return gridIdx(grid, idx.x, idx.y, idx.z);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
|
||||
{
|
||||
/*
|
||||
Here we decompose the host mesh and distribute it among the GPUs in
|
||||
the node.
|
||||
|
||||
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
||||
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
||||
one for each GPU. Here we check which parts of the range s0...s1 maps
|
||||
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
||||
the data if needed.
|
||||
|
||||
The index mapping is inherently quite involved, but here's a picture which
|
||||
hopefully helps make sense out of all this.
|
||||
|
||||
|
||||
Grid
|
||||
|----num_vertices---|
|
||||
xxx|....................................................|xxx
|
||||
^ ^ ^ ^
|
||||
d0 d1 s0 (src) s1
|
||||
|
||||
Subgrid
|
||||
|
||||
xxx|.............|xxx
|
||||
^ ^
|
||||
d0 d1
|
||||
|
||||
^ ^
|
||||
db da
|
||||
|
||||
*/
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
|
||||
|
||||
const int3 s0 = src;
|
||||
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
|
||||
|
||||
const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
|
||||
const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
|
||||
/*
|
||||
printf("Device %d\n", i);
|
||||
printf("\ts0: "); printInt3(s0); printf("\n");
|
||||
printf("\td0: "); printInt3(d0); printf("\n");
|
||||
printf("\tda: "); printInt3(da); printf("\n");
|
||||
printf("\tdb: "); printInt3(db); printf("\n");
|
||||
printf("\td1: "); printInt3(d1); printf("\n");
|
||||
printf("\ts1: "); printInt3(s1); printf("\n");
|
||||
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
|
||||
*/
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
|
||||
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
// See acLoadWithOffset() for an explanation of the index mapping
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
|
||||
|
||||
const int3 s0 = src;
|
||||
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0.x, s0.y, s0.z) + num_vertices);
|
||||
|
||||
const int3 da = (int3){max(s0.x, d0.x), max(s0.y, d0.y), max(s0.z, d0.z)};
|
||||
const int3 db = (int3){min(s1.x, d1.x), min(s1.y, d1.y), min(s1.z, d1.z)};
|
||||
/*
|
||||
printf("Device %d\n", i);
|
||||
printf("\ts0: "); printInt3(s0); printf("\n");
|
||||
printf("\td0: "); printInt3(d0); printf("\n");
|
||||
printf("\tda: "); printInt3(da); printf("\n");
|
||||
printf("\tdb: "); printInt3(db); printf("\n");
|
||||
printf("\td1: "); printInt3(d1); printf("\n");
|
||||
printf("\ts1: "); printInt3(s1); printf("\n");
|
||||
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
|
||||
*/
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
|
||||
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
// acCopyMeshToDevice
|
||||
AcResult
|
||||
acLoad(const AcMesh& host_mesh)
|
||||
{
|
||||
return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
|
||||
}
|
||||
|
||||
// acCopyMeshToHost
|
||||
AcResult
|
||||
acStore(AcMesh* host_mesh)
|
||||
{
|
||||
return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrateStep(const int& isubstep, const AcReal& dt)
|
||||
{
|
||||
const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
|
||||
const int3 end = (int3){STENCIL_ORDER/2 + subgrid.n.x,
|
||||
STENCIL_ORDER/2 + subgrid.n.y,
|
||||
STENCIL_ORDER/2 + subgrid.n.z};
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acBoundcondStep(void)
|
||||
{
|
||||
acSynchronize();
|
||||
if (num_devices == 1) {
|
||||
boundcondStep(devices[0], STREAM_PRIMARY,
|
||||
(int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
|
||||
} else {
|
||||
// Local boundary conditions
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
|
||||
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
|
||||
}
|
||||
|
||||
/*
|
||||
// ===MIIKKANOTE START==========================================
|
||||
%JP: The old way for computing boundary conditions conflicts with the
|
||||
way we have to do things with multiple GPUs.
|
||||
|
||||
The older approach relied on unified memory, which represented the whole
|
||||
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
||||
in its current state is more meant for quick prototyping when performance is not an issue.
|
||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
|
||||
when managing the memory explicitly.
|
||||
|
||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||
Quick rundown:
|
||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||
contain the extents of the whole simulation domain and the decomposed grids, respectively.
|
||||
To simplify thing, we require that each GPU is assigned the same amount of work,
|
||||
therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
|
||||
to work with.
|
||||
|
||||
The whole simulation domain is decomposed with respect to the z dimension.
|
||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||
contain (nx, ny, nz / num_devices) vertices.
|
||||
|
||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||
|
||||
Terminology:
|
||||
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
||||
|
||||
Changes required to this commented code block:
|
||||
- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
|
||||
instead. Same holds for any complex index calculations. Instead, the local coordinates
|
||||
should be passed as an int3 type without having to consider how the data is actually
|
||||
laid out in device memory
|
||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
|
||||
of type "Device" which should be passed to single-GPU functions. In this file, all devices
|
||||
are stored in a global array "devices[num_devices]".
|
||||
- Every single-GPU function is executed asynchronously by default such that we
|
||||
can optimize Astaroth by executing memory transactions concurrently with computation.
|
||||
Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||
as a parameter and commands executing in different streams can be processed
|
||||
in parallel/concurrently.
|
||||
|
||||
|
||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||
|
||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||
|
||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
|
||||
|
||||
on a single device are "local", in the sense that they can be computed without having
|
||||
to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||
the data to the fron and back plates outside this range. In the solution we use here,
|
||||
we solve the local boundaries first, and then just exchange the front and back plates
|
||||
in a "ring", like so
|
||||
device_id
|
||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||
|
||||
|
||||
// ======MIIKKANOTE END==========================================
|
||||
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
||||
moved into device.cu, function boundCondStep()
|
||||
In astaroth.cu, we use acBoundcondStep()
|
||||
just to distribute the work and manage
|
||||
communication between GPUs.
|
||||
|
||||
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
|
||||
|
||||
exit(0);
|
||||
#else
|
||||
|
||||
|
||||
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
|
||||
|
||||
const int3 start = (int3){0, 0, device_id * depth};
|
||||
const int3 end = (int3){mesh_info.int_params[AC_mx],
|
||||
mesh_info.int_params[AC_my],
|
||||
min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
|
||||
|
||||
const dim3 tpb(8,2,8);
|
||||
|
||||
// TODO uses the default stream currently
|
||||
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
|
||||
wedge_boundconds(0, tpb, start, end, d_buffer);
|
||||
} else {
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
*/
|
||||
// Exchange halos
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
{
|
||||
const int3 src = (int3) {0, 0, subgrid.n.z};
|
||||
const int3 dst = (int3) {0, 0, 0};
|
||||
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
|
||||
}
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
{
|
||||
const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
|
||||
const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
|
||||
copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
|
||||
}
|
||||
}
|
||||
}
|
||||
acSynchronize();
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
acSwapBuffers(void)
|
||||
{
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
swapBuffers(devices[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrate(const AcReal& dt)
|
||||
{
|
||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||
acBoundcondStep();
|
||||
acIntegrateStep(isubstep, dt);
|
||||
acSwapBuffers();
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceScal(const ReductionType& rtype,
|
||||
const VertexBufferHandle& vtxbuffer_handle)
|
||||
{
|
||||
// TODO
|
||||
return 0;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
|
||||
const VertexBufferHandle& b, const VertexBufferHandle& c)
|
||||
{
|
||||
// TODO
|
||||
return 0;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acSynchronize(void)
|
||||
{
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
synchronize(devices[i], STREAM_ALL);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
309
src/core/device.cu
Normal file
309
src/core/device.cu
Normal file
@@ -0,0 +1,309 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "device.cuh"
|
||||
|
||||
#include "errchk.h"
|
||||
|
||||
typedef struct {
|
||||
AcReal* in[NUM_VTXBUF_HANDLES];
|
||||
AcReal* out[NUM_VTXBUF_HANDLES];
|
||||
} VertexBufferArray;
|
||||
|
||||
__constant__ AcMeshInfo d_mesh_info;
|
||||
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
||||
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
||||
#include "kernels/kernels.cuh"
|
||||
|
||||
struct device_s {
|
||||
int id;
|
||||
AcMeshInfo local_config;
|
||||
|
||||
// Concurrency
|
||||
cudaStream_t streams[NUM_STREAM_TYPES];
|
||||
|
||||
// Memory
|
||||
VertexBufferArray vba;
|
||||
AcReal* reduce_scratchpad;
|
||||
AcReal* reduce_result;
|
||||
};
|
||||
|
||||
AcResult
|
||||
printDeviceInfo(const Device device)
|
||||
{
|
||||
const int device_id = device->id;
|
||||
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, device_id);
|
||||
printf("--------------------------------------------------\n");
|
||||
printf("Device Number: %d\n", device_id);
|
||||
const size_t bus_id_max_len = 128;
|
||||
char bus_id[bus_id_max_len];
|
||||
cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
|
||||
printf(" PCI bus ID: %s\n", bus_id);
|
||||
printf(" Device name: %s\n", props.name);
|
||||
printf(" Compute capability: %d.%d\n", props.major, props.minor);
|
||||
|
||||
// Compute
|
||||
printf(" Compute\n");
|
||||
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
|
||||
printf(" Stream processors: %d\n", props.multiProcessorCount);
|
||||
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
|
||||
printf(" Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
|
||||
// Memory
|
||||
printf(" Global memory\n");
|
||||
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
|
||||
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
|
||||
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
|
||||
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
|
||||
(8. * 1024. * 1024. * 1024.));
|
||||
printf(" ECC enabled: %d\n", props.ECCEnabled);
|
||||
// Memory usage
|
||||
size_t free_bytes, total_bytes;
|
||||
cudaMemGetInfo(&free_bytes, &total_bytes);
|
||||
const size_t used_bytes = total_bytes - free_bytes;
|
||||
printf(" Total global mem: %.2f GiB\n",
|
||||
props.totalGlobalMem / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory free (GiB): %.2f\n",
|
||||
free_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory total (GiB): %.2f\n",
|
||||
total_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Caches\n");
|
||||
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
|
||||
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
|
||||
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
|
||||
printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
|
||||
printf(" Shared mem per block: %ld KiB\n",
|
||||
props.sharedMemPerBlock / (1024));
|
||||
printf(" Other\n");
|
||||
printf(" Warp size: %d\n", props.warpSize);
|
||||
// printf(" Single to double perf. ratio: %dx\n",
|
||||
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
|
||||
// versions
|
||||
printf(" Stream priorities supported: %d\n",
|
||||
props.streamPrioritiesSupported);
|
||||
printf("--------------------------------------------------\n");
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static __global__ void dummy_kernel(void) {}
|
||||
|
||||
AcResult
|
||||
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||
{
|
||||
cudaSetDevice(id);
|
||||
cudaDeviceReset();
|
||||
|
||||
// Create Device
|
||||
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
|
||||
ERRCHK_ALWAYS(device);
|
||||
|
||||
device->id = id;
|
||||
device->local_config = device_config;
|
||||
|
||||
// Check that the code was compiled for the proper GPU architecture
|
||||
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
|
||||
"device supports the CUDA architecture you are compiling for.\n"
|
||||
"Running dummy kernel... ");
|
||||
fflush(stdout);
|
||||
dummy_kernel<<<1, 1>>>();
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
printf("Success!\n");
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||
cudaStreamCreate(&device->streams[i]);
|
||||
}
|
||||
|
||||
// Memory
|
||||
const size_t vba_size_bytes = AC_VTXBUF_SIZE_BYTES(device_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||
}
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
|
||||
AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||
|
||||
// Device constants
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
printf("Created device %d (%p)\n", device->id, device);
|
||||
*device_handle = device;
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
destroyDevice(Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
printf("Destroying device %d (%p)\n", device->id, device);
|
||||
|
||||
// Memory
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
cudaFree(device->vba.in[i]);
|
||||
cudaFree(device->vba.out[i]);
|
||||
}
|
||||
cudaFree(device->reduce_scratchpad);
|
||||
cudaFree(device->reduce_result);
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i)
|
||||
cudaStreamDestroy(device->streams[i]);
|
||||
|
||||
// Destroy Device
|
||||
free(device);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceScal(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceVec(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||
const int3& start, const int3& end, const AcReal dt)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
synchronize(const Device device, const StreamType stream_type)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
if (stream_type == STREAM_ALL) {
|
||||
cudaDeviceSynchronize();
|
||||
} else {
|
||||
cudaStreamSynchronize(device->streams[stream_type]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
loadWithOffset(const Device device, const StreamType stream_type,
|
||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
|
||||
device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
storeWithOffset(const Device device, const StreamType stream_type,
|
||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
|
||||
device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToDevice(const Device device, const StreamType stream_type,
|
||||
const AcMesh& host_mesh, const int3& src, const int3& dst,
|
||||
const int num_vertices)
|
||||
{
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
|
||||
&device->vba.in[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToHost(const Device device, const StreamType stream_type,
|
||||
const int3& src, const int3& dst, const int num_vertices,
|
||||
AcMesh* host_mesh)
|
||||
{
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
|
||||
num_vertices * sizeof(AcReal),
|
||||
&host_mesh->vertex_buffer[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
|
||||
const int3& src, Device dst_device, const int3& dst,
|
||||
const int num_vertices)
|
||||
{
|
||||
cudaSetDevice(src_device->id);
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, dst_device->local_config);
|
||||
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
|
||||
&src_device->vba.in[i][src_idx], src_device->id,
|
||||
sizeof(src_device->vba.in[i][0]) * num_vertices,
|
||||
src_device->streams[stream_type]));
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
AcResult
|
||||
swapBuffers(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
AcReal* tmp = device->vba.in[i];
|
||||
device->vba.in[i] = device->vba.out[i];
|
||||
device->vba.out[i] = tmp;
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
82
src/core/device.cuh
Normal file
82
src/core/device.cuh
Normal file
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include "astaroth.h"
|
||||
|
||||
typedef enum {
|
||||
STREAM_PRIMARY,
|
||||
STREAM_SECONDARY,
|
||||
NUM_STREAM_TYPES,
|
||||
STREAM_ALL
|
||||
} StreamType;
|
||||
|
||||
typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
|
||||
// in Vulkan, f.ex. VkDevice
|
||||
|
||||
/** */
|
||||
AcResult printDeviceInfo(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
|
||||
|
||||
/** */
|
||||
AcResult destroyDevice(Device device);
|
||||
|
||||
/** */
|
||||
AcResult boundcondStep(const Device device, const StreamType stream_type,
|
||||
const int3& start, const int3& end);
|
||||
|
||||
/** */
|
||||
AcResult reduceScal(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult reduceVec(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||
const int3& start, const int3& end, const AcReal dt);
|
||||
|
||||
/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
|
||||
a StreamType, the function synchronizes all streams on the device. */
|
||||
AcResult synchronize(const Device device, const StreamType stream_type);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
|
||||
const AcMesh& host_mesh, const int3& src, const int3& dst,
|
||||
const int num_vertices);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshToHost(const Device device, const StreamType stream_type,
|
||||
const int3& src, const int3& dst, const int num_vertices,
|
||||
AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
|
||||
Device dst, const int3& dst_idx, const int num_vertices);
|
||||
|
||||
/** Swaps the input/output buffers used in computations */
|
||||
AcResult swapBuffers(const Device device);
|
||||
112
src/core/errchk.h
Normal file
112
src/core/errchk.h
Normal file
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
// clang-format off
|
||||
/*
|
||||
* =============================================================================
|
||||
* General error checking
|
||||
* =============================================================================
|
||||
*/
|
||||
#define ERROR(str) \
|
||||
{ \
|
||||
time_t t; time(&t); \
|
||||
fprintf(stderr, "%s", ctime(&t)); \
|
||||
fprintf(stderr, "\tError in file %s line %d: %s\n", \
|
||||
__FILE__, __LINE__, str); \
|
||||
fflush(stderr); \
|
||||
exit(EXIT_FAILURE); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
#define WARNING(str) \
|
||||
{ \
|
||||
time_t t; time(&t); \
|
||||
fprintf(stderr, "%s", ctime(&t)); \
|
||||
fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
|
||||
__FILE__, __LINE__, str); \
|
||||
fflush(stderr); \
|
||||
}
|
||||
|
||||
// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
|
||||
#define ERRCHK(retval) { if (!(retval)) ERROR(#retval " was false"); }
|
||||
#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
|
||||
#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* CUDA-specific error checking
|
||||
* =============================================================================
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
static inline void
|
||||
cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
|
||||
{
|
||||
if (code != cudaSuccess) {
|
||||
time_t t; time(&t); \
|
||||
fprintf(stderr, "%s", ctime(&t)); \
|
||||
fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
|
||||
file, line, cudaGetErrorString(code)); \
|
||||
fflush(stderr); \
|
||||
|
||||
if (abort)
|
||||
exit(code);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
#undef ERRCHK
|
||||
#undef WARNCHK
|
||||
#define ERRCHK(params)
|
||||
#define WARNCHK(params)
|
||||
#define ERRCHK_CUDA(params) params;
|
||||
#define WARNCHK_CUDA(params) params;
|
||||
#define ERRCHK_CUDA_KERNEL() {}
|
||||
#else
|
||||
#define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
|
||||
#define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
|
||||
|
||||
#define ERRCHK_CUDA_KERNEL() \
|
||||
{ \
|
||||
ERRCHK_CUDA(cudaPeekAtLastError()); \
|
||||
ERRCHK_CUDA(cudaDeviceSynchronize()); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
|
||||
|
||||
#define ERRCHK_CUDA_KERNEL_ALWAYS() \
|
||||
{ \
|
||||
ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError()); \
|
||||
ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize()); \
|
||||
}
|
||||
// clang-format on
|
||||
2
src/core/kernels/.gitignore
vendored
Normal file
2
src/core/kernels/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Ignore the generated headers
|
||||
stencil_process.cuh stencil_assembly.cuh
|
||||
1363
src/core/kernels/boundconds.cuh
Normal file
1363
src/core/kernels/boundconds.cuh
Normal file
File diff suppressed because it is too large
Load Diff
794
src/core/kernels/kernels.cuh
Normal file
794
src/core/kernels/kernels.cuh
Normal file
@@ -0,0 +1,794 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
__global__ void
|
||||
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vertex_buffer)
|
||||
{
|
||||
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
// If within the start-end range (this allows threadblock dims that are not
|
||||
// divisible by end - start)
|
||||
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
||||
return;
|
||||
|
||||
//if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
||||
// return;
|
||||
|
||||
// If destination index is inside the computational domain, return since
|
||||
// the boundary conditions are only applied to the ghost zones
|
||||
if (i_dst >= DCONST_INT(AC_nx_min) && i_dst < DCONST_INT(AC_nx_max) &&
|
||||
j_dst >= DCONST_INT(AC_ny_min) && j_dst < DCONST_INT(AC_ny_max) &&
|
||||
k_dst >= DCONST_INT(AC_nz_min) && k_dst < DCONST_INT(AC_nz_max))
|
||||
return;
|
||||
|
||||
// Find the source index
|
||||
// Map to nx, ny, nz coordinates
|
||||
int i_src = i_dst - DCONST_INT(AC_nx_min);
|
||||
int j_src = j_dst - DCONST_INT(AC_ny_min);
|
||||
int k_src = k_dst - DCONST_INT(AC_nz_min);
|
||||
|
||||
// Translate (s.t. the index is always positive)
|
||||
i_src += DCONST_INT(AC_nx);
|
||||
j_src += DCONST_INT(AC_ny);
|
||||
k_src += DCONST_INT(AC_nz);
|
||||
|
||||
// Wrap
|
||||
i_src %= DCONST_INT(AC_nx);
|
||||
j_src %= DCONST_INT(AC_ny);
|
||||
k_src %= DCONST_INT(AC_nz);
|
||||
|
||||
// Map to mx, my, mz coordinates
|
||||
i_src += DCONST_INT(AC_nx_min);
|
||||
j_src += DCONST_INT(AC_ny_min);
|
||||
k_src += DCONST_INT(AC_nz_min);
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
||||
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
||||
vertex_buffer[dst_idx] = vertex_buffer[src_idx];
|
||||
}
|
||||
|
||||
void
|
||||
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vertex_buffer)
|
||||
{
|
||||
const dim3 tpb(8,2,8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
|
||||
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vertex_buffer);
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int i)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int i, const int j, const int k)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(i, j, k);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int3 idx)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
||||
}
|
||||
|
||||
static __forceinline__ AcMatrix
|
||||
create_rotz(const AcReal radians)
|
||||
{
|
||||
AcMatrix mat;
|
||||
|
||||
mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
|
||||
mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
|
||||
mat.row[2] = (AcReal3){0, 0, 0};
|
||||
|
||||
return mat;
|
||||
}
|
||||
|
||||
|
||||
#if AC_DOUBLE_PRECISION == 0
|
||||
#define sin __sinf
|
||||
#define cos __cosf
|
||||
#define exp __expf
|
||||
#define rsqrt rsqrtf // hardware reciprocal sqrt
|
||||
#endif // AC_DOUBLE_PRECISION == 0
|
||||
|
||||
|
||||
/*
|
||||
typedef struct {
|
||||
int i, j, k;
|
||||
} int3;*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0 (Input Assembly Stage)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0.1 (Read stencil elements and solve derivatives)
|
||||
* =============================================================================
|
||||
*/
|
||||
static __device__ __forceinline__ AcReal
|
||||
first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
||||
{
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {0, 1.0 / 2.0};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
|
||||
-1.0 / 280.0};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i)
|
||||
res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
|
||||
|
||||
return res * inv_ds;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
||||
{
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {-2., 1.};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
|
||||
1.0 / 90.0};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
|
||||
8.0 / 315.0, -1.0 / 560.0};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = coefficients[0] * pencil[MID];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i)
|
||||
res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
|
||||
|
||||
return res * inv_ds * inv_ds;
|
||||
}
|
||||
|
||||
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
|
||||
static __device__ __forceinline__ AcReal
|
||||
cross_derivative(const AcReal* __restrict__ pencil_a,
|
||||
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
|
||||
const AcReal inv_ds_b)
|
||||
{
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {0, 1.0 / 4.0};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal fac = (1. / 720.);
|
||||
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
|
||||
2.0 * fac};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal fac = (1. / 20160.);
|
||||
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
|
||||
128. * fac, -9. * fac};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = AcReal(0.);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i) {
|
||||
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
|
||||
pencil_b[MID + i] - pencil_b[MID - i]);
|
||||
}
|
||||
return res * inv_ds_a * inv_ds_b;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil_a[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
|
||||
AcReal pencil_b[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
||||
DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil_a[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
AcReal pencil_b[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
||||
DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil_a[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
AcReal pencil_b[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
|
||||
DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0.2 (Caching functions)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
#include "stencil_assembly.cuh"
|
||||
|
||||
/*
|
||||
typedef struct {
|
||||
AcRealData x;
|
||||
AcRealData y;
|
||||
AcRealData z;
|
||||
} AcReal3Data;
|
||||
|
||||
static __device__ __forceinline__ AcReal3Data
|
||||
read_data(const int i, const int j, const int k,
|
||||
AcReal* __restrict__ buf[], const int3& handle)
|
||||
{
|
||||
AcReal3Data data;
|
||||
|
||||
data.x = read_data(i, j, k, buf, handle.x);
|
||||
data.y = read_data(i, j, k, buf, handle.y);
|
||||
data.z = read_data(i, j, k, buf, handle.z);
|
||||
|
||||
return data;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator-(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator+(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator-(const AcReal3& a)
|
||||
{
|
||||
return (AcReal3){-a.x, -a.y, -a.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator*(const AcReal a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal
|
||||
dot(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
mul(const AcMatrix& aa, const AcReal3& x)
|
||||
{
|
||||
return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
cross(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
AcReal3 c;
|
||||
|
||||
c.x = a.y * b.z - a.z * b.y;
|
||||
c.y = a.z * b.x - a.x * b.z;
|
||||
c.z = a.x * b.y - a.y * b.x;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ bool
|
||||
is_valid(const AcReal a)
|
||||
{
|
||||
return !isnan(a) && !isinf(a);
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ bool
|
||||
is_valid(const AcReal3& a)
|
||||
{
|
||||
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1 (Stencil Processing Stage)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1.1 (Terms)
|
||||
* =============================================================================
|
||||
*/
|
||||
static __device__ __forceinline__ AcReal
|
||||
laplace(const AcRealData& data)
|
||||
{
|
||||
return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
divergence(const AcReal3Data& vec)
|
||||
{
|
||||
return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
laplace_vec(const AcReal3Data& vec)
|
||||
{
|
||||
return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
curl(const AcReal3Data& vec)
|
||||
{
|
||||
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
|
||||
gradient(vec.x).z - gradient(vec.z).x,
|
||||
gradient(vec.y).x - gradient(vec.x).y};
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
gradient_of_divergence(const AcReal3Data& vec)
|
||||
{
|
||||
return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
|
||||
hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
|
||||
hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
|
||||
}
|
||||
|
||||
// Takes uu gradients and returns S
|
||||
static __device__ __forceinline__ AcMatrix
|
||||
stress_tensor(const AcReal3Data& vec)
|
||||
{
|
||||
AcMatrix S;
|
||||
|
||||
S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
|
||||
AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
|
||||
S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
|
||||
S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
|
||||
|
||||
S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
|
||||
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
|
||||
|
||||
S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
|
||||
|
||||
S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
|
||||
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
|
||||
|
||||
S.row[1].x = S.row[0].y;
|
||||
S.row[2].x = S.row[0].z;
|
||||
S.row[2].y = S.row[1].z;
|
||||
|
||||
return S;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
contract(const AcMatrix& mat)
|
||||
{
|
||||
AcReal res = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 3; ++i)
|
||||
res += dot(mat.row[i], mat.row[i]);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1.2 (Equations)
|
||||
* =============================================================================
|
||||
*/
|
||||
static __device__ __forceinline__ AcReal
|
||||
length(const AcReal3& vec)
|
||||
{
|
||||
return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
reciprocal_len(const AcReal3& vec)
|
||||
{
|
||||
return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
normalized(const AcReal3& vec)
|
||||
{
|
||||
const AcReal inv_len = reciprocal_len(vec);
|
||||
return inv_len * vec;
|
||||
}
|
||||
|
||||
// Sinusoidal forcing
|
||||
// https://arxiv.org/pdf/1704.04676.pdf
|
||||
__constant__ AcReal3 forcing_vec;
|
||||
__constant__ AcReal forcing_phi;
|
||||
static __device__ __forceinline__ AcReal3
|
||||
forcing(const int i, const int j, const int k)
|
||||
{
|
||||
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
|
||||
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
|
||||
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
|
||||
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
|
||||
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
|
||||
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
|
||||
AcReal inv_len = reciprocal_len(k_vec);
|
||||
if (isnan(inv_len) || isinf(inv_len))
|
||||
inv_len = 0;
|
||||
if (inv_len > 2) // hack to make it cool
|
||||
inv_len = 2;
|
||||
const AcReal k_dot_x = dot(k_vec, forcing_vec);
|
||||
|
||||
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
|
||||
|
||||
return inv_len * inv_len * waves * forcing_vec;
|
||||
}
|
||||
|
||||
|
||||
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
|
||||
#define LNT0 (AcReal(0.0))
|
||||
#define LNRHO0 (AcReal(0.0))
|
||||
|
||||
#define H_CONST (AcReal(0.0))
|
||||
#define C_CONST (AcReal(0.0))
|
||||
|
||||
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate(const AcReal state_previous, const AcReal state_current,
|
||||
const AcReal rate_of_change, const AcReal dt)
|
||||
{
|
||||
// Williamson (1980)
|
||||
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
|
||||
AcReal(8. / 15.)};
|
||||
|
||||
|
||||
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
||||
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
||||
switch (step_number) {
|
||||
case 0:
|
||||
return state_current + beta[step_number + 1] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number + 1] *
|
||||
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
}
|
||||
}
|
||||
/*
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
|
||||
const AcReal rate_of_change, const AcReal dt)
|
||||
{
|
||||
// Williamson (1980)
|
||||
const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||
const AcReal beta[] = {AcReal(1. / 3.), AcReal(15. / 16.),
|
||||
AcReal(8. / 15.)};
|
||||
|
||||
|
||||
switch (step_number) {
|
||||
case 0:
|
||||
return state_current + beta[step_number] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number] *
|
||||
(alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal3
|
||||
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||
const AcReal3 rate_of_change, const AcReal dt)
|
||||
{
|
||||
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||
}
|
||||
|
||||
#define rk3(state_previous, state_current, rate_of_change, dt)\
|
||||
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
||||
|
||||
/*
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate(const int idx, const AcReal out, const int handle,
|
||||
const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
|
||||
{
|
||||
return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
|
||||
}
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal3
|
||||
rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
|
||||
const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
|
||||
{
|
||||
return (AcReal3) {
|
||||
rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
|
||||
};
|
||||
}
|
||||
|
||||
#define RK3(handle, in_cached, rate_of_change, dt) \
|
||||
rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1.3 (Kernels)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
static __device__ void
|
||||
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
|
||||
{
|
||||
out[handle][idx] = value;
|
||||
}
|
||||
|
||||
static __device__ void
|
||||
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
|
||||
{
|
||||
write(out, vec.x, idx, value.x);
|
||||
write(out, vec.y, idx, value.y);
|
||||
write(out, vec.z, idx, value.z);
|
||||
}
|
||||
|
||||
static __device__ AcReal
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
|
||||
{
|
||||
return field[handle][idx];
|
||||
}
|
||||
|
||||
static __device__ AcReal3
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
{
|
||||
return (AcReal3) { read_out(idx, field, handle.x),
|
||||
read_out(idx, field, handle.y),
|
||||
read_out(idx, field, handle.z) };
|
||||
}
|
||||
|
||||
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
||||
#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
|
||||
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
||||
|
||||
// also write for clarity here also, not for the DSL
|
||||
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
|
||||
|
||||
#define GEN_KERNEL_PARAM_BOILERPLATE \
|
||||
const int3 start, const int3 end, VertexBufferArray buffer
|
||||
|
||||
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
||||
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
|
||||
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
|
||||
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
|
||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
|
||||
return;\
|
||||
\
|
||||
\
|
||||
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
|
||||
vertexIdx.z < DCONST_INT(AC_nz_max));\
|
||||
\
|
||||
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
|
||||
vertexIdx.z >= DCONST_INT(AC_nz_min));\
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
#include "stencil_process.cuh"
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 2 (Host calls)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
static AcReal
|
||||
randf(void)
|
||||
{
|
||||
return AcReal(rand()) / AcReal(RAND_MAX);
|
||||
}
|
||||
|
||||
AcResult
|
||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
||||
const AcReal dt, VertexBufferArray* buffer)
|
||||
{
|
||||
const dim3 tpb(32, 1, 4);
|
||||
/////////////////// Forcing
|
||||
#if LFORCING
|
||||
const AcReal ff_scale = AcReal(.2);
|
||||
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
|
||||
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
|
||||
const AcMatrix rotz = create_rotz(radians);
|
||||
ff = mul(rotz, ff);
|
||||
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
|
||||
|
||||
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
|
||||
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
|
||||
#endif // LFORCING
|
||||
//////////////////////////
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
|
||||
const dim3 bpg(
|
||||
(unsigned int)ceil(nx / AcReal(tpb.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
||||
|
||||
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||
else if (step_number == 1)
|
||||
solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||
else
|
||||
solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
338
src/core/kernels/reduce.cuh
Normal file
338
src/core/kernels/reduce.cuh
Normal file
@@ -0,0 +1,338 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include "device_globals.cuh"
|
||||
|
||||
#include "src/core/errchk.h"
|
||||
#include "src/core/math_utils.h"
|
||||
|
||||
// Function pointer definitions
|
||||
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
|
||||
typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
|
||||
typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
|
||||
const AcReal&);
|
||||
|
||||
// clang-format off
|
||||
/* Comparison funcs */
|
||||
__device__ inline AcReal
|
||||
_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
|
||||
|
||||
/* Function used to determine the values used during reduction */
|
||||
__device__ inline AcReal
|
||||
_device_length_scal(const AcReal& a) { return AcReal(a); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
|
||||
|
||||
__device__ inline AcReal
|
||||
_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
|
||||
// clang-format on
|
||||
|
||||
__device__ inline bool
|
||||
oob(const int& i, const int& j, const int& k)
|
||||
{
|
||||
if (i >= d_mesh_info.int_params[AC_nx] ||
|
||||
j >= d_mesh_info.int_params[AC_ny] ||
|
||||
k >= d_mesh_info.int_params[AC_nz])
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
template <ReduceInitialScalFunc reduce_initial>
|
||||
__global__ void
|
||||
_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
|
||||
{
|
||||
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
if (oob(i, j, k))
|
||||
return;
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(
|
||||
i + d_mesh_info.int_params[AC_nx_min],
|
||||
j + d_mesh_info.int_params[AC_ny_min],
|
||||
k + d_mesh_info.int_params[AC_nz_min]);
|
||||
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
||||
|
||||
dst[dst_idx] = reduce_initial(src[src_idx]);
|
||||
}
|
||||
|
||||
template <ReduceInitialVecFunc reduce_initial>
|
||||
__global__ void
|
||||
_kernel_reduce_vec(const __restrict__ AcReal* src_a,
|
||||
const __restrict__ AcReal* src_b,
|
||||
const __restrict__ AcReal* src_c, AcReal* dst)
|
||||
{
|
||||
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int j = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int k = threadIdx.z + blockIdx.z * blockDim.z;
|
||||
|
||||
if (oob(i, j, k))
|
||||
return;
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(
|
||||
i + d_mesh_info.int_params[AC_nx_min],
|
||||
j + d_mesh_info.int_params[AC_ny_min],
|
||||
k + d_mesh_info.int_params[AC_nz_min]);
|
||||
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
|
||||
|
||||
dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
|
||||
src_c[src_idx]);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
#define BLOCK_SIZE (1024)
|
||||
#define ELEMS_PER_THREAD (32)
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
_kernel_reduce(AcReal* src, AcReal* result)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
||||
|
||||
if (idx >= scratchpad_size)
|
||||
return;
|
||||
|
||||
__shared__ AcReal smem[BLOCK_SIZE];
|
||||
|
||||
AcReal tmp = src[idx];
|
||||
|
||||
for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
|
||||
const int src_idx = idx + i * BLOCK_SIZE;
|
||||
if (src_idx >= scratchpad_size) {
|
||||
// This check is for safety: if accessing uninitialized values
|
||||
// beyond the mesh boundaries, we will immediately start seeing NANs
|
||||
if (threadIdx.x < BLOCK_SIZE)
|
||||
smem[threadIdx.x] = NAN;
|
||||
else
|
||||
break;
|
||||
}
|
||||
tmp = reduce(tmp, src[src_idx]);
|
||||
}
|
||||
|
||||
smem[threadIdx.x] = tmp;
|
||||
__syncthreads();
|
||||
|
||||
int offset = BLOCK_SIZE / 2;
|
||||
while (offset > 0) {
|
||||
|
||||
if (threadIdx.x < offset) {
|
||||
tmp = reduce(tmp, smem[threadIdx.x + offset]);
|
||||
smem[threadIdx.x] = tmp;
|
||||
}
|
||||
offset /= 2;
|
||||
__syncthreads();
|
||||
}
|
||||
if (threadIdx.x == 0)
|
||||
src[idx] = tmp;
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
|
||||
{
|
||||
const int scratchpad_size = DCONST_INT(AC_nxyz);
|
||||
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||
AcReal tmp = src[idx];
|
||||
const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
|
||||
for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
|
||||
tmp = reduce(tmp, src[idx + i * block_offset]);
|
||||
|
||||
*result = tmp;
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
AcReal
|
||||
_reduce_scal(const cudaStream_t stream,
|
||||
const ReductionType& rtype, const int& nx, const int& ny,
|
||||
const int& nz, const AcReal* vertex_buffer,
|
||||
AcReal* reduce_scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
bool solve_mean = false;
|
||||
|
||||
const dim3 tpb(32, 4, 1);
|
||||
const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
|
||||
int(ceil(AcReal(nz) / tpb.z)));
|
||||
|
||||
const int scratchpad_size = nx * ny * nz;
|
||||
const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) /
|
||||
AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
|
||||
|
||||
switch (rtype) {
|
||||
case RTYPE_MAX:
|
||||
_kernel_reduce_scal<_device_length_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_max>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_max>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_MIN:
|
||||
_kernel_reduce_scal<_device_length_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_min>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_min>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_RMS:
|
||||
_kernel_reduce_scal<_device_squared_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
solve_mean = true;
|
||||
break;
|
||||
case RTYPE_RMS_EXP:
|
||||
_kernel_reduce_scal<_device_exp_squared_scal>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
solve_mean = true;
|
||||
break;
|
||||
default:
|
||||
ERROR("Unrecognized RTYPE");
|
||||
}
|
||||
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
if (solve_mean) {
|
||||
const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
|
||||
return inv_n * result;
|
||||
}
|
||||
else {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
AcReal
|
||||
_reduce_vec(const cudaStream_t stream,
|
||||
const ReductionType& rtype, const int& nx, const int& ny,
|
||||
const int& nz, const AcReal* vertex_buffer_a,
|
||||
const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
|
||||
AcReal* reduce_scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
bool solve_mean = false;
|
||||
|
||||
const dim3 tpb(32, 4, 1);
|
||||
const dim3 bpg(int(ceil(float(nx) / tpb.x)),
|
||||
int(ceil(float(ny) / tpb.y)),
|
||||
int(ceil(float(nz) / tpb.z)));
|
||||
|
||||
const int scratchpad_size = nx * ny * nz;
|
||||
const int bpg2 = (unsigned int)ceil(float(scratchpad_size) /
|
||||
float(ELEMS_PER_THREAD * BLOCK_SIZE));
|
||||
|
||||
// "Features" of this quick & efficient reduction:
|
||||
// Block size must be smaller than the computational domain size
|
||||
// (otherwise we would have do some additional bounds checking in the
|
||||
// second half of _kernel_reduce, which gets quite confusing)
|
||||
// Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
|
||||
// the work without worrying too much about the array bounds.
|
||||
ERRCHK(BLOCK_SIZE <= scratchpad_size);
|
||||
ERRCHK(!(BLOCK_SIZE % 2));
|
||||
// NOTE! Also does not work properly with non-power of two mesh dimension
|
||||
// Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
|
||||
// see that uninitialized smem values are used in the comparison
|
||||
ERRCHK(is_power_of_two(nx));
|
||||
ERRCHK(is_power_of_two(ny));
|
||||
ERRCHK(is_power_of_two(nz));
|
||||
|
||||
switch (rtype) {
|
||||
case RTYPE_MAX:
|
||||
_kernel_reduce_vec<_device_length_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_max>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_max>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_MIN:
|
||||
_kernel_reduce_vec<_device_length_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_min>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_min>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
break;
|
||||
case RTYPE_RMS:
|
||||
_kernel_reduce_vec<_device_squared_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
solve_mean = true;
|
||||
break;
|
||||
case RTYPE_RMS_EXP:
|
||||
_kernel_reduce_vec<_device_exp_squared_vec>
|
||||
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
|
||||
reduce_scratchpad);
|
||||
_kernel_reduce<_device_sum>
|
||||
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
_kernel_reduce_block<_device_sum>
|
||||
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
|
||||
solve_mean = true;
|
||||
break;
|
||||
default:
|
||||
ERROR("Unrecognized RTYPE");
|
||||
}
|
||||
|
||||
AcReal result;
|
||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||
if (solve_mean) {
|
||||
const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
|
||||
return inv_n * result;
|
||||
}
|
||||
else {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
742
src/core/kernels/rk3.cuh
Normal file
742
src/core/kernels/rk3.cuh
Normal file
@@ -0,0 +1,742 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Implementation of the integration pipeline
|
||||
*
|
||||
*
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include "device_globals.cuh"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
/*
|
||||
#define RK_THREADS_X (32)
|
||||
#define RK_THREADS_Y (1)
|
||||
#define RK_THREADS_Z (4)
|
||||
#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
|
||||
#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
|
||||
*/
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int i)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int i, const int j, const int k)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(i, j, k);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int3 idx)
|
||||
{
|
||||
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
||||
}
|
||||
|
||||
static __forceinline__ AcMatrix
|
||||
create_rotz(const AcReal radians)
|
||||
{
|
||||
AcMatrix mat;
|
||||
|
||||
mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
|
||||
mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
|
||||
mat.row[2] = (AcReal3){0, 0, 0};
|
||||
|
||||
return mat;
|
||||
}
|
||||
|
||||
|
||||
#if AC_DOUBLE_PRECISION == 0
|
||||
#define sin __sinf
|
||||
#define cos __cosf
|
||||
#define exp __expf
|
||||
#define rsqrt rsqrtf // hardware reciprocal sqrt
|
||||
#endif // AC_DOUBLE_PRECISION == 0
|
||||
|
||||
|
||||
/*
|
||||
typedef struct {
|
||||
int i, j, k;
|
||||
} int3;*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0 (Input Assembly Stage)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0.1 (Read stencil elements and solve derivatives)
|
||||
* =============================================================================
|
||||
*/
|
||||
static __device__ __forceinline__ AcReal
|
||||
first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
||||
{
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {0, 1.0 / 2.0};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
|
||||
-1.0 / 280.0};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i)
|
||||
res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
|
||||
|
||||
return res * inv_ds;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
||||
{
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {-2., 1.};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
|
||||
1.0 / 90.0};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
|
||||
8.0 / 315.0, -1.0 / 560.0};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = coefficients[0] * pencil[MID];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i)
|
||||
res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
|
||||
|
||||
return res * inv_ds * inv_ds;
|
||||
}
|
||||
|
||||
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
|
||||
static __device__ __forceinline__ AcReal
|
||||
cross_derivative(const AcReal* __restrict__ pencil_a,
|
||||
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
|
||||
const AcReal inv_ds_b)
|
||||
{
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {0, 1.0 / 4.0};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal fac = (1. / 720.);
|
||||
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
|
||||
2.0 * fac};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal fac = (1. / 20160.);
|
||||
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
|
||||
128. * fac, -9. * fac};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = AcReal(0.);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i) {
|
||||
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
|
||||
pencil_b[MID + i] - pencil_b[MID - i]);
|
||||
}
|
||||
return res * inv_ds_a * inv_ds_b;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil_a[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
|
||||
AcReal pencil_b[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
||||
DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil_a[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
AcReal pencil_b[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
||||
DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil_a[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
AcReal pencil_b[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
|
||||
DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
{
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0.2 (Caching functions)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
#include "stencil_assembly.cuh"
|
||||
|
||||
/*
|
||||
typedef struct {
|
||||
AcRealData x;
|
||||
AcRealData y;
|
||||
AcRealData z;
|
||||
} AcReal3Data;
|
||||
|
||||
static __device__ __forceinline__ AcReal3Data
|
||||
read_data(const int i, const int j, const int k,
|
||||
AcReal* __restrict__ buf[], const int3& handle)
|
||||
{
|
||||
AcReal3Data data;
|
||||
|
||||
data.x = read_data(i, j, k, buf, handle.x);
|
||||
data.y = read_data(i, j, k, buf, handle.y);
|
||||
data.z = read_data(i, j, k, buf, handle.z);
|
||||
|
||||
return data;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator-(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator+(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator-(const AcReal3& a)
|
||||
{
|
||||
return (AcReal3){-a.x, -a.y, -a.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator*(const AcReal a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal
|
||||
dot(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
mul(const AcMatrix& aa, const AcReal3& x)
|
||||
{
|
||||
return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
cross(const AcReal3& a, const AcReal3& b)
|
||||
{
|
||||
AcReal3 c;
|
||||
|
||||
c.x = a.y * b.z - a.z * b.y;
|
||||
c.y = a.z * b.x - a.x * b.z;
|
||||
c.z = a.x * b.y - a.y * b.x;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ bool
|
||||
is_valid(const AcReal a)
|
||||
{
|
||||
return !isnan(a) && !isinf(a);
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ bool
|
||||
is_valid(const AcReal3& a)
|
||||
{
|
||||
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1 (Stencil Processing Stage)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1.1 (Terms)
|
||||
* =============================================================================
|
||||
*/
|
||||
static __device__ __forceinline__ AcReal
|
||||
laplace(const AcRealData& data)
|
||||
{
|
||||
return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
divergence(const AcReal3Data& vec)
|
||||
{
|
||||
return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
laplace_vec(const AcReal3Data& vec)
|
||||
{
|
||||
return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
curl(const AcReal3Data& vec)
|
||||
{
|
||||
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
|
||||
gradient(vec.x).z - gradient(vec.z).x,
|
||||
gradient(vec.y).x - gradient(vec.x).y};
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
gradient_of_divergence(const AcReal3Data& vec)
|
||||
{
|
||||
return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
|
||||
hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
|
||||
hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
|
||||
}
|
||||
|
||||
// Takes uu gradients and returns S
|
||||
static __device__ __forceinline__ AcMatrix
|
||||
stress_tensor(const AcReal3Data& vec)
|
||||
{
|
||||
AcMatrix S;
|
||||
|
||||
S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
|
||||
AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
|
||||
S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
|
||||
S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
|
||||
|
||||
S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
|
||||
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
|
||||
|
||||
S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
|
||||
|
||||
S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
|
||||
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
|
||||
|
||||
S.row[1].x = S.row[0].y;
|
||||
S.row[2].x = S.row[0].z;
|
||||
S.row[2].y = S.row[1].z;
|
||||
|
||||
return S;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
contract(const AcMatrix& mat)
|
||||
{
|
||||
AcReal res = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 3; ++i)
|
||||
res += dot(mat.row[i], mat.row[i]);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1.2 (Equations)
|
||||
* =============================================================================
|
||||
*/
|
||||
static __device__ __forceinline__ AcReal
|
||||
length(const AcReal3& vec)
|
||||
{
|
||||
return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
reciprocal_len(const AcReal3& vec)
|
||||
{
|
||||
return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal3
|
||||
normalized(const AcReal3& vec)
|
||||
{
|
||||
const AcReal inv_len = reciprocal_len(vec);
|
||||
return inv_len * vec;
|
||||
}
|
||||
|
||||
// Sinusoidal forcing
|
||||
// https://arxiv.org/pdf/1704.04676.pdf
|
||||
__constant__ AcReal3 forcing_vec;
|
||||
__constant__ AcReal forcing_phi;
|
||||
static __device__ __forceinline__ AcReal3
|
||||
forcing(const int i, const int j, const int k)
|
||||
{
|
||||
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
|
||||
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
|
||||
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
|
||||
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
|
||||
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
|
||||
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
|
||||
AcReal inv_len = reciprocal_len(k_vec);
|
||||
if (isnan(inv_len) || isinf(inv_len))
|
||||
inv_len = 0;
|
||||
if (inv_len > 2) // hack to make it cool
|
||||
inv_len = 2;
|
||||
const AcReal k_dot_x = dot(k_vec, forcing_vec);
|
||||
|
||||
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
|
||||
|
||||
return inv_len * inv_len * waves * forcing_vec;
|
||||
}
|
||||
|
||||
|
||||
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
|
||||
#define LNT0 (AcReal(0.0))
|
||||
#define LNRHO0 (AcReal(0.0))
|
||||
|
||||
#define H_CONST (AcReal(0.0))
|
||||
#define C_CONST (AcReal(0.0))
|
||||
|
||||
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate(const AcReal state_previous, const AcReal state_current,
|
||||
const AcReal rate_of_change, const AcReal dt)
|
||||
{
|
||||
// Williamson (1980)
|
||||
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
|
||||
AcReal(8. / 15.)};
|
||||
|
||||
|
||||
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
||||
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
||||
switch (step_number) {
|
||||
case 0:
|
||||
return state_current + beta[step_number + 1] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number + 1] *
|
||||
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
}
|
||||
}
|
||||
/*
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
|
||||
const AcReal rate_of_change, const AcReal dt)
|
||||
{
|
||||
// Williamson (1980)
|
||||
const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||
const AcReal beta[] = {AcReal(1. / 3.), AcReal(15. / 16.),
|
||||
AcReal(8. / 15.)};
|
||||
|
||||
|
||||
switch (step_number) {
|
||||
case 0:
|
||||
return state_current + beta[step_number] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number] *
|
||||
(alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal3
|
||||
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||
const AcReal3 rate_of_change, const AcReal dt)
|
||||
{
|
||||
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||
}
|
||||
|
||||
#define rk3(state_previous, state_current, rate_of_change, dt)\
|
||||
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
||||
|
||||
/*
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate(const int idx, const AcReal out, const int handle,
|
||||
const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
|
||||
{
|
||||
return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
|
||||
}
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal3
|
||||
rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
|
||||
const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
|
||||
{
|
||||
return (AcReal3) {
|
||||
rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
|
||||
};
|
||||
}
|
||||
|
||||
#define RK3(handle, in_cached, rate_of_change, dt) \
|
||||
rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
|
||||
*/
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1.3 (Kernels)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
static __device__ void
|
||||
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
|
||||
{
|
||||
out[handle][idx] = value;
|
||||
}
|
||||
|
||||
static __device__ void
|
||||
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
|
||||
{
|
||||
write(out, vec.x, idx, value.x);
|
||||
write(out, vec.y, idx, value.y);
|
||||
write(out, vec.z, idx, value.z);
|
||||
}
|
||||
|
||||
static __device__ AcReal
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
|
||||
{
|
||||
return field[handle][idx];
|
||||
}
|
||||
|
||||
static __device__ AcReal3
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
{
|
||||
return (AcReal3) { read_out(idx, field, handle.x),
|
||||
read_out(idx, field, handle.y),
|
||||
read_out(idx, field, handle.z) };
|
||||
}
|
||||
|
||||
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
||||
#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
|
||||
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
||||
|
||||
// also write for clarity here also, not for the DSL
|
||||
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
|
||||
|
||||
#define GEN_KERNEL_PARAM_BOILERPLATE \
|
||||
const int3 start, const int3 end, VertexBufferArray buffer
|
||||
|
||||
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
||||
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
|
||||
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
|
||||
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
|
||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
|
||||
return;\
|
||||
\
|
||||
\
|
||||
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
|
||||
vertexIdx.z < DCONST_INT(AC_nz_max));\
|
||||
\
|
||||
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
|
||||
vertexIdx.z >= DCONST_INT(AC_nz_min));\
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
#include "stencil_process.cuh"
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 2 (Host calls)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
static AcReal
|
||||
randf(void)
|
||||
{
|
||||
return AcReal(rand()) / AcReal(RAND_MAX);
|
||||
}
|
||||
|
||||
AcResult
|
||||
rk3_step_async(const cudaStream_t stream, const dim3& tpb,
|
||||
const int3& start, const int3& end, const int& step_number,
|
||||
const AcReal dt, const AcMeshInfo& /*mesh_info*/,
|
||||
VertexBufferArray* buffer)
|
||||
{
|
||||
/////////////////// Forcing
|
||||
#if LFORCING
|
||||
const AcReal ff_scale = AcReal(.2);
|
||||
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
|
||||
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
|
||||
const AcMatrix rotz = create_rotz(radians);
|
||||
ff = mul(rotz, ff);
|
||||
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
|
||||
|
||||
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
|
||||
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
|
||||
#endif // LFORCING
|
||||
//////////////////////////
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
|
||||
const dim3 bpg(
|
||||
(unsigned int)ceil(nx / AcReal(tpb.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
||||
|
||||
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||
else if (step_number == 1)
|
||||
solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||
else
|
||||
solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
91
src/core/math_utils.h
Normal file
91
src/core/math_utils.h
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
Copyright (C) 2014-2018, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include <math.h> // isnan, isinf
|
||||
#include <stdlib.h> // rand
|
||||
|
||||
template <class T>
|
||||
static inline const T
|
||||
max(const T& a, const T& b)
|
||||
{
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static inline const T
|
||||
min(const T& a, const T& b)
|
||||
{
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static inline const T
|
||||
sum(const T& a, const T& b)
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static inline const T
|
||||
is_valid(const T& val)
|
||||
{
|
||||
if (isnan(val) || isinf(val))
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static inline const T
|
||||
clamp(const T& val, const T& min, const T& max)
|
||||
{
|
||||
return val < min ? min : val > max ? max : val;
|
||||
}
|
||||
|
||||
static inline AcReal
|
||||
randr()
|
||||
{
|
||||
return AcReal(rand()) / AcReal(RAND_MAX);
|
||||
}
|
||||
|
||||
static inline int3
|
||||
operator+(const int3& a, const int3& b)
|
||||
{
|
||||
return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static inline int3
|
||||
operator-(const int3& a, const int3& b)
|
||||
{
|
||||
return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_power_of_two(const unsigned val)
|
||||
{
|
||||
return val && !(val & (val - 1));
|
||||
}
|
||||
Reference in New Issue
Block a user