Reordered src/core to have better division to host and device code (this is more likely to work when compiling with mpicxx). Disabled separate compilation of CUDA kernels as this complicates compilation and is a source of many cmake/cuda bugs. As a downside, GPU code takes longer to compile.

This commit is contained in:
jpekkila
2020-01-23 20:06:20 +02:00
parent 96389e9da6
commit 78fbcc090d
22 changed files with 1008 additions and 4305 deletions

View File

@@ -1,38 +1,4 @@
######################################## ## Astaroth Core
## CMakeLists.txt for Astaroth Core ## add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
######################################## target_include_directories(astaroth_core PRIVATE ${CUDA_INCLUDE_DIRS})
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
# Require C++11
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
# Require CUDA
find_package(CUDA REQUIRED)
set(CMAKE_CUDA_FLAGS "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70")
# Compile kernels
add_library(astaroth_kernels STATIC kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu kernels/packing.cu)
target_include_directories(astaroth_kernels PRIVATE .)
target_compile_features(astaroth_kernels PRIVATE cxx_std_11)
set_target_properties(astaroth_kernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_dependencies(astaroth_kernels dsl_headers)
# Compile core
add_library(astaroth_core STATIC astaroth.cc node.cc device.cc)
target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
target_link_libraries(astaroth_core m astaroth_kernels ${CUDA_LIBRARIES})
target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
set_target_properties(astaroth_core PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_dependencies(astaroth_kernels dsl_headers)
if (MULTIGPU_ENABLED)
target_compile_definitions(astaroth_core PRIVATE AC_MULTIGPU_ENABLED=1)
endif ()
if (MPI_ENABLED)
target_link_libraries(astaroth_core astaroth_utils)
target_compile_definitions(astaroth_core PRIVATE AC_MPI_ENABLED=1)
endif ()

View File

@@ -19,40 +19,11 @@
#include "astaroth.h" #include "astaroth.h"
#include "errchk.h" #include "errchk.h"
#include "math_utils.h" // int3 + int3 #include "math_utils.h"
#define AC_GEN_STR(X) #X
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
const char* int3param_names[] = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
const char* realparam_names[] = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
const char* scalararray_names[] = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
#undef AC_GEN_STR
static const int max_num_nodes = 1; static const int max_num_nodes = 1;
static Node nodes[max_num_nodes] = {0}; static Node nodes[max_num_nodes] = {0};
static int num_nodes = 0;
static int num_nodes = 0;
void
acPrintMeshInfo(const AcMeshInfo config)
{
for (int i = 0; i < NUM_INT_PARAMS; ++i)
printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
for (int i = 0; i < NUM_INT3_PARAMS; ++i)
printf("[%s]: (%d, %d, %d)\n", int3param_names[i], config.int3_params[i].x,
config.int3_params[i].y, config.int3_params[i].z);
for (int i = 0; i < NUM_REAL_PARAMS; ++i)
printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
for (int i = 0; i < NUM_REAL3_PARAMS; ++i)
printf("[%s]: (%g, %g, %g)\n", real3param_names[i], double(config.real3_params[i].x),
double(config.real3_params[i].y), double(config.real3_params[i].z));
}
AcResult AcResult
acInit(const AcMeshInfo mesh_info) acInit(const AcMeshInfo mesh_info)

View File

@@ -1,60 +1,106 @@
/* #include "astaroth.h"
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth. #include <mpi.h>
#include <string.h>
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "astaroth_device.h"
#include <string.h> // memcpy
#include "astaroth_utils.h"
#include "errchk.h" #include "errchk.h"
#include "math_utils.h" #include "math_utils.h"
#include "timer_hires.h"
#include "kernels/common.cuh" #include "kernels/kernels.h"
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
struct device_s { AcResult
int id; acDevicePrintInfo(const Device device)
AcMeshInfo local_config; {
const int device_id = device->id;
// Concurrency cudaDeviceProp props;
cudaStream_t streams[NUM_STREAMS]; cudaGetDeviceProperties(&props, device_id);
printf("--------------------------------------------------\n");
printf("Device Number: %d\n", device_id);
const size_t bus_id_max_len = 128;
char bus_id[bus_id_max_len];
cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
printf(" PCI bus ID: %s\n", bus_id);
printf(" Device name: %s\n", props.name);
printf(" Compute capability: %d.%d\n", props.major, props.minor);
// Compute
printf(" Compute\n");
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
printf(" Stream processors: %d\n", props.multiProcessorCount);
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
printf(
" Compute mode: %d\n",
(int)props
.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
// Memory // Memory
VertexBufferArray vba; printf(" Global memory\n");
AcReal* reduce_scratchpad; printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
AcReal* reduce_result; printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
}; printf(" Peak Memory Bandwidth (GiB/s): %f\n",
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
printf(" ECC enabled: %d\n", props.ECCEnabled);
#include "kernels/boundconds.cuh" // Memory usage
#include "kernels/integration.cuh" size_t free_bytes, total_bytes;
#include "kernels/reductions.cuh" cudaMemGetInfo(&free_bytes, &total_bytes);
const size_t used_bytes = total_bytes - free_bytes;
printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
printf(" Caches\n");
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
// MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
// MV: while compiling in TIARA gp cluster. Therefore commeted out.
//!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
//!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
printf(" Other\n");
printf(" Warp size: %d\n", props.warpSize);
// printf(" Single to double perf. ratio: %dx\n",
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
// versions
printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported);
printf("--------------------------------------------------\n");
#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead return AC_SUCCESS;
// #include "kernels/pack_unpack.cuh" }
#endif
AcResult
acDeviceAutoOptimize(const Device device)
{
cudaSetDevice(device->id);
const int3 start = (int3){
device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min],
};
const int3 end = (int3){
device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max],
};
return acKernelAutoOptimizeIntegration(start, end, device->vba);
}
AcResult
acDeviceSynchronizeStream(const Device device, const Stream stream)
{
cudaSetDevice(device->id);
if (stream == STREAM_ALL) {
cudaDeviceSynchronize();
}
else {
cudaStreamSynchronize(device->streams[stream]);
}
return AC_SUCCESS;
}
AcResult AcResult
acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -147,96 +193,6 @@ acDeviceDestroy(Device device)
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult
acDevicePrintInfo(const Device device)
{
const int device_id = device->id;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device_id);
printf("--------------------------------------------------\n");
printf("Device Number: %d\n", device_id);
const size_t bus_id_max_len = 128;
char bus_id[bus_id_max_len];
cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
printf(" PCI bus ID: %s\n", bus_id);
printf(" Device name: %s\n", props.name);
printf(" Compute capability: %d.%d\n", props.major, props.minor);
// Compute
printf(" Compute\n");
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
printf(" Stream processors: %d\n", props.multiProcessorCount);
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
printf(
" Compute mode: %d\n",
(int)props
.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
// Memory
printf(" Global memory\n");
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
printf(" ECC enabled: %d\n", props.ECCEnabled);
// Memory usage
size_t free_bytes, total_bytes;
cudaMemGetInfo(&free_bytes, &total_bytes);
const size_t used_bytes = total_bytes - free_bytes;
printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
printf(" Caches\n");
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
// MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
// MV: while compiling in TIARA gp cluster. Therefore commeted out.
//!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
//!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
printf(" Other\n");
printf(" Warp size: %d\n", props.warpSize);
// printf(" Single to double perf. ratio: %dx\n",
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
// versions
printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported);
printf("--------------------------------------------------\n");
return AC_SUCCESS;
}
AcResult
acDeviceAutoOptimize(const Device device)
{
cudaSetDevice(device->id);
const int3 start = (int3){
device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min],
};
const int3 end = (int3){
device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max],
};
return acKernelAutoOptimizeIntegration(start, end, device->vba);
}
AcResult
acDeviceSynchronizeStream(const Device device, const Stream stream)
{
cudaSetDevice(device->id);
if (stream == STREAM_ALL) {
cudaDeviceSynchronize();
}
else {
cudaStreamSynchronize(device->streams[stream]);
}
return AC_SUCCESS;
}
AcResult AcResult
acDeviceSwapBuffers(const Device device) acDeviceSwapBuffers(const Device device)
{ {
@@ -249,66 +205,6 @@ acDeviceSwapBuffers(const Device device)
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult
acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
const AcReal value)
{
cudaSetDevice(device->id);
if (param >= NUM_REAL_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
const AcReal3 value)
{
cudaSetDevice(device->id);
if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
const int value)
{
cudaSetDevice(device->id);
if (param >= NUM_INT_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
const int3 value)
{
cudaSetDevice(device->id);
if (param >= NUM_INT3_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult AcResult
acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle, acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
const size_t start, const AcReal* data, const size_t num) const size_t start, const AcReal* data, const size_t num)
@@ -327,22 +223,6 @@ acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarAr
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult
acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
{
cudaSetDevice(device->id);
ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
device->local_config.int_params[AC_multigpu_offset]);
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(&d_mesh_info, &device_config, sizeof(device_config),
0, cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult AcResult
acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh, acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src, const VertexBufferHandle vtxbuf_handle, const int3 src,
@@ -578,31 +458,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
return AC_SUCCESS; return AC_SUCCESS;
} }
#if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
// Functions for calling packed data transfers
#endif
#if AC_MPI_ENABLED == 0
AcResult
acDeviceRunMPITest(void)
{
WARNING("MPI was not enabled but acDeviceRunMPITest() was called");
return AC_FAILURE;
}
#else // MPI_ENABLED ///////////////////////////////////////////////////////////////////////////////
#include <mpi.h>
// Kernels
#include "kernels/packing.cuh"
// From Astaroth Utils
#include "src/utils/config_loader.h"
#include "src/utils/memory.h"
#include "src/utils/timer_hires.h"
#include "src/utils/verification.h"
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
static int static int
mod(const int a, const int b) mod(const int a, const int b)
{ {
@@ -881,6 +736,8 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
return AC_SUCCESS; return AC_SUCCESS;
} }
/*
// Deprecated
static AcResult static AcResult
acDeviceCommunicateBlocksMPI(const Device device, // acDeviceCommunicateBlocksMPI(const Device device, //
const int3* a0s, // Src idx inside comp. domain const int3* a0s, // Src idx inside comp. domain
@@ -977,6 +834,7 @@ acDeviceCommunicateBlocksMPI(const Device device, //
return AC_SUCCESS; return AC_SUCCESS;
} }
*/
typedef struct { typedef struct {
PackedData* srcs; PackedData* srcs;
@@ -1558,7 +1416,7 @@ acDeviceRunMPITest(void)
}; };
submesh_info.int3_params[AC_multigpu_offset] = (int3){-1, -1, -1}; // TODO submesh_info.int3_params[AC_multigpu_offset] = (int3){-1, -1, -1}; // TODO
WARNING("AC_multigpu_offset not yet implemented"); WARNING("AC_multigpu_offset not yet implemented");
acUpdateConfig(&submesh_info); acUpdateBuiltinParams(&submesh_info);
ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx])); ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx]));
ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound])); ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound]));
@@ -1630,4 +1488,3 @@ acDeviceRunMPITest(void)
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
return AC_SUCCESS; return AC_SUCCESS;
} }
#endif // MPI_ENABLED //////////////////////////////////////////////////////////////////////////////

View File

@@ -1,113 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// clang-format off
/*
* =============================================================================
* General error checking
* =============================================================================
*/
#define ERROR(str) \
{ \
time_t t; time(&t); \
fprintf(stderr, "%s", ctime(&t)); \
fprintf(stderr, "\tError in file %s line %d: %s\n", \
__FILE__, __LINE__, str); \
fflush(stderr); \
exit(EXIT_FAILURE); \
abort(); \
}
#define WARNING(str) \
{ \
time_t t; time(&t); \
fprintf(stderr, "%s", ctime(&t)); \
fprintf(stderr, "\tWarning in file %s line %d: %s\n", \
__FILE__, __LINE__, str); \
fflush(stderr); \
}
// DO NOT REMOVE BRACKETS AROUND RETVAL. F.ex. if (!a < b) vs if (!(a < b)).
#define ERRCHK(retval) { if (!(retval)) ERROR(#retval " was false"); }
#define WARNCHK(retval) { if (!(retval)) WARNING(#retval " was false"); }
#define ERRCHK_ALWAYS(retval) { if (!(retval)) ERROR(#retval " was false"); }
/*
* =============================================================================
* CUDA-specific error checking
* =============================================================================
*/
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
static inline void
cuda_assert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess) {
time_t t; time(&t); \
fprintf(stderr, "%s", ctime(&t)); \
fprintf(stderr, "\tCUDA error in file %s line %d: %s\n", \
file, line, cudaGetErrorString(code)); \
fflush(stderr); \
if (abort)
exit(code);
}
}
#ifdef NDEBUG
#undef ERRCHK
#undef WARNCHK
#define ERRCHK(params)
#define WARNCHK(params)
#define ERRCHK_CUDA(params) params
#define WARNCHK_CUDA(params) params
#define ERRCHK_CUDA_KERNEL() {}
#else
#define ERRCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__); }
#define WARNCHK_CUDA(params) { cuda_assert((params), __FILE__, __LINE__, false); }
#define ERRCHK_CUDA_KERNEL() \
{ \
ERRCHK_CUDA(cudaPeekAtLastError()); \
ERRCHK_CUDA(cudaDeviceSynchronize()); \
}
#endif
#define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); }
#define ERRCHK_CUDA_KERNEL_ALWAYS() \
{ \
ERRCHK_CUDA_ALWAYS(cudaPeekAtLastError()); \
ERRCHK_CUDA_ALWAYS(cudaDeviceSynchronize()); \
}
#define WARNCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__, false); }
// clang-format on
#endif // AC_USE_CUDA_RUNTIME_API

View File

@@ -1,194 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
#include "astaroth_grid.h"
#include "astaroth_device.h"
#include "errchk.h"
/** */
AcResult
acGridInit(const AcMeshInfo info)
{
int num_processes, pid;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes);
AcMesh submesh;
return AC_FAILURE;
}
/** */
AcResult
acGridQuit(void)
{
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
#if 0
/** */
AcResult
acGridSynchronizeStream(const Stream stream)
{
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridSwapBuffers(void)
{
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
{
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices)
{
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
const int3 dst, const int num_vertices)
{
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle)
{
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
#endif
/** */
AcResult
acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
{
(void)stream;
(void)host_mesh;
WARNING("Not implemented");
return AC_FAILURE;
}
#if 0
/** */
AcResult
acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
const int3 src, const int3 dst, const int num_vertices,
AcMesh* host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
const int num_vertices, AcMesh* host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
AcMesh* host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
#endif
/** */
AcResult
acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
{
(void)stream;
(void)host_mesh;
WARNING("Not implemented");
return AC_FAILURE;
}
#if 0
/** */
AcResult
acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
const AcReal dt)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridIntegrateStep(const Stream stream, const AcReal dt)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridPeriodicBoundcondStep(const Stream stream)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
return AC_FAILURE;
}
/** */
AcResult
acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
{
return AC_FAILURE;
}
#endif

View File

@@ -0,0 +1,3 @@
## Astaroth Kernels
add_library(astaroth_kernels STATIC kernels.cu)
add_dependencies(astaroth_kernels dsl_headers)

View File

@@ -1,91 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "boundconds.cuh"
#include "common.cuh"
#include "src/core/errchk.h"
__global__ void
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
{
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
// If within the start-end range (this allows threadblock dims that are not
// divisible by end - start)
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
return;
// if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
// return;
// If destination index is inside the computational domain, return since
// the boundary conditions are only applied to the ghost zones
if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
return;
// Find the source index
// Map to nx, ny, nz coordinates
int i_src = i_dst - DCONST(AC_nx_min);
int j_src = j_dst - DCONST(AC_ny_min);
int k_src = k_dst - DCONST(AC_nz_min);
// Translate (s.t. the index is always positive)
i_src += DCONST(AC_nx);
j_src += DCONST(AC_ny);
k_src += DCONST(AC_nz);
// Wrap
i_src %= DCONST(AC_nx);
j_src %= DCONST(AC_ny);
k_src %= DCONST(AC_nz);
// Map to mx, my, mz coordinates
i_src += DCONST(AC_nx_min);
j_src += DCONST(AC_ny_min);
k_src += DCONST(AC_nz_min);
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
vtxbuf[dst_idx] = vtxbuf[src_idx];
}
AcResult
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf)
{
const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}

View File

@@ -1,40 +1,62 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once #pragma once
#include "astaroth.h" static __global__ void
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
{
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
#ifdef __cplusplus // If within the start-end range (this allows threadblock dims that are not
extern "C" { // divisible by end - start)
#endif if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
return;
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end, // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz))
AcReal* vtxbuf); // return;
#ifdef __cplusplus // If destination index is inside the computational domain, return since
} // extern "C" // the boundary conditions are only applied to the ghost zones
#endif if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
return;
// Find the source index
// Map to nx, ny, nz coordinates
int i_src = i_dst - DCONST(AC_nx_min);
int j_src = j_dst - DCONST(AC_ny_min);
int k_src = k_dst - DCONST(AC_nz_min);
// Translate (s.t. the index is always positive)
i_src += DCONST(AC_nx);
j_src += DCONST(AC_ny);
k_src += DCONST(AC_nz);
// Wrap
i_src %= DCONST(AC_nx);
j_src %= DCONST(AC_ny);
k_src %= DCONST(AC_nz);
// Map to mx, my, mz coordinates
i_src += DCONST(AC_nx_min);
j_src += DCONST(AC_ny_min);
k_src += DCONST(AC_nz_min);
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
vtxbuf[dst_idx] = vtxbuf[src_idx];
}
AcResult
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf)
{
const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}

View File

@@ -1,127 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include "astaroth.h"
extern __constant__ AcMeshInfo d_mesh_info;
typedef struct {
AcReal* in[NUM_VTXBUF_HANDLES];
AcReal* out[NUM_VTXBUF_HANDLES];
AcReal* profiles[NUM_SCALARARRAY_HANDLES];
} VertexBufferArray;
static int __device__ __forceinline__
DCONST(const AcIntParam param)
{
return d_mesh_info.int_params[param];
}
static int3 __device__ __forceinline__
DCONST(const AcInt3Param param)
{
return d_mesh_info.int3_params[param];
}
static AcReal __device__ __forceinline__
DCONST(const AcRealParam param)
{
return d_mesh_info.real_params[param];
}
static AcReal3 __device__ __forceinline__
DCONST(const AcReal3Param param)
{
return d_mesh_info.real3_params[param];
}
static __device__ constexpr VertexBufferHandle
DCONST(const VertexBufferHandle handle)
{
return handle;
}
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
//#define globalMeshM // Placeholder
//#define localMeshN // Placeholder
//#define localMeshM // Placeholder
//#define localMeshN_min // Placeholder
//#define globalMeshN_min // Placeholder
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
static __device__ constexpr int
IDX(const int i)
{
return i;
}
static __device__ __forceinline__ int
IDX(const int i, const int j, const int k)
{
return DEVICE_VTXBUF_IDX(i, j, k);
}
static __device__ __forceinline__ int
IDX(const int3 idx)
{
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
}
//#include <thrust/complex.h>
// using namespace thrust;
#include <cuComplex.h>
#if AC_DOUBLE_PRECISION == 1
typedef cuDoubleComplex acComplex;
#define acComplex(x, y) make_cuDoubleComplex(x, y)
#else
typedef cuFloatComplex acComplex;
#define acComplex(x, y) make_cuFloatComplex(x, y)
#define exp(x) expf(x)
#define sin(x) sinf(x)
#define cos(x) cosf(x)
#define sqrt(x) sqrtf(x)
#endif
static __device__ inline acComplex
exp(const acComplex& val)
{
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
}
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
{
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
}
//#include <complex>

File diff suppressed because it is too large Load Diff

View File

@@ -1,338 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include "device_globals.cuh"
#include "src/core/errchk.h"
#include "src/core/math_utils.h"
// Function pointer definitions
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
const AcReal&);
// clang-format off
/* Comparison funcs */
__device__ inline AcReal
_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
__device__ inline AcReal
_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
__device__ inline AcReal
_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
/* Function used to determine the values used during reduction */
__device__ inline AcReal
_device_length_scal(const AcReal& a) { return AcReal(a); }
__device__ inline AcReal
_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
__device__ inline AcReal
_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
__device__ inline AcReal
_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
__device__ inline AcReal
_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
__device__ inline AcReal
_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
// clang-format on
__device__ inline bool
oob(const int& i, const int& j, const int& k)
{
if (i >= d_mesh_info.int_params[AC_nx] ||
j >= d_mesh_info.int_params[AC_ny] ||
k >= d_mesh_info.int_params[AC_nz])
return true;
else
return false;
}
template <ReduceInitialScalFunc reduce_initial>
__global__ void
_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
{
const int i = threadIdx.x + blockIdx.x * blockDim.x;
const int j = threadIdx.y + blockIdx.y * blockDim.y;
const int k = threadIdx.z + blockIdx.z * blockDim.z;
if (oob(i, j, k))
return;
const int src_idx = DEVICE_VTXBUF_IDX(
i + d_mesh_info.int_params[AC_nx_min],
j + d_mesh_info.int_params[AC_ny_min],
k + d_mesh_info.int_params[AC_nz_min]);
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
dst[dst_idx] = reduce_initial(src[src_idx]);
}
template <ReduceInitialVecFunc reduce_initial>
__global__ void
_kernel_reduce_vec(const __restrict__ AcReal* src_a,
const __restrict__ AcReal* src_b,
const __restrict__ AcReal* src_c, AcReal* dst)
{
const int i = threadIdx.x + blockIdx.x * blockDim.x;
const int j = threadIdx.y + blockIdx.y * blockDim.y;
const int k = threadIdx.z + blockIdx.z * blockDim.z;
if (oob(i, j, k))
return;
const int src_idx = DEVICE_VTXBUF_IDX(
i + d_mesh_info.int_params[AC_nx_min],
j + d_mesh_info.int_params[AC_ny_min],
k + d_mesh_info.int_params[AC_nz_min]);
const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
src_c[src_idx]);
}
///////////////////////////////////////////////////////////////////////////////
#define BLOCK_SIZE (1024)
#define ELEMS_PER_THREAD (32)
template <ReduceFunc reduce>
__global__ void
_kernel_reduce(AcReal* src, AcReal* result)
{
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
const int scratchpad_size = DCONST(AC_nxyz);
if (idx >= scratchpad_size)
return;
__shared__ AcReal smem[BLOCK_SIZE];
AcReal tmp = src[idx];
for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
const int src_idx = idx + i * BLOCK_SIZE;
if (src_idx >= scratchpad_size) {
// This check is for safety: if accessing uninitialized values
// beyond the mesh boundaries, we will immediately start seeing NANs
if (threadIdx.x < BLOCK_SIZE)
smem[threadIdx.x] = NAN;
else
break;
}
tmp = reduce(tmp, src[src_idx]);
}
smem[threadIdx.x] = tmp;
__syncthreads();
int offset = BLOCK_SIZE / 2;
while (offset > 0) {
if (threadIdx.x < offset) {
tmp = reduce(tmp, smem[threadIdx.x + offset]);
smem[threadIdx.x] = tmp;
}
offset /= 2;
__syncthreads();
}
if (threadIdx.x == 0)
src[idx] = tmp;
}
template <ReduceFunc reduce>
__global__ void
_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
{
const int scratchpad_size = DCONST(AC_nxyz);
const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
AcReal tmp = src[idx];
const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
tmp = reduce(tmp, src[idx + i * block_offset]);
*result = tmp;
}
//////////////////////////////////////////////////////////////////////////////
AcReal
_reduce_scal(const cudaStream_t stream,
const ReductionType& rtype, const int& nx, const int& ny,
const int& nz, const AcReal* vertex_buffer,
AcReal* reduce_scratchpad, AcReal* reduce_result)
{
bool solve_mean = false;
const dim3 tpb(32, 4, 1);
const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
int(ceil(AcReal(nz) / tpb.z)));
const int scratchpad_size = nx * ny * nz;
const int bpg2 = (unsigned int)ceil(AcReal(scratchpad_size) /
AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
switch (rtype) {
case RTYPE_MAX:
_kernel_reduce_scal<_device_length_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_max>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_max>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_MIN:
_kernel_reduce_scal<_device_length_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_min>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_min>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS:
_kernel_reduce_scal<_device_squared_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
case RTYPE_RMS_EXP:
_kernel_reduce_scal<_device_exp_squared_scal>
<<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
default:
ERROR("Unrecognized RTYPE");
}
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
if (solve_mean) {
const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
return inv_n * result;
}
else {
return result;
}
}
AcReal
_reduce_vec(const cudaStream_t stream,
const ReductionType& rtype, const int& nx, const int& ny,
const int& nz, const AcReal* vertex_buffer_a,
const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
AcReal* reduce_scratchpad, AcReal* reduce_result)
{
bool solve_mean = false;
const dim3 tpb(32, 4, 1);
const dim3 bpg(int(ceil(float(nx) / tpb.x)),
int(ceil(float(ny) / tpb.y)),
int(ceil(float(nz) / tpb.z)));
const int scratchpad_size = nx * ny * nz;
const int bpg2 = (unsigned int)ceil(float(scratchpad_size) /
float(ELEMS_PER_THREAD * BLOCK_SIZE));
// "Features" of this quick & efficient reduction:
// Block size must be smaller than the computational domain size
// (otherwise we would have do some additional bounds checking in the
// second half of _kernel_reduce, which gets quite confusing)
// Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
// the work without worrying too much about the array bounds.
ERRCHK(BLOCK_SIZE <= scratchpad_size);
ERRCHK(!(BLOCK_SIZE % 2));
// NOTE! Also does not work properly with non-power of two mesh dimension
// Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
// see that uninitialized smem values are used in the comparison
ERRCHK(is_power_of_two(nx));
ERRCHK(is_power_of_two(ny));
ERRCHK(is_power_of_two(nz));
switch (rtype) {
case RTYPE_MAX:
_kernel_reduce_vec<_device_length_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_max>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_max>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_MIN:
_kernel_reduce_vec<_device_length_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_min>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_min>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
break;
case RTYPE_RMS:
_kernel_reduce_vec<_device_squared_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
case RTYPE_RMS_EXP:
_kernel_reduce_vec<_device_exp_squared_vec>
<<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
reduce_scratchpad);
_kernel_reduce<_device_sum>
<<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
_kernel_reduce_block<_device_sum>
<<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
solve_mean = true;
break;
default:
ERROR("Unrecognized RTYPE");
}
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
if (solve_mean) {
const AcReal inv_n = AcReal(1.0) / (nx * ny * nz);
return inv_n * result;
}
else {
return result;
}
}

View File

@@ -1,742 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Implementation of the integration pipeline
*
*
*
*/
#pragma once
#include "device_globals.cuh"
#include <assert.h>
/*
#define RK_THREADS_X (32)
#define RK_THREADS_Y (1)
#define RK_THREADS_Z (4)
#define RK_LAUNCH_BOUND_MIN_BLOCKS (4)
#define RK_THREADBLOCK_SIZE (RK_THREADS_X * RK_THREADS_Y * RK_THREADS_Z)
*/
static __device__ __forceinline__ int
IDX(const int i)
{
return i;
}
static __device__ __forceinline__ int
IDX(const int i, const int j, const int k)
{
return DEVICE_VTXBUF_IDX(i, j, k);
}
static __device__ __forceinline__ int
IDX(const int3 idx)
{
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
}
static __forceinline__ AcMatrix
create_rotz(const AcReal radians)
{
AcMatrix mat;
mat.row[0] = (AcReal3){cos(radians), -sin(radians), 0};
mat.row[1] = (AcReal3){sin(radians), cos(radians), 0};
mat.row[2] = (AcReal3){0, 0, 0};
return mat;
}
#if AC_DOUBLE_PRECISION == 0
#define sin __sinf
#define cos __cosf
#define exp __expf
#define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0
/*
typedef struct {
int i, j, k;
} int3;*/
/*
* =============================================================================
* Level 0 (Input Assembly Stage)
* =============================================================================
*/
/*
* =============================================================================
* Level 0.1 (Read stencil elements and solve derivatives)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {0, 1.0 / 2.0};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {0, 2.0 / 3.0, -1.0 / 12.0};
#elif STENCIL_ORDER == 6
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
#elif STENCIL_ORDER == 8
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
-1.0 / 280.0};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = 0;
#pragma unroll
for (int i = 1; i <= MID; ++i)
res += coefficients[i] * (pencil[MID + i] - pencil[MID - i]);
return res * inv_ds;
}
static __device__ __forceinline__ AcReal
second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {-2., 1.};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
#elif STENCIL_ORDER == 6
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
1.0 / 90.0};
#elif STENCIL_ORDER == 8
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
8.0 / 315.0, -1.0 / 560.0};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = coefficients[0] * pencil[MID];
#pragma unroll
for (int i = 1; i <= MID; ++i)
res += coefficients[i] * (pencil[MID + i] + pencil[MID - i]);
return res * inv_ds * inv_ds;
}
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
static __device__ __forceinline__ AcReal
cross_derivative(const AcReal* __restrict__ pencil_a,
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
const AcReal inv_ds_b)
{
#if STENCIL_ORDER == 2
const AcReal coefficients[] = {0, 1.0 / 4.0};
#elif STENCIL_ORDER == 4
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
#elif STENCIL_ORDER == 6
const AcReal fac = (1. / 720.);
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
2.0 * fac};
#elif STENCIL_ORDER == 8
const AcReal fac = (1. / 20160.);
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
128. * fac, -9. * fac};
#endif
#define MID (STENCIL_ORDER / 2)
AcReal res = AcReal(0.);
#pragma unroll
for (int i = 1; i <= MID; ++i) {
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
pencil_b[MID + i] - pencil_b[MID - i]);
}
return res * inv_ds_a * inv_ds_b;
}
static __device__ __forceinline__ AcReal
derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
}
static __device__ __forceinline__ AcReal
derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
}
static __device__ __forceinline__ AcReal
derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
}
static __device__ __forceinline__ AcReal
deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil_a[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
AcReal pencil_b[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
}
static __device__ __forceinline__ AcReal
derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
{
AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
}
/*
* =============================================================================
* Level 0.2 (Caching functions)
* =============================================================================
*/
#include "stencil_assembly.cuh"
/*
typedef struct {
AcRealData x;
AcRealData y;
AcRealData z;
} AcReal3Data;
static __device__ __forceinline__ AcReal3Data
read_data(const int i, const int j, const int k,
AcReal* __restrict__ buf[], const int3& handle)
{
AcReal3Data data;
data.x = read_data(i, j, k, buf, handle.x);
data.y = read_data(i, j, k, buf, handle.y);
data.z = read_data(i, j, k, buf, handle.z);
return data;
}
*/
/*
* =============================================================================
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
* =============================================================================
*/
static __host__ __device__ __forceinline__ AcReal3
operator-(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator+(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator-(const AcReal3& a)
{
return (AcReal3){-a.x, -a.y, -a.z};
}
static __host__ __device__ __forceinline__ AcReal3
operator*(const AcReal a, const AcReal3& b)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static __host__ __device__ __forceinline__ AcReal
dot(const AcReal3& a, const AcReal3& b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
static __host__ __device__ __forceinline__ AcReal3
mul(const AcMatrix& aa, const AcReal3& x)
{
return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
}
static __host__ __device__ __forceinline__ AcReal3
cross(const AcReal3& a, const AcReal3& b)
{
AcReal3 c;
c.x = a.y * b.z - a.z * b.y;
c.y = a.z * b.x - a.x * b.z;
c.z = a.x * b.y - a.y * b.x;
return c;
}
static __host__ __device__ __forceinline__ bool
is_valid(const AcReal a)
{
return !isnan(a) && !isinf(a);
}
static __host__ __device__ __forceinline__ bool
is_valid(const AcReal3& a)
{
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
}
/*
* =============================================================================
* Level 1 (Stencil Processing Stage)
* =============================================================================
*/
/*
* =============================================================================
* Level 1.1 (Terms)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
laplace(const AcRealData& data)
{
return hessian(data).row[0].x + hessian(data).row[1].y + hessian(data).row[2].z;
}
static __device__ __forceinline__ AcReal
divergence(const AcReal3Data& vec)
{
return gradient(vec.x).x + gradient(vec.y).y + gradient(vec.z).z;
}
static __device__ __forceinline__ AcReal3
laplace_vec(const AcReal3Data& vec)
{
return (AcReal3){laplace(vec.x), laplace(vec.y), laplace(vec.z)};
}
static __device__ __forceinline__ AcReal3
curl(const AcReal3Data& vec)
{
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
gradient(vec.x).z - gradient(vec.z).x,
gradient(vec.y).x - gradient(vec.x).y};
}
static __device__ __forceinline__ AcReal3
gradient_of_divergence(const AcReal3Data& vec)
{
return (AcReal3){hessian(vec.x).row[0].x + hessian(vec.y).row[0].y + hessian(vec.z).row[0].z,
hessian(vec.x).row[1].x + hessian(vec.y).row[1].y + hessian(vec.z).row[1].z,
hessian(vec.x).row[2].x + hessian(vec.y).row[2].y + hessian(vec.z).row[2].z};
}
// Takes uu gradients and returns S
static __device__ __forceinline__ AcMatrix
stress_tensor(const AcReal3Data& vec)
{
AcMatrix S;
S.row[0].x = AcReal(2. / 3.) * gradient(vec.x).x -
AcReal(1. / 3.) * (gradient(vec.y).y + gradient(vec.z).z);
S.row[0].y = AcReal(1. / 2.) * (gradient(vec.x).y + gradient(vec.y).x);
S.row[0].z = AcReal(1. / 2.) * (gradient(vec.x).z + gradient(vec.z).x);
S.row[1].y = AcReal(2. / 3.) * gradient(vec.y).y -
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.z).z);
S.row[1].z = AcReal(1. / 2.) * (gradient(vec.y).z + gradient(vec.z).y);
S.row[2].z = AcReal(2. / 3.) * gradient(vec.z).z -
AcReal(1. / 3.) * (gradient(vec.x).x + gradient(vec.y).y);
S.row[1].x = S.row[0].y;
S.row[2].x = S.row[0].z;
S.row[2].y = S.row[1].z;
return S;
}
static __device__ __forceinline__ AcReal
contract(const AcMatrix& mat)
{
AcReal res = 0;
#pragma unroll
for (int i = 0; i < 3; ++i)
res += dot(mat.row[i], mat.row[i]);
return res;
}
/*
* =============================================================================
* Level 1.2 (Equations)
* =============================================================================
*/
static __device__ __forceinline__ AcReal
length(const AcReal3& vec)
{
return sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
}
static __device__ __forceinline__ AcReal
reciprocal_len(const AcReal3& vec)
{
return rsqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
}
static __device__ __forceinline__ AcReal3
normalized(const AcReal3& vec)
{
const AcReal inv_len = reciprocal_len(vec);
return inv_len * vec;
}
// Sinusoidal forcing
// https://arxiv.org/pdf/1704.04676.pdf
__constant__ AcReal3 forcing_vec;
__constant__ AcReal forcing_phi;
static __device__ __forceinline__ AcReal3
forcing(const int i, const int j, const int k)
{
#define DOMAIN_SIZE_X (DCONST(AC_nx) * DCONST_REAL(AC_dsx))
#define DOMAIN_SIZE_Y (DCONST(AC_ny) * DCONST_REAL(AC_dsy))
#define DOMAIN_SIZE_Z (DCONST(AC_nz) * DCONST_REAL(AC_dsz))
const AcReal3 k_vec = (AcReal3){(i - DCONST(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
(j - DCONST(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
(k - DCONST(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
AcReal inv_len = reciprocal_len(k_vec);
if (isnan(inv_len) || isinf(inv_len))
inv_len = 0;
if (inv_len > 2) // hack to make it cool
inv_len = 2;
const AcReal k_dot_x = dot(k_vec, forcing_vec);
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
return inv_len * inv_len * waves * forcing_vec;
}
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
#define LNT0 (AcReal(0.0))
#define LNRHO0 (AcReal(0.0))
#define H_CONST (AcReal(0.0))
#define C_CONST (AcReal(0.0))
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const AcReal state_previous, const AcReal state_current,
const AcReal rate_of_change, const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
AcReal(8. / 15.)};
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
// access (when accessing beta[step_number-1] even when step_number >= 1)
switch (step_number) {
case 0:
return state_current + beta[step_number + 1] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number + 1] *
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
/*
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate_scal(const AcReal state_previous, const AcReal state_current,
const AcReal rate_of_change, const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {AcReal(1. / 3.), AcReal(15. / 16.),
AcReal(8. / 15.)};
switch (step_number) {
case 0:
return state_current + beta[step_number] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number] *
(alpha[step_number] * (AcReal(1.) / beta[step_number - 1]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
*/
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
const AcReal3 rate_of_change, const AcReal dt)
{
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
}
#define rk3(state_previous, state_current, rate_of_change, dt)\
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
/*
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const int idx, const AcReal out, const int handle,
const AcRealData& in_cached, const AcReal rate_of_change, const AcReal dt)
{
return rk3_integrate_scal<step_number>(out, value(in_cached), rate_of_change, dt);
}
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const int idx, const AcReal3 out, const int3& handle,
const AcReal3Data& in_cached, const AcReal3& rate_of_change, const AcReal dt)
{
return (AcReal3) {
rk3_integrate<step_number>(idx, out, handle.x, in_cached.x, rate_of_change.x, dt),
rk3_integrate<step_number>(idx, out, handle.y, in_cached.y, rate_of_change.y, dt),
rk3_integrate<step_number>(idx, out, handle.z, in_cached.z, rate_of_change.z, dt)
};
}
#define RK3(handle, in_cached, rate_of_change, dt) \
rk3_integrate<step_number>(idx, buffer.out, handle, in_cached, rate_of_change, dt)
*/
/*
* =============================================================================
* Level 1.3 (Kernels)
* =============================================================================
*/
static __device__ void
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
{
out[handle][idx] = value;
}
static __device__ void
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
{
write(out, vec.x, idx, value.x);
write(out, vec.y, idx, value.y);
write(out, vec.z, idx, value.z);
}
static __device__ AcReal
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
{
return field[handle][idx];
}
static __device__ AcReal3
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
{
return (AcReal3) { read_out(idx, field, handle.x),
read_out(idx, field, handle.y),
read_out(idx, field, handle.z) };
}
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
#define READ(handle) (read_data(vertexIdx, buffer.in, handle))
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
// also write for clarity here also, not for the DSL
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
#define GEN_KERNEL_PARAM_BOILERPLATE \
const int3 start, const int3 end, VertexBufferArray buffer
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
return;\
\
\
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&\
vertexIdx.z < DCONST(AC_nz_max));\
\
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&\
vertexIdx.z >= DCONST(AC_nz_min));\
\
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
#include "stencil_process.cuh"
/*
* =============================================================================
* Level 2 (Host calls)
* =============================================================================
*/
static AcReal
randf(void)
{
return AcReal(rand()) / AcReal(RAND_MAX);
}
AcResult
rk3_step_async(const cudaStream_t stream, const dim3& tpb,
const int3& start, const int3& end, const int& step_number,
const AcReal dt, const AcMeshInfo& /*mesh_info*/,
VertexBufferArray* buffer)
{
/////////////////// Forcing
#if LFORCING
const AcReal ff_scale = AcReal(.2);
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
const AcMatrix rotz = create_rotz(radians);
ff = mul(rotz, ff);
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
#endif // LFORCING
//////////////////////////
const int nx = end.x - start.x;
const int ny = end.y - start.y;
const int nz = end.z - start.z;
const dim3 bpg(
(unsigned int)ceil(nx / AcReal(tpb.x)),
(unsigned int)ceil(ny / AcReal(tpb.y)),
(unsigned int)ceil(nz / AcReal(tpb.z)));
if (step_number == 0)
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
else
solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}

View File

@@ -1,293 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "common.cuh"
#include "integration.cuh"
#include "src/core/errchk.h"
#include "src/core/math_utils.h"
#include <assert.h>
__constant__ AcMeshInfo d_mesh_info; // Extern in kernels/common.cuh.
static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
// Device info
#define REGISTERS_PER_THREAD (255)
#define MAX_REGISTERS_PER_BLOCK (65536)
#define MAX_THREADS_PER_BLOCK (1024)
#define WARP_SIZE (32)
#define make_int3(a, b, c) \
(int3) { (int)a, (int)b, (int)c }
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
// access (when accessing beta[step_number-1] even when step_number >= 1)
switch (step_number) {
case 0:
return state_current + beta[step_number + 1] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
const AcReal3 rate_of_change, const AcReal dt)
{
return (AcReal3){
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
}
#define rk3(state_previous, state_current, rate_of_change, dt) \
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
static __device__ void
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
{
out[handle][idx] = value;
}
static __device__ __forceinline__ void
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
{
write(out, vec.x, idx, value.x);
write(out, vec.y, idx, value.y);
write(out, vec.z, idx, value.z);
}
static __device__ __forceinline__ AcReal
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
{
return field[handle][idx];
}
static __device__ __forceinline__ AcReal3
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
{
return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
read_out(idx, field, handle.z)};
}
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \
threadIdx.y + blockIdx.y * blockDim.y + start.y, \
threadIdx.z + blockIdx.z * blockDim.z + start.z}; \
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
d_multigpu_offset.y + vertexIdx.y, \
d_multigpu_offset.z + vertexIdx.z}; \
(void)globalVertexIdx; \
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
return; \
\
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
vertexIdx.z < DCONST(AC_nz_max)); \
\
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
vertexIdx.z >= DCONST(AC_nz_min)); \
\
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
#define GEN_DEVICE_FUNC_HOOK(identifier) \
template <int step_number> \
AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start, \
const int3 end, VertexBufferArray vba) \
{ \
\
const dim3 tpb(32, 1, 4); \
\
const int3 n = end - start; \
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \
(unsigned int)ceil(n.y / AcReal(tpb.y)), \
(unsigned int)ceil(n.z / AcReal(tpb.z))); \
\
identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba); \
ERRCHK_CUDA_KERNEL(); \
\
return AC_SUCCESS; \
}
#include "user_kernels.h"
static dim3 rk3_tpb(32, 1, 4);
AcResult
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
{
// RK3
dim3 best_dims(0, 0, 0);
float best_time = INFINITY;
const int num_iterations = 10;
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
break;
if (x * y * z > MAX_THREADS_PER_BLOCK)
break;
if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
break;
if (((x * y * z) % WARP_SIZE) != 0)
continue;
const dim3 tpb(x, y, z);
const int3 n = end - start;
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
cudaDeviceSynchronize();
if (cudaGetLastError() != cudaSuccess) // resets the error if any
continue;
// printf("(%d, %d, %d)\n", x, y, z);
cudaEvent_t tstart, tstop;
cudaEventCreate(&tstart);
cudaEventCreate(&tstop);
// #ifdef AC_dt
// acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
// note, temporarily disabled
/*#else
ERROR("FATAL ERROR: acDeviceAutoOptimize() or
acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
it or call the generated " "device function acDeviceKernel_<kernel name> which does
not require the " "timestep to be defined.\n"); #endif*/
cudaEventRecord(tstart); // ---------------------------------------- Timing start
for (int i = 0; i < num_iterations; ++i)
solve<2><<<bpg, tpb>>>(start, end, vba);
cudaEventRecord(tstop); // ----------------------------------------- Timing end
cudaEventSynchronize(tstop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, tstart, tstop);
ERRCHK_CUDA_KERNEL_ALWAYS();
if (milliseconds < best_time) {
best_time = milliseconds;
best_dims = tpb;
}
}
}
}
#if VERBOSE_PRINTING
printf(
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
#endif
/*
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
ERRCHK(fp);
fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
fclose(fp);
*/
rk3_tpb = best_dims;
return AC_SUCCESS;
}
AcResult
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
const int3 end, VertexBufferArray vba)
{
const dim3 tpb = rk3_tpb;
const int3 n = end - start;
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
//#ifdef AC_dt
// acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
/*#else
(void)dt;
ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
"called, but AC_dt was not defined. Either define it or call the generated "
"device function acDeviceKernel_<kernel name> which does not require the "
"timestep to be defined.\n");
#endif*/
if (step_number == 0)
solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
else
solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
static __global__ void
dummy_kernel(void)
{
DCONST((AcIntParam)0);
DCONST((AcInt3Param)0);
DCONST((AcRealParam)0);
DCONST((AcReal3Param)0);
acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
a* a;
}
AcResult
acKernelDummy(void)
{
dummy_kernel<<<1, 1>>>();
ERRCHK_CUDA_KERNEL_ALWAYS();
return AC_SUCCESS;
}

View File

@@ -1,42 +1,259 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once #pragma once
#ifdef __cplusplus static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared.");
extern "C" {
// Device info
#define REGISTERS_PER_THREAD (255)
#define MAX_REGISTERS_PER_BLOCK (65536)
#define MAX_THREADS_PER_BLOCK (1024)
#define WARP_SIZE (32)
#define make_int3(a, b, c) \
(int3) { (int)a, (int)b, (int)c }
template <int step_number>
static __device__ __forceinline__ AcReal
rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
const AcReal dt)
{
// Williamson (1980)
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
// access (when accessing beta[step_number-1] even when step_number >= 1)
switch (step_number) {
case 0:
return state_current + beta[step_number + 1] * rate_of_change * dt;
case 1: // Fallthrough
case 2:
return state_current +
beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
(state_current - state_previous) +
rate_of_change * dt);
default:
return NAN;
}
}
template <int step_number>
static __device__ __forceinline__ AcReal3
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
const AcReal3 rate_of_change, const AcReal dt)
{
return (AcReal3){
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
}
#define rk3(state_previous, state_current, rate_of_change, dt) \
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
static __device__ void
write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value)
{
out[handle][idx] = value;
}
static __device__ __forceinline__ void
write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value)
{
write(out, vec.x, idx, value.x);
write(out, vec.y, idx, value.y);
write(out, vec.z, idx, value.z);
}
static __device__ __forceinline__ AcReal
read_out(const int idx, AcReal* __restrict__ field[], const int handle)
{
return field[handle][idx];
}
static __device__ __forceinline__ AcReal3
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
{
return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
read_out(idx, field, handle.z)};
}
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle))
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx
#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \
threadIdx.y + blockIdx.y * blockDim.y + start.y, \
threadIdx.z + blockIdx.z * blockDim.z + start.z}; \
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
d_multigpu_offset.y + vertexIdx.y, \
d_multigpu_offset.z + vertexIdx.z}; \
(void)globalVertexIdx; \
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
return; \
\
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
vertexIdx.z < DCONST(AC_nz_max)); \
\
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
vertexIdx.z >= DCONST(AC_nz_min)); \
\
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
#define GEN_DEVICE_FUNC_HOOK(identifier) \
template <int step_number> \
AcResult acDeviceKernel_##identifier(const cudaStream_t stream, const int3 start, \
const int3 end, VertexBufferArray vba) \
{ \
\
const dim3 tpb(32, 1, 4); \
\
const int3 n = end - start; \
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \
(unsigned int)ceil(n.y / AcReal(tpb.y)), \
(unsigned int)ceil(n.z / AcReal(tpb.z))); \
\
identifier<step_number><<<bpg, tpb, 0, stream>>>(start, end, vba); \
ERRCHK_CUDA_KERNEL(); \
\
return AC_SUCCESS; \
}
#include "user_kernels.h"
static dim3 rk3_tpb(32, 1, 4);
AcResult
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba)
{
// RK3
dim3 best_dims(0, 0, 0);
float best_time = INFINITY;
const int num_iterations = 10;
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
break;
if (x * y * z > MAX_THREADS_PER_BLOCK)
break;
if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
break;
if (((x * y * z) % WARP_SIZE) != 0)
continue;
const dim3 tpb(x, y, z);
const int3 n = end - start;
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
cudaDeviceSynchronize();
if (cudaGetLastError() != cudaSuccess) // resets the error if any
continue;
// printf("(%d, %d, %d)\n", x, y, z);
cudaEvent_t tstart, tstop;
cudaEventCreate(&tstart);
cudaEventCreate(&tstop);
// #ifdef AC_dt
// acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO
// note, temporarily disabled
/*#else
ERROR("FATAL ERROR: acDeviceAutoOptimize() or
acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define
it or call the generated " "device function acDeviceKernel_<kernel name> which does
not require the " "timestep to be defined.\n"); #endif*/
cudaEventRecord(tstart); // ---------------------------------------- Timing start
for (int i = 0; i < num_iterations; ++i)
solve<2><<<bpg, tpb>>>(start, end, vba);
cudaEventRecord(tstop); // ----------------------------------------- Timing end
cudaEventSynchronize(tstop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, tstart, tstop);
ERRCHK_CUDA_KERNEL_ALWAYS();
if (milliseconds < best_time) {
best_time = milliseconds;
best_dims = tpb;
}
}
}
}
#if VERBOSE_PRINTING
printf(
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
#endif #endif
/*
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
ERRCHK(fp);
fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
fclose(fp);
*/
AcResult acKernelDummy(void); rk3_tpb = best_dims;
return AC_SUCCESS;
}
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba); AcResult
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
const int3 end, VertexBufferArray vba)
{
const dim3 tpb = rk3_tpb;
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 n = end - start;
const int3 start, const int3 end, VertexBufferArray vba); const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
#ifdef __cplusplus //#ifdef AC_dt
} // extern "C" // acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
#endif /*#else
(void)dt;
ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was "
"called, but AC_dt was not defined. Either define it or call the generated "
"device function acDeviceKernel_<kernel name> which does not require the "
"timestep to be defined.\n");
#endif*/
if (step_number == 0)
solve<0><<<bpg, tpb, 0, stream>>>(start, end, vba);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, stream>>>(start, end, vba);
else
solve<2><<<bpg, tpb, 0, stream>>>(start, end, vba);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
static __global__ void
dummy_kernel(void)
{
DCONST((AcIntParam)0);
DCONST((AcInt3Param)0);
DCONST((AcRealParam)0);
DCONST((AcReal3Param)0);
acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1));
a* a;
}
AcResult
acKernelDummy(void)
{
dummy_kernel<<<1, 1>>>();
ERRCHK_CUDA_KERNEL_ALWAYS();
return AC_SUCCESS;
}

177
src/core/kernels/kernels.cu Normal file
View File

@@ -0,0 +1,177 @@
#include "kernels.h"
#include <assert.h>
#include <cuComplex.h>
#include "errchk.h"
#include "math_utils.h"
__device__ __constant__ AcMeshInfo d_mesh_info;
static int __device__ __forceinline__
DCONST(const AcIntParam param)
{
return d_mesh_info.int_params[param];
}
static int3 __device__ __forceinline__
DCONST(const AcInt3Param param)
{
return d_mesh_info.int3_params[param];
}
static AcReal __device__ __forceinline__
DCONST(const AcRealParam param)
{
return d_mesh_info.real_params[param];
}
static AcReal3 __device__ __forceinline__
DCONST(const AcReal3Param param)
{
return d_mesh_info.real3_params[param];
}
static __device__ constexpr VertexBufferHandle
DCONST(const VertexBufferHandle handle)
{
return handle;
}
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy))
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy))
#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
//#define globalMeshM // Placeholder
//#define localMeshN // Placeholder
//#define localMeshM // Placeholder
//#define localMeshN_min // Placeholder
//#define globalMeshN_min // Placeholder
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
static __device__ constexpr int
IDX(const int i)
{
return i;
}
static __device__ __forceinline__ int
IDX(const int i, const int j, const int k)
{
return DEVICE_VTXBUF_IDX(i, j, k);
}
static __device__ __forceinline__ int
IDX(const int3 idx)
{
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
}
#if AC_DOUBLE_PRECISION == 1
typedef cuDoubleComplex acComplex;
#define acComplex(x, y) make_cuDoubleComplex(x, y)
#else
typedef cuFloatComplex acComplex;
#define acComplex(x, y) make_cuFloatComplex(x, y)
#define exp(x) expf(x)
#define sin(x) sinf(x)
#define cos(x) cosf(x)
#define sqrt(x) sqrtf(x)
#endif
static __device__ inline acComplex
exp(const acComplex& val)
{
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
}
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
{
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
}
// Kernels /////////////////////////////////////////////////////////////////////
#include "boundconds.cuh"
#include "integration.cuh"
#include "packing.cuh"
#include "reductions.cuh"
AcResult
acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
{
cudaSetDevice(device->id);
ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
device->local_config.int_params[AC_multigpu_offset]);
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
0, cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcRealParam param,
const AcReal value)
{
cudaSetDevice(device->id);
if (param >= NUM_REAL_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal3Param param,
const AcReal3 value)
{
cudaSetDevice(device->id);
if (param >= NUM_REAL3_PARAMS || !NUM_REAL3_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
const int value)
{
cudaSetDevice(device->id);
if (param >= NUM_INT_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
const int3 value)
{
cudaSetDevice(device->id);
if (param >= NUM_INT3_PARAMS)
return AC_FAILURE;
const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}

View File

@@ -0,0 +1,82 @@
#pragma once
#include "astaroth.h"
typedef struct {
int3 dims;
AcReal* data;
} PackedData;
typedef struct {
AcReal* in[NUM_VTXBUF_HANDLES];
AcReal* out[NUM_VTXBUF_HANDLES];
AcReal* profiles[NUM_SCALARARRAY_HANDLES];
} VertexBufferArray;
struct device_s {
int id;
AcMeshInfo local_config;
// Concurrency
cudaStream_t streams[NUM_STREAMS];
// Memory
VertexBufferArray vba;
AcReal* reduce_scratchpad;
AcReal* reduce_result;
};
#ifdef __cplusplus
extern "C" {
#endif
/** */
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf);
/** */
AcResult acKernelDummy(void);
/** */
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
/** */
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
const int3 start, const int3 end, VertexBufferArray vba);
/** */
AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba,
const int3 vba_start, PackedData packed);
/** */
AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed,
const int3 vba_start, VertexBufferArray vba);
/** */
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
AcReal* reduce_result);
/** */
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
AcResult acDeviceLoadMeshInfo(const Device device, const Stream stream,
const AcMeshInfo device_config);
AcResult acDeviceLoadScalarUniform(const Device device, const Stream stream,
const AcRealParam param, const AcReal value);
AcResult acDeviceLoadVectorUniform(const Device device, const Stream stream,
const AcReal3Param param, const AcReal3 value);
AcResult acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param,
const int value);
AcResult acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param,
const int3 value);
#ifdef __cplusplus
} // extern "C"
#endif

View File

@@ -1,120 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "packing.cuh"
#include "src/core/errchk.h"
__global__ void
kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
{
const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
// If within the start-end range (this allows threadblock dims that are not
// divisible by end - start)
if (i_packed >= packed.dims.x || //
j_packed >= packed.dims.y || //
k_packed >= packed.dims.z) {
return;
}
const int i_unpacked = i_packed + vba_start.x;
const int j_unpacked = j_packed + vba_start.y;
const int k_unpacked = k_packed + vba_start.z;
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
const int packed_idx = i_packed + //
j_packed * packed.dims.x + //
k_packed * packed.dims.x * packed.dims.y;
const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
//#pragma unroll
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
}
__global__ void
kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
{
const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
// If within the start-end range (this allows threadblock dims that are not
// divisible by end - start)
if (i_packed >= packed.dims.x || //
j_packed >= packed.dims.y || //
k_packed >= packed.dims.z) {
return;
}
const int i_unpacked = i_packed + vba_start.x;
const int j_unpacked = j_packed + vba_start.y;
const int k_unpacked = k_packed + vba_start.z;
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
const int packed_idx = i_packed + //
j_packed * packed.dims.x + //
k_packed * packed.dims.x * packed.dims.y;
const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
//#pragma unroll
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
}
AcResult
acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
PackedData packed)
{
const dim3 tpb(32, 8, 1);
const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
(unsigned int)ceil(packed.dims.y / (float)tpb.y),
(unsigned int)ceil(packed.dims.z / (float)tpb.z));
kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
VertexBufferArray vba)
{
const dim3 tpb(32, 8, 1);
const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
(unsigned int)ceil(packed.dims.y / (float)tpb.y),
(unsigned int)ceil(packed.dims.z / (float)tpb.z));
kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}

View File

@@ -1,40 +1,92 @@
/*
Copyright (C) 2014-2020, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once #pragma once
#include "astaroth.h"
#include "common.cuh"
typedef struct { static __global__ void
int3 dims; kernel_pack_data(const VertexBufferArray vba, const int3 vba_start, PackedData packed)
AcReal* data; {
} PackedData; const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
AcResult acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, // If within the start-end range (this allows threadblock dims that are not
const int3 vba_start, PackedData packed); // divisible by end - start)
if (i_packed >= packed.dims.x || //
j_packed >= packed.dims.y || //
k_packed >= packed.dims.z) {
return;
}
AcResult acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int i_unpacked = i_packed + vba_start.x;
const int3 vba_start, VertexBufferArray vba); const int j_unpacked = j_packed + vba_start.y;
const int k_unpacked = k_packed + vba_start.z;
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
const int packed_idx = i_packed + //
j_packed * packed.dims.x + //
k_packed * packed.dims.x * packed.dims.y;
const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
//#pragma unroll
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
packed.data[packed_idx + i * vtxbuf_offset] = vba.in[i][unpacked_idx];
}
static __global__ void
kernel_unpack_data(const PackedData packed, const int3 vba_start, VertexBufferArray vba)
{
const int i_packed = threadIdx.x + blockIdx.x * blockDim.x;
const int j_packed = threadIdx.y + blockIdx.y * blockDim.y;
const int k_packed = threadIdx.z + blockIdx.z * blockDim.z;
// If within the start-end range (this allows threadblock dims that are not
// divisible by end - start)
if (i_packed >= packed.dims.x || //
j_packed >= packed.dims.y || //
k_packed >= packed.dims.z) {
return;
}
const int i_unpacked = i_packed + vba_start.x;
const int j_unpacked = j_packed + vba_start.y;
const int k_unpacked = k_packed + vba_start.z;
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
const int packed_idx = i_packed + //
j_packed * packed.dims.x + //
k_packed * packed.dims.x * packed.dims.y;
const size_t vtxbuf_offset = packed.dims.x * packed.dims.y * packed.dims.z;
//#pragma unroll
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
vba.in[i][unpacked_idx] = packed.data[packed_idx + i * vtxbuf_offset];
}
AcResult
acKernelPackData(const cudaStream_t stream, const VertexBufferArray vba, const int3 vba_start,
PackedData packed)
{
const dim3 tpb(32, 8, 1);
const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
(unsigned int)ceil(packed.dims.y / (float)tpb.y),
(unsigned int)ceil(packed.dims.z / (float)tpb.z));
kernel_pack_data<<<bpg, tpb, 0, stream>>>(vba, vba_start, packed);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
acKernelUnpackData(const cudaStream_t stream, const PackedData packed, const int3 vba_start,
VertexBufferArray vba)
{
const dim3 tpb(32, 8, 1);
const dim3 bpg((unsigned int)ceil(packed.dims.x / (float)tpb.x),
(unsigned int)ceil(packed.dims.y / (float)tpb.y),
(unsigned int)ceil(packed.dims.z / (float)tpb.z));
kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, vba_start, vba);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}

View File

@@ -1,284 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "common.cuh"
#include "reductions.cuh"
#include "src/core/errchk.h"
#include "src/core/math_utils.h" // is_power_of_two
/*
Reduction steps:
1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
2 of 3: Compute most of the reductions into a single block of data
3 of 3: After all results have been stored into the final block, reduce the data in the final block
*/
// Function pointer definitions
typedef AcReal (*FilterFunc)(const AcReal&);
typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
// clang-format off
/* Comparison funcs */
static __device__ inline AcReal
dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
static __device__ inline AcReal
dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
static __device__ inline AcReal
dsum(const AcReal& a, const AcReal& b) { return a + b; }
/* Function used to determine the values used during reduction */
static __device__ inline AcReal
dvalue(const AcReal& a) { return AcReal(a); }
static __device__ inline AcReal
dsquared(const AcReal& a) { return (AcReal)(a*a); }
static __device__ inline AcReal
dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
static __device__ inline AcReal
dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
static __device__ inline AcReal
dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
static __device__ inline AcReal
dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
// clang-format on
#include <assert.h>
template <FilterFunc filter>
__global__ void
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
{
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
start.y + threadIdx.y + blockIdx.y * blockDim.y,
start.z + threadIdx.z + blockIdx.z * blockDim.z};
const int nx = end.x - start.x;
const int ny = end.y - start.y;
const int nz = end.z - start.z;
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
threadIdx.y + blockIdx.y * blockDim.y,
threadIdx.z + blockIdx.z * blockDim.z};
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
src_idx.z < DCONST(AC_nz_max));
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
}
template <FilterFuncVec filter>
__global__ void
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
{
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
start.y + threadIdx.y + blockIdx.y * blockDim.y,
start.z + threadIdx.z + blockIdx.z * blockDim.z};
const int nx = end.x - start.x;
const int ny = end.y - start.y;
const int nz = end.z - start.z;
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
threadIdx.y + blockIdx.y * blockDim.y,
threadIdx.z + blockIdx.z * blockDim.z};
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
src_idx.z < DCONST(AC_nz_max));
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
}
template <ReduceFunc reduce>
__global__ void
kernel_reduce(AcReal* scratchpad, const int num_elems)
{
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ AcReal smem[];
if (idx < num_elems) {
smem[threadIdx.x] = scratchpad[idx];
}
else {
smem[threadIdx.x] = NAN;
}
__syncthreads();
int offset = blockDim.x / 2;
assert(offset % 2 == 0);
while (offset > 0) {
if (threadIdx.x < offset) {
smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
}
offset /= 2;
__syncthreads();
}
if (threadIdx.x == 0) {
scratchpad[idx] = smem[threadIdx.x];
}
}
template <ReduceFunc reduce>
__global__ void
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
const int block_size, AcReal* result)
{
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx != 0) {
return;
}
AcReal res = scratchpad[0];
for (int i = 1; i < num_blocks; ++i) {
res = reduce(res, scratchpad[i * block_size]);
}
*result = res;
}
AcReal
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
{
const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y;
const unsigned nz = end.z - start.z;
const unsigned num_elems = nx * ny * nz;
const dim3 tpb_filter(32, 4, 1);
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
const int tpb_reduce = 128;
const int bpg_reduce = num_elems / tpb_reduce;
ERRCHK(nx >= tpb_filter.x);
ERRCHK(ny >= tpb_filter.y);
ERRCHK(nz >= tpb_filter.z);
ERRCHK(tpb_reduce <= num_elems);
ERRCHK(nx * ny * nz % 2 == 0);
// clang-format off
if (rtype == RTYPE_MAX) {
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_MIN) {
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS) {
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS_EXP) {
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_SUM) {
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else {
ERROR("Unrecognized rtype");
}
// clang-format on
cudaStreamSynchronize(stream);
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;
}
AcReal
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
{
const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y;
const unsigned nz = end.z - start.z;
const unsigned num_elems = nx * ny * nz;
const dim3 tpb_filter(32, 4, 1);
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
const int tpb_reduce = 128;
const int bpg_reduce = num_elems / tpb_reduce;
ERRCHK(nx >= tpb_filter.x);
ERRCHK(ny >= tpb_filter.y);
ERRCHK(nz >= tpb_filter.z);
ERRCHK(tpb_reduce <= num_elems);
ERRCHK(nx * ny * nz % 2 == 0);
// clang-format off
if (rtype == RTYPE_MAX) {
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_MIN) {
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS) {
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS_EXP) {
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_SUM) {
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else {
ERROR("Unrecognized rtype");
}
// clang-format on
cudaStreamSynchronize(stream);
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;
}

View File

@@ -1,44 +1,254 @@
#pragma once
/* /*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala. Reduction steps:
1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad
This file is part of Astaroth. 2 of 3: Compute most of the reductions into a single block of data
3 of 3: After all results have been stored into the final block, reduce the data in the final block
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/ */
/** // Function pointer definitions
* @file typedef AcReal (*FilterFunc)(const AcReal&);
* \brief Brief info. typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
* typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
* Detailed info.
*
*/
#pragma once
#include <astaroth.h>
#ifdef __cplusplus // clang-format off
extern "C" { /* Comparison funcs */
#endif static __device__ inline AcReal
dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start, static __device__ inline AcReal
const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
AcReal* reduce_result);
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start, static __device__ inline AcReal
const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, dsum(const AcReal& a, const AcReal& b) { return a + b; }
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
#ifdef __cplusplus /* Function used to determine the values used during reduction */
} // extern "C" static __device__ inline AcReal
#endif dvalue(const AcReal& a) { return AcReal(a); }
static __device__ inline AcReal
dsquared(const AcReal& a) { return (AcReal)(a*a); }
static __device__ inline AcReal
dexp_squared(const AcReal& a) { return exp(a)*exp(a); }
static __device__ inline AcReal
dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
static __device__ inline AcReal
dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); }
static __device__ inline AcReal
dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
// clang-format on
#include <assert.h>
template <FilterFunc filter>
static __global__ void
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
{
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
start.y + threadIdx.y + blockIdx.y * blockDim.y,
start.z + threadIdx.z + blockIdx.z * blockDim.z};
const int nx = end.x - start.x;
const int ny = end.y - start.y;
const int nz = end.z - start.z;
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
threadIdx.y + blockIdx.y * blockDim.y,
threadIdx.z + blockIdx.z * blockDim.z};
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
src_idx.z < DCONST(AC_nz_max));
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]);
}
template <FilterFuncVec filter>
static __global__ void
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
{
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
start.y + threadIdx.y + blockIdx.y * blockDim.y,
start.z + threadIdx.z + blockIdx.z * blockDim.z};
const int nx = end.x - start.x;
const int ny = end.y - start.y;
const int nz = end.z - start.z;
(void)nz; // Suppressed unused variable warning when not compiling with debug flags
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
threadIdx.y + blockIdx.y * blockDim.y,
threadIdx.z + blockIdx.z * blockDim.z};
assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
src_idx.z < DCONST(AC_nz_max));
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
}
template <ReduceFunc reduce>
static __global__ void
kernel_reduce(AcReal* scratchpad, const int num_elems)
{
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ AcReal smem[];
if (idx < num_elems) {
smem[threadIdx.x] = scratchpad[idx];
}
else {
smem[threadIdx.x] = NAN;
}
__syncthreads();
int offset = blockDim.x / 2;
assert(offset % 2 == 0);
while (offset > 0) {
if (threadIdx.x < offset) {
smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
}
offset /= 2;
__syncthreads();
}
if (threadIdx.x == 0) {
scratchpad[idx] = smem[threadIdx.x];
}
}
template <ReduceFunc reduce>
static __global__ void
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
const int block_size, AcReal* result)
{
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx != 0) {
return;
}
AcReal res = scratchpad[0];
for (int i = 1; i < num_blocks; ++i) {
res = reduce(res, scratchpad[i * block_size]);
}
*result = res;
}
AcReal
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
{
const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y;
const unsigned nz = end.z - start.z;
const unsigned num_elems = nx * ny * nz;
const dim3 tpb_filter(32, 4, 1);
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
const int tpb_reduce = 128;
const int bpg_reduce = num_elems / tpb_reduce;
ERRCHK(nx >= tpb_filter.x);
ERRCHK(ny >= tpb_filter.y);
ERRCHK(nz >= tpb_filter.z);
ERRCHK(tpb_reduce <= num_elems);
ERRCHK(nx * ny * nz % 2 == 0);
// clang-format off
if (rtype == RTYPE_MAX) {
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_MIN) {
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS) {
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS_EXP) {
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_SUM) {
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else {
ERROR("Unrecognized rtype");
}
// clang-format on
cudaStreamSynchronize(stream);
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;
}
AcReal
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
{
const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y;
const unsigned nz = end.z - start.z;
const unsigned num_elems = nx * ny * nz;
const dim3 tpb_filter(32, 4, 1);
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
const int tpb_reduce = 128;
const int bpg_reduce = num_elems / tpb_reduce;
ERRCHK(nx >= tpb_filter.x);
ERRCHK(ny >= tpb_filter.y);
ERRCHK(nz >= tpb_filter.z);
ERRCHK(tpb_reduce <= num_elems);
ERRCHK(nx * ny * nz % 2 == 0);
// clang-format off
if (rtype == RTYPE_MAX) {
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_MIN) {
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS) {
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_RMS_EXP) {
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else if (rtype == RTYPE_SUM) {
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
} else {
ERROR("Unrecognized rtype");
}
// clang-format on
cudaStreamSynchronize(stream);
AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
return result;
}

View File

@@ -1,187 +0,0 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
//#include <cmath>
// using namespace std; // Potentially bad practice to declare namespace std here
#include <math.h> // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
//#include <tgmath.h> // Even this does not work
#include <stdlib.h> // rand
template <class T>
static inline const T
max(const T& a, const T& b)
{
return a > b ? a : b;
}
template <class T>
static inline const T
min(const T& a, const T& b)
{
return a < b ? a : b;
}
static inline const int3
max(const int3& a, const int3& b)
{
return (int3){max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
}
static inline const int3
min(const int3& a, const int3& b)
{
return (int3){min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
}
template <class T>
static inline const T
sum(const T& a, const T& b)
{
return a + b;
}
template <class T>
static inline const T
clamp(const T& val, const T& min, const T& max)
{
return val < min ? min : val > max ? max : val;
}
static inline AcReal
randr()
{
return AcReal(rand()) / AcReal(RAND_MAX);
}
static inline bool
is_power_of_two(const unsigned val)
{
return val && !(val & (val - 1));
}
#ifdef __CUDACC__
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
#else
#define HOST_DEVICE_INLINE inline
#endif // __CUDACC__
static HOST_DEVICE_INLINE AcReal3
operator+(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static HOST_DEVICE_INLINE int3
operator+(const int3& a, const int3& b)
{
return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
{
return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
}
static HOST_DEVICE_INLINE void
operator+=(AcReal3& lhs, const AcReal3& rhs)
{
lhs.x += rhs.x;
lhs.y += rhs.y;
lhs.z += rhs.z;
}
static HOST_DEVICE_INLINE AcReal3
operator-(const AcReal3& a, const AcReal3& b)
{
return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
}
static HOST_DEVICE_INLINE int3
operator-(const int3& a, const int3& b)
{
return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
}
static HOST_DEVICE_INLINE AcReal3
operator-(const AcReal3& a)
{
return (AcReal3){-a.x, -a.y, -a.z};
}
static HOST_DEVICE_INLINE void
operator-=(AcReal3& lhs, const AcReal3& rhs)
{
lhs.x -= rhs.x;
lhs.y -= rhs.y;
lhs.z -= rhs.z;
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal
dot(const AcReal3& a, const AcReal3& b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
static HOST_DEVICE_INLINE AcReal3
mul(const AcMatrix& aa, const AcReal3& x)
{
return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
}
static HOST_DEVICE_INLINE AcReal3
cross(const AcReal3& a, const AcReal3& b)
{
AcReal3 c;
c.x = a.y * b.z - a.z * b.y;
c.y = a.z * b.x - a.x * b.z;
c.z = a.x * b.y - a.y * b.x;
return c;
}
static HOST_DEVICE_INLINE bool
is_valid(const AcReal& a)
{
return !isnan(a) && !isinf(a);
}
static HOST_DEVICE_INLINE bool
is_valid(const AcReal3& a)
{
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
}

View File

@@ -123,12 +123,10 @@
db da db da
* *
*/ */
#include "astaroth_node.h" #include "astaroth.h"
#include "astaroth_device.h"
#include "errchk.h" #include "errchk.h"
#include "kernels/common.cuh" #include "math_utils.h"
#include "math_utils.h" // sum for reductions
static const int MAX_NUM_DEVICES = 32; static const int MAX_NUM_DEVICES = 32;