diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h index d8a4027..0771d07 100644 --- a/include/astaroth_defines.h +++ b/include/astaroth_defines.h @@ -26,7 +26,10 @@ extern "C" { #include // size_t //#include // CUDA vector types (float4, etc) -#ifndef __CUDACC__ +//#ifndef __CUDACC__ +#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__) +#include +#else typedef struct { int x, y, z; } int3; @@ -42,7 +45,8 @@ typedef struct { typedef struct { double x, y, z; } double3; -#endif // __CUDACC__ +#endif +//#endif // __CUDACC__ // Library flags #define STENCIL_ORDER (6) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 5f55b02..f716fa4 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -2,8 +2,13 @@ ## CMakeLists.txt for Astaroth Core ## ######################################## +# Require C++11 +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + ## Find packages find_package(CUDA REQUIRED) +set(CUDA_SEPARABLE_COMPILATION ON) # find_package(OpenMP REQUIRED) ## Architecture and optimization flags @@ -23,8 +28,13 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37 set(CUDA_WARNING_FLAGS --compiler-options -Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion) # -Wshadow set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS}) -set(CUDA_NVCC_FLAGS_RELEASE) -set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v) + +# Workaround for CMake bug https://gitlab.kitware.com/cmake/cmake/issues/16411 +if (CMAKE_BUILD_TYPE_UPPER STREQUAL "RELEASE") + set(CUDA_NVCC_FLAGS_RELEASE -O3) +elseif (CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG") + set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v) +endif() if (MPI_ENABLED) find_package(MPI REQUIRED) @@ -36,10 +46,11 @@ if (MPI_ENABLED) endif () ## Create and link the library -set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries -cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu) -target_include_directories(astaroth_core PRIVATE .) -target_link_libraries(astaroth_core m) +#set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries +cuda_add_library(astaroth_core STATIC astaroth.cc device.cc node.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu) +target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS}) +target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m) +target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API) add_dependencies(astaroth_core dsl_headers) ## Definitions diff --git a/src/core/astaroth.cu b/src/core/astaroth.cc similarity index 100% rename from src/core/astaroth.cu rename to src/core/astaroth.cc diff --git a/src/core/device.cu b/src/core/device.cc similarity index 82% rename from src/core/device.cu rename to src/core/device.cc index 4c64098..c33bde2 100644 --- a/src/core/device.cu +++ b/src/core/device.cc @@ -26,20 +26,10 @@ */ #include "astaroth_device.h" +#include "math_utils.h" #include "errchk.h" -// Device info -#define REGISTERS_PER_THREAD (255) -#define MAX_REGISTERS_PER_BLOCK (65536) -#define MAX_THREADS_PER_BLOCK (1024) -#define WARP_SIZE (32) - -typedef struct { - AcReal* in[NUM_VTXBUF_HANDLES]; - AcReal* out[NUM_VTXBUF_HANDLES]; - - AcReal* profiles[NUM_SCALARARRAY_HANDLES]; -} VertexBufferArray; +#include "kernels/common.cuh" struct device_s { int id; @@ -63,94 +53,14 @@ struct device_s { #endif }; -__constant__ AcMeshInfo d_mesh_info; -static int __device__ __forceinline__ -DCONST(const AcIntParam param) -{ - return d_mesh_info.int_params[param]; -} -static int3 __device__ __forceinline__ -DCONST(const AcInt3Param param) -{ - return d_mesh_info.int3_params[param]; -} -static AcReal __device__ __forceinline__ -DCONST(const AcRealParam param) -{ - return d_mesh_info.real_params[param]; -} -static AcReal3 __device__ __forceinline__ -DCONST(const AcReal3Param param) -{ - return d_mesh_info.real3_params[param]; -} -static __device__ constexpr VertexBufferHandle -DCONST(const VertexBufferHandle handle) -{ - return handle; -} -#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy)) -#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy)) -#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n]) -//#define globalMeshM // Placeholder -//#define localMeshN // Placeholder -//#define localMeshM // Placeholder -//#define localMeshN_min // Placeholder -//#define globalMeshN_min // Placeholder -#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset]) -//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder -//#include -// using namespace thrust; -#include -#if AC_DOUBLE_PRECISION == 1 -typedef cuDoubleComplex acComplex; -#define acComplex(x, y) make_cuDoubleComplex(x, y) -#else -typedef cuFloatComplex acComplex; -#define acComplex(x, y) make_cuFloatComplex(x, y) -#endif -static __device__ inline acComplex -exp(const acComplex& val) -{ - return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y)); -} -static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b) -{ - return (acComplex){a * b.x, a * b.y}; -} - -static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a) -{ - return (acComplex){a * b.x, a * b.y}; -} - -static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b) -{ - return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x}; -} -//#include - #include "kernels/boundconds.cuh" #include "kernels/integration.cuh" #include "kernels/reductions.cuh" -static dim3 rk3_tpb(32, 1, 4); - #if PACKED_DATA_TRANSFERS // Defined in device.cuh // #include "kernels/pack_unpack.cuh" #endif -static __global__ void -dummy_kernel(void) -{ - DCONST((AcIntParam)0); - DCONST((AcInt3Param)0); - DCONST((AcRealParam)0); - DCONST((AcReal3Param)0); - acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1)); - a* a; -} - AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle) { @@ -171,8 +81,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand "device supports the CUDA architecture you are compiling for.\n" "Running dummy kernel... "); fflush(stdout); - dummy_kernel<<<1, 1>>>(); - ERRCHK_CUDA_KERNEL_ALWAYS(); + acKernelDummy(); printf("Success!\n"); // Concurrency @@ -184,21 +93,21 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand // VBA in/out const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.in[i], vba_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.out[i], vba_size_bytes)); } // VBA Profiles const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx], max(device_config.int_params[AC_my], device_config.int_params[AC_mz])); for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.profiles[i], profile_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.profiles[i], profile_size_bytes)); } // Reductions ERRCHK_CUDA_ALWAYS( - cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); + cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal))); #if AC_MPI_ENABLED // Allocate data required for packed transfers here (cudaMalloc) @@ -206,8 +115,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand device_config.int_params[AC_my] * NGHOST * NUM_VTXBUF_HANDLES * sizeof(AcReal); for (int i = 0; i < 2; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->inner[i], block_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->outer[i], block_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes)); @@ -332,87 +241,11 @@ AcResult acDeviceAutoOptimize(const Device device) { cudaSetDevice(device->id); - - // RK3 const int3 start = (int3){NGHOST, NGHOST, NGHOST}; - const int3 end = start + (int3){device->local_config.int_params[AC_nx], // - device->local_config.int_params[AC_ny], // - device->local_config.int_params[AC_nz]}; - - dim3 best_dims(0, 0, 0); - float best_time = INFINITY; - const int num_iterations = 10; - - for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { - for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { - for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) { - - if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z) - break; - if (x * y * z > MAX_THREADS_PER_BLOCK) - break; - - if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK) - break; - - if (((x * y * z) % WARP_SIZE) != 0) - continue; - - const dim3 tpb(x, y, z); - const int3 n = end - start; - const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // - (unsigned int)ceil(n.y / AcReal(tpb.y)), // - (unsigned int)ceil(n.z / AcReal(tpb.z))); - - cudaDeviceSynchronize(); - if (cudaGetLastError() != cudaSuccess) // resets the error if any - continue; - - // printf("(%d, %d, %d)\n", x, y, z); - - cudaEvent_t tstart, tstop; - cudaEventCreate(&tstart); - cudaEventCreate(&tstop); - - // #ifdef AC_dt - acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); - /*#else - ERROR("FATAL ERROR: acDeviceAutoOptimize() or - acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define - it or call the generated " "device function acDeviceKernel_ which does - not require the " "timestep to be defined.\n"); #endif*/ - - cudaEventRecord(tstart); // ---------------------------------------- Timing start - for (int i = 0; i < num_iterations; ++i) - solve<2><<>>(start, end, device->vba); - - cudaEventRecord(tstop); // ----------------------------------------- Timing end - cudaEventSynchronize(tstop); - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, tstart, tstop); - - ERRCHK_CUDA_KERNEL_ALWAYS(); - if (milliseconds < best_time) { - best_time = milliseconds; - best_dims = tpb; - } - } - } - } -#if VERBOSE_PRINTING - printf( - "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", - best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); -#endif - /* - FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); - ERRCHK(fp); - fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z); - fclose(fp); - */ - - rk3_tpb = best_dims; - return AC_SUCCESS; + const int3 end = (int3){device->local_config.int_params[AC_mx], // + device->local_config.int_params[AC_my], // + device->local_config.int_params[AC_mz]}; + return acKernelAutoOptimizeIntegration(start, end, device->vba); } AcResult @@ -446,7 +279,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal { cudaSetDevice(device->id); const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, + ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -457,7 +290,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal { cudaSetDevice(device->id); const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, + ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -468,7 +301,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara { cudaSetDevice(device->id); const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, + ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -479,7 +312,7 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa { cudaSetDevice(device->id); const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, + ERRCHK_CUDA(cudaMemcpyToSymbolAsync(&d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -510,7 +343,7 @@ acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] == device->local_config.int_params[AC_multigpu_offset]); - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config), + ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(&d_mesh_info, &device_config, sizeof(device_config), 0, cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -685,52 +518,17 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste const int3 start, const int3 end, const AcReal dt) { cudaSetDevice(device->id); - - const dim3 tpb = rk3_tpb; - - const int3 n = end - start; - const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // - (unsigned int)ceil(n.y / AcReal(tpb.y)), // - (unsigned int)ceil(n.z / AcReal(tpb.z))); - - //#ifdef AC_dt acDeviceLoadScalarUniform(device, stream, AC_dt, dt); - /*#else - (void)dt; - ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was " - "called, but AC_dt was not defined. Either define it or call the generated " - "device function acDeviceKernel_ which does not require the " - "timestep to be defined.\n"); - #endif*/ - if (step_number == 0) - solve<0><<streams[stream]>>>(start, end, device->vba); - else if (step_number == 1) - solve<1><<streams[stream]>>>(start, end, device->vba); - else - solve<2><<streams[stream]>>>(start, end, device->vba); - - ERRCHK_CUDA_KERNEL(); - - return AC_SUCCESS; + return acKernelIntegrateSubstep(device->streams[stream], step_number, start, end, device->vba); } AcResult -acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type, +acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const VertexBufferHandle vtxbuf_handle, const int3 start, const int3 end) { cudaSetDevice(device->id); - const cudaStream_t stream = device->streams[stream_type]; - - const dim3 tpb(8, 2, 8); - const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), - (unsigned int)ceil((end.y - start.y) / (float)tpb.y), - (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); - - kernel_periodic_boundconds<<>>(start, end, device->vba.in[vtxbuf_handle]); - ERRCHK_CUDA_KERNEL(); - - return AC_SUCCESS; + return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]); } AcResult @@ -757,7 +555,7 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; - *result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle], + *result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle], device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } @@ -777,7 +575,7 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; - *result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], + *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; diff --git a/src/core/errchk.h b/src/core/errchk.h index 72ac7f9..9646a83 100644 --- a/src/core/errchk.h +++ b/src/core/errchk.h @@ -65,7 +65,7 @@ * CUDA-specific error checking * ============================================================================= */ -#ifdef __CUDACC__ +#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__) static inline void cuda_assert(cudaError_t code, const char* file, int line, bool abort = true) { @@ -98,8 +98,6 @@ cuda_assert(cudaError_t code, const char* file, int line, bool abort = true) ERRCHK_CUDA(cudaPeekAtLastError()); \ ERRCHK_CUDA(cudaDeviceSynchronize()); \ } - #endif - #endif #define ERRCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__); } @@ -112,3 +110,4 @@ cuda_assert(cudaError_t code, const char* file, int line, bool abort = true) #define WARNCHK_CUDA_ALWAYS(params) { cuda_assert((params), __FILE__, __LINE__, false); } // clang-format on +#endif // AC_USE_CUDA_RUNTIME_API diff --git a/src/core/grid.cu b/src/core/grid.cc similarity index 100% rename from src/core/grid.cu rename to src/core/grid.cc diff --git a/src/core/kernels/boundconds.cu b/src/core/kernels/boundconds.cu new file mode 100644 index 0000000..a001c6d --- /dev/null +++ b/src/core/kernels/boundconds.cu @@ -0,0 +1,89 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ + +/** + * @file + * \brief Brief info. + * + * Detailed info. + * + */ +#include "common.cuh" + +#include "src/core/errchk.h" + +__global__ void +kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) +{ + const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x; + const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y; + const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z; + + // If within the start-end range (this allows threadblock dims that are not + // divisible by end - start) + if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z) + return; + + // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz)) + // return; + + // If destination index is inside the computational domain, return since + // the boundary conditions are only applied to the ghost zones + if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) && + j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max)) + return; + + // Find the source index + // Map to nx, ny, nz coordinates + int i_src = i_dst - DCONST(AC_nx_min); + int j_src = j_dst - DCONST(AC_ny_min); + int k_src = k_dst - DCONST(AC_nz_min); + + // Translate (s.t. the index is always positive) + i_src += DCONST(AC_nx); + j_src += DCONST(AC_ny); + k_src += DCONST(AC_nz); + + // Wrap + i_src %= DCONST(AC_nx); + j_src %= DCONST(AC_ny); + k_src %= DCONST(AC_nz); + + // Map to mx, my, mz coordinates + i_src += DCONST(AC_nx_min); + j_src += DCONST(AC_ny_min); + k_src += DCONST(AC_nz_min); + + const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); + const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); + vtxbuf[dst_idx] = vtxbuf[src_idx]; +} + +AcResult +acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) +{ + const dim3 tpb(8, 2, 8); + const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), + (unsigned int)ceil((end.y - start.y) / (float)tpb.y), + (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); + + kernel_periodic_boundconds<<>>(start, end, vtxbuf); + ERRCHK_CUDA_KERNEL(); + return AC_SUCCESS; +} diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh index 1120489..2516000 100644 --- a/src/core/kernels/boundconds.cuh +++ b/src/core/kernels/boundconds.cuh @@ -26,61 +26,7 @@ */ #pragma once -__global__ void -kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) -{ - const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x; - const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y; - const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z; +#include "astaroth.h" - // If within the start-end range (this allows threadblock dims that are not - // divisible by end - start) - if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z) - return; - - // if (i_dst >= DCONST(AC_mx) || j_dst >= DCONST(AC_my) || k_dst >= DCONST(AC_mz)) - // return; - - // If destination index is inside the computational domain, return since - // the boundary conditions are only applied to the ghost zones - if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) && - j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max)) - return; - - // Find the source index - // Map to nx, ny, nz coordinates - int i_src = i_dst - DCONST(AC_nx_min); - int j_src = j_dst - DCONST(AC_ny_min); - int k_src = k_dst - DCONST(AC_nz_min); - - // Translate (s.t. the index is always positive) - i_src += DCONST(AC_nx); - j_src += DCONST(AC_ny); - k_src += DCONST(AC_nz); - - // Wrap - i_src %= DCONST(AC_nx); - j_src %= DCONST(AC_ny); - k_src %= DCONST(AC_nz); - - // Map to mx, my, mz coordinates - i_src += DCONST(AC_nx_min); - j_src += DCONST(AC_ny_min); - k_src += DCONST(AC_nz_min); - - const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); - const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); - vtxbuf[dst_idx] = vtxbuf[src_idx]; -} - -static void -periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) -{ - const dim3 tpb(8, 2, 8); - const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), - (unsigned int)ceil((end.y - start.y) / (float)tpb.y), - (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); - - kernel_periodic_boundconds<<>>(start, end, vtxbuf); - ERRCHK_CUDA_KERNEL(); -} +AcResult +acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf); diff --git a/src/core/kernels/common.cuh b/src/core/kernels/common.cuh new file mode 100644 index 0000000..382e9b2 --- /dev/null +++ b/src/core/kernels/common.cuh @@ -0,0 +1,136 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ + +/** + * @file + * \brief Brief info. + * + * Detailed info. + * + */ +#pragma once +#include "astaroth.h" + +extern __constant__ AcMeshInfo d_mesh_info; + +typedef struct { + AcReal* in[NUM_VTXBUF_HANDLES]; + AcReal* out[NUM_VTXBUF_HANDLES]; + + AcReal* profiles[NUM_SCALARARRAY_HANDLES]; +} VertexBufferArray; + +static int __device__ __forceinline__ +DCONST(const AcIntParam param) +{ + return d_mesh_info.int_params[param]; +} +static int3 __device__ __forceinline__ +DCONST(const AcInt3Param param) +{ + return d_mesh_info.int3_params[param]; +} +static AcReal __device__ __forceinline__ +DCONST(const AcRealParam param) +{ + return d_mesh_info.real_params[param]; +} +static AcReal3 __device__ __forceinline__ +DCONST(const AcReal3Param param) +{ + return d_mesh_info.real3_params[param]; +} +static __device__ constexpr VertexBufferHandle +DCONST(const VertexBufferHandle handle) +{ + return handle; +} +#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST(AC_mx) + (k)*DCONST(AC_mxy)) +#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST(AC_nx) + (k)*DCONST(AC_nxy)) +#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n]) +//#define globalMeshM // Placeholder +//#define localMeshN // Placeholder +//#define localMeshM // Placeholder +//#define localMeshN_min // Placeholder +//#define globalMeshN_min // Placeholder +#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset]) +//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder + + + + +static __device__ constexpr int +IDX(const int i) +{ + return i; +} + +static __device__ __forceinline__ int +IDX(const int i, const int j, const int k) +{ + return DEVICE_VTXBUF_IDX(i, j, k); +} + +static __device__ __forceinline__ int +IDX(const int3 idx) +{ + return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z); +} + + + + + + + + +//#include +// using namespace thrust; + + + + +#include +#if AC_DOUBLE_PRECISION == 1 +typedef cuDoubleComplex acComplex; +#define acComplex(x, y) make_cuDoubleComplex(x, y) +#else +typedef cuFloatComplex acComplex; +#define acComplex(x, y) make_cuFloatComplex(x, y) +#endif +static __device__ inline acComplex +exp(const acComplex& val) +{ + return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y)); +} +static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b) +{ + return (acComplex){a * b.x, a * b.y}; +} + +static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a) +{ + return (acComplex){a * b.x, a * b.y}; +} + +static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b) +{ + return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x}; +} +//#include diff --git a/src/core/kernels/integration.cu b/src/core/kernels/integration.cu new file mode 100644 index 0000000..2f029b5 --- /dev/null +++ b/src/core/kernels/integration.cu @@ -0,0 +1,296 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ + +/** + * @file + * \brief Brief info. + * + * Detailed info. + * + */ +#include "common.cuh" + +#include "src/core/errchk.h" +#include "src/core/math_utils.h" + +#include + +__constant__ AcMeshInfo d_mesh_info; // Extern in kernels/common.cuh. + +static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared."); + +// Device info +#define REGISTERS_PER_THREAD (255) +#define MAX_REGISTERS_PER_BLOCK (65536) +#define MAX_THREADS_PER_BLOCK (1024) +#define WARP_SIZE (32) + +#define make_int3(a, b, c) \ + (int3) { (int)a, (int)b, (int)c } + +template +static __device__ __forceinline__ AcReal +rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change, + const AcReal dt) +{ + // Williamson (1980) + const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)}; + const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)}; + + // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds" + // access (when accessing beta[step_number-1] even when step_number >= 1) + switch (step_number) { + case 0: + return state_current + beta[step_number + 1] * rate_of_change * dt; + case 1: // Fallthrough + case 2: + return state_current + + beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) * + (state_current - state_previous) + + rate_of_change * dt); + default: + return NAN; + } +} + +template +static __device__ __forceinline__ AcReal3 +rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current, + const AcReal3 rate_of_change, const AcReal dt) +{ + return (AcReal3){ + rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, dt), + rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, dt), + rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, dt)}; +} + +#define rk3(state_previous, state_current, rate_of_change, dt) \ + rk3_integrate(state_previous, value(state_current), rate_of_change, dt) + +static __device__ void +write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value) +{ + out[handle][idx] = value; +} + +static __device__ __forceinline__ void +write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value) +{ + write(out, vec.x, idx, value.x); + write(out, vec.y, idx, value.y); + write(out, vec.z, idx, value.z); +} + +static __device__ __forceinline__ AcReal +read_out(const int idx, AcReal* __restrict__ field[], const int handle) +{ + return field[handle][idx]; +} + +static __device__ __forceinline__ AcReal3 +read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) +{ + return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y), + read_out(idx, field, handle.z)}; +} + +#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value)) +#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle)) +#define READ_OUT(handle) (read_out(idx, buffer.out, handle)) + +#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx +#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer + +#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \ + const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \ + threadIdx.y + blockIdx.y * blockDim.y + start.y, \ + threadIdx.z + blockIdx.z * blockDim.z + start.z}; \ + const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \ + d_multigpu_offset.y + vertexIdx.y, \ + d_multigpu_offset.z + vertexIdx.z}; \ + (void)globalVertexIdx; \ + if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \ + return; \ + \ + assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \ + vertexIdx.z < DCONST(AC_nz_max)); \ + \ + assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \ + vertexIdx.z >= DCONST(AC_nz_min)); \ + \ + const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); + +// clang-format off +/* +#define GEN_DEVICE_FUNC_HOOK(identifier) \ + template \ + AcResult acDeviceKernel_##identifier(const cudaStream_t stream, \ + const int3 start, const int3 end, VertexBufferArray vba) \ + { \ + \ + const dim3 tpb(32, 1, 4); \ + \ + const int3 n = end - start; \ + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \ + (unsigned int)ceil(n.y / AcReal(tpb.y)), \ + (unsigned int)ceil(n.z / AcReal(tpb.z))); \ + \ + identifier \ + <<>>(start, end, vba); \ + ERRCHK_CUDA_KERNEL(); \ + \ + return AC_SUCCESS; \ + } +*/ +#define GEN_DEVICE_FUNC_HOOK(identifier) + +#include "user_kernels.h" + +static dim3 rk3_tpb(32, 1, 4); + +AcResult +acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba) +{ + // RK3 + dim3 best_dims(0, 0, 0); + float best_time = INFINITY; + const int num_iterations = 10; + + for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { + for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { + for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) { + + if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z) + break; + if (x * y * z > MAX_THREADS_PER_BLOCK) + break; + + if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK) + break; + + if (((x * y * z) % WARP_SIZE) != 0) + continue; + + const dim3 tpb(x, y, z); + const int3 n = end - start; + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // + (unsigned int)ceil(n.y / AcReal(tpb.y)), // + (unsigned int)ceil(n.z / AcReal(tpb.z))); + + cudaDeviceSynchronize(); + if (cudaGetLastError() != cudaSuccess) // resets the error if any + continue; + + // printf("(%d, %d, %d)\n", x, y, z); + + cudaEvent_t tstart, tstop; + cudaEventCreate(&tstart); + cudaEventCreate(&tstop); + + // #ifdef AC_dt + //acDeviceLoadScalarUniform(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); // TODO note, temporarily disabled + /*#else + ERROR("FATAL ERROR: acDeviceAutoOptimize() or + acDeviceIntegrateSubstep() was " "called, but AC_dt was not defined. Either define + it or call the generated " "device function acDeviceKernel_ which does + not require the " "timestep to be defined.\n"); #endif*/ + + cudaEventRecord(tstart); // ---------------------------------------- Timing start + for (int i = 0; i < num_iterations; ++i) + solve<2><<>>(start, end, vba); + + cudaEventRecord(tstop); // ----------------------------------------- Timing end + cudaEventSynchronize(tstop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, tstart, tstop); + + ERRCHK_CUDA_KERNEL_ALWAYS(); + if (milliseconds < best_time) { + best_time = milliseconds; + best_dims = tpb; + } + } + } + } +#if VERBOSE_PRINTING + printf( + "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", + best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); +#endif + /* + FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); + ERRCHK(fp); + fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z); + fclose(fp); + */ + + rk3_tpb = best_dims; + return AC_SUCCESS; +} + +AcResult +acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba) +{ + const dim3 tpb = rk3_tpb; + + const int3 n = end - start; + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // + (unsigned int)ceil(n.y / AcReal(tpb.y)), // + (unsigned int)ceil(n.z / AcReal(tpb.z))); + + //#ifdef AC_dt + //acDeviceLoadScalarUniform(device, stream, AC_dt, dt); + /*#else + (void)dt; + ERROR("FATAL ERROR: acDeviceAutoOptimize() or acDeviceIntegrateSubstep() was " + "called, but AC_dt was not defined. Either define it or call the generated " + "device function acDeviceKernel_ which does not require the " + "timestep to be defined.\n"); + #endif*/ + if (step_number == 0) + solve<0><<>>(start, end, vba); + else if (step_number == 1) + solve<1><<>>(start, end, vba); + else + solve<2><<>>(start, end, vba); + + ERRCHK_CUDA_KERNEL(); + + return AC_SUCCESS; +} + +static __global__ void +dummy_kernel(void) +{ + DCONST((AcIntParam)0); + DCONST((AcInt3Param)0); + DCONST((AcRealParam)0); + DCONST((AcReal3Param)0); + acComplex a = exp(AcReal(1) * acComplex(1, 1) * AcReal(1)); + a* a; +} + +AcReal +acKernelDummy(void) +{ + dummy_kernel<<<1, 1>>>(); + ERRCHK_CUDA_KERNEL_ALWAYS(); + return AC_SUCCESS; +} + diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index f27e5f9..b103b46 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -25,146 +25,12 @@ * */ #pragma once -#include "src/core/math_utils.h" -#include +AcResult +acKernelDummy(void); -static_assert(NUM_VTXBUF_HANDLES > 0, "ERROR: At least one uniform ScalarField must be declared."); +AcResult +acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba); -static __device__ constexpr int -IDX(const int i) -{ - return i; -} - -static __device__ __forceinline__ int -IDX(const int i, const int j, const int k) -{ - return DEVICE_VTXBUF_IDX(i, j, k); -} - -static __device__ __forceinline__ int -IDX(const int3 idx) -{ - return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z); -} - -#define make_int3(a, b, c) \ - (int3) { (int)a, (int)b, (int)c } - -template -static __device__ __forceinline__ AcReal -rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change, - const AcReal dt) -{ - // Williamson (1980) - const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)}; - const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)}; - - // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds" - // access (when accessing beta[step_number-1] even when step_number >= 1) - switch (step_number) { - case 0: - return state_current + beta[step_number + 1] * rate_of_change * dt; - case 1: // Fallthrough - case 2: - return state_current + - beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) * - (state_current - state_previous) + - rate_of_change * dt); - default: - return NAN; - } -} - -template -static __device__ __forceinline__ AcReal3 -rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current, - const AcReal3 rate_of_change, const AcReal dt) -{ - return (AcReal3){ - rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, dt), - rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, dt), - rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, dt)}; -} - -#define rk3(state_previous, state_current, rate_of_change, dt) \ - rk3_integrate(state_previous, value(state_current), rate_of_change, dt) - -static __device__ void -write(AcReal* __restrict__ out[], const int handle, const int idx, const AcReal value) -{ - out[handle][idx] = value; -} - -static __device__ __forceinline__ void -write(AcReal* __restrict__ out[], const int3 vec, const int idx, const AcReal3 value) -{ - write(out, vec.x, idx, value.x); - write(out, vec.y, idx, value.y); - write(out, vec.z, idx, value.z); -} - -static __device__ __forceinline__ AcReal -read_out(const int idx, AcReal* __restrict__ field[], const int handle) -{ - return field[handle][idx]; -} - -static __device__ __forceinline__ AcReal3 -read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) -{ - return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y), - read_out(idx, field, handle.z)}; -} - -#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value)) -#define READ(handle) (read_data(vertexIdx, globalVertexIdx, buffer.in, handle)) -#define READ_OUT(handle) (read_out(idx, buffer.out, handle)) - -#define GEN_PREPROCESSED_PARAM_BOILERPLATE const int3 &vertexIdx, const int3 &globalVertexIdx -#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer - -#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \ - const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \ - threadIdx.y + blockIdx.y * blockDim.y + start.y, \ - threadIdx.z + blockIdx.z * blockDim.z + start.z}; \ - const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \ - d_multigpu_offset.y + vertexIdx.y, \ - d_multigpu_offset.z + vertexIdx.z}; \ - (void)globalVertexIdx; \ - if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \ - return; \ - \ - assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \ - vertexIdx.z < DCONST(AC_nz_max)); \ - \ - assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \ - vertexIdx.z >= DCONST(AC_nz_min)); \ - \ - const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); - -// clang-format off -#define GEN_DEVICE_FUNC_HOOK(identifier) \ - template \ - AcResult acDeviceKernel_##identifier(const Device device, const Stream stream, \ - const int3 start, const int3 end) \ - { \ - cudaSetDevice(device->id); \ - \ - const dim3 tpb(32, 1, 4); \ - \ - const int3 n = end - start; \ - const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \ - (unsigned int)ceil(n.y / AcReal(tpb.y)), \ - (unsigned int)ceil(n.z / AcReal(tpb.z))); \ - \ - identifier \ - <<streams[stream]>>>(start, end, device->vba); \ - ERRCHK_CUDA_KERNEL(); \ - \ - return AC_SUCCESS; \ - } - - -#include "user_kernels.h" +AcResult +acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba); diff --git a/src/core/kernels/reductions.cu b/src/core/kernels/reductions.cu new file mode 100644 index 0000000..6d497f5 --- /dev/null +++ b/src/core/kernels/reductions.cu @@ -0,0 +1,283 @@ +/* + Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. + + This file is part of Astaroth. + + Astaroth is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Astaroth is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Astaroth. If not, see . +*/ + +/** + * @file + * \brief Brief info. + * + * Detailed info. + * + */ +#include "common.cuh" + +#include "src/core/errchk.h" +#include "src/core/math_utils.h" // is_power_of_two + +/* +Reduction steps: + 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad + 2 of 3: Compute most of the reductions into a single block of data + 3 of 3: After all results have been stored into the final block, reduce the data in the final block +*/ + +// Function pointer definitions +typedef AcReal (*FilterFunc)(const AcReal&); +typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&); +typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&); + +// clang-format off +/* Comparison funcs */ +static __device__ inline AcReal +dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; } + +static __device__ inline AcReal +dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; } + +static __device__ inline AcReal +dsum(const AcReal& a, const AcReal& b) { return a + b; } + +/* Function used to determine the values used during reduction */ +static __device__ inline AcReal +dvalue(const AcReal& a) { return AcReal(a); } + +static __device__ inline AcReal +dsquared(const AcReal& a) { return (AcReal)(a*a); } + +static __device__ inline AcReal +dexp_squared(const AcReal& a) { return exp(a)*exp(a); } + +static __device__ inline AcReal +dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); } + +static __device__ inline AcReal +dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); } + +static __device__ inline AcReal +dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); } +// clang-format on + +#include +template +__global__ void +kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst) +{ + const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z}; + + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; + (void)nz; // Suppressed unused variable warning when not compiling with debug flags + + const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z}; + + assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) && + src_idx.z < DCONST(AC_nz_max)); + assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); + assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); + + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]); +} + +template +__global__ void +kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1, + const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst) +{ + const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z}; + + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; + (void)nz; // Suppressed unused variable warning when not compiling with debug flags + + const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z}; + + assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) && + src_idx.z < DCONST(AC_nz_max)); + assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); + assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); + + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( + src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); +} + +template +__global__ void +kernel_reduce(AcReal* scratchpad, const int num_elems) +{ + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + + extern __shared__ AcReal smem[]; + if (idx < num_elems) { + smem[threadIdx.x] = scratchpad[idx]; + } + else { + smem[threadIdx.x] = NAN; + } + __syncthreads(); + + int offset = blockDim.x / 2; + assert(offset % 2 == 0); + while (offset > 0) { + if (threadIdx.x < offset) { + smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]); + } + offset /= 2; + __syncthreads(); + } + + if (threadIdx.x == 0) { + scratchpad[idx] = smem[threadIdx.x]; + } +} + +template +__global__ void +kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks, + const int block_size, AcReal* result) +{ + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx != 0) { + return; + } + + AcReal res = scratchpad[0]; + for (int i = 1; i < num_blocks; ++i) { + res = reduce(res, scratchpad[i * block_size]); + } + *result = res; +} + +AcReal +acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, + const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) +{ + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; + const unsigned num_elems = nx * ny * nz; + + const dim3 tpb_filter(32, 4, 1); + const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z))); + + const int tpb_reduce = 128; + const int bpg_reduce = num_elems / tpb_reduce; + + ERRCHK(nx >= tpb_filter.x); + ERRCHK(ny >= tpb_filter.y); + ERRCHK(nz >= tpb_filter.z); + ERRCHK(tpb_reduce <= num_elems); + ERRCHK(nx * ny * nz % 2 == 0); + + // clang-format off + if (rtype == RTYPE_MAX) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_MIN) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS_EXP) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_SUM) { + kernel_filter<<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else { + ERROR("Unrecognized rtype"); + } + // clang-format on + cudaStreamSynchronize(stream); + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} + +AcReal +acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, + const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, + AcReal* reduce_result) +{ + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; + const unsigned num_elems = nx * ny * nz; + + const dim3 tpb_filter(32, 4, 1); + const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z))); + + const int tpb_reduce = 128; + const int bpg_reduce = num_elems / tpb_reduce; + + ERRCHK(nx >= tpb_filter.x); + ERRCHK(ny >= tpb_filter.y); + ERRCHK(nz >= tpb_filter.z); + ERRCHK(tpb_reduce <= num_elems); + ERRCHK(nx * ny * nz % 2 == 0); + + // clang-format off + if (rtype == RTYPE_MAX) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_MIN) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_RMS_EXP) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else if (rtype == RTYPE_SUM) { + kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>(scratchpad, num_elems); + kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } else { + ERROR("Unrecognized rtype"); + } + // clang-format on + + cudaStreamSynchronize(stream); + AcReal result; + cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); + return result; +} diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh index 4e7537c..410fcdf 100644 --- a/src/core/kernels/reductions.cuh +++ b/src/core/kernels/reductions.cuh @@ -25,258 +25,13 @@ * */ #pragma once +#include -#include "src/core/math_utils.h" // is_power_of_two +AcReal +acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, + const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result); -/* -Reduction steps: - 1 of 3: Compute the initial value (a, a*a or exp(a)*exp(a)) and put the result in scratchpad - 2 of 3: Compute most of the reductions into a single block of data - 3 of 3: After all results have been stored into the final block, reduce the data in the final block -*/ - -// Function pointer definitions -typedef AcReal (*FilterFunc)(const AcReal&); -typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&); -typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&); - -// clang-format off -/* Comparison funcs */ -static __device__ inline AcReal -dmax(const AcReal& a, const AcReal& b) { return a > b ? a : b; } - -static __device__ inline AcReal -dmin(const AcReal& a, const AcReal& b) { return a < b ? a : b; } - -static __device__ inline AcReal -dsum(const AcReal& a, const AcReal& b) { return a + b; } - -/* Function used to determine the values used during reduction */ -static __device__ inline AcReal -dvalue(const AcReal& a) { return AcReal(a); } - -static __device__ inline AcReal -dsquared(const AcReal& a) { return (AcReal)(a*a); } - -static __device__ inline AcReal -dexp_squared(const AcReal& a) { return exp(a)*exp(a); } - -static __device__ inline AcReal -dlength_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); } - -static __device__ inline AcReal -dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquared(a) + dsquared(b) + dsquared(c); } - -static __device__ inline AcReal -dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); } -// clang-format on - -#include -template -__global__ void -kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst) -{ - const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, - start.y + threadIdx.y + blockIdx.y * blockDim.y, - start.z + threadIdx.z + blockIdx.z * blockDim.z}; - - const int nx = end.x - start.x; - const int ny = end.y - start.y; - const int nz = end.z - start.z; - (void)nz; // Suppressed unused variable warning when not compiling with debug flags - - const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, - threadIdx.y + blockIdx.y * blockDim.y, - threadIdx.z + blockIdx.z * blockDim.z}; - - assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) && - src_idx.z < DCONST(AC_nz_max)); - assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); - assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); - - dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src[IDX(src_idx)]); -} - -template -__global__ void -kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1, - const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst) -{ - const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, - start.y + threadIdx.y + blockIdx.y * blockDim.y, - start.z + threadIdx.z + blockIdx.z * blockDim.z}; - - const int nx = end.x - start.x; - const int ny = end.y - start.y; - const int nz = end.z - start.z; - (void)nz; // Suppressed unused variable warning when not compiling with debug flags - - const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, - threadIdx.y + blockIdx.y * blockDim.y, - threadIdx.z + blockIdx.z * blockDim.z}; - - assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) && - src_idx.z < DCONST(AC_nz_max)); - assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); - assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); - - dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( - src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); -} - -template -__global__ void -kernel_reduce(AcReal* scratchpad, const int num_elems) -{ - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - - extern __shared__ AcReal smem[]; - if (idx < num_elems) { - smem[threadIdx.x] = scratchpad[idx]; - } - else { - smem[threadIdx.x] = NAN; - } - __syncthreads(); - - int offset = blockDim.x / 2; - assert(offset % 2 == 0); - while (offset > 0) { - if (threadIdx.x < offset) { - smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]); - } - offset /= 2; - __syncthreads(); - } - - if (threadIdx.x == 0) { - scratchpad[idx] = smem[threadIdx.x]; - } -} - -template -__global__ void -kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks, - const int block_size, AcReal* result) -{ - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx != 0) { - return; - } - - AcReal res = scratchpad[0]; - for (int i = 1; i < num_blocks; ++i) { - res = reduce(res, scratchpad[i * block_size]); - } - *result = res; -} - -static AcReal -reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) -{ - const unsigned nx = end.x - start.x; - const unsigned ny = end.y - start.y; - const unsigned nz = end.z - start.z; - const unsigned num_elems = nx * ny * nz; - - const dim3 tpb_filter(32, 4, 1); - const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), - (unsigned int)ceil(ny / AcReal(tpb_filter.y)), - (unsigned int)ceil(nz / AcReal(tpb_filter.z))); - - const int tpb_reduce = 128; - const int bpg_reduce = num_elems / tpb_reduce; - - ERRCHK(nx >= tpb_filter.x); - ERRCHK(ny >= tpb_filter.y); - ERRCHK(nz >= tpb_filter.z); - ERRCHK(tpb_reduce <= num_elems); - ERRCHK(nx * ny * nz % 2 == 0); - - // clang-format off - if (rtype == RTYPE_MAX) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_MIN) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS_EXP) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_SUM) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else { - ERROR("Unrecognized rtype"); - } - // clang-format on - cudaStreamSynchronize(stream); - AcReal result; - cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); - return result; -} - -static AcReal -reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, +AcReal +acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, - AcReal* reduce_result) -{ - const unsigned nx = end.x - start.x; - const unsigned ny = end.y - start.y; - const unsigned nz = end.z - start.z; - const unsigned num_elems = nx * ny * nz; - - const dim3 tpb_filter(32, 4, 1); - const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), - (unsigned int)ceil(ny / AcReal(tpb_filter.y)), - (unsigned int)ceil(nz / AcReal(tpb_filter.z))); - - const int tpb_reduce = 128; - const int bpg_reduce = num_elems / tpb_reduce; - - ERRCHK(nx >= tpb_filter.x); - ERRCHK(ny >= tpb_filter.y); - ERRCHK(nz >= tpb_filter.z); - ERRCHK(tpb_reduce <= num_elems); - ERRCHK(nx * ny * nz % 2 == 0); - - // clang-format off - if (rtype == RTYPE_MAX) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_MIN) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS_EXP) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_SUM) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else { - ERROR("Unrecognized rtype"); - } - // clang-format on - - cudaStreamSynchronize(stream); - AcReal result; - cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); - return result; -} + AcReal* reduce_result); diff --git a/src/core/node.cu b/src/core/node.cc similarity index 100% rename from src/core/node.cu rename to src/core/node.cc