From 316d44b8436e7f4a964cbbabbb9dc2d023ab0406 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 3 Dec 2019 16:04:44 +0200 Subject: [PATCH] Fixed an out-of-bounds error with auto-optimization (introduced in the last few commits) --- src/core/CMakeLists.txt | 2 +- src/core/device.cc | 77 +++++++++++++++++++------------- src/core/kernels/boundconds.cu | 3 +- src/core/kernels/boundconds.cuh | 10 ++++- src/core/kernels/integration.cu | 3 +- src/core/kernels/integration.cuh | 8 ++++ src/core/kernels/reductions.cu | 9 ++-- src/core/kernels/reductions.cuh | 16 +++++-- 8 files changed, 84 insertions(+), 44 deletions(-) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index f716fa4..33a386f 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -47,7 +47,7 @@ endif () ## Create and link the library #set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries -cuda_add_library(astaroth_core STATIC astaroth.cc device.cc node.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu) +cuda_add_library(astaroth_core STATIC astaroth.cc node.cc device.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu) target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS}) target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m) target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API) diff --git a/src/core/device.cc b/src/core/device.cc index beb1687..3d3077e 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -43,14 +43,16 @@ struct device_s { AcReal* reduce_scratchpad; AcReal* reduce_result; -#if AC_MPI_ENABLED - // Declare memory for buffers needed for packed data transfers here - AcReal* inner[2]; - AcReal* outer[2]; + /* + #if AC_MPI_ENABLED + // Declare memory for buffers needed for packed data transfers here + AcReal* inner[2]; + AcReal* outer[2]; - AcReal* inner_host[2]; - AcReal* outer_host[2]; -#endif + AcReal* inner_host[2]; + AcReal* outer_host[2]; + #endif + */ }; #include "kernels/boundconds.cuh" @@ -109,19 +111,20 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand acVertexBufferCompdomainSizeBytes(device_config))); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal))); -#if AC_MPI_ENABLED - // Allocate data required for packed transfers here (cudaMalloc) - const size_t block_size_bytes = device_config.int_params[AC_mx] * - device_config.int_params[AC_my] * NGHOST * NUM_VTXBUF_HANDLES * - sizeof(AcReal); - for (int i = 0; i < 2; ++i) { - ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes)); + /* + #if AC_MPI_ENABLED + // Allocate data required for packed transfers here (cudaMalloc) + const size_t block_size_bytes = device_config.int_params[AC_mx] * + device_config.int_params[AC_my] * NGHOST * + NUM_VTXBUF_HANDLES * sizeof(AcReal); for (int i = 0; i < 2; ++i) { + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes)); - ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes)); - } -#endif + ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes)); + ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes)); + } + #endif + */ // Device constants acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config); @@ -156,16 +159,18 @@ acDeviceDestroy(Device device) cudaFree(device->reduce_scratchpad); cudaFree(device->reduce_result); -#if AC_MPI_ENABLED - // Free data required for packed tranfers here (cudaFree) - for (int i = 0; i < 2; ++i) { - cudaFree(device->inner[i]); - cudaFree(device->outer[i]); + /* + #if AC_MPI_ENABLED + // Free data required for packed tranfers here (cudaFree) + for (int i = 0; i < 2; ++i) { + cudaFree(device->inner[i]); + cudaFree(device->outer[i]); - cudaFreeHost(device->inner_host[i]); - cudaFreeHost(device->outer_host[i]); - } -#endif + cudaFreeHost(device->inner_host[i]); + cudaFreeHost(device->outer_host[i]); + } + #endif + */ // Concurrency for (int i = 0; i < NUM_STREAMS; ++i) { @@ -241,10 +246,16 @@ AcResult acDeviceAutoOptimize(const Device device) { cudaSetDevice(device->id); - const int3 start = (int3){NGHOST, NGHOST, NGHOST}; - const int3 end = (int3){device->local_config.int_params[AC_mx], // - device->local_config.int_params[AC_my], // - device->local_config.int_params[AC_mz]}; + const int3 start = (int3){ + device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min], + }; + const int3 end = (int3){ + device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max], + }; return acKernelAutoOptimizeIntegration(start, end, device->vba); } @@ -592,6 +603,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType */ #include +#include + static int mod(const int a, const int b) { diff --git a/src/core/kernels/boundconds.cu b/src/core/kernels/boundconds.cu index 23c24a2..d82b38d 100644 --- a/src/core/kernels/boundconds.cu +++ b/src/core/kernels/boundconds.cu @@ -24,6 +24,7 @@ * Detailed info. * */ +#include "boundconds.cuh" #include "common.cuh" #include "src/core/errchk.h" @@ -76,7 +77,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) } AcResult -acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, +acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end, AcReal* vtxbuf) { const dim3 tpb(8, 2, 8); diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh index 6a43b2c..737c582 100644 --- a/src/core/kernels/boundconds.cuh +++ b/src/core/kernels/boundconds.cuh @@ -28,5 +28,13 @@ #include "astaroth.h" -AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, +#ifdef __cplusplus +extern "C" { +#endif + +AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end, AcReal* vtxbuf); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/src/core/kernels/integration.cu b/src/core/kernels/integration.cu index b6bd134..e9afb58 100644 --- a/src/core/kernels/integration.cu +++ b/src/core/kernels/integration.cu @@ -25,6 +25,7 @@ * */ #include "common.cuh" +#include "integration.cuh" #include "src/core/errchk.h" #include "src/core/math_utils.h" @@ -283,7 +284,7 @@ dummy_kernel(void) a* a; } -AcReal +AcResult acKernelDummy(void) { dummy_kernel<<<1, 1>>>(); diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index 1889145..8f86db7 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -26,9 +26,17 @@ */ #pragma once +#ifdef __cplusplus +extern "C" { +#endif + AcResult acKernelDummy(void); AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba); AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/src/core/kernels/reductions.cu b/src/core/kernels/reductions.cu index 71c2fac..b06dbf4 100644 --- a/src/core/kernels/reductions.cu +++ b/src/core/kernels/reductions.cu @@ -25,6 +25,7 @@ * */ #include "common.cuh" +#include "reductions.cuh" #include "src/core/errchk.h" #include "src/core/math_utils.h" // is_power_of_two @@ -173,8 +174,8 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks, } AcReal -acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) +acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start, + const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) { const unsigned nx = end.x - start.x; const unsigned ny = end.y - start.y; @@ -227,8 +228,8 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i } AcReal -acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, +acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start, + const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result) { const unsigned nx = end.x - start.x; diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh index e095dcf..191a1b6 100644 --- a/src/core/kernels/reductions.cuh +++ b/src/core/kernels/reductions.cuh @@ -27,10 +27,18 @@ #pragma once #include -AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, +#ifdef __cplusplus +extern "C" { +#endif + +AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start, + const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result); -AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, +AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start, + const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result); + +#ifdef __cplusplus +} // extern "C" +#endif