Fixed an out-of-bounds error with auto-optimization (introduced in the last few commits)

This commit is contained in:
jpekkila
2019-12-03 16:04:44 +02:00
parent 7e4212ddd9
commit 316d44b843
8 changed files with 84 additions and 44 deletions

View File

@@ -47,7 +47,7 @@ endif ()
## Create and link the library ## Create and link the library
#set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries #set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries
cuda_add_library(astaroth_core STATIC astaroth.cc device.cc node.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu) cuda_add_library(astaroth_core STATIC astaroth.cc node.cc device.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu)
target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS}) target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m) target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m)
target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API) target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)

View File

@@ -43,14 +43,16 @@ struct device_s {
AcReal* reduce_scratchpad; AcReal* reduce_scratchpad;
AcReal* reduce_result; AcReal* reduce_result;
#if AC_MPI_ENABLED /*
// Declare memory for buffers needed for packed data transfers here #if AC_MPI_ENABLED
AcReal* inner[2]; // Declare memory for buffers needed for packed data transfers here
AcReal* outer[2]; AcReal* inner[2];
AcReal* outer[2];
AcReal* inner_host[2]; AcReal* inner_host[2];
AcReal* outer_host[2]; AcReal* outer_host[2];
#endif #endif
*/
}; };
#include "kernels/boundconds.cuh" #include "kernels/boundconds.cuh"
@@ -109,19 +111,20 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
acVertexBufferCompdomainSizeBytes(device_config))); acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal))); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
#if AC_MPI_ENABLED /*
// Allocate data required for packed transfers here (cudaMalloc) #if AC_MPI_ENABLED
const size_t block_size_bytes = device_config.int_params[AC_mx] * // Allocate data required for packed transfers here (cudaMalloc)
device_config.int_params[AC_my] * NGHOST * NUM_VTXBUF_HANDLES * const size_t block_size_bytes = device_config.int_params[AC_mx] *
sizeof(AcReal); device_config.int_params[AC_my] * NGHOST *
for (int i = 0; i < 2; ++i) { NUM_VTXBUF_HANDLES * sizeof(AcReal); for (int i = 0; i < 2; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes));
} }
#endif #endif
*/
// Device constants // Device constants
acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config); acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config);
@@ -156,16 +159,18 @@ acDeviceDestroy(Device device)
cudaFree(device->reduce_scratchpad); cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result); cudaFree(device->reduce_result);
#if AC_MPI_ENABLED /*
// Free data required for packed tranfers here (cudaFree) #if AC_MPI_ENABLED
for (int i = 0; i < 2; ++i) { // Free data required for packed tranfers here (cudaFree)
cudaFree(device->inner[i]); for (int i = 0; i < 2; ++i) {
cudaFree(device->outer[i]); cudaFree(device->inner[i]);
cudaFree(device->outer[i]);
cudaFreeHost(device->inner_host[i]); cudaFreeHost(device->inner_host[i]);
cudaFreeHost(device->outer_host[i]); cudaFreeHost(device->outer_host[i]);
} }
#endif #endif
*/
// Concurrency // Concurrency
for (int i = 0; i < NUM_STREAMS; ++i) { for (int i = 0; i < NUM_STREAMS; ++i) {
@@ -241,10 +246,16 @@ AcResult
acDeviceAutoOptimize(const Device device) acDeviceAutoOptimize(const Device device)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
const int3 start = (int3){NGHOST, NGHOST, NGHOST}; const int3 start = (int3){
const int3 end = (int3){device->local_config.int_params[AC_mx], // device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_my], // device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_mz]}; device->local_config.int_params[AC_nz_min],
};
const int3 end = (int3){
device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max],
};
return acKernelAutoOptimizeIntegration(start, end, device->vba); return acKernelAutoOptimizeIntegration(start, end, device->vba);
} }
@@ -592,6 +603,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
*/ */
#include <mpi.h> #include <mpi.h>
#include <assert.h>
static int static int
mod(const int a, const int b) mod(const int a, const int b)
{ {

View File

@@ -24,6 +24,7 @@
* Detailed info. * Detailed info.
* *
*/ */
#include "boundconds.cuh"
#include "common.cuh" #include "common.cuh"
#include "src/core/errchk.h" #include "src/core/errchk.h"
@@ -76,7 +77,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
} }
AcResult AcResult
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf) AcReal* vtxbuf)
{ {
const dim3 tpb(8, 2, 8); const dim3 tpb(8, 2, 8);

View File

@@ -28,5 +28,13 @@
#include "astaroth.h" #include "astaroth.h"
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, #ifdef __cplusplus
extern "C" {
#endif
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf); AcReal* vtxbuf);
#ifdef __cplusplus
} // extern "C"
#endif

View File

@@ -25,6 +25,7 @@
* *
*/ */
#include "common.cuh" #include "common.cuh"
#include "integration.cuh"
#include "src/core/errchk.h" #include "src/core/errchk.h"
#include "src/core/math_utils.h" #include "src/core/math_utils.h"
@@ -283,7 +284,7 @@ dummy_kernel(void)
a* a; a* a;
} }
AcReal AcResult
acKernelDummy(void) acKernelDummy(void)
{ {
dummy_kernel<<<1, 1>>>(); dummy_kernel<<<1, 1>>>();

View File

@@ -26,9 +26,17 @@
*/ */
#pragma once #pragma once
#ifdef __cplusplus
extern "C" {
#endif
AcResult acKernelDummy(void); AcResult acKernelDummy(void);
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba); AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
const int3 start, const int3 end, VertexBufferArray vba); const int3 start, const int3 end, VertexBufferArray vba);
#ifdef __cplusplus
} // extern "C"
#endif

View File

@@ -25,6 +25,7 @@
* *
*/ */
#include "common.cuh" #include "common.cuh"
#include "reductions.cuh"
#include "src/core/errchk.h" #include "src/core/errchk.h"
#include "src/core/math_utils.h" // is_power_of_two #include "src/core/math_utils.h" // is_power_of_two
@@ -173,8 +174,8 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
} }
AcReal AcReal
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
{ {
const unsigned nx = end.x - start.x; const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y; const unsigned ny = end.y - start.y;
@@ -227,8 +228,8 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
} }
AcReal AcReal
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result) const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
{ {
const unsigned nx = end.x - start.x; const unsigned nx = end.x - start.x;

View File

@@ -27,10 +27,18 @@
#pragma once #pragma once
#include <astaroth.h> #include <astaroth.h>
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, #ifdef __cplusplus
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, extern "C" {
#endif
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
AcReal* reduce_result); AcReal* reduce_result);
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result); const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
#ifdef __cplusplus
} // extern "C"
#endif