Fixed an out-of-bounds error with auto-optimization (introduced in the last few commits)
This commit is contained in:
@@ -47,7 +47,7 @@ endif ()
|
|||||||
|
|
||||||
## Create and link the library
|
## Create and link the library
|
||||||
#set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries
|
#set(CMAKE_POSITION_INDEPENDENT_CODE ON) # fpic for shared libraries
|
||||||
cuda_add_library(astaroth_core STATIC astaroth.cc device.cc node.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu)
|
cuda_add_library(astaroth_core STATIC astaroth.cc node.cc device.cc kernels/boundconds.cu kernels/integration.cu kernels/reductions.cu)
|
||||||
target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
|
target_include_directories(astaroth_core PRIVATE . ${CUDA_INCLUDE_DIRS})
|
||||||
target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m)
|
target_link_libraries(astaroth_core ${CUDA_LIBRARIES} m)
|
||||||
target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
|
target_compile_definitions(astaroth_core PRIVATE AC_USE_CUDA_RUNTIME_API)
|
||||||
|
@@ -43,14 +43,16 @@ struct device_s {
|
|||||||
AcReal* reduce_scratchpad;
|
AcReal* reduce_scratchpad;
|
||||||
AcReal* reduce_result;
|
AcReal* reduce_result;
|
||||||
|
|
||||||
#if AC_MPI_ENABLED
|
/*
|
||||||
// Declare memory for buffers needed for packed data transfers here
|
#if AC_MPI_ENABLED
|
||||||
AcReal* inner[2];
|
// Declare memory for buffers needed for packed data transfers here
|
||||||
AcReal* outer[2];
|
AcReal* inner[2];
|
||||||
|
AcReal* outer[2];
|
||||||
|
|
||||||
AcReal* inner_host[2];
|
AcReal* inner_host[2];
|
||||||
AcReal* outer_host[2];
|
AcReal* outer_host[2];
|
||||||
#endif
|
#endif
|
||||||
|
*/
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "kernels/boundconds.cuh"
|
#include "kernels/boundconds.cuh"
|
||||||
@@ -109,19 +111,20 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
|||||||
acVertexBufferCompdomainSizeBytes(device_config)));
|
acVertexBufferCompdomainSizeBytes(device_config)));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
|
||||||
|
|
||||||
#if AC_MPI_ENABLED
|
/*
|
||||||
// Allocate data required for packed transfers here (cudaMalloc)
|
#if AC_MPI_ENABLED
|
||||||
const size_t block_size_bytes = device_config.int_params[AC_mx] *
|
// Allocate data required for packed transfers here (cudaMalloc)
|
||||||
device_config.int_params[AC_my] * NGHOST * NUM_VTXBUF_HANDLES *
|
const size_t block_size_bytes = device_config.int_params[AC_mx] *
|
||||||
sizeof(AcReal);
|
device_config.int_params[AC_my] * NGHOST *
|
||||||
for (int i = 0; i < 2; ++i) {
|
NUM_VTXBUF_HANDLES * sizeof(AcReal); for (int i = 0; i < 2; ++i) {
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->inner[i], block_size_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->outer[i], block_size_bytes));
|
||||||
|
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->inner_host[i], block_size_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMallocHost(&device->outer_host[i], block_size_bytes));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
// Device constants
|
// Device constants
|
||||||
acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config);
|
acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config);
|
||||||
@@ -156,16 +159,18 @@ acDeviceDestroy(Device device)
|
|||||||
cudaFree(device->reduce_scratchpad);
|
cudaFree(device->reduce_scratchpad);
|
||||||
cudaFree(device->reduce_result);
|
cudaFree(device->reduce_result);
|
||||||
|
|
||||||
#if AC_MPI_ENABLED
|
/*
|
||||||
// Free data required for packed tranfers here (cudaFree)
|
#if AC_MPI_ENABLED
|
||||||
for (int i = 0; i < 2; ++i) {
|
// Free data required for packed tranfers here (cudaFree)
|
||||||
cudaFree(device->inner[i]);
|
for (int i = 0; i < 2; ++i) {
|
||||||
cudaFree(device->outer[i]);
|
cudaFree(device->inner[i]);
|
||||||
|
cudaFree(device->outer[i]);
|
||||||
|
|
||||||
cudaFreeHost(device->inner_host[i]);
|
cudaFreeHost(device->inner_host[i]);
|
||||||
cudaFreeHost(device->outer_host[i]);
|
cudaFreeHost(device->outer_host[i]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
// Concurrency
|
// Concurrency
|
||||||
for (int i = 0; i < NUM_STREAMS; ++i) {
|
for (int i = 0; i < NUM_STREAMS; ++i) {
|
||||||
@@ -241,10 +246,16 @@ AcResult
|
|||||||
acDeviceAutoOptimize(const Device device)
|
acDeviceAutoOptimize(const Device device)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
|
const int3 start = (int3){
|
||||||
const int3 end = (int3){device->local_config.int_params[AC_mx], //
|
device->local_config.int_params[AC_nx_min],
|
||||||
device->local_config.int_params[AC_my], //
|
device->local_config.int_params[AC_ny_min],
|
||||||
device->local_config.int_params[AC_mz]};
|
device->local_config.int_params[AC_nz_min],
|
||||||
|
};
|
||||||
|
const int3 end = (int3){
|
||||||
|
device->local_config.int_params[AC_nx_max],
|
||||||
|
device->local_config.int_params[AC_ny_max],
|
||||||
|
device->local_config.int_params[AC_nz_max],
|
||||||
|
};
|
||||||
return acKernelAutoOptimizeIntegration(start, end, device->vba);
|
return acKernelAutoOptimizeIntegration(start, end, device->vba);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -592,6 +603,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
|||||||
*/
|
*/
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
static int
|
static int
|
||||||
mod(const int a, const int b)
|
mod(const int a, const int b)
|
||||||
{
|
{
|
||||||
|
@@ -24,6 +24,7 @@
|
|||||||
* Detailed info.
|
* Detailed info.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
#include "boundconds.cuh"
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
#include "src/core/errchk.h"
|
#include "src/core/errchk.h"
|
||||||
@@ -76,7 +77,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
|
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
|
||||||
AcReal* vtxbuf)
|
AcReal* vtxbuf)
|
||||||
{
|
{
|
||||||
const dim3 tpb(8, 2, 8);
|
const dim3 tpb(8, 2, 8);
|
||||||
|
@@ -28,5 +28,13 @@
|
|||||||
|
|
||||||
#include "astaroth.h"
|
#include "astaroth.h"
|
||||||
|
|
||||||
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
|
||||||
AcReal* vtxbuf);
|
AcReal* vtxbuf);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
@@ -25,6 +25,7 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
#include "integration.cuh"
|
||||||
|
|
||||||
#include "src/core/errchk.h"
|
#include "src/core/errchk.h"
|
||||||
#include "src/core/math_utils.h"
|
#include "src/core/math_utils.h"
|
||||||
@@ -283,7 +284,7 @@ dummy_kernel(void)
|
|||||||
a* a;
|
a* a;
|
||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcResult
|
||||||
acKernelDummy(void)
|
acKernelDummy(void)
|
||||||
{
|
{
|
||||||
dummy_kernel<<<1, 1>>>();
|
dummy_kernel<<<1, 1>>>();
|
||||||
|
@@ -26,9 +26,17 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
AcResult acKernelDummy(void);
|
AcResult acKernelDummy(void);
|
||||||
|
|
||||||
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
|
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
|
||||||
|
|
||||||
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
|
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
|
||||||
const int3 start, const int3 end, VertexBufferArray vba);
|
const int3 start, const int3 end, VertexBufferArray vba);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
@@ -25,6 +25,7 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
#include "reductions.cuh"
|
||||||
|
|
||||||
#include "src/core/errchk.h"
|
#include "src/core/errchk.h"
|
||||||
#include "src/core/math_utils.h" // is_power_of_two
|
#include "src/core/math_utils.h" // is_power_of_two
|
||||||
@@ -173,8 +174,8 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
|
||||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
const int3 end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||||
{
|
{
|
||||||
const unsigned nx = end.x - start.x;
|
const unsigned nx = end.x - start.x;
|
||||||
const unsigned ny = end.y - start.y;
|
const unsigned ny = end.y - start.y;
|
||||||
@@ -227,8 +228,8 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
|
||||||
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
||||||
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
|
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
|
||||||
{
|
{
|
||||||
const unsigned nx = end.x - start.x;
|
const unsigned nx = end.x - start.x;
|
||||||
|
@@ -27,10 +27,18 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <astaroth.h>
|
#include <astaroth.h>
|
||||||
|
|
||||||
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
#ifdef __cplusplus
|
||||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad,
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
|
||||||
|
const int3 end, const AcReal* vtxbuf, AcReal* scratchpad,
|
||||||
AcReal* reduce_result);
|
AcReal* reduce_result);
|
||||||
|
|
||||||
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3 start,
|
||||||
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
||||||
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
|
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
Reference in New Issue
Block a user