diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 326419e..a082a20 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -610,3 +610,9 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value) } return AC_SUCCESS; } + +AcResult +acAutoOptimize(void) +{ + return autoOptimize(devices[0]); +} diff --git a/src/core/device.cu b/src/core/device.cu index 11b4298..cb2c732 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -109,10 +109,10 @@ printDeviceInfo(const Device device) printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported); printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported); printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024)); -//MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error -//MV: while compiling in TIARA gp cluster. Therefore commeted out. -//!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); -//!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024)); + // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error + // MV: while compiling in TIARA gp cluster. Therefore commeted out. + //!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); + //!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024)); printf(" Other\n"); printf(" Warp size: %d\n", props.warpSize); // printf(" Single to double perf. ratio: %dx\n", @@ -270,7 +270,23 @@ rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& end, const AcReal dt) { cudaSetDevice(device->id); - rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba); + + const dim3 tpb(32, 1, 4); + + const int3 n = end - start; + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // + (unsigned int)ceil(n.y / AcReal(tpb.y)), // + (unsigned int)ceil(n.z / AcReal(tpb.z))); + + if (step_number == 0) + solve<0><<streams[stream_type]>>>(start, end, device->vba, dt); + else if (step_number == 1) + solve<1><<streams[stream_type]>>>(start, end, device->vba, dt); + else + solve<2><<streams[stream_type]>>>(start, end, device->vba, dt); + + ERRCHK_CUDA_KERNEL(); + return AC_SUCCESS; } @@ -395,6 +411,14 @@ loadGlobalGrid(const Device device, const Grid grid) return AC_SUCCESS; } +AcResult +autoOptimize(const Device device) +{ + cudaSetDevice(device->id); + + return AC_SUCCESS; +} + #if PACKED_DATA_TRANSFERS // Functions for calling packed data transfers #endif diff --git a/src/core/device.cuh b/src/core/device.cuh index 1e9becc..228f4e9 100644 --- a/src/core/device.cuh +++ b/src/core/device.cuh @@ -99,6 +99,9 @@ AcResult loadDeviceConstant(const Device device, const AcRealParam param, const /** */ AcResult loadGlobalGrid(const Device device, const Grid grid); +/** */ +AcResult autoOptimize(const Device device); + // #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers #if PACKED_DATA_TRANSFERS // Declarations used for packed data transfers diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index 19f4e47..f63d198 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -707,30 +707,6 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) * ============================================================================= */ -AcResult -rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, - const int3& end, const AcReal dt, VertexBufferArray* buffer) -{ - const dim3 tpb(32, 1, 4); - - const int nx = end.x - start.x; - const int ny = end.y - start.y; - const int nz = end.z - start.z; - - const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)), - (unsigned int)ceil(nz / AcReal(tpb.z))); - - if (step_number == 0) - solve<0><<>>(start, end, *buffer, dt); - else if (step_number == 1) - solve<1><<>>(start, end, *buffer, dt); - else - solve<2><<>>(start, end, *buffer, dt); - - ERRCHK_CUDA_KERNEL(); - return AC_SUCCESS; -} - ////////////////REDUCE/////////////////////////// #include "src/core/math_utils.h" // is_power_of_two