Added a function acAutoOptimize to the interface and removed rk3_step_async in kernels.cuh (moved into rkStep)
This commit is contained in:
@@ -610,3 +610,9 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
|
|||||||
}
|
}
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acAutoOptimize(void)
|
||||||
|
{
|
||||||
|
return autoOptimize(devices[0]);
|
||||||
|
}
|
||||||
|
@@ -109,10 +109,10 @@ printDeviceInfo(const Device device)
|
|||||||
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
|
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
|
||||||
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
|
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
|
||||||
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
|
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
|
||||||
//MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
|
// MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
|
||||||
//MV: while compiling in TIARA gp cluster. Therefore commeted out.
|
// MV: while compiling in TIARA gp cluster. Therefore commeted out.
|
||||||
//!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
|
//!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
|
||||||
//!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
|
//!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
|
||||||
printf(" Other\n");
|
printf(" Other\n");
|
||||||
printf(" Warp size: %d\n", props.warpSize);
|
printf(" Warp size: %d\n", props.warpSize);
|
||||||
// printf(" Single to double perf. ratio: %dx\n",
|
// printf(" Single to double perf. ratio: %dx\n",
|
||||||
@@ -270,7 +270,23 @@ rkStep(const Device device, const StreamType stream_type, const int step_number,
|
|||||||
const int3& end, const AcReal dt)
|
const int3& end, const AcReal dt)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
|
|
||||||
|
const dim3 tpb(32, 1, 4);
|
||||||
|
|
||||||
|
const int3 n = end - start;
|
||||||
|
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
|
||||||
|
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||||
|
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||||
|
|
||||||
|
if (step_number == 0)
|
||||||
|
solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
|
||||||
|
else if (step_number == 1)
|
||||||
|
solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
|
||||||
|
else
|
||||||
|
solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
|
||||||
|
|
||||||
|
ERRCHK_CUDA_KERNEL();
|
||||||
|
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -395,6 +411,14 @@ loadGlobalGrid(const Device device, const Grid grid)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
autoOptimize(const Device device)
|
||||||
|
{
|
||||||
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
#if PACKED_DATA_TRANSFERS
|
#if PACKED_DATA_TRANSFERS
|
||||||
// Functions for calling packed data transfers
|
// Functions for calling packed data transfers
|
||||||
#endif
|
#endif
|
||||||
|
@@ -99,6 +99,9 @@ AcResult loadDeviceConstant(const Device device, const AcRealParam param, const
|
|||||||
/** */
|
/** */
|
||||||
AcResult loadGlobalGrid(const Device device, const Grid grid);
|
AcResult loadGlobalGrid(const Device device, const Grid grid);
|
||||||
|
|
||||||
|
/** */
|
||||||
|
AcResult autoOptimize(const Device device);
|
||||||
|
|
||||||
// #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers
|
// #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers
|
||||||
#if PACKED_DATA_TRANSFERS
|
#if PACKED_DATA_TRANSFERS
|
||||||
// Declarations used for packed data transfers
|
// Declarations used for packed data transfers
|
||||||
|
@@ -707,30 +707,6 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
|||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
AcResult
|
|
||||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
|
|
||||||
const int3& end, const AcReal dt, VertexBufferArray* buffer)
|
|
||||||
{
|
|
||||||
const dim3 tpb(32, 1, 4);
|
|
||||||
|
|
||||||
const int nx = end.x - start.x;
|
|
||||||
const int ny = end.y - start.y;
|
|
||||||
const int nz = end.z - start.z;
|
|
||||||
|
|
||||||
const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
|
|
||||||
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
|
||||||
|
|
||||||
if (step_number == 0)
|
|
||||||
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
|
||||||
else if (step_number == 1)
|
|
||||||
solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
|
||||||
else
|
|
||||||
solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
|
||||||
|
|
||||||
ERRCHK_CUDA_KERNEL();
|
|
||||||
return AC_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////REDUCE///////////////////////////
|
////////////////REDUCE///////////////////////////
|
||||||
#include "src/core/math_utils.h" // is_power_of_two
|
#include "src/core/math_utils.h" // is_power_of_two
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user