The previous attempt (dsl_feature_completeness_2019-08-23) to enable arbitrary kernel functions was a failure: we get significant performance loss (25-100%) if step_number is not passed as a template parameter to the integration kernel. Apparently the CUDA compiler cannot perform some optimizations if there is a if/else construct in a performance-critical part which cannot be evaluated at compile time. This branch keeps step_number as a template parameter but takes rest of the user parameters as uniforms (dt is no longer passed as a function parameter but as an uniform with the DSL instead).
This commit is contained in:
@@ -15,6 +15,7 @@ uniform int AC_bin_steps;
|
|||||||
uniform int AC_bc_type;
|
uniform int AC_bc_type;
|
||||||
|
|
||||||
// Real params
|
// Real params
|
||||||
|
uniform Scalar AC_dt;
|
||||||
// Spacing
|
// Spacing
|
||||||
uniform Scalar AC_dsx;
|
uniform Scalar AC_dsx;
|
||||||
uniform Scalar AC_dsy;
|
uniform Scalar AC_dsy;
|
||||||
|
@@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
Kernel void
|
Kernel void
|
||||||
solve(Scalar dt)
|
solve()
|
||||||
{
|
{
|
||||||
|
Scalar dt = AC_dt;
|
||||||
out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
|
out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
|
||||||
|
|
||||||
#if LMAGNETIC
|
#if LMAGNETIC
|
||||||
|
@@ -62,9 +62,9 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
|
|||||||
[MATRIX] = "AcMatrix",
|
[MATRIX] = "AcMatrix",
|
||||||
[SCALARFIELD] = "AcReal",
|
[SCALARFIELD] = "AcReal",
|
||||||
// Type qualifiers
|
// Type qualifiers
|
||||||
[KERNEL] = "template <int step_number> static "
|
[KERNEL] = "template <int step_number> static __global__",
|
||||||
"__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
|
//__launch_bounds__(RK_THREADBLOCK_SIZE,
|
||||||
// RK_LAUNCH_BOUND_MIN_BLOCKS),
|
// RK_LAUNCH_BOUND_MIN_BLOCKS),
|
||||||
[PREPROCESSED] = "static __device__ "
|
[PREPROCESSED] = "static __device__ "
|
||||||
"__forceinline__",
|
"__forceinline__",
|
||||||
[CONSTANT] = "const",
|
[CONSTANT] = "const",
|
||||||
@@ -318,9 +318,15 @@ traverse(const ASTNode* node)
|
|||||||
inside_kernel = true;
|
inside_kernel = true;
|
||||||
|
|
||||||
// Kernel parameter boilerplate
|
// Kernel parameter boilerplate
|
||||||
const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, ";
|
const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE";
|
||||||
if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
|
if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) {
|
||||||
printf("%s ", kernel_parameter_boilerplate);
|
printf("%s", kernel_parameter_boilerplate);
|
||||||
|
|
||||||
|
if (node->lhs != NULL) {
|
||||||
|
printf("Compilation error: function parameters for Kernel functions not allowed!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Kernel builtin variables boilerplate (read input/output arrays and setup
|
// Kernel builtin variables boilerplate (read input/output arrays and setup
|
||||||
// indices)
|
// indices)
|
||||||
|
@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)
|
|||||||
|
|
||||||
cudaEventRecord(tstart); // ---------------------------------------- Timing start
|
cudaEventRecord(tstart); // ---------------------------------------- Timing start
|
||||||
|
|
||||||
|
acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
|
||||||
for (int i = 0; i < num_iterations; ++i)
|
for (int i = 0; i < num_iterations; ++i)
|
||||||
solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
|
solve<2><<<bpg, tpb>>>(start, end, device->vba);
|
||||||
|
|
||||||
cudaEventRecord(tstop); // ----------------------------------------- Timing end
|
cudaEventRecord(tstop); // ----------------------------------------- Timing end
|
||||||
cudaEventSynchronize(tstop);
|
cudaEventSynchronize(tstop);
|
||||||
@@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
|
|||||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||||
|
|
||||||
|
acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
|
||||||
if (step_number == 0)
|
if (step_number == 0)
|
||||||
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||||
else if (step_number == 1)
|
else if (step_number == 1)
|
||||||
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||||
else
|
else
|
||||||
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||||
|
|
||||||
ERRCHK_CUDA_KERNEL();
|
ERRCHK_CUDA_KERNEL();
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user