The previous attempt (dsl_feature_completeness_2019-08-23) to enable arbitrary kernel functions was a failure: we get significant performance loss (25-100%) if step_number is not passed as a template parameter to the integration kernel. Apparently the CUDA compiler cannot perform some optimizations if there is a if/else construct in a performance-critical part which cannot be evaluated at compile time. This branch keeps step_number as a template parameter but takes rest of the user parameters as uniforms (dt is no longer passed as a function parameter but as an uniform with the DSL instead).

This commit is contained in:
jpekkila
2019-08-27 17:36:33 +03:00
parent 9fa39b66fc
commit 20138263f4
4 changed files with 21 additions and 11 deletions

View File

@@ -15,6 +15,7 @@ uniform int AC_bin_steps;
uniform int AC_bc_type; uniform int AC_bc_type;
// Real params // Real params
uniform Scalar AC_dt;
// Spacing // Spacing
uniform Scalar AC_dsx; uniform Scalar AC_dsx;
uniform Scalar AC_dsy; uniform Scalar AC_dsy;

View File

@@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE);
#endif #endif
Kernel void Kernel void
solve(Scalar dt) solve()
{ {
Scalar dt = AC_dt;
out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt); out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
#if LMAGNETIC #if LMAGNETIC

View File

@@ -62,9 +62,9 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
[MATRIX] = "AcMatrix", [MATRIX] = "AcMatrix",
[SCALARFIELD] = "AcReal", [SCALARFIELD] = "AcReal",
// Type qualifiers // Type qualifiers
[KERNEL] = "template <int step_number> static " [KERNEL] = "template <int step_number> static __global__",
"__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE, //__launch_bounds__(RK_THREADBLOCK_SIZE,
// RK_LAUNCH_BOUND_MIN_BLOCKS), // RK_LAUNCH_BOUND_MIN_BLOCKS),
[PREPROCESSED] = "static __device__ " [PREPROCESSED] = "static __device__ "
"__forceinline__", "__forceinline__",
[CONSTANT] = "const", [CONSTANT] = "const",
@@ -318,9 +318,15 @@ traverse(const ASTNode* node)
inside_kernel = true; inside_kernel = true;
// Kernel parameter boilerplate // Kernel parameter boilerplate
const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, "; const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE";
if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) {
printf("%s ", kernel_parameter_boilerplate); printf("%s", kernel_parameter_boilerplate);
if (node->lhs != NULL) {
printf("Compilation error: function parameters for Kernel functions not allowed!\n");
exit(EXIT_FAILURE);
}
}
// Kernel builtin variables boilerplate (read input/output arrays and setup // Kernel builtin variables boilerplate (read input/output arrays and setup
// indices) // indices)

View File

@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)
cudaEventRecord(tstart); // ---------------------------------------- Timing start cudaEventRecord(tstart); // ---------------------------------------- Timing start
acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
for (int i = 0; i < num_iterations; ++i) for (int i = 0; i < num_iterations; ++i)
solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON); solve<2><<<bpg, tpb>>>(start, end, device->vba);
cudaEventRecord(tstop); // ----------------------------------------- Timing end cudaEventRecord(tstop); // ----------------------------------------- Timing end
cudaEventSynchronize(tstop); cudaEventSynchronize(tstop);
@@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
(unsigned int)ceil(n.y / AcReal(tpb.y)), // (unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z))); (unsigned int)ceil(n.z / AcReal(tpb.z)));
acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
if (step_number == 0) if (step_number == 0)
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt); solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
else if (step_number == 1) else if (step_number == 1)
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt); solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
else else
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt); solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
ERRCHK_CUDA_KERNEL(); ERRCHK_CUDA_KERNEL();