From 20138263f41bbd1a5ce377cb91a8f7bd061d0f22 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 27 Aug 2019 17:36:33 +0300 Subject: [PATCH] The previous attempt (dsl_feature_completeness_2019-08-23) to enable arbitrary kernel functions was a failure: we get significant performance loss (25-100%) if step_number is not passed as a template parameter to the integration kernel. Apparently the CUDA compiler cannot perform some optimizations if there is a if/else construct in a performance-critical part which cannot be evaluated at compile time. This branch keeps step_number as a template parameter but takes rest of the user parameters as uniforms (dt is no longer passed as a function parameter but as an uniform with the DSL instead). --- acc/mhd_solver/stencil_definition.sdh | 1 + acc/mhd_solver/stencil_process.sps | 3 ++- acc/src/code_generator.c | 18 ++++++++++++------ src/core/device.cu | 10 ++++++---- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/acc/mhd_solver/stencil_definition.sdh b/acc/mhd_solver/stencil_definition.sdh index 31667fc..c4324e5 100644 --- a/acc/mhd_solver/stencil_definition.sdh +++ b/acc/mhd_solver/stencil_definition.sdh @@ -15,6 +15,7 @@ uniform int AC_bin_steps; uniform int AC_bc_type; // Real params +uniform Scalar AC_dt; // Spacing uniform Scalar AC_dsx; uniform Scalar AC_dsy; diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps index e605b9f..0db62e0 100644 --- a/acc/mhd_solver/stencil_process.sps +++ b/acc/mhd_solver/stencil_process.sps @@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE); #endif Kernel void -solve(Scalar dt) +solve() { + Scalar dt = AC_dt; out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt); #if LMAGNETIC diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c index e560a70..b486a6d 100644 --- a/acc/src/code_generator.c +++ b/acc/src/code_generator.c @@ -62,9 +62,9 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = { [MATRIX] = "AcMatrix", [SCALARFIELD] = "AcReal", // Type qualifiers - [KERNEL] = "template static " - "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE, - // RK_LAUNCH_BOUND_MIN_BLOCKS), + [KERNEL] = "template static __global__", + //__launch_bounds__(RK_THREADBLOCK_SIZE, + // RK_LAUNCH_BOUND_MIN_BLOCKS), [PREPROCESSED] = "static __device__ " "__forceinline__", [CONSTANT] = "const", @@ -318,9 +318,15 @@ traverse(const ASTNode* node) inside_kernel = true; // Kernel parameter boilerplate - const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, "; - if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) - printf("%s ", kernel_parameter_boilerplate); + const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE"; + if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) { + printf("%s", kernel_parameter_boilerplate); + + if (node->lhs != NULL) { + printf("Compilation error: function parameters for Kernel functions not allowed!\n"); + exit(EXIT_FAILURE); + } + } // Kernel builtin variables boilerplate (read input/output arrays and setup // indices) diff --git a/src/core/device.cu b/src/core/device.cu index 15f0c1e..12d4087 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device) cudaEventRecord(tstart); // ---------------------------------------- Timing start + acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); for (int i = 0; i < num_iterations; ++i) - solve<2><<>>(start, end, device->vba, FLT_EPSILON); + solve<2><<>>(start, end, device->vba); cudaEventRecord(tstop); // ----------------------------------------- Timing end cudaEventSynchronize(tstop); @@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste (unsigned int)ceil(n.y / AcReal(tpb.y)), // (unsigned int)ceil(n.z / AcReal(tpb.z))); + acDeviceLoadScalarConstant(device, stream, AC_dt, dt); if (step_number == 0) - solve<0><<streams[stream]>>>(start, end, device->vba, dt); + solve<0><<streams[stream]>>>(start, end, device->vba); else if (step_number == 1) - solve<1><<streams[stream]>>>(start, end, device->vba, dt); + solve<1><<streams[stream]>>>(start, end, device->vba); else - solve<2><<streams[stream]>>>(start, end, device->vba, dt); + solve<2><<streams[stream]>>>(start, end, device->vba); ERRCHK_CUDA_KERNEL();