From 20138263f41bbd1a5ce377cb91a8f7bd061d0f22 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 27 Aug 2019 17:36:33 +0300
Subject: [PATCH] The previous attempt (dsl_feature_completeness_2019-08-23) to
 enable arbitrary kernel functions was a failure: we get significant
 performance loss (25-100%) if step_number is not passed as a template
 parameter to the integration kernel. Apparently the CUDA compiler cannot
 perform some optimizations if there is a if/else construct in a
 performance-critical part which cannot be evaluated at compile time. This
 branch keeps step_number as a template parameter but takes rest of the user
 parameters as uniforms (dt is no longer passed as a function parameter but as
 an uniform with the DSL instead).

---
 acc/mhd_solver/stencil_definition.sdh |  1 +
 acc/mhd_solver/stencil_process.sps    |  3 ++-
 acc/src/code_generator.c              | 18 ++++++++++++------
 src/core/device.cu                    | 10 ++++++----
 4 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/acc/mhd_solver/stencil_definition.sdh b/acc/mhd_solver/stencil_definition.sdh
index 31667fc..c4324e5 100644
--- a/acc/mhd_solver/stencil_definition.sdh
+++ b/acc/mhd_solver/stencil_definition.sdh
@@ -15,6 +15,7 @@ uniform int AC_bin_steps;
 uniform int AC_bc_type;
 
 // Real params
+uniform Scalar AC_dt;
 // Spacing
 uniform Scalar AC_dsx;
 uniform Scalar AC_dsy;
diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps
index e605b9f..0db62e0 100644
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE);
 #endif
 
 Kernel void
-solve(Scalar dt)
+solve()
 {
+    Scalar dt = AC_dt;
     out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
 
 #if LMAGNETIC
diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index e560a70..b486a6d 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -62,9 +62,9 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
     [MATRIX]      = "AcMatrix",
     [SCALARFIELD] = "AcReal",
     // Type qualifiers
-    [KERNEL] = "template <int step_number>  static "
-               "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
-                             // RK_LAUNCH_BOUND_MIN_BLOCKS),
+    [KERNEL] = "template <int step_number>  static __global__",
+    //__launch_bounds__(RK_THREADBLOCK_SIZE,
+    // RK_LAUNCH_BOUND_MIN_BLOCKS),
     [PREPROCESSED] = "static __device__ "
                      "__forceinline__",
     [CONSTANT] = "const",
@@ -318,9 +318,15 @@ traverse(const ASTNode* node)
         inside_kernel = true;
 
     // Kernel parameter boilerplate
-    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, ";
-    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
-        printf("%s ", kernel_parameter_boilerplate);
+    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE";
+    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) {
+        printf("%s", kernel_parameter_boilerplate);
+
+        if (node->lhs != NULL) {
+            printf("Compilation error: function parameters for Kernel functions not allowed!\n");
+            exit(EXIT_FAILURE);
+        }
+    }
 
     // Kernel builtin variables boilerplate (read input/output arrays and setup
     // indices)
diff --git a/src/core/device.cu b/src/core/device.cu
index 15f0c1e..12d4087 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)
 
                 cudaEventRecord(tstart); // ---------------------------------------- Timing start
 
+                acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
                 for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
+                    solve<2><<<bpg, tpb>>>(start, end, device->vba);
 
                 cudaEventRecord(tstop); // ----------------------------------------- Timing end
                 cudaEventSynchronize(tstop);
@@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
                    (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                    (unsigned int)ceil(n.z / AcReal(tpb.z)));
 
+    acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
     if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
     else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
     else
-        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
 
     ERRCHK_CUDA_KERNEL();