Merged in dsl_feature_completeness_2019-08-27_V2 (pull request #7)

Dsl feature completeness 2019 08 27 V2 Approved-by: Miikka Väisälä <mvaisala@asiaa.sinica.edu.tw>
2019-09-02 02:41:13 +00:00
parent be9ec6293b 6ea02fa28e
commit c0ba08133d
6 changed files with 103 additions and 29 deletions
--- a/acc/mhd_solver/stencil_definition.sdh
+++ b/acc/mhd_solver/stencil_definition.sdh
@@ -15,6 +15,7 @@ uniform int AC_bin_steps;
 uniform int AC_bc_type;
 // Real params
 uniform Scalar AC_dt;
 // Spacing
 uniform Scalar AC_dsx;
 uniform Scalar AC_dsy;
--- a/acc/mhd_solver/stencil_process.sps
+++ b/acc/mhd_solver/stencil_process.sps
@@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE);
 #endif
 Kernel void
-solve(Scalar dt)
+solve()
 {
    Scalar dt = AC_dt;
    out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
 #if LMAGNETIC
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -62,8 +62,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
    [MATRIX]      = "AcMatrix",
    [SCALARFIELD] = "AcReal",
    // Type qualifiers
-    [KERNEL] = "template <int step_number>  static "
+    [KERNEL] = "template <int step_number>  static __global__",
-               "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
+    //__launch_bounds__(RK_THREADBLOCK_SIZE,
    // RK_LAUNCH_BOUND_MIN_BLOCKS),
    [PREPROCESSED] = "static __device__ "
                     "__forceinline__",
@@ -318,10 +318,16 @@ traverse(const ASTNode* node)
        inside_kernel = true;
    // Kernel parameter boilerplate
-    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, ";
+    const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE";
-    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
+    if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) {
        printf("%s", kernel_parameter_boilerplate);
        if (node->lhs != NULL) {
            printf("Compilation error: function parameters for Kernel functions not allowed!\n");
            exit(EXIT_FAILURE);
        }
    }
    // Kernel builtin variables boilerplate (read input/output arrays and setup
    // indices)
    const char* kernel_builtin_variables_boilerplate = "GEN_KERNEL_BUILTIN_VARIABLES_"
@@ -619,6 +625,17 @@ generate_header(void)
    */
 }
 static void
 generate_library_hooks(void)
 {
    for (int i = 0; i < num_symbols; ++i) {
        if (symbol_table[i].type_qualifier == KERNEL) {
            printf("GEN_DEVICE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
            // printf("GEN_NODE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
        }
    }
 }
 int
 main(int argc, char** argv)
 {
@@ -656,6 +673,8 @@ main(int argc, char** argv)
        generate_preprocessed_structures();
    else if (compilation_type == STENCIL_HEADER)
        generate_header();
    else if (compilation_type == STENCIL_PROCESS)
        generate_library_hooks();
    // print_symbol_table();
--- a/scripts/preprocess_device_files.sh
+++ b/scripts/preprocess_device_files.sh
@@ -0,0 +1 @@
 nvcc -E ../src/core/device.cu -I ../include -I ../ > preprocessed_device_files.pp
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -39,6 +39,24 @@ typedef struct {
    AcReal* out[NUM_VTXBUF_HANDLES];
 } VertexBufferArray;
 struct device_s {
    int id;
    AcMeshInfo local_config;
    // Concurrency
    cudaStream_t streams[NUM_STREAM_TYPES];
    // Memory
    VertexBufferArray vba;
    AcReal* reduce_scratchpad;
    AcReal* reduce_result;
 #if PACKED_DATA_TRANSFERS
 // Declare memory for buffers needed for packed data transfers here
 // AcReal* data_packing_buffer;
 #endif
 };
 __constant__ AcMeshInfo d_mesh_info;
 static int __device__ __forceinline__
 DCONST(const AcIntParam param)
@@ -89,24 +107,6 @@ static dim3 rk3_tpb(32, 1, 4);
 // #include "kernels/pack_unpack.cuh"
 #endif
 struct device_s {
    int id;
    AcMeshInfo local_config;
    // Concurrency
    cudaStream_t streams[NUM_STREAM_TYPES];
    // Memory
    VertexBufferArray vba;
    AcReal* reduce_scratchpad;
    AcReal* reduce_result;
 #if PACKED_DATA_TRANSFERS
 // Declare memory for buffers needed for packed data transfers here
 // AcReal* data_packing_buffer;
 #endif
 };
 // clang-format off
 static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
 // clang-format on
@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)
                cudaEventRecord(tstart); // ---------------------------------------- Timing start
                acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
                for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
+                    solve<2><<<bpg, tpb>>>(start, end, device->vba);
                cudaEventRecord(tstop); // ----------------------------------------- Timing end
                cudaEventSynchronize(tstop);
@@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
    acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
    else
-        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
    ERRCHK_CUDA_KERNEL();
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -585,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
                                                                                                   \
    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 // clang-format off
 #define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
    template <int step_number>                                                                     \
    AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,                 \
                                         const int3 start, const int3 end)                         \
    {                                                                                              \
        cudaSetDevice(device->id);                                                                 \
                                                                                                   \
        const dim3 tpb(32, 1, 4);                                                                  \
                                                                                                   \
        const int3 n = end - start;                                                                \
        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
                                                                                                   \
        identifier<step_number>                                                                    \
            <<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);                   \
        ERRCHK_CUDA_KERNEL();                                                                      \
                                                                                                   \
        return AC_SUCCESS;                                                                         \
    }
 /*
 #define GEN_NODE_FUNC_HOOK(identifier)                                                             \
    template <int step_number>                                                                     \
    AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start,     \
                                       const int3 end)                                             \
    {                                                                                              \
        acNodeSynchronizeStream(node, stream);                                                     \
                                                                                                   \
        for (int i = 0; i < node->num_devices; ++i) {                                              \
                                                                                                   \
            const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};                \
            const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};  \
                                                                                                   \
            const int3 da = max(start, d0);                                                        \
            const int3 db = min(end, d1);                                                          \
                                                                                                   \
            if (db.z >= da.z) {                                                                    \
                const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};                    \
                const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};                    \
                acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local,          \
                                            db_local, dt);                                         \
            }                                                                                      \
        }                                                                                          \
        return AC_SUCCESS;                                                                         \
    }
    */
 // clang-format on
 #include "stencil_process.cuh"
		`@@ -0,0 +1 @@`
							`nvcc -E ../src/core/device.cu -I ../include -I ../ > preprocessed_device_files.pp`