DSL now 'feature complete' with respect to what I had in mind before the summer. Users can now create multiple kernels and the library functions are generated automatically for them. The generated library functions are of the form acDeviceKernel_<name> and acNodeKernel_<name>. More features are needed though. The next features to be added at some point are 1D and 2D device constant arrays in order to support profiles for f.ex. forcing.

2019-08-27 18:19:20 +03:00
parent 230230ead9
commit 6ea02fa28e
3 changed files with 81 additions and 18 deletions
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -625,6 +625,17 @@ generate_header(void)
    */
 }

+static void
+generate_library_hooks(void)
+{
+    for (int i = 0; i < num_symbols; ++i) {
+        if (symbol_table[i].type_qualifier == KERNEL) {
+            printf("GEN_DEVICE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
+            // printf("GEN_NODE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
+        }
+    }
+}
+
 int
 main(int argc, char** argv)
 {
@@ -662,6 +673,8 @@ main(int argc, char** argv)
        generate_preprocessed_structures();
    else if (compilation_type == STENCIL_HEADER)
        generate_header();
+    else if (compilation_type == STENCIL_PROCESS)
+        generate_library_hooks();

    // print_symbol_table();

--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -39,6 +39,24 @@ typedef struct {
    AcReal* out[NUM_VTXBUF_HANDLES];
 } VertexBufferArray;

+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+
+#if PACKED_DATA_TRANSFERS
+// Declare memory for buffers needed for packed data transfers here
+// AcReal* data_packing_buffer;
+#endif
+};
+
 __constant__ AcMeshInfo d_mesh_info;
 static int __device__ __forceinline__
 DCONST(const AcIntParam param)
@@ -89,24 +107,6 @@ static dim3 rk3_tpb(32, 1, 4);
 // #include "kernels/pack_unpack.cuh"
 #endif

-struct device_s {
-    int id;
-    AcMeshInfo local_config;
-
-    // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
-
-    // Memory
-    VertexBufferArray vba;
-    AcReal* reduce_scratchpad;
-    AcReal* reduce_result;
-
-#if PACKED_DATA_TRANSFERS
-// Declare memory for buffers needed for packed data transfers here
-// AcReal* data_packing_buffer;
-#endif
-};
-
 // clang-format off
 static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
 // clang-format on
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -585,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
                                                                                                   \
    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);

+// clang-format off
+#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
+    template <int step_number>                                                                     \
+    AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,                 \
+                                         const int3 start, const int3 end)                         \
+    {                                                                                              \
+        cudaSetDevice(device->id);                                                                 \
+                                                                                                   \
+        const dim3 tpb(32, 1, 4);                                                                  \
+                                                                                                   \
+        const int3 n = end - start;                                                                \
+        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
+                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
+                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
+                                                                                                   \
+        identifier<step_number>                                                                    \
+            <<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);                   \
+        ERRCHK_CUDA_KERNEL();                                                                      \
+                                                                                                   \
+        return AC_SUCCESS;                                                                         \
+    }
+
+/*
+#define GEN_NODE_FUNC_HOOK(identifier)                                                             \
+    template <int step_number>                                                                     \
+    AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start,     \
+                                       const int3 end)                                             \
+    {                                                                                              \
+        acNodeSynchronizeStream(node, stream);                                                     \
+                                                                                                   \
+        for (int i = 0; i < node->num_devices; ++i) {                                              \
+                                                                                                   \
+            const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};                \
+            const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};  \
+                                                                                                   \
+            const int3 da = max(start, d0);                                                        \
+            const int3 db = min(end, d1);                                                          \
+                                                                                                   \
+            if (db.z >= da.z) {                                                                    \
+                const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};                    \
+                const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};                    \
+                acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local,          \
+                                            db_local, dt);                                         \
+            }                                                                                      \
+        }                                                                                          \
+        return AC_SUCCESS;                                                                         \
+    }
+    */
+// clang-format on
+
 #include "stencil_process.cuh"