DSL now 'feature complete' with respect to what I had in mind before the summer. Users can now create multiple kernels and the library functions are generated automatically for them. The generated library functions are of the form acDeviceKernel_<name> and acNodeKernel_<name>. More features are needed though. The next features to be added at some point are 1D and 2D device constant arrays in order to support profiles for f.ex. forcing.

This commit is contained in:
jpekkila
2019-08-27 18:19:20 +03:00
parent 230230ead9
commit 6ea02fa28e
3 changed files with 81 additions and 18 deletions

View File

@@ -625,6 +625,17 @@ generate_header(void)
*/
}
static void
generate_library_hooks(void)
{
for (int i = 0; i < num_symbols; ++i) {
if (symbol_table[i].type_qualifier == KERNEL) {
printf("GEN_DEVICE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
// printf("GEN_NODE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
}
}
}
int
main(int argc, char** argv)
{
@@ -662,6 +673,8 @@ main(int argc, char** argv)
generate_preprocessed_structures();
else if (compilation_type == STENCIL_HEADER)
generate_header();
else if (compilation_type == STENCIL_PROCESS)
generate_library_hooks();
// print_symbol_table();

View File

@@ -39,6 +39,24 @@ typedef struct {
AcReal* out[NUM_VTXBUF_HANDLES];
} VertexBufferArray;
struct device_s {
int id;
AcMeshInfo local_config;
// Concurrency
cudaStream_t streams[NUM_STREAM_TYPES];
// Memory
VertexBufferArray vba;
AcReal* reduce_scratchpad;
AcReal* reduce_result;
#if PACKED_DATA_TRANSFERS
// Declare memory for buffers needed for packed data transfers here
// AcReal* data_packing_buffer;
#endif
};
__constant__ AcMeshInfo d_mesh_info;
static int __device__ __forceinline__
DCONST(const AcIntParam param)
@@ -89,24 +107,6 @@ static dim3 rk3_tpb(32, 1, 4);
// #include "kernels/pack_unpack.cuh"
#endif
struct device_s {
int id;
AcMeshInfo local_config;
// Concurrency
cudaStream_t streams[NUM_STREAM_TYPES];
// Memory
VertexBufferArray vba;
AcReal* reduce_scratchpad;
AcReal* reduce_result;
#if PACKED_DATA_TRANSFERS
// Declare memory for buffers needed for packed data transfers here
// AcReal* data_packing_buffer;
#endif
};
// clang-format off
static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
// clang-format on

View File

@@ -585,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
\
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
// clang-format off
#define GEN_DEVICE_FUNC_HOOK(identifier) \
template <int step_number> \
AcResult acDeviceKernel_##identifier(const Device device, const Stream stream, \
const int3 start, const int3 end) \
{ \
cudaSetDevice(device->id); \
\
const dim3 tpb(32, 1, 4); \
\
const int3 n = end - start; \
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \
(unsigned int)ceil(n.y / AcReal(tpb.y)), \
(unsigned int)ceil(n.z / AcReal(tpb.z))); \
\
identifier<step_number> \
<<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba); \
ERRCHK_CUDA_KERNEL(); \
\
return AC_SUCCESS; \
}
/*
#define GEN_NODE_FUNC_HOOK(identifier) \
template <int step_number> \
AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start, \
const int3 end) \
{ \
acNodeSynchronizeStream(node, stream); \
\
for (int i = 0; i < node->num_devices; ++i) { \
\
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z}; \
const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z}; \
\
const int3 da = max(start, d0); \
const int3 db = min(end, d1); \
\
if (db.z >= da.z) { \
const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z}; \
const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z}; \
acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local, \
db_local, dt); \
} \
} \
return AC_SUCCESS; \
}
*/
// clang-format on
#include "stencil_process.cuh"