DSL now 'feature complete' with respect to what I had in mind before the summer. Users can now create multiple kernels and the library functions are generated automatically for them. The generated library functions are of the form acDeviceKernel_<name> and acNodeKernel_<name>. More features are needed though. The next features to be added at some point are 1D and 2D device constant arrays in order to support profiles for f.ex. forcing.
This commit is contained in:
@@ -625,6 +625,17 @@ generate_header(void)
|
||||
*/
|
||||
}
|
||||
|
||||
static void
|
||||
generate_library_hooks(void)
|
||||
{
|
||||
for (int i = 0; i < num_symbols; ++i) {
|
||||
if (symbol_table[i].type_qualifier == KERNEL) {
|
||||
printf("GEN_DEVICE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
|
||||
// printf("GEN_NODE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
@@ -662,6 +673,8 @@ main(int argc, char** argv)
|
||||
generate_preprocessed_structures();
|
||||
else if (compilation_type == STENCIL_HEADER)
|
||||
generate_header();
|
||||
else if (compilation_type == STENCIL_PROCESS)
|
||||
generate_library_hooks();
|
||||
|
||||
// print_symbol_table();
|
||||
|
||||
|
@@ -39,6 +39,24 @@ typedef struct {
|
||||
AcReal* out[NUM_VTXBUF_HANDLES];
|
||||
} VertexBufferArray;
|
||||
|
||||
struct device_s {
|
||||
int id;
|
||||
AcMeshInfo local_config;
|
||||
|
||||
// Concurrency
|
||||
cudaStream_t streams[NUM_STREAM_TYPES];
|
||||
|
||||
// Memory
|
||||
VertexBufferArray vba;
|
||||
AcReal* reduce_scratchpad;
|
||||
AcReal* reduce_result;
|
||||
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Declare memory for buffers needed for packed data transfers here
|
||||
// AcReal* data_packing_buffer;
|
||||
#endif
|
||||
};
|
||||
|
||||
__constant__ AcMeshInfo d_mesh_info;
|
||||
static int __device__ __forceinline__
|
||||
DCONST(const AcIntParam param)
|
||||
@@ -89,24 +107,6 @@ static dim3 rk3_tpb(32, 1, 4);
|
||||
// #include "kernels/pack_unpack.cuh"
|
||||
#endif
|
||||
|
||||
struct device_s {
|
||||
int id;
|
||||
AcMeshInfo local_config;
|
||||
|
||||
// Concurrency
|
||||
cudaStream_t streams[NUM_STREAM_TYPES];
|
||||
|
||||
// Memory
|
||||
VertexBufferArray vba;
|
||||
AcReal* reduce_scratchpad;
|
||||
AcReal* reduce_result;
|
||||
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Declare memory for buffers needed for packed data transfers here
|
||||
// AcReal* data_packing_buffer;
|
||||
#endif
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
|
||||
// clang-format on
|
||||
|
@@ -585,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
// clang-format off
|
||||
#define GEN_DEVICE_FUNC_HOOK(identifier) \
|
||||
template <int step_number> \
|
||||
AcResult acDeviceKernel_##identifier(const Device device, const Stream stream, \
|
||||
const int3 start, const int3 end) \
|
||||
{ \
|
||||
cudaSetDevice(device->id); \
|
||||
\
|
||||
const dim3 tpb(32, 1, 4); \
|
||||
\
|
||||
const int3 n = end - start; \
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), \
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z))); \
|
||||
\
|
||||
identifier<step_number> \
|
||||
<<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba); \
|
||||
ERRCHK_CUDA_KERNEL(); \
|
||||
\
|
||||
return AC_SUCCESS; \
|
||||
}
|
||||
|
||||
/*
|
||||
#define GEN_NODE_FUNC_HOOK(identifier) \
|
||||
template <int step_number> \
|
||||
AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start, \
|
||||
const int3 end) \
|
||||
{ \
|
||||
acNodeSynchronizeStream(node, stream); \
|
||||
\
|
||||
for (int i = 0; i < node->num_devices; ++i) { \
|
||||
\
|
||||
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z}; \
|
||||
const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z}; \
|
||||
\
|
||||
const int3 da = max(start, d0); \
|
||||
const int3 db = min(end, d1); \
|
||||
\
|
||||
if (db.z >= da.z) { \
|
||||
const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z}; \
|
||||
const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z}; \
|
||||
acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local, \
|
||||
db_local, dt); \
|
||||
} \
|
||||
} \
|
||||
return AC_SUCCESS; \
|
||||
}
|
||||
*/
|
||||
// clang-format on
|
||||
|
||||
#include "stencil_process.cuh"
|
||||
|
Reference in New Issue
Block a user