Merged in dsl_feature_completeness_2019-08-27_V2 (pull request #7)
Dsl feature completeness 2019 08 27 V2 Approved-by: Miikka Väisälä <mvaisala@asiaa.sinica.edu.tw>
This commit is contained in:
@@ -15,6 +15,7 @@ uniform int AC_bin_steps;
|
|||||||
uniform int AC_bc_type;
|
uniform int AC_bc_type;
|
||||||
|
|
||||||
// Real params
|
// Real params
|
||||||
|
uniform Scalar AC_dt;
|
||||||
// Spacing
|
// Spacing
|
||||||
uniform Scalar AC_dsx;
|
uniform Scalar AC_dsx;
|
||||||
uniform Scalar AC_dsy;
|
uniform Scalar AC_dsy;
|
||||||
|
@@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
Kernel void
|
Kernel void
|
||||||
solve(Scalar dt)
|
solve()
|
||||||
{
|
{
|
||||||
|
Scalar dt = AC_dt;
|
||||||
out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
|
out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt);
|
||||||
|
|
||||||
#if LMAGNETIC
|
#if LMAGNETIC
|
||||||
|
@@ -62,8 +62,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
|
|||||||
[MATRIX] = "AcMatrix",
|
[MATRIX] = "AcMatrix",
|
||||||
[SCALARFIELD] = "AcReal",
|
[SCALARFIELD] = "AcReal",
|
||||||
// Type qualifiers
|
// Type qualifiers
|
||||||
[KERNEL] = "template <int step_number> static "
|
[KERNEL] = "template <int step_number> static __global__",
|
||||||
"__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE,
|
//__launch_bounds__(RK_THREADBLOCK_SIZE,
|
||||||
// RK_LAUNCH_BOUND_MIN_BLOCKS),
|
// RK_LAUNCH_BOUND_MIN_BLOCKS),
|
||||||
[PREPROCESSED] = "static __device__ "
|
[PREPROCESSED] = "static __device__ "
|
||||||
"__forceinline__",
|
"__forceinline__",
|
||||||
@@ -318,10 +318,16 @@ traverse(const ASTNode* node)
|
|||||||
inside_kernel = true;
|
inside_kernel = true;
|
||||||
|
|
||||||
// Kernel parameter boilerplate
|
// Kernel parameter boilerplate
|
||||||
const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, ";
|
const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE";
|
||||||
if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
|
if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) {
|
||||||
printf("%s", kernel_parameter_boilerplate);
|
printf("%s", kernel_parameter_boilerplate);
|
||||||
|
|
||||||
|
if (node->lhs != NULL) {
|
||||||
|
printf("Compilation error: function parameters for Kernel functions not allowed!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Kernel builtin variables boilerplate (read input/output arrays and setup
|
// Kernel builtin variables boilerplate (read input/output arrays and setup
|
||||||
// indices)
|
// indices)
|
||||||
const char* kernel_builtin_variables_boilerplate = "GEN_KERNEL_BUILTIN_VARIABLES_"
|
const char* kernel_builtin_variables_boilerplate = "GEN_KERNEL_BUILTIN_VARIABLES_"
|
||||||
@@ -619,6 +625,17 @@ generate_header(void)
|
|||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
generate_library_hooks(void)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < num_symbols; ++i) {
|
||||||
|
if (symbol_table[i].type_qualifier == KERNEL) {
|
||||||
|
printf("GEN_DEVICE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
|
||||||
|
// printf("GEN_NODE_FUNC_HOOK(%s)\n", symbol_table[i].identifier);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char** argv)
|
main(int argc, char** argv)
|
||||||
{
|
{
|
||||||
@@ -656,6 +673,8 @@ main(int argc, char** argv)
|
|||||||
generate_preprocessed_structures();
|
generate_preprocessed_structures();
|
||||||
else if (compilation_type == STENCIL_HEADER)
|
else if (compilation_type == STENCIL_HEADER)
|
||||||
generate_header();
|
generate_header();
|
||||||
|
else if (compilation_type == STENCIL_PROCESS)
|
||||||
|
generate_library_hooks();
|
||||||
|
|
||||||
// print_symbol_table();
|
// print_symbol_table();
|
||||||
|
|
||||||
|
1
scripts/preprocess_device_files.sh
Executable file
1
scripts/preprocess_device_files.sh
Executable file
@@ -0,0 +1 @@
|
|||||||
|
nvcc -E ../src/core/device.cu -I ../include -I ../ > preprocessed_device_files.pp
|
@@ -39,6 +39,24 @@ typedef struct {
|
|||||||
AcReal* out[NUM_VTXBUF_HANDLES];
|
AcReal* out[NUM_VTXBUF_HANDLES];
|
||||||
} VertexBufferArray;
|
} VertexBufferArray;
|
||||||
|
|
||||||
|
struct device_s {
|
||||||
|
int id;
|
||||||
|
AcMeshInfo local_config;
|
||||||
|
|
||||||
|
// Concurrency
|
||||||
|
cudaStream_t streams[NUM_STREAM_TYPES];
|
||||||
|
|
||||||
|
// Memory
|
||||||
|
VertexBufferArray vba;
|
||||||
|
AcReal* reduce_scratchpad;
|
||||||
|
AcReal* reduce_result;
|
||||||
|
|
||||||
|
#if PACKED_DATA_TRANSFERS
|
||||||
|
// Declare memory for buffers needed for packed data transfers here
|
||||||
|
// AcReal* data_packing_buffer;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
__constant__ AcMeshInfo d_mesh_info;
|
__constant__ AcMeshInfo d_mesh_info;
|
||||||
static int __device__ __forceinline__
|
static int __device__ __forceinline__
|
||||||
DCONST(const AcIntParam param)
|
DCONST(const AcIntParam param)
|
||||||
@@ -89,24 +107,6 @@ static dim3 rk3_tpb(32, 1, 4);
|
|||||||
// #include "kernels/pack_unpack.cuh"
|
// #include "kernels/pack_unpack.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct device_s {
|
|
||||||
int id;
|
|
||||||
AcMeshInfo local_config;
|
|
||||||
|
|
||||||
// Concurrency
|
|
||||||
cudaStream_t streams[NUM_STREAM_TYPES];
|
|
||||||
|
|
||||||
// Memory
|
|
||||||
VertexBufferArray vba;
|
|
||||||
AcReal* reduce_scratchpad;
|
|
||||||
AcReal* reduce_result;
|
|
||||||
|
|
||||||
#if PACKED_DATA_TRANSFERS
|
|
||||||
// Declare memory for buffers needed for packed data transfers here
|
|
||||||
// AcReal* data_packing_buffer;
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
|
static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
|
||||||
// clang-format on
|
// clang-format on
|
||||||
@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)
|
|||||||
|
|
||||||
cudaEventRecord(tstart); // ---------------------------------------- Timing start
|
cudaEventRecord(tstart); // ---------------------------------------- Timing start
|
||||||
|
|
||||||
|
acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
|
||||||
for (int i = 0; i < num_iterations; ++i)
|
for (int i = 0; i < num_iterations; ++i)
|
||||||
solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
|
solve<2><<<bpg, tpb>>>(start, end, device->vba);
|
||||||
|
|
||||||
cudaEventRecord(tstop); // ----------------------------------------- Timing end
|
cudaEventRecord(tstop); // ----------------------------------------- Timing end
|
||||||
cudaEventSynchronize(tstop);
|
cudaEventSynchronize(tstop);
|
||||||
@@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
|
|||||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||||
|
|
||||||
|
acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
|
||||||
if (step_number == 0)
|
if (step_number == 0)
|
||||||
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||||
else if (step_number == 1)
|
else if (step_number == 1)
|
||||||
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||||
else
|
else
|
||||||
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||||
|
|
||||||
ERRCHK_CUDA_KERNEL();
|
ERRCHK_CUDA_KERNEL();
|
||||||
|
|
||||||
|
@@ -585,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
|||||||
\
|
\
|
||||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
#define GEN_DEVICE_FUNC_HOOK(identifier) \
|
||||||
|
template <int step_number> \
|
||||||
|
AcResult acDeviceKernel_##identifier(const Device device, const Stream stream, \
|
||||||
|
const int3 start, const int3 end) \
|
||||||
|
{ \
|
||||||
|
cudaSetDevice(device->id); \
|
||||||
|
\
|
||||||
|
const dim3 tpb(32, 1, 4); \
|
||||||
|
\
|
||||||
|
const int3 n = end - start; \
|
||||||
|
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \
|
||||||
|
(unsigned int)ceil(n.y / AcReal(tpb.y)), \
|
||||||
|
(unsigned int)ceil(n.z / AcReal(tpb.z))); \
|
||||||
|
\
|
||||||
|
identifier<step_number> \
|
||||||
|
<<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba); \
|
||||||
|
ERRCHK_CUDA_KERNEL(); \
|
||||||
|
\
|
||||||
|
return AC_SUCCESS; \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
#define GEN_NODE_FUNC_HOOK(identifier) \
|
||||||
|
template <int step_number> \
|
||||||
|
AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start, \
|
||||||
|
const int3 end) \
|
||||||
|
{ \
|
||||||
|
acNodeSynchronizeStream(node, stream); \
|
||||||
|
\
|
||||||
|
for (int i = 0; i < node->num_devices; ++i) { \
|
||||||
|
\
|
||||||
|
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z}; \
|
||||||
|
const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z}; \
|
||||||
|
\
|
||||||
|
const int3 da = max(start, d0); \
|
||||||
|
const int3 db = min(end, d1); \
|
||||||
|
\
|
||||||
|
if (db.z >= da.z) { \
|
||||||
|
const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z}; \
|
||||||
|
const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z}; \
|
||||||
|
acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local, \
|
||||||
|
db_local, dt); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
return AC_SUCCESS; \
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
#include "stencil_process.cuh"
|
#include "stencil_process.cuh"
|
||||||
|
Reference in New Issue
Block a user