diff --git a/acc/mhd_solver/stencil_definition.sdh b/acc/mhd_solver/stencil_definition.sdh index 31667fc..c4324e5 100644 --- a/acc/mhd_solver/stencil_definition.sdh +++ b/acc/mhd_solver/stencil_definition.sdh @@ -15,6 +15,7 @@ uniform int AC_bin_steps; uniform int AC_bc_type; // Real params +uniform Scalar AC_dt; // Spacing uniform Scalar AC_dsx; uniform Scalar AC_dsy; diff --git a/acc/mhd_solver/stencil_process.sps b/acc/mhd_solver/stencil_process.sps index e605b9f..0db62e0 100644 --- a/acc/mhd_solver/stencil_process.sps +++ b/acc/mhd_solver/stencil_process.sps @@ -294,8 +294,9 @@ out ScalarField out_tt(VTXBUF_TEMPERATURE); #endif Kernel void -solve(Scalar dt) +solve() { + Scalar dt = AC_dt; out_lnrho = rk3(out_lnrho, lnrho, continuity(uu, lnrho), dt); #if LMAGNETIC diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c index e560a70..864d2ec 100644 --- a/acc/src/code_generator.c +++ b/acc/src/code_generator.c @@ -62,9 +62,9 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = { [MATRIX] = "AcMatrix", [SCALARFIELD] = "AcReal", // Type qualifiers - [KERNEL] = "template static " - "__global__", //__launch_bounds__(RK_THREADBLOCK_SIZE, - // RK_LAUNCH_BOUND_MIN_BLOCKS), + [KERNEL] = "template static __global__", + //__launch_bounds__(RK_THREADBLOCK_SIZE, + // RK_LAUNCH_BOUND_MIN_BLOCKS), [PREPROCESSED] = "static __device__ " "__forceinline__", [CONSTANT] = "const", @@ -318,9 +318,15 @@ traverse(const ASTNode* node) inside_kernel = true; // Kernel parameter boilerplate - const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE, "; - if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) - printf("%s ", kernel_parameter_boilerplate); + const char* kernel_parameter_boilerplate = "GEN_KERNEL_PARAM_BOILERPLATE"; + if (inside_kernel && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) { + printf("%s", kernel_parameter_boilerplate); + + if (node->lhs != NULL) { + printf("Compilation error: function parameters for Kernel functions not allowed!\n"); + exit(EXIT_FAILURE); + } + } // Kernel builtin variables boilerplate (read input/output arrays and setup // indices) @@ -619,6 +625,17 @@ generate_header(void) */ } +static void +generate_library_hooks(void) +{ + for (int i = 0; i < num_symbols; ++i) { + if (symbol_table[i].type_qualifier == KERNEL) { + printf("GEN_DEVICE_FUNC_HOOK(%s)\n", symbol_table[i].identifier); + // printf("GEN_NODE_FUNC_HOOK(%s)\n", symbol_table[i].identifier); + } + } +} + int main(int argc, char** argv) { @@ -656,6 +673,8 @@ main(int argc, char** argv) generate_preprocessed_structures(); else if (compilation_type == STENCIL_HEADER) generate_header(); + else if (compilation_type == STENCIL_PROCESS) + generate_library_hooks(); // print_symbol_table(); diff --git a/scripts/preprocess_device_files.sh b/scripts/preprocess_device_files.sh new file mode 100755 index 0000000..0ef61f9 --- /dev/null +++ b/scripts/preprocess_device_files.sh @@ -0,0 +1 @@ +nvcc -E ../src/core/device.cu -I ../include -I ../ > preprocessed_device_files.pp diff --git a/src/core/device.cu b/src/core/device.cu index 15f0c1e..78e82d7 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -39,6 +39,24 @@ typedef struct { AcReal* out[NUM_VTXBUF_HANDLES]; } VertexBufferArray; +struct device_s { + int id; + AcMeshInfo local_config; + + // Concurrency + cudaStream_t streams[NUM_STREAM_TYPES]; + + // Memory + VertexBufferArray vba; + AcReal* reduce_scratchpad; + AcReal* reduce_result; + +#if PACKED_DATA_TRANSFERS +// Declare memory for buffers needed for packed data transfers here +// AcReal* data_packing_buffer; +#endif +}; + __constant__ AcMeshInfo d_mesh_info; static int __device__ __forceinline__ DCONST(const AcIntParam param) @@ -89,24 +107,6 @@ static dim3 rk3_tpb(32, 1, 4); // #include "kernels/pack_unpack.cuh" #endif -struct device_s { - int id; - AcMeshInfo local_config; - - // Concurrency - cudaStream_t streams[NUM_STREAM_TYPES]; - - // Memory - VertexBufferArray vba; - AcReal* reduce_scratchpad; - AcReal* reduce_result; - -#if PACKED_DATA_TRANSFERS -// Declare memory for buffers needed for packed data transfers here -// AcReal* data_packing_buffer; -#endif -}; - // clang-format off static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); } // clang-format on @@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device) cudaEventRecord(tstart); // ---------------------------------------- Timing start + acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON); for (int i = 0; i < num_iterations; ++i) - solve<2><<>>(start, end, device->vba, FLT_EPSILON); + solve<2><<>>(start, end, device->vba); cudaEventRecord(tstop); // ----------------------------------------- Timing end cudaEventSynchronize(tstop); @@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste (unsigned int)ceil(n.y / AcReal(tpb.y)), // (unsigned int)ceil(n.z / AcReal(tpb.z))); + acDeviceLoadScalarConstant(device, stream, AC_dt, dt); if (step_number == 0) - solve<0><<streams[stream]>>>(start, end, device->vba, dt); + solve<0><<streams[stream]>>>(start, end, device->vba); else if (step_number == 1) - solve<1><<streams[stream]>>>(start, end, device->vba, dt); + solve<1><<streams[stream]>>>(start, end, device->vba); else - solve<2><<streams[stream]>>>(start, end, device->vba, dt); + solve<2><<streams[stream]>>>(start, end, device->vba); ERRCHK_CUDA_KERNEL(); diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index 1b62d1e..0a06c2f 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -585,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) \ const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); +// clang-format off +#define GEN_DEVICE_FUNC_HOOK(identifier) \ + template \ + AcResult acDeviceKernel_##identifier(const Device device, const Stream stream, \ + const int3 start, const int3 end) \ + { \ + cudaSetDevice(device->id); \ + \ + const dim3 tpb(32, 1, 4); \ + \ + const int3 n = end - start; \ + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), \ + (unsigned int)ceil(n.y / AcReal(tpb.y)), \ + (unsigned int)ceil(n.z / AcReal(tpb.z))); \ + \ + identifier \ + <<streams[stream]>>>(start, end, device->vba); \ + ERRCHK_CUDA_KERNEL(); \ + \ + return AC_SUCCESS; \ + } + +/* +#define GEN_NODE_FUNC_HOOK(identifier) \ + template \ + AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start, \ + const int3 end) \ + { \ + acNodeSynchronizeStream(node, stream); \ + \ + for (int i = 0; i < node->num_devices; ++i) { \ + \ + const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z}; \ + const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z}; \ + \ + const int3 da = max(start, d0); \ + const int3 db = min(end, d1); \ + \ + if (db.z >= da.z) { \ + const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z}; \ + const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z}; \ + acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local, \ + db_local, dt); \ + } \ + } \ + return AC_SUCCESS; \ + } + */ +// clang-format on + #include "stencil_process.cuh"