diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c index 66d4c9d..fc8eb5d 100644 --- a/acc/src/code_generator.c +++ b/acc/src/code_generator.c @@ -316,9 +316,13 @@ traverse(const ASTNode* node) if (symbol_table[i].type_qualifier == IN) { printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier); - } else if (symbol_table[i].type_qualifier == OUT) { - printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier); - //printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier); + } + else if (symbol_table[i].type_qualifier == OUT) { + printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), + symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier); + // printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", + // translate(symbol_table[i].type_specifier), symbol_table[i].identifier, + // inout_name_prefix, symbol_table[i].identifier); } } } @@ -326,8 +330,7 @@ traverse(const ASTNode* node) // Preprocessed parameter boilerplate if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED) inside_preprocessed = true; - static const char - preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, "; + static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, "; if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) printf("%s ", preprocessed_parameter_boilerplate); // BOILERPLATE END//////////////////////////////////////////////////////// @@ -343,7 +346,6 @@ traverse(const ASTNode* node) if (node->type == NODE_FUNCTION_DECLARATION) inside_function_declaration = false; - // If the node is a subscript expression and the expression list inside it is not empty if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs) printf("IDX("); @@ -354,7 +356,7 @@ traverse(const ASTNode* node) if (handle >= 0) { // The variable exists in the symbol table const Symbol* symbol = &symbol_table[handle]; - //if (symbol->type_qualifier == OUT) { + // if (symbol->type_qualifier == OUT) { // printf("%s%s", inout_name_prefix, symbol->identifier); //} if (symbol->type_qualifier == UNIFORM) { @@ -394,14 +396,16 @@ traverse(const ASTNode* node) // Postfix logic %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // If the node is a subscript expression and the expression list inside it is not empty if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs) - printf(")"); // Closing bracket of IDX() + printf(")"); // Closing bracket of IDX() // Generate writeback boilerplate for OUT fields if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) { for (int i = 0; i < num_symbols; ++i) { if (symbol_table[i].type_qualifier == OUT) { - printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier); - //printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier); + printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, + symbol_table[i].identifier); + // printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", + // inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier); } } } @@ -486,8 +490,8 @@ generate_preprocessed_structures(void) for (int i = 0; i < num_symbols; ++i) { if (symbol_table[i].type_qualifier == PREPROCESSED) - printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier, - symbol_table[i].identifier); + printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", + symbol_table[i].identifier, symbol_table[i].identifier); } printf("return data;\n"); printf("}\n"); diff --git a/include/astaroth.h b/include/astaroth.h index 14eed0c..10b883c 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -41,7 +41,6 @@ extern "C" { #include // size_t #include // CUDA vector types (float4, etc) - /* * ============================================================================= * Flags for auto-optimization @@ -59,7 +58,6 @@ extern "C" { #define NUM_ITERATIONS (10) #define WARP_SIZE (32) - /* * ============================================================================= * Compile-time constants used during simulation (user definable) @@ -75,7 +73,8 @@ extern "C" { // L-prefix inherited from the old Astaroth, no idea what it means // MV: L means a Logical switch variale, something having true of false value. -#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL) +// Note: forcing is disabled currently in the files generated by acc (compiler of our DSL) +#define LFORCING (0) #define LINDUCTION (1) #define LENTROPY (1) #define LTEMPERATURE (0) @@ -258,28 +257,16 @@ typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult; * Reduction types * ============================================================================= */ -typedef enum { - RTYPE_MAX, - RTYPE_MIN, - RTYPE_RMS, - RTYPE_RMS_EXP, - NUM_REDUCTION_TYPES -} ReductionType; +typedef enum { RTYPE_MAX, RTYPE_MIN, RTYPE_RMS, RTYPE_RMS_EXP, NUM_REDUCTION_TYPES } ReductionType; /* * ============================================================================= * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH) * ============================================================================= */ -typedef enum { - AC_FOR_INT_PARAM_TYPES(AC_GEN_ID), - NUM_INT_PARAM_TYPES -} AcIntParam; +typedef enum { AC_FOR_INT_PARAM_TYPES(AC_GEN_ID), NUM_INT_PARAM_TYPES } AcIntParam; -typedef enum { - AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID), - NUM_REAL_PARAM_TYPES -} AcRealParam; +typedef enum { AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID), NUM_REAL_PARAM_TYPES } AcRealParam; extern const char* intparam_names[]; // Defined in astaroth.cu extern const char* realparam_names[]; // Defined in astaroth.cu @@ -294,9 +281,7 @@ typedef struct { * Definitions for the enums and structs for AcMesh (DO NOT TOUCH) * ============================================================================= */ -typedef enum { - AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES -} VertexBufferHandle; +typedef enum { AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES } VertexBufferHandle; extern const char* vtxbuf_names[]; // Defined in astaroth.cu @@ -316,22 +301,20 @@ typedef struct { AcMeshInfo info; } AcMesh; -#define AC_VTXBUF_SIZE(mesh_info) \ - ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] * \ +#define AC_VTXBUF_SIZE(mesh_info) \ + ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] * \ mesh_info.int_params[AC_mz])) -#define AC_VTXBUF_SIZE_BYTES(mesh_info) \ - (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info)) +#define AC_VTXBUF_SIZE_BYTES(mesh_info) (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info)) -#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info) \ - (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * \ - mesh_info.int_params[AC_nz]) +#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info) \ + (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * mesh_info.int_params[AC_nz]) -#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info) \ +#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info) \ (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)) -#define AC_VTXBUF_IDX(i, j, k, mesh_info) \ - ((i) + (j)*mesh_info.int_params[AC_mx] + \ +#define AC_VTXBUF_IDX(i, j, k, mesh_info) \ + ((i) + (j)*mesh_info.int_params[AC_mx] + \ (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my]) /* diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 58b551b..4692df3 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -28,33 +28,24 @@ #include "errchk.h" #include "device.cuh" -#include "math_utils.h" // sum for reductions +#include "math_utils.h" // sum for reductions #include "standalone/config_loader.h" // update_config -const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)}; -const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)}; -const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)}; +const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)}; +const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)}; +const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)}; - -static const int MAX_NUM_DEVICES = 32; -static int num_devices = 1; +static const int MAX_NUM_DEVICES = 32; +static int num_devices = 1; static Device devices[MAX_NUM_DEVICES] = {}; static Grid createGrid(const AcMeshInfo& config) { Grid grid; - grid.m = (int3) { - config.int_params[AC_mx], - config.int_params[AC_my], - config.int_params[AC_mz] - }; - grid.n = (int3) { - config.int_params[AC_nx], - config.int_params[AC_ny], - config.int_params[AC_nz] - }; + grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]}; + grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]}; return grid; } @@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k) static int3 gridIdx3d(const Grid& grid, const int idx) { - return (int3){idx % grid.m.x, - (idx % (grid.m.x * grid.m.y)) / grid.m.x, + return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x, idx / (grid.m.x * grid.m.y)}; } @@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config) ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER); ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER); - printf("Grid m "); printInt3(grid.m); printf("\n"); - printf("Grid n "); printInt3(grid.n); printf("\n"); + // clang-format off + printf("Grid m "); printInt3(grid.m); printf("\n"); + printf("Grid n "); printInt3(grid.n); printf("\n"); printf("Subrid m "); printInt3(subgrid.m); printf("\n"); printf("Subrid n "); printInt3(subgrid.n); printf("\n"); + // clang-format on // Initialize the devices for (int i = 0; i < num_devices; ++i) { @@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice */ if (db.z >= da.z) { const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da); - const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE - // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n"); + const int3 da_local = (int3){ + da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE + // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); + // printf("\n"); copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells); } printf("\n"); @@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh) */ if (db.z >= da.z) { const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da); - const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE - // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n"); + const int3 da_local = (int3){ + da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE + // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); + // printf("\n"); copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh); } printf("\n"); @@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh) AcResult acIntegrateStep(const int& isubstep, const AcReal& dt) { - const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2}; - const int3 end = (int3){STENCIL_ORDER/2 + subgrid.n.x, - STENCIL_ORDER/2 + subgrid.n.y, - STENCIL_ORDER/2 + subgrid.n.z}; + const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2}; + const int3 end = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y, + STENCIL_ORDER / 2 + subgrid.n.z}; for (int i = 0; i < num_devices; ++i) { rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt); } @@ -278,121 +273,125 @@ acBoundcondStep(void) { acSynchronize(); if (num_devices == 1) { - boundcondStep(devices[0], STREAM_PRIMARY, - (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z}); - } else { + boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, + (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z}); + } + else { // Local boundary conditions for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE + const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; boundcondStep(devices[i], STREAM_PRIMARY, d0, d1); } -/* -// ===MIIKKANOTE START========================================== -%JP: The old way for computing boundary conditions conflicts with the -way we have to do things with multiple GPUs. + /* + // ===MIIKKANOTE START========================================== + %JP: The old way for computing boundary conditions conflicts with the + way we have to do things with multiple GPUs. -The older approach relied on unified memory, which represented the whole -memory area as one huge mesh instead of several smaller ones. However, unified memory -in its current state is more meant for quick prototyping when performance is not an issue. -Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than -when managing the memory explicitly. + The older approach relied on unified memory, which represented the whole + memory area as one huge mesh instead of several smaller ones. However, unified memory + in its current state is more meant for quick prototyping when performance is not an issue. + Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult + than when managing the memory explicitly. -In this new approach, I have simplified the multi- and single-GPU layers significantly. -Quick rundown: - New struct: Grid. There are two global variables, "grid" and "subgrid", which - contain the extents of the whole simulation domain and the decomposed grids, respectively. - To simplify thing, we require that each GPU is assigned the same amount of work, - therefore each GPU in the node is assigned and "subgrid.m" -sized block of data - to work with. + In this new approach, I have simplified the multi- and single-GPU layers significantly. + Quick rundown: + New struct: Grid. There are two global variables, "grid" and "subgrid", which + contain the extents of the whole simulation domain and the decomposed grids, + respectively. To simplify thing, we require that each GPU is assigned the same amount of + work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to + work with. - The whole simulation domain is decomposed with respect to the z dimension. - For example, if the grid contains (nx, ny, nz) vertices, then the subgrids - contain (nx, ny, nz / num_devices) vertices. + The whole simulation domain is decomposed with respect to the z dimension. + For example, if the grid contains (nx, ny, nz) vertices, then the subgrids + contain (nx, ny, nz / num_devices) vertices. - An local index (i, j, k) in some subgrid can be mapped to the global grid with - global idx = (i, j, k + device_id * subgrid.n.z) + An local index (i, j, k) in some subgrid can be mapped to the global grid with + global idx = (i, j, k + device_id * subgrid.n.z) -Terminology: - - Single-GPU function: a function defined on the single-GPU layer (device.cu) + Terminology: + - Single-GPU function: a function defined on the single-GPU layer (device.cu) -Changes required to this commented code block: - - The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu - instead. Same holds for any complex index calculations. Instead, the local coordinates - should be passed as an int3 type without having to consider how the data is actually - laid out in device memory - - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle - of type "Device" which should be passed to single-GPU functions. In this file, all devices - are stored in a global array "devices[num_devices]". - - Every single-GPU function is executed asynchronously by default such that we - can optimize Astaroth by executing memory transactions concurrently with computation. - Therefore a StreamType should be passed as a parameter to single-GPU functions. - Refresher: CUDA function calls are non-blocking when a stream is explicitly passed - as a parameter and commands executing in different streams can be processed - in parallel/concurrently. + Changes required to this commented code block: + - The thread block dimensions (tpb) are no longer passed to the kernel here but in + device.cu instead. Same holds for any complex index calculations. Instead, the local + coordinates should be passed as an int3 type without having to consider how the data is + actually laid out in device memory + - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque + handle of type "Device" which should be passed to single-GPU functions. In this file, all + devices are stored in a global array "devices[num_devices]". + - Every single-GPU function is executed asynchronously by default such that we + can optimize Astaroth by executing memory transactions concurrently with + computation. Therefore a StreamType should be passed as a parameter to single-GPU functions. + Refresher: CUDA function calls are non-blocking when a stream is explicitly passed + as a parameter and commands executing in different streams can be processed + in parallel/concurrently. -Note on periodic boundaries (might be helpful when implementing other boundary conditions): + Note on periodic boundaries (might be helpful when implementing other boundary conditions): - With multiple GPUs, periodic boundary conditions applied on indices ranging from + With multiple GPUs, periodic boundary conditions applied on indices ranging from - (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2) + (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - + STENCIL_ORDER/2) - on a single device are "local", in the sense that they can be computed without having - to exchange data with neighboring GPUs. Special care is needed only for transferring - the data to the fron and back plates outside this range. In the solution we use here, - we solve the local boundaries first, and then just exchange the front and back plates - in a "ring", like so - device_id - (n) <-> 0 <-> 1 <-> ... <-> n <-> (0) + on a single device are "local", in the sense that they can be computed without + having to exchange data with neighboring GPUs. Special care is needed only for transferring + the data to the fron and back plates outside this range. In the solution we use + here, we solve the local boundaries first, and then just exchange the front and back plates + in a "ring", like so + device_id + (n) <-> 0 <-> 1 <-> ... <-> n <-> (0) -// ======MIIKKANOTE END========================================== + // ======MIIKKANOTE END========================================== -<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially - moved into device.cu, function boundCondStep() - In astaroth.cu, we use acBoundcondStep() - just to distribute the work and manage - communication between GPUs. + <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially + moved into device.cu, function + boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and + manage communication between GPUs. - printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS); + printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, + best_dims.z, double(best_time) / NUM_ITERATIONS); - exit(0); - #else + exit(0); + #else - const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices); + const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices); - const int3 start = (int3){0, 0, device_id * depth}; - const int3 end = (int3){mesh_info.int_params[AC_mx], - mesh_info.int_params[AC_my], - min((device_id+1) * depth, mesh_info.int_params[AC_mz])}; + const int3 start = (int3){0, 0, device_id * depth}; + const int3 end = (int3){mesh_info.int_params[AC_mx], + mesh_info.int_params[AC_my], + min((device_id+1) * depth, mesh_info.int_params[AC_mz])}; - const dim3 tpb(8,2,8); + const dim3 tpb(8,2,8); - // TODO uses the default stream currently - if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH - wedge_boundconds(0, tpb, start, end, d_buffer); - } else { - for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) - periodic_boundconds(0, tpb, start, end, d_buffer.in[i]); -<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -*/ + // TODO uses the default stream currently + if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH + wedge_boundconds(0, tpb, start, end, d_buffer); + } else { + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) + periodic_boundconds(0, tpb, start, end, d_buffer.in[i]); + <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + */ // Exchange halos for (int i = 0; i < num_devices; ++i) { - const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2; + const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2; // ...|ooooxxx|... -> xxx|ooooooo|... { - const int3 src = (int3) {0, 0, subgrid.n.z}; - const int3 dst = (int3) {0, 0, 0}; - copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices); + const int3 src = (int3){0, 0, subgrid.n.z}; + const int3 dst = (int3){0, 0, 0}; + copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, + devices[(i + 1) % num_devices], dst, num_vertices); } // ...|ooooooo|xxx <- ...|xxxoooo|... { - const int3 src = (int3) {0, 0, STENCIL_ORDER/2}; - const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z}; - copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices); + const int3 src = (int3){0, 0, STENCIL_ORDER / 2}; + const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z}; + copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, + devices[i], dst, num_vertices); } } } @@ -427,26 +426,28 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons for (int i = 1; i < n; ++i) { if (rtype == RTYPE_MAX) { res = max(res, results[i]); - } else if (rtype == RTYPE_MIN) { + } + else if (rtype == RTYPE_MIN) { res = min(res, results[i]); - } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + } + else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { res = sum(res, results[i]); - } else { + } + else { ERROR("Invalid rtype"); } } if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z); - res = sqrt(inv_n * res); + res = sqrt(inv_n * res); } return res; } AcReal -acReduceScal(const ReductionType& rtype, - const VertexBufferHandle& vtxbuffer_handle) +acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle) { AcReal results[num_devices]; for (int i = 0; i < num_devices; ++i) { @@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype, } AcReal -acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, - const VertexBufferHandle& b, const VertexBufferHandle& c) +acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b, + const VertexBufferHandle& c) { AcReal results[num_devices]; for (int i = 0; i < num_devices; ++i) { diff --git a/src/core/device.cu b/src/core/device.cu index 1ef2648..6df10f7 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -36,7 +36,7 @@ typedef struct { __constant__ AcMeshInfo d_mesh_info; __constant__ int3 d_multigpu_offset; __constant__ Grid globalGrid; -#define DCONST_INT(X) (d_mesh_info.int_params[X]) +#define DCONST_INT(X) (d_mesh_info.int_params[X]) #define DCONST_REAL(X) (d_mesh_info.real_params[X]) #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) @@ -76,46 +76,46 @@ printDeviceInfo(const Device device) printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz printf(" Stream processors: %d\n", props.multiProcessorCount); printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio); - printf(" Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0 + printf( + " Compute mode: %d\n", + (int)props + .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0 // Memory printf(" Global memory\n"); printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000)); printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth); printf(" Peak Memory Bandwidth (GiB/s): %f\n", - 2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / - (8. * 1024. * 1024. * 1024.)); + 2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.)); printf(" ECC enabled: %d\n", props.ECCEnabled); // Memory usage size_t free_bytes, total_bytes; cudaMemGetInfo(&free_bytes, &total_bytes); const size_t used_bytes = total_bytes - free_bytes; - printf(" Total global mem: %.2f GiB\n", - props.totalGlobalMem / (1024.0 * 1024 * 1024)); + printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024)); printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024)); - printf(" Gmem memory free (GiB): %.2f\n", - free_bytes / (1024.0 * 1024 * 1024)); - printf(" Gmem memory total (GiB): %.2f\n", - total_bytes / (1024.0 * 1024 * 1024)); + printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024)); + printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024)); printf(" Caches\n"); printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported); printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported); printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024)); printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); - printf(" Shared mem per block: %ld KiB\n", - props.sharedMemPerBlock / (1024)); + printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024)); printf(" Other\n"); printf(" Warp size: %d\n", props.warpSize); // printf(" Single to double perf. ratio: %dx\n", // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA // versions - printf(" Stream priorities supported: %d\n", - props.streamPrioritiesSupported); + printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported); printf("--------------------------------------------------\n"); return AC_SUCCESS; } -static __global__ void dummy_kernel(void) {} +static __global__ void +dummy_kernel(void) +{ +} AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device_handle) @@ -124,10 +124,10 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle cudaDeviceReset(); // Create Device - struct device_s* device = (struct device_s*) malloc(sizeof(*device)); + struct device_s* device = (struct device_s*)malloc(sizeof(*device)); ERRCHK_ALWAYS(device); - device->id = id; + device->id = id; device->local_config = device_config; // Check that the code was compiled for the proper GPU architecture @@ -150,15 +150,14 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); } - ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad, - AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config))); + ERRCHK_CUDA_ALWAYS( + cudaMalloc(&device->reduce_scratchpad, AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config))); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); // Device constants ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0, cudaMemcpyHostToDevice)); - // Multi-GPU offset. This is used to compute globalVertexIdx. // Might be better to calculate this in astaroth.cu instead of here, s.t. // everything related to the decomposition is limited to the multi-GPU layer @@ -166,7 +165,6 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset, sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice)); - printf("Created device %d (%p)\n", device->id, device); *device_handle = device; return AC_SUCCESS; @@ -211,53 +209,44 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp { cudaSetDevice(device->id); - const int3 start = (int3) {device->local_config.int_params[AC_nx_min], - device->local_config.int_params[AC_ny_min], - device->local_config.int_params[AC_nz_min] - }; + const int3 start = (int3){device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min]}; - const int3 end = (int3) {device->local_config.int_params[AC_nx_max], - device->local_config.int_params[AC_ny_max], - device->local_config.int_params[AC_nz_max] - }; + const int3 end = (int3){device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max]}; - *result = reduce_scal(device->streams[stream_type], rtype, - start, end, device->vba.in[vtxbuf_handle], - device->reduce_scratchpad, device->reduce_result); + *result = reduce_scal(device->streams[stream_type], rtype, start, end, + device->vba.in[vtxbuf_handle], device->reduce_scratchpad, + device->reduce_result); return AC_SUCCESS; } AcResult -reduceVec(const Device device, const StreamType stream_type, - const ReductionType rtype, - const VertexBufferHandle vtxbuf0, - const VertexBufferHandle vtxbuf1, - const VertexBufferHandle vtxbuf2, - AcReal* result) +reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype, + const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, + const VertexBufferHandle vtxbuf2, AcReal* result) { cudaSetDevice(device->id); - const int3 start = (int3) {device->local_config.int_params[AC_nx_min], - device->local_config.int_params[AC_ny_min], - device->local_config.int_params[AC_nz_min] - }; + const int3 start = (int3){device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min]}; - const int3 end = (int3) {device->local_config.int_params[AC_nx_max], - device->local_config.int_params[AC_ny_max], - device->local_config.int_params[AC_nz_max] - }; + const int3 end = (int3){device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max]}; - *result = reduce_vec(device->streams[stream_type], rtype, start, end, - device->vba.in[vtxbuf0], - device->vba.in[vtxbuf1], - device->vba.in[vtxbuf2], - device->reduce_scratchpad, device->reduce_result); + *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0], + device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], + device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } AcResult -rkStep(const Device device, const StreamType stream_type, const int step_number, - const int3& start, const int3& end, const AcReal dt) +rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start, + const int3& end, const AcReal dt) { cudaSetDevice(device->id); rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba); @@ -270,65 +259,62 @@ synchronize(const Device device, const StreamType stream_type) cudaSetDevice(device->id); if (stream_type == STREAM_ALL) { cudaDeviceSynchronize(); - } else { + } + else { cudaStreamSynchronize(device->streams[stream_type]); } return AC_SUCCESS; } static AcResult -loadWithOffset(const Device device, const StreamType stream_type, - const AcReal* src, const size_t bytes, AcReal* dst) +loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src, + const size_t bytes, AcReal* dst) { cudaSetDevice(device->id); - ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, - device->streams[stream_type])); + ERRCHK_CUDA( + cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type])); return AC_SUCCESS; } static AcResult -storeWithOffset(const Device device, const StreamType stream_type, - const AcReal* src, const size_t bytes, AcReal* dst) +storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src, + const size_t bytes, AcReal* dst) { cudaSetDevice(device->id); - ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, - device->streams[stream_type])); + ERRCHK_CUDA( + cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type])); return AC_SUCCESS; } AcResult -copyMeshToDevice(const Device device, const StreamType stream_type, - const AcMesh& host_mesh, const int3& src, const int3& dst, - const int num_vertices) +copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh, + const int3& src, const int3& dst, const int num_vertices) { const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info); const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal), - &device->vba.in[i][dst_idx]); + loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], + num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]); } return AC_SUCCESS; } AcResult -copyMeshToHost(const Device device, const StreamType stream_type, - const int3& src, const int3& dst, const int num_vertices, - AcMesh* host_mesh) +copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst, + const int num_vertices, AcMesh* host_mesh) { const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config); const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { storeWithOffset(device, stream_type, &device->vba.in[i][src_idx], - num_vertices * sizeof(AcReal), - &host_mesh->vertex_buffer[i][dst_idx]); + num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]); } return AC_SUCCESS; } AcResult -copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, - const int3& src, Device dst_device, const int3& dst, - const int num_vertices) +copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src, + Device dst_device, const int3& dst, const int num_vertices) { cudaSetDevice(src_device->id); const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config); @@ -348,7 +334,7 @@ swapBuffers(const Device device) { cudaSetDevice(device->id); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - AcReal* tmp = device->vba.in[i]; + AcReal* tmp = device->vba.in[i]; device->vba.in[i] = device->vba.out[i]; device->vba.out[i] = tmp; } @@ -364,8 +350,8 @@ loadDeviceConstant(const Device device, const AcIntParam param, const int value) // Therefore we have to obfuscate the code a bit and compute the offset address before // invoking cudaMemcpyToSymbol. const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), - offset, cudaMemcpyHostToDevice)); + ERRCHK_CUDA_ALWAYS( + cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice)); return AC_SUCCESS; } @@ -374,8 +360,8 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va { cudaSetDevice(device->id); const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), - offset, cudaMemcpyHostToDevice)); + ERRCHK_CUDA_ALWAYS( + cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice)); return AC_SUCCESS; } @@ -383,7 +369,7 @@ AcResult loadGlobalGrid(const Device device, const Grid grid) { cudaSetDevice(device->id); - ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), - 0, cudaMemcpyHostToDevice)); + ERRCHK_CUDA_ALWAYS( + cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice)); return AC_SUCCESS; } diff --git a/src/core/device.cuh b/src/core/device.cuh index 35d5209..28dbd50 100644 --- a/src/core/device.cuh +++ b/src/core/device.cuh @@ -27,12 +27,14 @@ #pragma once #include "astaroth.h" +// clang-format off typedef enum { - STREAM_PRIMARY, - STREAM_SECONDARY, - NUM_STREAM_TYPES, - STREAM_ALL + STREAM_PRIMARY, + STREAM_SECONDARY, + NUM_STREAM_TYPES, + STREAM_ALL } StreamType; +// clang-format on typedef struct { int3 m; @@ -52,20 +54,17 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* devi AcResult destroyDevice(Device device); /** */ -AcResult boundcondStep(const Device device, const StreamType stream_type, - const int3& start, const int3& end); +AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start, + const int3& end); /** */ AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result); /** */ -AcResult reduceVec(const Device device, const StreamType stream_type, - const ReductionType rtype, - const VertexBufferHandle vec0, - const VertexBufferHandle vec1, - const VertexBufferHandle vec2, - AcReal* result); +AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype, + const VertexBufferHandle vec0, const VertexBufferHandle vec1, + const VertexBufferHandle vec2, AcReal* result); /** */ AcResult rkStep(const Device device, const StreamType stream_type, const int step_number, @@ -81,9 +80,8 @@ AcResult copyMeshToDevice(const Device device, const StreamType stream_type, const int num_vertices); /** */ -AcResult copyMeshToHost(const Device device, const StreamType stream_type, - const int3& src, const int3& dst, const int num_vertices, - AcMesh* host_mesh); +AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, + const int3& dst, const int num_vertices, AcMesh* host_mesh); /** */ AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx, diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh index 4028d8f..84c5fda 100644 --- a/src/core/kernels/kernels.cuh +++ b/src/core/kernels/kernels.cuh @@ -24,7 +24,7 @@ * Detailed info. * */ - #pragma once +#pragma once __global__ void kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) @@ -38,7 +38,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z) return; - //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz)) + // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz)) // return; // If destination index is inside the computational domain, return since @@ -69,15 +69,15 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) j_src += DCONST_INT(AC_ny_min); k_src += DCONST_INT(AC_nz_min); - const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); - const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); - vtxbuf[dst_idx] = vtxbuf[src_idx]; + const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); + const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); + vtxbuf[dst_idx] = vtxbuf[src_idx]; } void periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) { - const dim3 tpb(8,2,8); + const dim3 tpb(8, 2, 8); const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), (unsigned int)ceil((end.y - start.y) / (float)tpb.y), (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); @@ -89,7 +89,6 @@ periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& en /////////////////////////////////////////////////////////////////////////////////////////////////// #include - static __device__ __forceinline__ int IDX(const int i) { @@ -120,14 +119,12 @@ create_rotz(const AcReal radians) return mat; } - #if AC_DOUBLE_PRECISION == 0 #define sin __sinf #define cos __cosf #define exp __expf #define rsqrt rsqrtf // hardware reciprocal sqrt -#endif // AC_DOUBLE_PRECISION == 0 - +#endif // AC_DOUBLE_PRECISION == 0 /* typedef struct { @@ -155,12 +152,11 @@ first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds) #elif STENCIL_ORDER == 6 const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0}; #elif STENCIL_ORDER == 8 - const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, - -1.0 / 280.0}; + const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, -1.0 / 280.0}; #endif - #define MID (STENCIL_ORDER / 2) - AcReal res = 0; +#define MID (STENCIL_ORDER / 2) + AcReal res = 0; #pragma unroll for (int i = 1; i <= MID; ++i) @@ -175,17 +171,15 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds) #if STENCIL_ORDER == 2 const AcReal coefficients[] = {-2., 1.}; #elif STENCIL_ORDER == 4 - const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0}; + const AcReal coefficients[] = {-5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0}; #elif STENCIL_ORDER == 6 - const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, - 1.0 / 90.0}; + const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, 1.0 / 90.0}; #elif STENCIL_ORDER == 8 - const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, - 8.0 / 315.0, -1.0 / 560.0}; + const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, 8.0 / 315.0, -1.0 / 560.0}; #endif - #define MID (STENCIL_ORDER / 2) - AcReal res = coefficients[0] * pencil[MID]; +#define MID (STENCIL_ORDER / 2) + AcReal res = coefficients[0] * pencil[MID]; #pragma unroll for (int i = 1; i <= MID; ++i) @@ -196,31 +190,29 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds) /** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */ static __device__ __forceinline__ AcReal -cross_derivative(const AcReal* __restrict__ pencil_a, - const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a, - const AcReal inv_ds_b) +cross_derivative(const AcReal* __restrict__ pencil_a, const AcReal* __restrict__ pencil_b, + const AcReal inv_ds_a, const AcReal inv_ds_b) { #if STENCIL_ORDER == 2 const AcReal coefficients[] = {0, 1.0 / 4.0}; #elif STENCIL_ORDER == 4 - const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders + const AcReal coefficients[] = { + 0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders #elif STENCIL_ORDER == 6 const AcReal fac = (1. / 720.); - const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, - 2.0 * fac}; + const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, 2.0 * fac}; #elif STENCIL_ORDER == 8 const AcReal fac = (1. / 20160.); - const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, - 128. * fac, -9. * fac}; + const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, 128. * fac, -9. * fac}; #endif - #define MID (STENCIL_ORDER / 2) - AcReal res = AcReal(0.); +#define MID (STENCIL_ORDER / 2) + AcReal res = AcReal(0.); - #pragma unroll +#pragma unroll for (int i = 1; i <= MID; ++i) { - res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] - - pencil_b[MID + i] - pencil_b[MID - i]); + res += coefficients[i] * + (pencil_a[MID + i] + pencil_a[MID - i] - pencil_b[MID + i] - pencil_b[MID - i]); } return res * inv_ds_a * inv_ds_b; } @@ -231,7 +223,8 @@ derx(const int3 vertexIdx, const AcReal* __restrict__ arr) AcReal pencil[STENCIL_ORDER + 1]; #pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)]; + pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, + vertexIdx.z)]; return first_derivative(pencil, DCONST_REAL(AC_inv_dsx)); } @@ -242,7 +235,8 @@ derxx(const int3 vertexIdx, const AcReal* __restrict__ arr) AcReal pencil[STENCIL_ORDER + 1]; #pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)]; + pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, + vertexIdx.z)]; return second_derivative(pencil, DCONST_REAL(AC_inv_dsx)); } @@ -262,8 +256,7 @@ derxy(const int3 vertexIdx, const AcReal* __restrict__ arr) pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)]; - return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), - DCONST_REAL(AC_inv_dsy)); + return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsy)); } static __device__ __forceinline__ AcReal @@ -281,8 +274,7 @@ derxz(const int3 vertexIdx, const AcReal* __restrict__ arr) pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z + STENCIL_ORDER / 2 - offset)]; - return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), - DCONST_REAL(AC_inv_dsz)); + return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsz)); } static __device__ __forceinline__ AcReal @@ -291,7 +283,8 @@ dery(const int3 vertexIdx, const AcReal* __restrict__ arr) AcReal pencil[STENCIL_ORDER + 1]; #pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)]; + pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, + vertexIdx.z)]; return first_derivative(pencil, DCONST_REAL(AC_inv_dsy)); } @@ -302,7 +295,8 @@ deryy(const int3 vertexIdx, const AcReal* __restrict__ arr) AcReal pencil[STENCIL_ORDER + 1]; #pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)]; + pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, + vertexIdx.z)]; return second_derivative(pencil, DCONST_REAL(AC_inv_dsy)); } @@ -322,8 +316,7 @@ deryz(const int3 vertexIdx, const AcReal* __restrict__ arr) pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z + STENCIL_ORDER / 2 - offset)]; - return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), - DCONST_REAL(AC_inv_dsz)); + return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), DCONST_REAL(AC_inv_dsz)); } static __device__ __forceinline__ AcReal @@ -332,7 +325,8 @@ derz(const int3 vertexIdx, const AcReal* __restrict__ arr) AcReal pencil[STENCIL_ORDER + 1]; #pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)]; + pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, + vertexIdx.z + offset - STENCIL_ORDER / 2)]; return first_derivative(pencil, DCONST_REAL(AC_inv_dsz)); } @@ -343,7 +337,8 @@ derzz(const int3 vertexIdx, const AcReal* __restrict__ arr) AcReal pencil[STENCIL_ORDER + 1]; #pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)]; + pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, + vertexIdx.z + offset - STENCIL_ORDER / 2)]; return second_derivative(pencil, DCONST_REAL(AC_inv_dsz)); } @@ -401,8 +396,7 @@ operator-(const AcReal3& a) return (AcReal3){-a.x, -a.y, -a.z}; } -static __host__ __device__ __forceinline__ AcReal3 -operator*(const AcReal a, const AcReal3& b) +static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b) { return (AcReal3){a * b.x, a * b.y, a * b.z}; } @@ -443,7 +437,6 @@ is_valid(const AcReal3& a) return is_valid(a.x) && is_valid(a.y) && is_valid(a.z); } - /* * ============================================================================= * Level 1 (Stencil Processing Stage) @@ -476,8 +469,7 @@ laplace_vec(const AcReal3Data& vec) static __device__ __forceinline__ AcReal3 curl(const AcReal3Data& vec) { - return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, - gradient(vec.x).z - gradient(vec.z).x, + return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, gradient(vec.x).z - gradient(vec.z).x, gradient(vec.y).x - gradient(vec.x).y}; } @@ -520,7 +512,7 @@ contract(const AcMatrix& mat) { AcReal res = 0; - #pragma unroll +#pragma unroll for (int i = 0; i < 3; ++i) res += dot(mat.row[i], mat.row[i]); @@ -558,12 +550,13 @@ __constant__ AcReal forcing_phi; static __device__ __forceinline__ AcReal3 forcing(const int i, const int j, const int k) { - #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx)) - #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy)) - #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz)) - const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X, - (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y, - (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z}; +#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx)) +#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy)) +#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz)) + const AcReal3 k_vec = (AcReal3){ + (i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X, + (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y, + (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z}; AcReal inv_len = reciprocal_len(k_vec); if (isnan(inv_len) || isinf(inv_len)) inv_len = 0; @@ -571,46 +564,41 @@ forcing(const int i, const int j, const int k) inv_len = 2; const AcReal k_dot_x = dot(k_vec, forcing_vec); - const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi); + const AcReal waves = cos(k_dot_x) * cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi); return inv_len * inv_len * waves * forcing_vec; } - -// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision +// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values +// in the mesh, then we will inherently lose precision #define LNT0 (AcReal(0.0)) #define LNRHO0 (AcReal(0.0)) #define H_CONST (AcReal(0.0)) #define C_CONST (AcReal(0.0)) - - template static __device__ __forceinline__ AcReal -rk3_integrate(const AcReal state_previous, const AcReal state_current, - const AcReal rate_of_change, const AcReal dt) +rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change, + const AcReal dt) { // Williamson (1980) const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)}; - const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), - AcReal(8. / 15.)}; - + const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)}; // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds" // access (when accessing beta[step_number-1] even when step_number >= 1) switch (step_number) { - case 0: - return state_current + beta[step_number + 1] * rate_of_change * dt; - case 1: // Fallthrough - case 2: - return state_current + - beta[step_number + 1] * - (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) * - (state_current - state_previous) + - rate_of_change * dt); - default: - return NAN; + case 0: + return state_current + beta[step_number + 1] * rate_of_change * dt; + case 1: // Fallthrough + case 2: + return state_current + + beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) * + (state_current - state_previous) + + rate_of_change * dt); + default: + return NAN; } } /* @@ -646,13 +634,14 @@ static __device__ __forceinline__ AcReal3 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current, const AcReal3 rate_of_change, const AcReal dt) { - return (AcReal3) { rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, dt), - rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, dt), - rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, dt)}; + return (AcReal3){ + rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, dt), + rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, dt), + rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, dt)}; } -#define rk3(state_previous, state_current, rate_of_change, dt)\ -rk3_integrate(state_previous, value(state_current), rate_of_change, dt) +#define rk3(state_previous, state_current, rate_of_change, dt) \ + rk3_integrate(state_previous, value(state_current), rate_of_change, dt) /* template @@ -708,9 +697,8 @@ read_out(const int idx, AcReal* __restrict__ field[], const int handle) static __device__ AcReal3 read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) { - return (AcReal3) { read_out(idx, field, handle.x), - read_out(idx, field, handle.y), - read_out(idx, field, handle.z) }; + return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y), + read_out(idx, field, handle.z)}; } #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value)) @@ -718,29 +706,28 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) #define READ_OUT(handle) (read_out(idx, buffer.out, handle)) // also write for clarity here also, not for the DSL -//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function +//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the +// function -#define GEN_KERNEL_PARAM_BOILERPLATE \ - const int3 start, const int3 end, VertexBufferArray buffer +#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer -#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \ - const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\ - threadIdx.y + blockIdx.y * blockDim.y + start.y,\ - threadIdx.z + blockIdx.z * blockDim.z + start.z};\ - const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \ - d_multigpu_offset.y + vertexIdx.y, \ - d_multigpu_offset.z + vertexIdx.z}; \ - if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\ - return;\ -\ -\ - assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\ - vertexIdx.z < DCONST_INT(AC_nz_max));\ -\ - assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\ - vertexIdx.z >= DCONST_INT(AC_nz_min));\ -\ - const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); +#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \ + const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \ + threadIdx.y + blockIdx.y * blockDim.y + start.y, \ + threadIdx.z + blockIdx.z * blockDim.z + start.z}; \ + const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \ + d_multigpu_offset.y + vertexIdx.y, \ + d_multigpu_offset.z + vertexIdx.z}; \ + if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \ + return; \ + \ + assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) && \ + vertexIdx.z < DCONST_INT(AC_nz_max)); \ + \ + assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) && \ + vertexIdx.z >= DCONST_INT(AC_nz_min)); \ + \ + const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); #include "stencil_process.cuh" @@ -757,33 +744,31 @@ randf(void) } AcResult -rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, - const AcReal dt, VertexBufferArray* buffer) +rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, + const int3& end, const AcReal dt, VertexBufferArray* buffer) { const dim3 tpb(32, 1, 4); - /////////////////// Forcing - #if LFORCING +/////////////////// Forcing +#if LFORCING const AcReal ff_scale = AcReal(.2); - static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0}; - const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8; - const AcMatrix rotz = create_rotz(radians); - ff = mul(rotz, ff); + static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0}; + const AcReal radians = randf() * AcReal(2 * M_PI) / 360 / 8; + const AcMatrix rotz = create_rotz(radians); + ff = mul(rotz, ff); cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream); - const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf(); - cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream); - #endif // LFORCING + const AcReal ff_phi = AcReal(M_PI); // AcReal(2 * M_PI) * randf(); + cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, + stream); +#endif // LFORCING ////////////////////////// const int nx = end.x - start.x; const int ny = end.y - start.y; const int nz = end.z - start.z; - const dim3 bpg( - (unsigned int)ceil(nx / AcReal(tpb.x)), - (unsigned int)ceil(ny / AcReal(tpb.y)), - (unsigned int)ceil(nz / AcReal(tpb.z))); - + const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)), + (unsigned int)ceil(nz / AcReal(tpb.z))); if (step_number == 0) solve<0><<>>(start, end, *buffer, dt); @@ -796,7 +781,6 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st return AC_SUCCESS; } - ////////////////REDUCE/////////////////////////// #include "src/core/math_utils.h" // is_power_of_two @@ -848,22 +832,19 @@ template __global__ void kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst) { - const int3 src_idx = (int3) { - start.x + threadIdx.x + blockIdx.x * blockDim.x, - start.y + threadIdx.y + blockIdx.y * blockDim.y, - start.z + threadIdx.z + blockIdx.z * blockDim.z - }; + const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z}; - const int nx = end.x - start.x; - const int ny = end.y - start.y; - const int nz = end.z - start.z; //MV: Added this because it was undefined - const int3 dst_idx = (int3) { - threadIdx.x + blockIdx.x * blockDim.x, - threadIdx.y + blockIdx.y * blockDim.y, - threadIdx.z + blockIdx.z * blockDim.z - }; + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; // MV: Added this because it was undefined + const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z}; - assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max)); + assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && + src_idx.z < DCONST_INT(AC_nz_max)); assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); @@ -872,31 +853,27 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, template __global__ void -kernel_filter_vec(const __restrict__ AcReal* src0, - const __restrict__ AcReal* src1, - const __restrict__ AcReal* src2, - const int3 start, const int3 end, AcReal* dst) +kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1, + const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst) { - const int3 src_idx = (int3) { - start.x + threadIdx.x + blockIdx.x * blockDim.x, - start.y + threadIdx.y + blockIdx.y * blockDim.y, - start.z + threadIdx.z + blockIdx.z * blockDim.z - }; + const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x, + start.y + threadIdx.y + blockIdx.y * blockDim.y, + start.z + threadIdx.z + blockIdx.z * blockDim.z}; - const int nx = end.x - start.x; - const int ny = end.y - start.y; - const int nz = end.z - start.z; //MV: Added this because it was undefined - const int3 dst_idx = (int3) { - threadIdx.x + blockIdx.x * blockDim.x, - threadIdx.y + blockIdx.y * blockDim.y, - threadIdx.z + blockIdx.z * blockDim.z - }; + const int nx = end.x - start.x; + const int ny = end.y - start.y; + const int nz = end.z - start.z; // MV: Added this because it was undefined + const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z}; - assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max)); + assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && + src_idx.z < DCONST_INT(AC_nz_max)); assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); - dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( + src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); } template @@ -908,7 +885,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems) extern __shared__ AcReal smem[]; if (idx < num_elems) { smem[threadIdx.x] = scratchpad[idx]; - } else { + } + else { smem[threadIdx.x] = NAN; } __syncthreads(); @@ -930,9 +908,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems) template __global__ void -kernel_reduce_block(const __restrict__ AcReal* scratchpad, - const int num_blocks, const int block_size, - AcReal* result) +kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks, + const int block_size, AcReal* result) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx != 0) { @@ -946,23 +923,19 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, *result = res; } - AcReal -reduce_scal(const cudaStream_t stream, const ReductionType rtype, - const int3& start, const int3& end, - const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) +reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start, + const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) { - const unsigned nx = end.x - start.x; - const unsigned ny = end.y - start.y; - const unsigned nz = end.z - start.z; + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; const unsigned num_elems = nx * ny * nz; const dim3 tpb_filter(32, 4, 1); - const dim3 bpg_filter( - (unsigned int)ceil(nx / AcReal(tpb_filter.x)), - (unsigned int)ceil(ny / AcReal(tpb_filter.y)), - (unsigned int)ceil(nz / AcReal(tpb_filter.z)) - ); + const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z))); const int tpb_reduce = 128; const int bpg_reduce = num_elems / tpb_reduce; @@ -974,22 +947,38 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, ERRCHK(nx * ny * nz % 2 == 0); if (rtype == RTYPE_MAX) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_MIN) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS_EXP) { - kernel_filter<<>>(vtxbuf, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else { + kernel_filter + <<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else if (rtype == RTYPE_MIN) { + kernel_filter + <<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else if (rtype == RTYPE_RMS) { + kernel_filter + <<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else if (rtype == RTYPE_RMS_EXP) { + kernel_filter + <<>>(vtxbuf, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else { ERROR("Unrecognized rtype"); } AcReal result; @@ -998,22 +987,19 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, } AcReal -reduce_vec(const cudaStream_t stream, const ReductionType rtype, - const int3& start, const int3& end, - const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, - AcReal* scratchpad, AcReal* reduce_result) +reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, + const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, + AcReal* reduce_result) { - const unsigned nx = end.x - start.x; - const unsigned ny = end.y - start.y; - const unsigned nz = end.z - start.z; + const unsigned nx = end.x - start.x; + const unsigned ny = end.y - start.y; + const unsigned nz = end.z - start.z; const unsigned num_elems = nx * ny * nz; const dim3 tpb_filter(32, 4, 1); - const dim3 bpg_filter( - (unsigned int)ceil(nx / AcReal(tpb_filter.x)), - (unsigned int)ceil(ny / AcReal(tpb_filter.y)), - (unsigned int)ceil(nz / AcReal(tpb_filter.z)) - ); + const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)), + (unsigned int)ceil(ny / AcReal(tpb_filter.y)), + (unsigned int)ceil(nz / AcReal(tpb_filter.z))); const int tpb_reduce = 128; const int bpg_reduce = num_elems / tpb_reduce; @@ -1025,22 +1011,38 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, ERRCHK(nx * ny * nz % 2 == 0); if (rtype == RTYPE_MAX) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_MIN) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else if (rtype == RTYPE_RMS_EXP) { - kernel_filter_vec<<>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); - kernel_reduce<<>>(scratchpad, num_elems); - kernel_reduce_block<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); - } else { + kernel_filter_vec<<>>( + vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else if (rtype == RTYPE_MIN) { + kernel_filter_vec<<>>( + vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else if (rtype == RTYPE_RMS) { + kernel_filter_vec<<>>( + vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else if (rtype == RTYPE_RMS_EXP) { + kernel_filter_vec<<>>( + vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); + kernel_reduce<<>>( + scratchpad, num_elems); + kernel_reduce_block + <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); + } + else { ERROR("Unrecognized rtype"); } AcReal result; diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc index 7c44be3..107b950 100644 --- a/src/standalone/autotest.cc +++ b/src/standalone/autotest.cc @@ -53,56 +53,58 @@ #define GEN_TEST_RESULT (1) // Generate a test file always during testing typedef struct { - int x, y, z; + int x, y, z; } vec3i; typedef struct { - AcReal x, y, z; + AcReal x, y, z; } vec3r; - typedef struct { - ModelScalar model; - AcReal candidate; - ModelScalar error; + ModelScalar model; + AcReal candidate; + ModelScalar error; } ErrorInfo; #define QUICK_TEST (0) #define THOROUGH_TEST (1) #define TEST_TYPE QUICK_TEST -static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW}; +static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, + INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW}; // #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) -#if TEST_TYPE == QUICK_TEST // REGULAR TEST START HERE -------------------------------------------------------------------------------------------------------------- - static inline ModelScalar +#if TEST_TYPE == \ + QUICK_TEST // REGULAR TEST START HERE + // -------------------------------------------------------------------------------------------------------------- +static inline ModelScalar get_absolute_error(const ModelScalar& model, const AcReal& candidate) { - return fabsl(candidate - model); + return fabsl(candidate - model); } - static inline ModelScalar +static inline ModelScalar get_acceptable_absolute_error(const ModelScalar& range) { - // This is the upper limit, which assumes that both the min and max values - // are used in a calculation (which inherently leads to cancellation). - // - // AFAIK if this breaks, there is definitely something wrong with the code. - // Otherwise the error is so small it's indistiguishable from inherent - // inaccuracies in floating-point arithmetic. - return range * AC_REAL_EPSILON; + // This is the upper limit, which assumes that both the min and max values + // are used in a calculation (which inherently leads to cancellation). + // + // AFAIK if this breaks, there is definitely something wrong with the code. + // Otherwise the error is so small it's indistiguishable from inherent + // inaccuracies in floating-point arithmetic. + return range * AC_REAL_EPSILON; } - static inline ModelScalar +static inline ModelScalar get_acceptable_relative_error(void) { - return 30; // machine epsilons + return 30; // machine epsilons } - static inline ModelScalar +static inline ModelScalar get_relative_error(const ModelScalar& model, const AcReal& candidate) { - ModelScalar error = NAN; + ModelScalar error = NAN; #if 0 const ModelScalar abs_epsilon = get_acceptable_absolute_error(range); @@ -121,190 +123,182 @@ get_relative_error(const ModelScalar& model, const AcReal& candidate) error = fabsl(1.0l - candidate / model); } #endif - error = fabsl(1.0l - candidate / model); + error = fabsl(1.0l - candidate / model); - // Return the relative error as multiples of the machine epsilon - // See Sect. Relative Error and Ulps in - // What Every Computer Scientist Should Know About Floating-Point Arithmetic - // By David Goldberg (1991) - return error / AC_REAL_EPSILON; + // Return the relative error as multiples of the machine epsilon + // See Sect. Relative Error and Ulps in + // What Every Computer Scientist Should Know About Floating-Point Arithmetic + // By David Goldberg (1991) + return error / AC_REAL_EPSILON; } - static bool +static bool verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range) { - if (!is_valid(model) || !is_valid(cand)) - return false; + if (!is_valid(model) || !is_valid(cand)) + return false; - const ModelScalar relative_error = get_relative_error(model, cand); - if (relative_error < get_acceptable_relative_error()) - return true; + const ModelScalar relative_error = get_relative_error(model, cand); + if (relative_error < get_acceptable_relative_error()) + return true; - const ModelScalar absolute_error = get_absolute_error(model, cand); - if (absolute_error < get_acceptable_absolute_error(range)) - return true; + const ModelScalar absolute_error = get_absolute_error(model, cand); + if (absolute_error < get_acceptable_absolute_error(range)) + return true; - return false; + return false; } - static ModelScalar +static ModelScalar get_reduction_range(const ModelMesh& mesh) { - ERRCHK(NUM_VTXBUF_HANDLES >= 3); + ERRCHK(NUM_VTXBUF_HANDLES >= 3); - const ModelScalar max0 = model_reduce_scal(mesh, RTYPE_MAX, - VertexBufferHandle(0)); - const ModelScalar max1 = model_reduce_scal(mesh, RTYPE_MAX, - VertexBufferHandle(1)); - const ModelScalar max2 = model_reduce_scal(mesh, RTYPE_MAX, - VertexBufferHandle(2)); - const ModelScalar max_scal = max(max0, max(max1, max2)); + const ModelScalar max0 = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(0)); + const ModelScalar max1 = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(1)); + const ModelScalar max2 = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(2)); + const ModelScalar max_scal = max(max0, max(max1, max2)); - const ModelScalar min0 = model_reduce_scal(mesh, RTYPE_MIN, - VertexBufferHandle(0)); - const ModelScalar min1 = model_reduce_scal(mesh, RTYPE_MIN, - VertexBufferHandle(1)); - const ModelScalar min2 = model_reduce_scal(mesh, RTYPE_MIN, - VertexBufferHandle(2)); - const ModelScalar min_scal = min(min0, min(min1, min2)); + const ModelScalar min0 = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(0)); + const ModelScalar min1 = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(1)); + const ModelScalar min2 = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(2)); + const ModelScalar min_scal = min(min0, min(min1, min2)); - return max_scal - min_scal; -} - - static void -print_debug_info(const ModelScalar& model, const AcReal& candidate, - const ModelScalar& range) -{ - printf("MeshPointInfo\n"); - printf("\tModel: %e\n", double(model)); - printf("\tCandidate: %e\n", double(candidate)); - printf("\tRange: %e\n", double(range)); - - printf("\tAbsolute error: %Le (max acceptable: %Le)\n", - get_absolute_error(model, candidate), - get_acceptable_absolute_error(range)); - printf("\tRelative error: %Le (max acceptable: %Le)\n", - get_relative_error(model, candidate), - get_acceptable_relative_error()); - printf("\tIs acceptable: %d\n", verify(model, candidate, range)); + return max_scal - min_scal; } static void -print_result(const ModelScalar& model, const AcReal& candidate, - const ModelScalar& range, const char* name = "???") +print_debug_info(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range) { - const ModelScalar rel_err = get_relative_error(model, candidate); - const ModelScalar abs_err = get_absolute_error(model, candidate); - if (!verify(model, candidate, range)) { - printf("\t%-12s... ", name); - printf(RED "FAIL! " RESET); - } - else { - printf("\t%-12s... ", name); - printf(GRN "OK! " RESET); - } + printf("MeshPointInfo\n"); + printf("\tModel: %e\n", double(model)); + printf("\tCandidate: %e\n", double(candidate)); + printf("\tRange: %e\n", double(range)); - printf("(relative error: %.3Lg \u03B5, absolute error: %Lg)\n", rel_err, abs_err); - /* - // DEPRECATED: TODO remove - if (rel_err < get_acceptable_relative_error()) - printf("(relative error: %Lg \u03B5, max accepted %Lg)\n", rel_err, - get_acceptable_relative_error()); - else - printf("(absolute error: %Lg, max accepted %Lg)\n", abs_err, - get_acceptable_absolute_error(range)); - */ + printf("\tAbsolute error: %Le (max acceptable: %Le)\n", get_absolute_error(model, candidate), + get_acceptable_absolute_error(range)); + printf("\tRelative error: %Le (max acceptable: %Le)\n", get_relative_error(model, candidate), + get_acceptable_relative_error()); + printf("\tIs acceptable: %d\n", verify(model, candidate, range)); } - static int +static void +print_result(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range, + const char* name = "???") +{ + const ModelScalar rel_err = get_relative_error(model, candidate); + const ModelScalar abs_err = get_absolute_error(model, candidate); + if (!verify(model, candidate, range)) { + printf("\t%-12s... ", name); + printf(RED "FAIL! " RESET); + } + else { + printf("\t%-12s... ", name); + printf(GRN "OK! " RESET); + } + + printf("(relative error: %.3Lg \u03B5, absolute error: %Lg)\n", rel_err, abs_err); + /* + // DEPRECATED: TODO remove + if (rel_err < get_acceptable_relative_error()) + printf("(relative error: %Lg \u03B5, max accepted %Lg)\n", rel_err, + get_acceptable_relative_error()); + else + printf("(absolute error: %Lg, max accepted %Lg)\n", abs_err, + get_acceptable_absolute_error(range)); + */ +} + +static int check_reductions(const AcMeshInfo& config) { - printf("Testing reductions\n"); - int num_failures = 0; + printf("Testing reductions\n"); + int num_failures = 0; - // Init CPU meshes - AcMesh* mesh = acmesh_create(config); - ModelMesh* modelmesh = modelmesh_create(config); + // Init CPU meshes + AcMesh* mesh = acmesh_create(config); + ModelMesh* modelmesh = modelmesh_create(config); - // Init GPU meshes - acInit(config); + // Init GPU meshes + acInit(config); - for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) { + for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) { const InitType itype = test_cases[i]; printf("Checking %s...\n", init_type_names[InitType(itype)]); - // Init the mesh and figure out the acceptable range for error - acmesh_init_to(InitType(itype), mesh); + // Init the mesh and figure out the acceptable range for error + acmesh_init_to(InitType(itype), mesh); - acmesh_to_modelmesh(*mesh, modelmesh); - const ModelScalar range = get_reduction_range(*modelmesh); + acmesh_to_modelmesh(*mesh, modelmesh); + const ModelScalar range = get_reduction_range(*modelmesh); - acLoad(*mesh); + acLoad(*mesh); - for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) { - const VertexBufferHandle ftype = VTXBUF_UUX; + for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) { + const VertexBufferHandle ftype = VTXBUF_UUX; - // Scal - ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype), - VertexBufferHandle(ftype)); - AcReal candidate = acReduceScal(ReductionType(rtype), - VertexBufferHandle(ftype)); - print_result(model, candidate, range, "UUX scal"); + // Scal + ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype), + VertexBufferHandle(ftype)); + AcReal candidate = acReduceScal(ReductionType(rtype), VertexBufferHandle(ftype)); + print_result(model, candidate, range, "UUX scal"); - bool is_acceptable = verify(model, candidate, range); - if (!is_acceptable) { - ++num_failures; + bool is_acceptable = verify(model, candidate, range); + if (!is_acceptable) { + ++num_failures; - // Print debug info - printf("Scalar reduction type %d FAIL\n", rtype); - print_debug_info(model, candidate, range); - } + // Print debug info + printf("Scalar reduction type %d FAIL\n", rtype); + print_debug_info(model, candidate, range); + } - // Vec - model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX, - VTXBUF_UUY, VTXBUF_UUZ); - candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX, - VTXBUF_UUY, VTXBUF_UUZ); - print_result(model, candidate, range, "UUXYZ vec"); + // Vec + model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY, + VTXBUF_UUZ); + candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); + print_result(model, candidate, range, "UUXYZ vec"); - is_acceptable = verify(model, candidate, range); - if (!is_acceptable) { - ++num_failures; + is_acceptable = verify(model, candidate, range); + if (!is_acceptable) { + ++num_failures; - // Print debug info - printf("Vector reduction type %d FAIL\n", rtype); - print_debug_info(model, candidate, range); - } - } + // Print debug info + printf("Vector reduction type %d FAIL\n", rtype); + print_debug_info(model, candidate, range); + } + } - printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range)); - } - acQuit(); - modelmesh_destroy(modelmesh); - acmesh_destroy(mesh); + printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", + get_acceptable_relative_error(), get_acceptable_absolute_error(range)); + } + acQuit(); + modelmesh_destroy(modelmesh); + acmesh_destroy(mesh); - return num_failures; + return num_failures; } /** Finds the maximum and minimum in all meshes and computes the range. * Note! Potentially dangerous if all meshes do not interact with each other. * Otherwise the range may be too high. */ - static ModelScalar +static ModelScalar get_data_range(const ModelMesh& model) { - ModelScalar vertex_buffer_max_all = -INFINITY; - ModelScalar vertex_buffer_min_all = INFINITY; - for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) { - const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, VertexBufferHandle(w)); - const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, VertexBufferHandle(w)); + ModelScalar vertex_buffer_max_all = -INFINITY; + ModelScalar vertex_buffer_min_all = INFINITY; + for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) { + const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, + VertexBufferHandle(w)); + const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, + VertexBufferHandle(w)); - if (vertex_buffer_max > vertex_buffer_max_all) - vertex_buffer_max_all = vertex_buffer_max; - if (vertex_buffer_min < vertex_buffer_min_all) - vertex_buffer_min_all = vertex_buffer_min; - } - return fabsl(vertex_buffer_max_all - vertex_buffer_min_all); + if (vertex_buffer_max > vertex_buffer_max_all) + vertex_buffer_max_all = vertex_buffer_max; + if (vertex_buffer_min < vertex_buffer_min_all) + vertex_buffer_min_all = vertex_buffer_min; + } + return fabsl(vertex_buffer_max_all - vertex_buffer_min_all); } // #define GEN_TEST_RESULT @@ -312,380 +306,398 @@ get_data_range(const ModelMesh& model) static FILE* test_result = NULL; #endif - static bool +static bool verify_meshes(const ModelMesh& model, const AcMesh& candidate) { - bool retval = true; + bool retval = true; #if GEN_TEST_RESULT == 1 - ErrorInfo err = ErrorInfo(); + ErrorInfo err = ErrorInfo(); #endif - const ModelScalar range = get_data_range(model); - for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) { - const size_t n = AC_VTXBUF_SIZE(model.info); + const ModelScalar range = get_data_range(model); + for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) { + const size_t n = AC_VTXBUF_SIZE(model.info); - // Maximum errors - ErrorInfo max_abs_error = ErrorInfo(); - ErrorInfo max_rel_error = ErrorInfo(); + // Maximum errors + ErrorInfo max_abs_error = ErrorInfo(); + ErrorInfo max_rel_error = ErrorInfo(); - for (size_t i = 0; i < n; ++i) { - const ModelScalar model_val = model.vertex_buffer[VertexBufferHandle(w)][i]; - const AcReal cand_val = candidate.vertex_buffer[VertexBufferHandle(w)][i]; + for (size_t i = 0; i < n; ++i) { + const ModelScalar model_val = model.vertex_buffer[VertexBufferHandle(w)][i]; + const AcReal cand_val = candidate.vertex_buffer[VertexBufferHandle(w)][i]; - if (!verify(model_val, cand_val, range)) { - const int i0 = i % model.info.int_params[AC_mx]; - const int j0 = ((i % (model.info.int_params[AC_mx] * - model.info.int_params[AC_my])) / - model.info.int_params[AC_mx]); - const int k0 = i / (model.info.int_params[AC_mx] * - model.info.int_params[AC_my]); - printf("Index (%d, %d, %d)\n", i0, j0, k0); - print_debug_info(model_val, cand_val, range); - retval = false; - } + if (!verify(model_val, cand_val, range)) { + const int i0 = i % model.info.int_params[AC_mx]; + const int j0 = ((i % + (model.info.int_params[AC_mx] * model.info.int_params[AC_my])) / + model.info.int_params[AC_mx]); + const int k0 = i / (model.info.int_params[AC_mx] * model.info.int_params[AC_my]); + printf("Index (%d, %d, %d)\n", i0, j0, k0); + print_debug_info(model_val, cand_val, range); + retval = false; + } - const ModelScalar abs_error = get_absolute_error(model_val, - cand_val); - if (abs_error > max_abs_error.error) { - max_abs_error.error = abs_error; - max_abs_error.model = model_val; - max_abs_error.candidate = cand_val; - } + const ModelScalar abs_error = get_absolute_error(model_val, cand_val); + if (abs_error > max_abs_error.error) { + max_abs_error.error = abs_error; + max_abs_error.model = model_val; + max_abs_error.candidate = cand_val; + } - const ModelScalar rel_error = get_relative_error(model_val, cand_val); - if (rel_error > max_rel_error.error) { - max_rel_error.error = rel_error; - max_rel_error.model = model_val; - max_rel_error.candidate = cand_val; - } + const ModelScalar rel_error = get_relative_error(model_val, cand_val); + if (rel_error > max_rel_error.error) { + max_rel_error.error = rel_error; + max_rel_error.model = model_val; + max_rel_error.candidate = cand_val; + } #if GEN_TEST_RESULT == 1 - if (abs_error > err.error) { - err.error = abs_error; - err.model = model_val; - err.candidate = cand_val; - } + if (abs_error > err.error) { + err.error = abs_error; + err.model = model_val; + err.candidate = cand_val; + } #endif - } - //print_result(max_rel_error.model, max_rel_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]); - print_result(max_abs_error.model, max_abs_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]); - } + } + // print_result(max_rel_error.model, max_rel_error.candidate, range, + // vtxbuf_names[VertexBufferHandle(w)]); + print_result(max_abs_error.model, max_abs_error.candidate, range, + vtxbuf_names[VertexBufferHandle(w)]); + } #if GEN_TEST_RESULT == 1 - const ModelScalar rel_err = get_relative_error(err.model, err.candidate); - const ModelScalar abs_err = get_absolute_error(err.model, err.candidate); - fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err); + const ModelScalar rel_err = get_relative_error(err.model, err.candidate); + const ModelScalar abs_err = get_absolute_error(err.model, err.candidate); + fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err); #endif - printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range)); + printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", + get_acceptable_relative_error(), get_acceptable_absolute_error(range)); - return retval; + return retval; } - int +int check_rk3(const AcMeshInfo& mesh_info) { - const int num_iterations = 1; // Note: should work up to at least 15 steps - printf("Testing RK3 (running %d steps before checking the result)\n", - num_iterations); - int num_failures = 0; + const int num_iterations = 1; // Note: should work up to at least 15 steps + printf("Testing RK3 (running %d steps before checking the result)\n", num_iterations); + int num_failures = 0; - // Init CPU meshes - AcMesh* gpu_mesh = acmesh_create(mesh_info); - ModelMesh* model_mesh = modelmesh_create(mesh_info); + // Init CPU meshes + AcMesh* gpu_mesh = acmesh_create(mesh_info); + ModelMesh* model_mesh = modelmesh_create(mesh_info); - // Init GPU meshes - acInit(mesh_info); + // Init GPU meshes + acInit(mesh_info); - for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) { + for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) { const InitType itype = test_cases[i]; - printf("Checking %s...\n", init_type_names[InitType(itype)]); + printf("Checking %s...\n", init_type_names[InitType(itype)]); - // Init the mesh and figure out the acceptable range for error - acmesh_init_to(InitType(itype), gpu_mesh); + // Init the mesh and figure out the acceptable range for error + acmesh_init_to(InitType(itype), gpu_mesh); - acLoad(*gpu_mesh); - acmesh_to_modelmesh(*gpu_mesh, model_mesh); + acLoad(*gpu_mesh); + acmesh_to_modelmesh(*gpu_mesh, model_mesh); - acBoundcondStep(); - boundconds(model_mesh->info, model_mesh); + acBoundcondStep(); + boundconds(model_mesh->info, model_mesh); - for (int i = 0; i < num_iterations; ++i) { - //const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)); - //const AcReal dt = host_timestep(umax, mesh_info); - const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities + for (int i = 0; i < num_iterations; ++i) { + // const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, + // VTXBUF_UUZ)); + // const AcReal dt = host_timestep(umax, mesh_info); + const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities - acIntegrate(dt); - acBoundcondStep(); - acSynchronize(); + acIntegrate(dt); + acBoundcondStep(); + acSynchronize(); - model_rk3(dt, model_mesh); - boundconds(model_mesh->info, model_mesh); - } - acStore(gpu_mesh); + model_rk3(dt, model_mesh); + boundconds(model_mesh->info, model_mesh); + } + acStore(gpu_mesh); - bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh); - if (!is_acceptable) { - ++num_failures; - } - } + bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh); + if (!is_acceptable) { + ++num_failures; + } + } - acQuit(); - acmesh_destroy(gpu_mesh); - modelmesh_destroy(model_mesh); + acQuit(); + acmesh_destroy(gpu_mesh); + modelmesh_destroy(model_mesh); - return num_failures; + return num_failures; } - int +int run_autotest(void) { #if GEN_TEST_RESULT == 1 - char testresult_path[256]; - sprintf(testresult_path, "%s_fullstep_testresult.out", AC_DOUBLE_PRECISION ? "double" : "float"); + char testresult_path[256]; + sprintf(testresult_path, "%s_fullstep_testresult.out", + AC_DOUBLE_PRECISION ? "double" : "float"); - test_result = fopen(testresult_path, "w"); - ERRCHK(test_result); + test_result = fopen(testresult_path, "w"); + ERRCHK(test_result); - fprintf(test_result, "n, max abs error, corresponding rel error\n"); + fprintf(test_result, "n, max abs error, corresponding rel error\n"); #endif - /* Parse configs */ - AcMeshInfo config; - load_config(&config); + /* Parse configs */ + AcMeshInfo config; + load_config(&config); - if (STENCIL_ORDER > 6) - printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen."); + if (STENCIL_ORDER > 6) + printf("WARNING!!! If the stencil order is larger than the computational domain some " + "vertices may be done twice (f.ex. doing inner and outer domains separately and " + "some of the front/back/left/right/etc slabs collide). The mesh must be large " + "enough s.t. this doesn't happen."); - const vec3i test_dims[] = { - {32, 32, 32}, - {64, 32, 32}, - {32, 64, 32}, - {32, 32, 64}, - {64, 64, 32}, - {64, 32, 64}, - {32, 64, 64} - }; + const vec3i test_dims[] = {{32, 32, 32}, {64, 32, 32}, {32, 64, 32}, {32, 32, 64}, + {64, 64, 32}, {64, 32, 64}, {32, 64, 64}}; - int num_failures = 0; - for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { - config.int_params[AC_nx] = test_dims[i].x; - config.int_params[AC_ny] = test_dims[i].y; - config.int_params[AC_nz] = test_dims[i].z; - update_config(&config); + int num_failures = 0; + for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { + config.int_params[AC_nx] = test_dims[i].x; + config.int_params[AC_ny] = test_dims[i].y; + config.int_params[AC_nz] = test_dims[i].z; + update_config(&config); - printf("Testing mesh (%d, %d, %d):\n", // - test_dims[i].x, test_dims[i].y, test_dims[i].z); + printf("Testing mesh (%d, %d, %d):\n", // + test_dims[i].x, test_dims[i].y, test_dims[i].z); - num_failures += check_reductions(config); - fflush(stdout); - } + num_failures += check_reductions(config); + fflush(stdout); + } - for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { - config.int_params[AC_nx] = test_dims[i].x; - config.int_params[AC_ny] = test_dims[i].y; - config.int_params[AC_nz] = test_dims[i].z; - update_config(&config); + for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) { + config.int_params[AC_nx] = test_dims[i].x; + config.int_params[AC_ny] = test_dims[i].y; + config.int_params[AC_nz] = test_dims[i].z; + update_config(&config); - printf("Testing mesh (%d, %d, %d):\n", // - test_dims[i].x, test_dims[i].y, test_dims[i].z); + printf("Testing mesh (%d, %d, %d):\n", // + test_dims[i].x, test_dims[i].y, test_dims[i].z); - num_failures += check_rk3(config); - fflush(stdout); - } + num_failures += check_rk3(config); + fflush(stdout); + } - printf("\n--------Testing done---------\n"); - printf("Failures found: %d\n", num_failures); + printf("\n--------Testing done---------\n"); + printf("Failures found: %d\n", num_failures); #if GEN_TEST_RESULT == 1 - fflush(test_result); - fclose(test_result); + fflush(test_result); + fclose(test_result); #endif - if (num_failures > 0) - return EXIT_FAILURE; - else - return EXIT_SUCCESS; + if (num_failures > 0) + return EXIT_FAILURE; + else + return EXIT_SUCCESS; } -#elif TEST_TYPE == THOROUGH_TEST // GEN TEST FILE START HERE -------------------------------------------------------------------------------------------------------------- +#elif TEST_TYPE == \ + THOROUGH_TEST // GEN TEST FILE START HERE + // -------------------------------------------------------------------------------------------------------------- typedef struct { - ModelScalar model; - AcReal candidate; - ModelScalar abs_error; - ModelScalar ulp_error; - ModelScalar rel_error; - ModelScalar maximum_magnitude; - ModelScalar minimum_magnitude; + ModelScalar model; + AcReal candidate; + ModelScalar abs_error; + ModelScalar ulp_error; + ModelScalar rel_error; + ModelScalar maximum_magnitude; + ModelScalar minimum_magnitude; } Error; -Error get_error(ModelScalar model, AcReal candidate) +Error +get_error(ModelScalar model, AcReal candidate) { - Error error; + Error error; + error.abs_error = 0; + + error.model = model; + error.candidate = candidate; + + if (error.model == error.candidate || fabsl(model - candidate) == 0) { // If exact error.abs_error = 0; + error.rel_error = 0; + error.ulp_error = 0; + } + else if (!is_valid(error.model) || !is_valid(error.candidate)) { + error.abs_error = INFINITY; + error.rel_error = INFINITY; + error.ulp_error = INFINITY; + } + else { + const int base = 2; + const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant - error.model = model; - error.candidate = candidate; + const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2)); - if (error.model == error.candidate || fabsl(model - candidate) == 0) { // If exact - error.abs_error = 0; - error.rel_error = 0; - error.ulp_error = 0; - } else if (!is_valid(error.model) || !is_valid(error.candidate)) { - error.abs_error = INFINITY; - error.rel_error = INFINITY; - error.ulp_error = INFINITY; - } else { - const int base = 2; - const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant + const ModelScalar ulp = powl(base, e - (p - 1)); + const ModelScalar machine_epsilon = 0.5 * powl(base, -(p - 1)); + error.abs_error = fabsl(model - candidate); + error.ulp_error = error.abs_error / ulp; + error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon; + } - const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2)); - - const ModelScalar ulp = powl(base, e - (p-1)); - const ModelScalar machine_epsilon = 0.5 * powl(base, -(p-1)); - error.abs_error = fabsl(model - candidate); - error.ulp_error = error.abs_error / ulp; - error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon; - } - - return error; + return error; } -Error get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh) +Error +get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh) { - Error error; - error.abs_error = -1; + Error error; + error.abs_error = -1; - for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) { - for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) { - Error curr_error = get_error(model_mesh.vertex_buffer[j][i], candidate_mesh.vertex_buffer[j][i]); - if (curr_error.abs_error > error.abs_error) - error = curr_error; - } - } + for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) { + for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) { + Error curr_error = get_error(model_mesh.vertex_buffer[j][i], + candidate_mesh.vertex_buffer[j][i]); + if (curr_error.abs_error > error.abs_error) + error = curr_error; + } + } - error.maximum_magnitude = -1; // Not calculated. - error.minimum_magnitude = -1; // Not calculated. + error.maximum_magnitude = -1; // Not calculated. + error.minimum_magnitude = -1; // Not calculated. - return error; + return error; } static ModelScalar get_maximum_magnitude(const ModelScalar* field, const AcMeshInfo info) { - ModelScalar maximum = -INFINITY; + ModelScalar maximum = -INFINITY; - for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i) - maximum = max(maximum, fabsl(field[i])); + for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i) + maximum = max(maximum, fabsl(field[i])); - return maximum; + return maximum; } - static ModelScalar get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info) { - ModelScalar minimum = INFINITY; + ModelScalar minimum = INFINITY; - for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i) - minimum = min(minimum, fabsl(field[i])); + for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i) + minimum = min(minimum, fabsl(field[i])); - return minimum; + return minimum; } -Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, const AcMesh& candidate_mesh) +Error +get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, + const AcMesh& candidate_mesh) { - ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle]; - AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle]; + ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle]; + AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle]; - Error error; - error.abs_error = -1; + Error error; + error.abs_error = -1; - for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) { + for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) { - Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]); + Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]); - if (curr_error.abs_error > error.abs_error) - error = curr_error; - } + if (curr_error.abs_error > error.abs_error) + error = curr_error; + } + error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info); + error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info); - error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info); - error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info); - - return error; + return error; } void print_error_to_file(const char* path, const int n, const Error error) { FILE* file = fopen(path, "a"); - fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, error.rel_error, error.maximum_magnitude, error.minimum_magnitude); - //fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error); + fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, + error.rel_error, error.maximum_magnitude, error.minimum_magnitude); + // fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, + // error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error); fclose(file); } #define MAX_PATH_LEN (256) -int run_autotest(void) +int +run_autotest(void) { #define N_MIN (32) #define N_MAX (512) - for (int n = N_MIN; n <= N_MAX; n += N_MIN) { - AcMeshInfo config; - load_config(&config); - config.int_params[AC_nx] = config.int_params[AC_ny] = config.int_params[AC_nz] = n; - update_config(&config); + for (int n = N_MIN; n <= N_MAX; n += N_MIN) { + AcMeshInfo config; + load_config(&config); + config.int_params[AC_nx] = config.int_params[AC_ny] = config.int_params[AC_nz] = n; + update_config(&config); - // Init host - AcMesh* candidate_mesh = acmesh_create(config); - ModelMesh* model_mesh = modelmesh_create(config); + // Init host + AcMesh* candidate_mesh = acmesh_create(config); + ModelMesh* model_mesh = modelmesh_create(config); - // Init device - acInit(config); + // Init device + acInit(config); - // Check all initial conditions + // Check all initial conditions for (int i = 0; i < ARRAY_SIZE(test_cases); ++i) { const InitType init_type = test_cases[i]; - acmesh_init_to((InitType)init_type, candidate_mesh); - acmesh_to_modelmesh(*candidate_mesh, model_mesh); // Load to Host - acLoad(*candidate_mesh); // Load to Device + acmesh_init_to((InitType)init_type, candidate_mesh); + acmesh_to_modelmesh(*candidate_mesh, model_mesh); // Load to Host + acLoad(*candidate_mesh); // Load to Device - boundconds(model_mesh->info, model_mesh); - acBoundcondStep(); + boundconds(model_mesh->info, model_mesh); + acBoundcondStep(); { // Check boundconds acStore(candidate_mesh); Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh); char boundcond_path[MAX_PATH_LEN]; - sprintf(boundcond_path, "%s_boundcond_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]); + sprintf(boundcond_path, "%s_boundcond_%s.testresult", + AC_DOUBLE_PRECISION ? "double" : "float", + init_type_names[(InitType)init_type]); print_error_to_file(boundcond_path, n, boundcond_error); } { // Check scalar max reduction - ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX); - AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX); + ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, + VTXBUF_UUX); + AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX); Error scalar_reduce_error = get_error(model, candidate); char scalar_reduce_path[MAX_PATH_LEN]; - sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]); + sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult", + AC_DOUBLE_PRECISION ? "double" : "float", + init_type_names[(InitType)init_type]); print_error_to_file(scalar_reduce_path, n, scalar_reduce_error); } { // Check vector max reduction - ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); - AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); + ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, + VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); + AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, + VTXBUF_UUZ); Error vector_reduce_error = get_error(model, candidate); char vector_reduce_path[MAX_PATH_LEN]; - sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]); + sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult", + AC_DOUBLE_PRECISION ? "double" : "float", + init_type_names[(InitType)init_type]); print_error_to_file(vector_reduce_path, n, vector_reduce_error); } // Time advance { - const AcReal umax = (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); - const AcReal dt = host_timestep(umax, config); + const AcReal umax = (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, + VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); + const AcReal dt = host_timestep(umax, config); // Host integration step model_rk3(dt, model_mesh); @@ -699,25 +711,31 @@ int run_autotest(void) // Check fields for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) { - Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, *model_mesh, *candidate_mesh); + Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, + *model_mesh, *candidate_mesh); - printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error); + printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", + (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, + (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error); char field_path[MAX_PATH_LEN]; - sprintf(field_path, "%s_integrationstep_%s_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type], vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]); + sprintf(field_path, "%s_integrationstep_%s_%s.testresult", + AC_DOUBLE_PRECISION ? "double" : "float", + init_type_names[(InitType)init_type], + vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]); print_error_to_file(field_path, n, field_error); } } - } + } - // Deallocate host - acmesh_destroy(candidate_mesh); - modelmesh_destroy(model_mesh); + // Deallocate host + acmesh_destroy(candidate_mesh); + modelmesh_destroy(model_mesh); - // Deallocate device - acQuit(); - } + // Deallocate device + acQuit(); + } - return 0; + return 0; } #endif diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc index a5f163a..05f4b32 100644 --- a/src/standalone/benchmark.cc +++ b/src/standalone/benchmark.cc @@ -35,10 +35,10 @@ #include "model/model_rk3.h" #include "timer_hires.h" -#include +#include "src/core/errchk.h" #include #include -#include "src/core/errchk.h" +#include static bool smaller_than(const double& a, const double& b) @@ -47,7 +47,8 @@ smaller_than(const double& a, const double& b) } static int -write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc) +write_runningtimes(const char* path, const int n, const double min, const double max, + const double median, const double perc) { FILE* fp; fp = fopen(path, "a"); @@ -80,7 +81,8 @@ int run_benchmark(void) { char runningtime_path[256]; - sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep"); + sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", + GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep"); FILE* fp; fp = fopen(runningtime_path, "w"); @@ -88,13 +90,14 @@ run_benchmark(void) if (fp != NULL) { fprintf(fp, "n, min, max, median, perc\n"); fclose(fp); - } else { + } + else { return EXIT_FAILURE; } - #define N_STEP_SIZE (128) - #define MAX_MESH_DIM (128) - #define NUM_ITERS (100) +#define N_STEP_SIZE (128) +#define MAX_MESH_DIM (128) +#define NUM_ITERS (100) for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) { /* Parse configs */ AcMeshInfo mesh_info; @@ -113,7 +116,6 @@ run_benchmark(void) std::vector results; results.reserve(NUM_ITERS); - // Warmup for (int i = 0; i < 10; ++i) { acIntegrate(0); @@ -124,28 +126,35 @@ run_benchmark(void) for (int i = 0; i < NUM_ITERS; ++i) { timer_reset(&t); - #if GEN_BENCHMARK_RK3 == 1 +#if GEN_BENCHMARK_RK3 == 1 acIntegrateStep(2, FLT_EPSILON); - #else // GEN_BENCHMARK_FULL - //const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); - const AcReal dt = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info); +#else // GEN_BENCHMARK_FULL + // const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); + const AcReal dt = AcReal( + 1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info); acIntegrate(dt); - #endif +#endif acSynchronize(); const double ms_elapsed = timer_diff_nsec(t) / 1e6; results.push_back(ms_elapsed); } - #define NTH_PERCENTILE (0.95) +#define NTH_PERCENTILE (0.95) std::sort(results.begin(), results.end(), smaller_than); - write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]); + write_runningtimes(runningtime_path, n, results[0], results[results.size() - 1], + results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]); char percentile_path[256]; - sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep"); + sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, + AC_DOUBLE_PRECISION ? "double" : "float", + GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep"); write_percentiles(percentile_path, NUM_ITERS, results); - printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]); + printf("%s running time %g ms, (%dth percentile, nx = %d) \n", + GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", + double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), + mesh_info.int_params[AC_nx]); acStore(mesh); acQuit(); @@ -225,7 +234,8 @@ run_benchmark(void) return 0; } -#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA +#else +//////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA @@ -290,8 +300,8 @@ run_benchmark(void) #define NTH_PERCENTILE (0.95) std::sort(results.begin(), results.end(), smaller_than); - write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]); - write_percentiles(n, NUM_ITERS, results); + write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], +results[int(NTH_PERCENTILE * NUM_ITERS)]); write_percentiles(n, NUM_ITERS, results); } return 0; diff --git a/src/standalone/config_loader.cc b/src/standalone/config_loader.cc index 6054802..637bcd4 100644 --- a/src/standalone/config_loader.cc +++ b/src/standalone/config_loader.cc @@ -63,7 +63,7 @@ parse_config(const char* path, AcMeshInfo* config) { FILE* fp; fp = fopen(path, "r"); - // For knowing which .conf file will be used + // For knowing which .conf file will be used printf("Config file path: \n %s \n ", path); ERRCHK(fp != NULL); @@ -79,8 +79,7 @@ parse_config(const char* path, AcMeshInfo* config) int idx = -1; if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0) config->int_params[idx] = atoi(value); - else if ((idx = find_str(keyword, realparam_names, - NUM_REAL_PARAM_TYPES)) >= 0) + else if ((idx = find_str(keyword, realparam_names, NUM_REAL_PARAM_TYPES)) >= 0) config->real_params[idx] = AcReal(atof(value)); } @@ -92,68 +91,66 @@ update_config(AcMeshInfo* config) { config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER; ///////////// PAD TEST - //config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; + // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; ///////////// PAD TEST config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER; config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER; // Bounds for the computational domain, i.e. nx_min <= i < nx_max config->int_params[AC_nx_min] = STENCIL_ORDER / 2; - config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + - config->int_params[AC_nx]; + config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx]; config->int_params[AC_ny_min] = STENCIL_ORDER / 2; - config->int_params[AC_ny_max] = config->int_params[AC_ny] + - STENCIL_ORDER / 2; + config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2; config->int_params[AC_nz_min] = STENCIL_ORDER / 2; - config->int_params[AC_nz_max] = config->int_params[AC_nz] + - STENCIL_ORDER / 2; + config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2; // Spacing config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx]; config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy]; config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz]; - config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz])); + config->real_params[AC_dsmin] = min( + config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz])); // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES) - config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx]; - config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my]; - config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz]; + config->real_params[AC_xlen] = config->real_params[AC_dsx] * config->int_params[AC_mx]; + config->real_params[AC_ylen] = config->real_params[AC_dsy] * config->int_params[AC_my]; + config->real_params[AC_zlen] = config->real_params[AC_dsz] * config->int_params[AC_mz]; - config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen]; - config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen]; - config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen]; + config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen]; + config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen]; + config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen]; /* Additional helper params */ // Int helpers - config->int_params[AC_mxy] = config->int_params[AC_mx] * - config->int_params[AC_my]; - config->int_params[AC_nxy] = config->int_params[AC_nx] * - config->int_params[AC_ny]; - config->int_params[AC_nxyz] = config->int_params[AC_nxy] * - config->int_params[AC_nz]; + config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my]; + config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny]; + config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz]; // Real helpers config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] * config->real_params[AC_cs_sound]; - config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma]; + config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / + config->real_params[AC_gamma]; - AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module - AcReal M_sun = AcReal(1.989e33); // g solar mass + AcReal G_CONST_CGS = AcReal( + 6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module + AcReal M_sun = AcReal(1.989e33); // g solar mass - config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun / - ( (config->real_params[AC_unit_length]* - config->real_params[AC_unit_length]* - config->real_params[AC_unit_length]) * - config->real_params[AC_unit_density] ) ; + config->real_params[AC_M_star] = config->real_params[AC_M_star] * M_sun / + ((config->real_params[AC_unit_length] * + config->real_params[AC_unit_length] * + config->real_params[AC_unit_length]) * + config->real_params[AC_unit_density]); - config->real_params[AC_G_CONST] = G_CONST_CGS / - ( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) / - (config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ; - - config->real_params[AC_GM_star] = config->real_params[AC_M_star]*config->real_params[AC_G_CONST]; - config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star])); + config->real_params[AC_G_CONST] = G_CONST_CGS / ((config->real_params[AC_unit_velocity] * + config->real_params[AC_unit_velocity]) / + (config->real_params[AC_unit_density] * + config->real_params[AC_unit_length])); + config->real_params[AC_GM_star] = config->real_params[AC_M_star] * + config->real_params[AC_G_CONST]; + config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2) * config->real_params[AC_GM_star])); const bool print_config = true; if (print_config) { diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc index 7f5f3e7..9024627 100644 --- a/src/standalone/renderer.cc +++ b/src/standalone/renderer.cc @@ -50,12 +50,12 @@ static const int window_bpp = 32; // Bits per pixel SDL_Surface* surfaces[NUM_VTXBUF_HANDLES]; static int datasurface_width = -1; static int datasurface_height = -1; -static int k_slice = 0; -static int k_slice_max = 0; +static int k_slice = 0; +static int k_slice_max = 0; // Colors -static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255}; -static const int num_tiles = NUM_VTXBUF_HANDLES + 1; +static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255}; +static const int num_tiles = NUM_VTXBUF_HANDLES + 1; static const int tiles_per_row = 3; /* @@ -82,10 +82,9 @@ static Camera camera = (Camera){(float2){.0f, .0f}, 1.f}; static inline vec4 project_ortho(const float2& pos, const float2& bbox, const float2& wdims) { - const vec4 rect = (vec4){ - camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x, - camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y, - camera.scale * bbox.x, camera.scale * bbox.y}; + const vec4 rect = (vec4){camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x, + camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y, + camera.scale * bbox.x, camera.scale * bbox.y}; return rect; } @@ -103,13 +102,12 @@ renderer_init(const int& mx, const int& my) SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS); // Setup window - window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, - SDL_WINDOWPOS_UNDEFINED, window_width, - window_height, SDL_WINDOW_SHOWN); + window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, + window_width, window_height, SDL_WINDOW_SHOWN); // Setup SDL renderer renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED); - //SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP); + // SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP); SDL_GetWindowSize(window, &window_width, &window_height); SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering @@ -118,24 +116,24 @@ renderer_init(const int& mx, const int& my) datasurface_height = my; // vec drawing uses the surface of the first component, no memory issues here for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) - surfaces[i] = SDL_CreateRGBSurfaceWithFormat( - 0, datasurface_width, datasurface_height, window_bpp, - SDL_PIXELFORMAT_RGBA8888); + surfaces[i] = SDL_CreateRGBSurfaceWithFormat(0, datasurface_width, datasurface_height, + window_bpp, SDL_PIXELFORMAT_RGBA8888); - camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width, - -.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height}; + camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width, + -.5f * (num_tiles / tiles_per_row) * datasurface_height + + .5f * datasurface_height}; camera.scale = min(window_width / float(datasurface_width * tiles_per_row), - window_height / float(datasurface_height * (num_tiles/tiles_per_row))); + window_height / float(datasurface_height * (num_tiles / tiles_per_row))); SDL_RendererInfo renderer_info; SDL_GetRendererInfo(renderer, &renderer_info); - printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height); + printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, + renderer_info.max_texture_height); return 0; } static int -set_pixel(const int& i, const int& j, const uint32_t& color, - SDL_Surface* surface) +set_pixel(const int& i, const int& j, const uint32_t& color, SDL_Surface* surface) { uint32_t* pixels = (uint32_t*)surface->pixels; pixels[i + j * surface->w] = color; @@ -143,22 +141,21 @@ set_pixel(const int& i, const int& j, const uint32_t& color, } static int -draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, - const int& tile) +draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int& tile) { const float xoffset = (tile % tiles_per_row) * datasurface_width; - const float yoffset = - (tile / tiles_per_row) * datasurface_height; + const float yoffset = -(tile / tiles_per_row) * datasurface_height; /* const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer)); const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer)); */ - const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer)); - const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer)); + const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer)); + const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer)); const float range = fabsf(max - min); const float mid = max - .5f * range; - const int k = k_slice; //mesh.info.int_params[AC_mz] / 2; + const int k = k_slice; // mesh.info.int_params[AC_mz] / 2; for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) { for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) { @@ -166,29 +163,23 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info); const uint8_t shade = (uint8_t)( - 255.f * - (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / - range); + 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / range); uint8_t color[4] = {0, 0, 0, 255}; color[tile % 3] = shade; - const uint32_t mapped_color = SDL_MapRGBA( - surfaces[vertex_buffer]->format, color[0], color[1], color[2], - color[3]); + const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer]->format, color[0], + color[1], color[2], color[3]); set_pixel(i, j, mapped_color, surfaces[vertex_buffer]); } } const float2 pos = (float2){xoffset, yoffset}; - const float2 bbox = (float2){.5f * datasurface_width, - .5f * datasurface_height}; + const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height}; const float2 wsize = (float2){float(window_width), float(window_height)}; const vec4 rectf = project_ortho(pos, bbox, wsize); - SDL_Rect rect = (SDL_Rect){ - int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h), - int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))}; + SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h), + int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))}; - SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, - surfaces[vertex_buffer]); + SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer]); SDL_RenderCopy(renderer, tex, NULL, &rect); SDL_DestroyTexture(tex); @@ -196,14 +187,12 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, } static int -draw_vertex_buffer_vec(const AcMesh& mesh, - const VertexBufferHandle& vertex_buffer_a, +draw_vertex_buffer_vec(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer_a, const VertexBufferHandle& vertex_buffer_b, - const VertexBufferHandle& vertex_buffer_c, - const int& tile) + const VertexBufferHandle& vertex_buffer_c, const int& tile) { const float xoffset = (tile % tiles_per_row) * datasurface_width; - const float yoffset = - (tile / tiles_per_row) * datasurface_height; + const float yoffset = -(tile / tiles_per_row) * datasurface_height; /* const float maxx = float( @@ -215,52 +204,41 @@ draw_vertex_buffer_vec(const AcMesh& mesh, min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b), model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c)))); */ - const float maxx = float( - max(acReduceScal(RTYPE_MAX, vertex_buffer_a), - max(acReduceScal(RTYPE_MAX, vertex_buffer_b), - acReduceScal(RTYPE_MAX, vertex_buffer_c)))); - const float minn = float( - min(acReduceScal(RTYPE_MIN, vertex_buffer_a), - min(acReduceScal(RTYPE_MIN, vertex_buffer_b), - acReduceScal(RTYPE_MIN, vertex_buffer_c)))); + const float maxx = float(max( + acReduceScal(RTYPE_MAX, vertex_buffer_a), + max(acReduceScal(RTYPE_MAX, vertex_buffer_b), acReduceScal(RTYPE_MAX, vertex_buffer_c)))); + const float minn = float(min( + acReduceScal(RTYPE_MIN, vertex_buffer_a), + min(acReduceScal(RTYPE_MIN, vertex_buffer_b), acReduceScal(RTYPE_MIN, vertex_buffer_c)))); const float range = fabsf(maxx - minn); const float mid = maxx - .5f * range; - const int k = k_slice; //mesh.info.int_params[AC_mz] / 2; + const int k = k_slice; // mesh.info.int_params[AC_mz] / 2; for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) { for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) { ERRCHK(i < datasurface_width && j < datasurface_height); const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info); const uint8_t r = (uint8_t)( - 255.f * - (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / - range); + 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / range); const uint8_t g = (uint8_t)( - 255.f * - (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / - range); + 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / range); const uint8_t b = (uint8_t)( - 255.f * - (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / - range); - const uint32_t mapped_color = SDL_MapRGBA( - surfaces[vertex_buffer_a]->format, r, g, b, 255); + 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / range); + const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer_a]->format, r, g, b, + 255); set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]); } } const float2 pos = (float2){xoffset, yoffset}; - const float2 bbox = (float2){.5f * datasurface_width, - .5f * datasurface_height}; + const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height}; const float2 wsize = (float2){float(window_width), float(window_height)}; const vec4 rectf = project_ortho(pos, bbox, wsize); - SDL_Rect rect = (SDL_Rect){ - int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h), - int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))}; + SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h), + int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))}; - SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, - surfaces[vertex_buffer_a]); + SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer_a]); SDL_RenderCopy(renderer, tex, NULL, &rect); SDL_DestroyTexture(tex); @@ -272,13 +250,11 @@ renderer_draw(const AcMesh& mesh) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) draw_vertex_buffer(mesh, VertexBufferHandle(i), i); - draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, - NUM_VTXBUF_HANDLES); + draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, NUM_VTXBUF_HANDLES); // Drawing done, present SDL_RenderPresent(renderer); - SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, - color_bg.a); + SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, color_bg.a); SDL_RenderClear(renderer); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { @@ -404,14 +380,14 @@ run_renderer(void) /* Step the simulation */ #if 1 - const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, - VTXBUF_UUZ); + const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); const AcReal dt = host_timestep(umax, mesh_info); acIntegrate(dt); #else ModelMesh* model_mesh = modelmesh_create(mesh->info); - const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)); - const AcReal dt = host_timestep(umax, mesh_info); + const AcReal umax = AcReal( + model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)); + const AcReal dt = host_timestep(umax, mesh_info); acmesh_to_modelmesh(*mesh, model_mesh); model_rk3(dt, model_mesh); modelmesh_to_acmesh(*model_mesh, mesh); @@ -425,7 +401,7 @@ run_renderer(void) /* Render */ const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f; if (timer_diff_sec >= desired_frame_time) { - //acStore(mesh); + // acStore(mesh); const int num_vertices = mesh->info.int_params[AC_mxy]; const int3 dst = (int3){0, 0, k_slice}; acStoreWithOffset(dst, num_vertices, mesh); diff --git a/src/standalone/simulation.cc b/src/standalone/simulation.cc index efa198a..4a4b6a4 100644 --- a/src/standalone/simulation.cc +++ b/src/standalone/simulation.cc @@ -60,23 +60,23 @@ print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt) } */ -//Write all setting info into a separate ascii file. This is done to guarantee -//that we have the data specifi information in the thing, even though in -//principle these things are in the astaroth.conf. -static inline -void write_mesh_info(const AcMeshInfo* config) +// Write all setting info into a separate ascii file. This is done to guarantee +// that we have the data specifi information in the thing, even though in +// principle these things are in the astaroth.conf. +static inline void +write_mesh_info(const AcMeshInfo* config) { - + FILE* infotxt; - infotxt = fopen("purge.sh","w"); + infotxt = fopen("purge.sh", "w"); fprintf(infotxt, "#!/bin/bash\n"); fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n"); - fclose(infotxt); + fclose(infotxt); - infotxt = fopen("mesh_info.list","w"); + infotxt = fopen("mesh_info.list", "w"); - //Total grid dimensions + // Total grid dimensions fprintf(infotxt, "int AC_mx %i \n", config->int_params[AC_mx]); fprintf(infotxt, "int AC_my %i \n", config->int_params[AC_my]); fprintf(infotxt, "int AC_mz %i \n", config->int_params[AC_mz]); @@ -96,30 +96,28 @@ void write_mesh_info(const AcMeshInfo* config) fprintf(infotxt, "real AC_inv_dsx %e \n", (double)config->real_params[AC_inv_dsx]); fprintf(infotxt, "real AC_inv_dsy %e \n", (double)config->real_params[AC_inv_dsy]); fprintf(infotxt, "real AC_inv_dsz %e \n", (double)config->real_params[AC_inv_dsz]); - fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin ]); + fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin]); /* Additional helper params */ // Int helpers - fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy ]); - fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy ]); + fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy]); + fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy]); fprintf(infotxt, "int AC_nxyz %i \n", config->int_params[AC_nxyz]); // Real helpers fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]); - fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound ]); + fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound]); fclose(infotxt); } - -//This funtion writes a run state into a set of C binaries. For the sake of -//accuracy, all floating point numbers are to be saved in long double precision -//regardless of the choise of accuracy during runtime. +// This funtion writes a run state into a set of C binaries. For the sake of +// accuracy, all floating point numbers are to be saved in long double precision +// regardless of the choise of accuracy during runtime. static inline void -save_mesh(const AcMesh &save_mesh, const int step, - const AcReal t_step) +save_mesh(const AcMesh& save_mesh, const int step, const AcReal t_step) { - FILE* save_ptr; + FILE* save_ptr; for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) { const size_t n = AC_VTXBUF_SIZE(save_mesh.info); @@ -128,7 +126,7 @@ save_mesh(const AcMesh &save_mesh, const int step, char cstep[10]; char bin_filename[80] = "\0"; - //sprintf(bin_filename, ""); + // sprintf(bin_filename, ""); sprintf(cstep, "%d", step); @@ -139,30 +137,27 @@ save_mesh(const AcMesh &save_mesh, const int step, printf("Savefile %s \n", bin_filename); - save_ptr = fopen(bin_filename,"wb"); + save_ptr = fopen(bin_filename, "wb"); - //Start file with time stamp - long double write_long_buf = (long double) t_step; + // Start file with time stamp + long double write_long_buf = (long double)t_step; fwrite(&write_long_buf, sizeof(long double), 1, save_ptr); - //Grid data + // Grid data for (size_t i = 0; i < n; ++i) { - const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i]; - long double write_long_buf = (long double) point_val; + const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i]; + long double write_long_buf = (long double)point_val; fwrite(&write_long_buf, sizeof(long double), 1, save_ptr); } fclose(save_ptr); } - } - - // This function prints out the diagnostic values to std.out and also saves and -// appends an ascii file to contain all the result. +// appends an ascii file to contain all the result. static inline void -print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file) +print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE* diag_file) { - + AcReal buf_rms, buf_max, buf_min; const int max_name_width = 16; @@ -172,20 +167,19 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di buf_rms = acReduceVec(RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); // MV: The ordering in the earlier version was wrong in terms of variable - // MV: name and its diagnostics. + // MV: name and its diagnostics. printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt)); - printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", - double(buf_min), double(buf_rms), double(buf_max)); - fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), - double(buf_min), double(buf_rms), double(buf_max)); - + printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", double(buf_min), + double(buf_rms), double(buf_max)); + fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), double(buf_min), + double(buf_rms), double(buf_max)); // Calculate rms, min and max from the variables as scalars for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { buf_max = acReduceScal(RTYPE_MAX, VertexBufferHandle(i)); buf_min = acReduceScal(RTYPE_MIN, VertexBufferHandle(i)); buf_rms = acReduceScal(RTYPE_RMS, VertexBufferHandle(i)); - + printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i], double(buf_min), double(buf_rms), double(buf_max)); fprintf(diag_file, "%e %e %e ", double(buf_min), double(buf_rms), double(buf_max)); @@ -194,11 +188,11 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di fprintf(diag_file, "\n"); } - /* - MV NOTE: At the moment I have no clear idea how to calculate magnetic - diagnostic variables from grid. Vector potential measures have a limited - value. TODO: Smart way to get brms, bmin and bmax. - */ +/* + MV NOTE: At the moment I have no clear idea how to calculate magnetic + diagnostic variables from grid. Vector potential measures have a limited + value. TODO: Smart way to get brms, bmin and bmax. +*/ int run_simulation(void) @@ -213,16 +207,16 @@ run_simulation(void) acInit(mesh_info); acLoad(*mesh); - - FILE *diag_file; + FILE* diag_file; diag_file = fopen("timeseries.ts", "a"); - // TODO Get time from earlier state. + // TODO Get time from earlier state. AcReal t_step = 0.0; // Generate the title row. fprintf(diag_file, "step t_step dt uu_total_min uu_total_rms uu_total_max "); for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { - fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]); + fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i], + vtxbuf_names[i]); } fprintf(diag_file, "\n"); @@ -234,56 +228,54 @@ run_simulation(void) acStore(mesh); save_mesh(*mesh, 0, t_step); - const int max_steps = mesh_info.int_params[AC_max_steps]; + const int max_steps = mesh_info.int_params[AC_max_steps]; const int save_steps = mesh_info.int_params[AC_save_steps]; - const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info + const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; // TODO Get from mesh_info AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t]; AcReal bin_crit_t = bin_save_t; /* Step the simulation */ for (int i = 1; i < max_steps; ++i) { - const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, - VTXBUF_UUZ); + const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); const AcReal dt = host_timestep(umax, mesh_info); acIntegrate(dt); - t_step += dt; + t_step += dt; /* Save the simulation state and print diagnostics */ if ((i % save_steps) == 0) { /* - print_diagnostics() writes out both std.out printout from the - results and saves the diagnostics into a table for ascii file + print_diagnostics() writes out both std.out printout from the + results and saves the diagnostics into a table for ascii file timeseries.ts. */ print_diagnostics(i, dt, t_step, diag_file); /* - We would also might want an XY-average calculating funtion, - which can be very useful when observing behaviour of turbulent + We would also might want an XY-average calculating funtion, + which can be very useful when observing behaviour of turbulent simulations. (TODO) */ - } /* Save the simulation state and print diagnostics */ if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) { /* - This loop saves the data into simple C binaries which can be + This loop saves the data into simple C binaries which can be used for analysing the data snapshots closely. - - Saving simulation state should happen in a separate stage. We do - not want to save it as often as diagnostics. The file format - should IDEALLY be HDF5 which has become a well supported, portable and - reliable data format when it comes to HPC applications. - However, implementing it will have to for more simpler approach + + Saving simulation state should happen in a separate stage. We do + not want to save it as often as diagnostics. The file format + should IDEALLY be HDF5 which has become a well supported, portable and + reliable data format when it comes to HPC applications. + However, implementing it will have to for more simpler approach to function. (TODO?) */ - + /* The updated mesh will be located on the GPU. Also all calls to the astaroth interface (functions beginning with ac*) are @@ -299,10 +291,8 @@ run_simulation(void) save_mesh(*mesh, i, t_step); - bin_crit_t += bin_save_t; - + bin_crit_t += bin_save_t; } - } //////Save the final snapshot @@ -318,25 +308,3 @@ run_simulation(void) return 0; } - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/standalone/timer_hires.h b/src/standalone/timer_hires.h index 6b9723b..3a79419 100644 --- a/src/standalone/timer_hires.h +++ b/src/standalone/timer_hires.h @@ -52,8 +52,7 @@ timer_diff_nsec(const Timer start) { Timer end; timer_reset(&end); - const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + - (end.tv_nsec - start.tv_nsec); + const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + (end.tv_nsec - start.tv_nsec); return diff; }