Autoformatted all CUDA/C/C++ code

This commit is contained in:
jpekkila
2019-06-18 16:42:56 +03:00
parent 6fdc4cddb2
commit 8864266042
12 changed files with 1053 additions and 1111 deletions

View File

@@ -316,9 +316,13 @@ traverse(const ASTNode* node)
if (symbol_table[i].type_qualifier == IN) { if (symbol_table[i].type_qualifier == IN) {
printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier), printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier); symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
} else if (symbol_table[i].type_qualifier == OUT) { }
printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier); else if (symbol_table[i].type_qualifier == OUT) {
//printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier); printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier),
symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
// printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n",
// translate(symbol_table[i].type_specifier), symbol_table[i].identifier,
// inout_name_prefix, symbol_table[i].identifier);
} }
} }
} }
@@ -326,8 +330,7 @@ traverse(const ASTNode* node)
// Preprocessed parameter boilerplate // Preprocessed parameter boilerplate
if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED) if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
inside_preprocessed = true; inside_preprocessed = true;
static const char static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION) if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
printf("%s ", preprocessed_parameter_boilerplate); printf("%s ", preprocessed_parameter_boilerplate);
// BOILERPLATE END//////////////////////////////////////////////////////// // BOILERPLATE END////////////////////////////////////////////////////////
@@ -343,7 +346,6 @@ traverse(const ASTNode* node)
if (node->type == NODE_FUNCTION_DECLARATION) if (node->type == NODE_FUNCTION_DECLARATION)
inside_function_declaration = false; inside_function_declaration = false;
// If the node is a subscript expression and the expression list inside it is not empty // If the node is a subscript expression and the expression list inside it is not empty
if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs) if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
printf("IDX("); printf("IDX(");
@@ -354,7 +356,7 @@ traverse(const ASTNode* node)
if (handle >= 0) { // The variable exists in the symbol table if (handle >= 0) { // The variable exists in the symbol table
const Symbol* symbol = &symbol_table[handle]; const Symbol* symbol = &symbol_table[handle];
//if (symbol->type_qualifier == OUT) { // if (symbol->type_qualifier == OUT) {
// printf("%s%s", inout_name_prefix, symbol->identifier); // printf("%s%s", inout_name_prefix, symbol->identifier);
//} //}
if (symbol->type_qualifier == UNIFORM) { if (symbol->type_qualifier == UNIFORM) {
@@ -394,14 +396,16 @@ traverse(const ASTNode* node)
// Postfix logic %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // Postfix logic %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// If the node is a subscript expression and the expression list inside it is not empty // If the node is a subscript expression and the expression list inside it is not empty
if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs) if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
printf(")"); // Closing bracket of IDX() printf(")"); // Closing bracket of IDX()
// Generate writeback boilerplate for OUT fields // Generate writeback boilerplate for OUT fields
if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) { if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
for (int i = 0; i < num_symbols; ++i) { for (int i = 0; i < num_symbols; ++i) {
if (symbol_table[i].type_qualifier == OUT) { if (symbol_table[i].type_qualifier == OUT) {
printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier); printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier,
//printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier); symbol_table[i].identifier);
// printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n",
// inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
} }
} }
} }
@@ -486,8 +490,8 @@ generate_preprocessed_structures(void)
for (int i = 0; i < num_symbols; ++i) { for (int i = 0; i < num_symbols; ++i) {
if (symbol_table[i].type_qualifier == PREPROCESSED) if (symbol_table[i].type_qualifier == PREPROCESSED)
printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier, printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n",
symbol_table[i].identifier); symbol_table[i].identifier, symbol_table[i].identifier);
} }
printf("return data;\n"); printf("return data;\n");
printf("}\n"); printf("}\n");

View File

@@ -41,7 +41,6 @@ extern "C" {
#include <stdlib.h> // size_t #include <stdlib.h> // size_t
#include <vector_types.h> // CUDA vector types (float4, etc) #include <vector_types.h> // CUDA vector types (float4, etc)
/* /*
* ============================================================================= * =============================================================================
* Flags for auto-optimization * Flags for auto-optimization
@@ -59,7 +58,6 @@ extern "C" {
#define NUM_ITERATIONS (10) #define NUM_ITERATIONS (10)
#define WARP_SIZE (32) #define WARP_SIZE (32)
/* /*
* ============================================================================= * =============================================================================
* Compile-time constants used during simulation (user definable) * Compile-time constants used during simulation (user definable)
@@ -75,7 +73,8 @@ extern "C" {
// L-prefix inherited from the old Astaroth, no idea what it means // L-prefix inherited from the old Astaroth, no idea what it means
// MV: L means a Logical switch variale, something having true of false value. // MV: L means a Logical switch variale, something having true of false value.
#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
#define LFORCING (0)
#define LINDUCTION (1) #define LINDUCTION (1)
#define LENTROPY (1) #define LENTROPY (1)
#define LTEMPERATURE (0) #define LTEMPERATURE (0)
@@ -258,28 +257,16 @@ typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
* Reduction types * Reduction types
* ============================================================================= * =============================================================================
*/ */
typedef enum { typedef enum { RTYPE_MAX, RTYPE_MIN, RTYPE_RMS, RTYPE_RMS_EXP, NUM_REDUCTION_TYPES } ReductionType;
RTYPE_MAX,
RTYPE_MIN,
RTYPE_RMS,
RTYPE_RMS_EXP,
NUM_REDUCTION_TYPES
} ReductionType;
/* /*
* ============================================================================= * =============================================================================
* Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH) * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
* ============================================================================= * =============================================================================
*/ */
typedef enum { typedef enum { AC_FOR_INT_PARAM_TYPES(AC_GEN_ID), NUM_INT_PARAM_TYPES } AcIntParam;
AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
NUM_INT_PARAM_TYPES
} AcIntParam;
typedef enum { typedef enum { AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID), NUM_REAL_PARAM_TYPES } AcRealParam;
AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
NUM_REAL_PARAM_TYPES
} AcRealParam;
extern const char* intparam_names[]; // Defined in astaroth.cu extern const char* intparam_names[]; // Defined in astaroth.cu
extern const char* realparam_names[]; // Defined in astaroth.cu extern const char* realparam_names[]; // Defined in astaroth.cu
@@ -294,9 +281,7 @@ typedef struct {
* Definitions for the enums and structs for AcMesh (DO NOT TOUCH) * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
* ============================================================================= * =============================================================================
*/ */
typedef enum { typedef enum { AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES } VertexBufferHandle;
AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
} VertexBufferHandle;
extern const char* vtxbuf_names[]; // Defined in astaroth.cu extern const char* vtxbuf_names[]; // Defined in astaroth.cu
@@ -316,22 +301,20 @@ typedef struct {
AcMeshInfo info; AcMeshInfo info;
} AcMesh; } AcMesh;
#define AC_VTXBUF_SIZE(mesh_info) \ #define AC_VTXBUF_SIZE(mesh_info) \
((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] * \ ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] * \
mesh_info.int_params[AC_mz])) mesh_info.int_params[AC_mz]))
#define AC_VTXBUF_SIZE_BYTES(mesh_info) \ #define AC_VTXBUF_SIZE_BYTES(mesh_info) (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
(sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info) \ #define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info) \
(mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * \ (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * mesh_info.int_params[AC_nz])
mesh_info.int_params[AC_nz])
#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info) \ #define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info) \
(sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)) (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
#define AC_VTXBUF_IDX(i, j, k, mesh_info) \ #define AC_VTXBUF_IDX(i, j, k, mesh_info) \
((i) + (j)*mesh_info.int_params[AC_mx] + \ ((i) + (j)*mesh_info.int_params[AC_mx] + \
(k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my]) (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my])
/* /*

View File

@@ -28,33 +28,24 @@
#include "errchk.h" #include "errchk.h"
#include "device.cuh" #include "device.cuh"
#include "math_utils.h" // sum for reductions #include "math_utils.h" // sum for reductions
#include "standalone/config_loader.h" // update_config #include "standalone/config_loader.h" // update_config
const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)}; const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)}; const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)}; const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
static const int MAX_NUM_DEVICES = 32;
static const int MAX_NUM_DEVICES = 32; static int num_devices = 1;
static int num_devices = 1;
static Device devices[MAX_NUM_DEVICES] = {}; static Device devices[MAX_NUM_DEVICES] = {};
static Grid static Grid
createGrid(const AcMeshInfo& config) createGrid(const AcMeshInfo& config)
{ {
Grid grid; Grid grid;
grid.m = (int3) {
config.int_params[AC_mx],
config.int_params[AC_my],
config.int_params[AC_mz]
};
grid.n = (int3) { grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
config.int_params[AC_nx], grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
config.int_params[AC_ny],
config.int_params[AC_nz]
};
return grid; return grid;
} }
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
static int3 static int3
gridIdx3d(const Grid& grid, const int idx) gridIdx3d(const Grid& grid, const int idx)
{ {
return (int3){idx % grid.m.x, return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
(idx % (grid.m.x * grid.m.y)) / grid.m.x,
idx / (grid.m.x * grid.m.y)}; idx / (grid.m.x * grid.m.y)};
} }
@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER); ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER); ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
printf("Grid m "); printInt3(grid.m); printf("\n"); // clang-format off
printf("Grid n "); printInt3(grid.n); printf("\n"); printf("Grid m "); printInt3(grid.m); printf("\n");
printf("Grid n "); printInt3(grid.n); printf("\n");
printf("Subrid m "); printInt3(subgrid.m); printf("\n"); printf("Subrid m "); printInt3(subgrid.m); printf("\n");
printf("Subrid n "); printInt3(subgrid.n); printf("\n"); printf("Subrid n "); printInt3(subgrid.n); printf("\n");
// clang-format on
// Initialize the devices // Initialize the devices
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
*/ */
if (db.z >= da.z) { if (db.z >= da.z) {
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da); const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE const int3 da_local = (int3){
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n"); da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
// printf("\n");
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells); copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
} }
printf("\n"); printf("\n");
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
*/ */
if (db.z >= da.z) { if (db.z >= da.z) {
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da); const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE const int3 da_local = (int3){
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n"); da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
// printf("\n");
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh); copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
} }
printf("\n"); printf("\n");
@@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh)
AcResult AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt) acIntegrateStep(const int& isubstep, const AcReal& dt)
{ {
const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2}; const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
const int3 end = (int3){STENCIL_ORDER/2 + subgrid.n.x, const int3 end = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
STENCIL_ORDER/2 + subgrid.n.y, STENCIL_ORDER / 2 + subgrid.n.z};
STENCIL_ORDER/2 + subgrid.n.z};
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {
rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt); rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
} }
@@ -278,121 +273,125 @@ acBoundcondStep(void)
{ {
acSynchronize(); acSynchronize();
if (num_devices == 1) { if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY, boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
(int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z}); (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
} else { }
else {
// Local boundary conditions // Local boundary conditions
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1); boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
} }
/* /*
// ===MIIKKANOTE START========================================== // ===MIIKKANOTE START==========================================
%JP: The old way for computing boundary conditions conflicts with the %JP: The old way for computing boundary conditions conflicts with the
way we have to do things with multiple GPUs. way we have to do things with multiple GPUs.
The older approach relied on unified memory, which represented the whole The older approach relied on unified memory, which represented the whole
memory area as one huge mesh instead of several smaller ones. However, unified memory memory area as one huge mesh instead of several smaller ones. However, unified memory
in its current state is more meant for quick prototyping when performance is not an issue. in its current state is more meant for quick prototyping when performance is not an issue.
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
when managing the memory explicitly. than when managing the memory explicitly.
In this new approach, I have simplified the multi- and single-GPU layers significantly. In this new approach, I have simplified the multi- and single-GPU layers significantly.
Quick rundown: Quick rundown:
New struct: Grid. There are two global variables, "grid" and "subgrid", which New struct: Grid. There are two global variables, "grid" and "subgrid", which
contain the extents of the whole simulation domain and the decomposed grids, respectively. contain the extents of the whole simulation domain and the decomposed grids,
To simplify thing, we require that each GPU is assigned the same amount of work, respectively. To simplify thing, we require that each GPU is assigned the same amount of
therefore each GPU in the node is assigned and "subgrid.m" -sized block of data work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
to work with. work with.
The whole simulation domain is decomposed with respect to the z dimension. The whole simulation domain is decomposed with respect to the z dimension.
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
contain (nx, ny, nz / num_devices) vertices. contain (nx, ny, nz / num_devices) vertices.
An local index (i, j, k) in some subgrid can be mapped to the global grid with An local index (i, j, k) in some subgrid can be mapped to the global grid with
global idx = (i, j, k + device_id * subgrid.n.z) global idx = (i, j, k + device_id * subgrid.n.z)
Terminology: Terminology:
- Single-GPU function: a function defined on the single-GPU layer (device.cu) - Single-GPU function: a function defined on the single-GPU layer (device.cu)
Changes required to this commented code block: Changes required to this commented code block:
- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu - The thread block dimensions (tpb) are no longer passed to the kernel here but in
instead. Same holds for any complex index calculations. Instead, the local coordinates device.cu instead. Same holds for any complex index calculations. Instead, the local
should be passed as an int3 type without having to consider how the data is actually coordinates should be passed as an int3 type without having to consider how the data is
laid out in device memory actually laid out in device memory
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
of type "Device" which should be passed to single-GPU functions. In this file, all devices handle of type "Device" which should be passed to single-GPU functions. In this file, all
are stored in a global array "devices[num_devices]". devices are stored in a global array "devices[num_devices]".
- Every single-GPU function is executed asynchronously by default such that we - Every single-GPU function is executed asynchronously by default such that we
can optimize Astaroth by executing memory transactions concurrently with computation. can optimize Astaroth by executing memory transactions concurrently with
Therefore a StreamType should be passed as a parameter to single-GPU functions. computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
as a parameter and commands executing in different streams can be processed as a parameter and commands executing in different streams can be processed
in parallel/concurrently. in parallel/concurrently.
Note on periodic boundaries (might be helpful when implementing other boundary conditions): Note on periodic boundaries (might be helpful when implementing other boundary conditions):
With multiple GPUs, periodic boundary conditions applied on indices ranging from With multiple GPUs, periodic boundary conditions applied on indices ranging from
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2) (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
STENCIL_ORDER/2)
on a single device are "local", in the sense that they can be computed without having on a single device are "local", in the sense that they can be computed without
to exchange data with neighboring GPUs. Special care is needed only for transferring having to exchange data with neighboring GPUs. Special care is needed only for transferring
the data to the fron and back plates outside this range. In the solution we use here, the data to the fron and back plates outside this range. In the solution we use
we solve the local boundaries first, and then just exchange the front and back plates here, we solve the local boundaries first, and then just exchange the front and back plates
in a "ring", like so in a "ring", like so
device_id device_id
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0) (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
// ======MIIKKANOTE END========================================== // ======MIIKKANOTE END==========================================
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
moved into device.cu, function boundCondStep() moved into device.cu, function
In astaroth.cu, we use acBoundcondStep() boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
just to distribute the work and manage manage communication between GPUs.
communication between GPUs.
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS); printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
best_dims.z, double(best_time) / NUM_ITERATIONS);
exit(0); exit(0);
#else #else
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices); const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
const int3 start = (int3){0, 0, device_id * depth}; const int3 start = (int3){0, 0, device_id * depth};
const int3 end = (int3){mesh_info.int_params[AC_mx], const int3 end = (int3){mesh_info.int_params[AC_mx],
mesh_info.int_params[AC_my], mesh_info.int_params[AC_my],
min((device_id+1) * depth, mesh_info.int_params[AC_mz])}; min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
const dim3 tpb(8,2,8); const dim3 tpb(8,2,8);
// TODO uses the default stream currently // TODO uses the default stream currently
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
wedge_boundconds(0, tpb, start, end, d_buffer); wedge_boundconds(0, tpb, start, end, d_buffer);
} else { } else {
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]); periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
*/ */
// Exchange halos // Exchange halos
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2; const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2;
// ...|ooooxxx|... -> xxx|ooooooo|... // ...|ooooxxx|... -> xxx|ooooooo|...
{ {
const int3 src = (int3) {0, 0, subgrid.n.z}; const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3) {0, 0, 0}; const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices); copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
devices[(i + 1) % num_devices], dst, num_vertices);
} }
// ...|ooooooo|xxx <- ...|xxxoooo|... // ...|ooooooo|xxx <- ...|xxxoooo|...
{ {
const int3 src = (int3) {0, 0, STENCIL_ORDER/2}; const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z}; const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices); copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
devices[i], dst, num_vertices);
} }
} }
} }
@@ -427,26 +426,28 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) { if (rtype == RTYPE_MAX) {
res = max(res, results[i]); res = max(res, results[i]);
} else if (rtype == RTYPE_MIN) { }
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]); res = min(res, results[i]);
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { }
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
res = sum(res, results[i]); res = sum(res, results[i]);
} else { }
else {
ERROR("Invalid rtype"); ERROR("Invalid rtype");
} }
} }
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z); const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res); res = sqrt(inv_n * res);
} }
return res; return res;
} }
AcReal AcReal
acReduceScal(const ReductionType& rtype, acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
const VertexBufferHandle& vtxbuffer_handle)
{ {
AcReal results[num_devices]; AcReal results[num_devices];
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
} }
AcReal AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
const VertexBufferHandle& b, const VertexBufferHandle& c) const VertexBufferHandle& c)
{ {
AcReal results[num_devices]; AcReal results[num_devices];
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {

View File

@@ -36,7 +36,7 @@ typedef struct {
__constant__ AcMeshInfo d_mesh_info; __constant__ AcMeshInfo d_mesh_info;
__constant__ int3 d_multigpu_offset; __constant__ int3 d_multigpu_offset;
__constant__ Grid globalGrid; __constant__ Grid globalGrid;
#define DCONST_INT(X) (d_mesh_info.int_params[X]) #define DCONST_INT(X) (d_mesh_info.int_params[X])
#define DCONST_REAL(X) (d_mesh_info.real_params[X]) #define DCONST_REAL(X) (d_mesh_info.real_params[X])
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy)) #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
@@ -76,46 +76,46 @@ printDeviceInfo(const Device device)
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
printf(" Stream processors: %d\n", props.multiProcessorCount); printf(" Stream processors: %d\n", props.multiProcessorCount);
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio); printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
printf(" Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0 printf(
" Compute mode: %d\n",
(int)props
.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
// Memory // Memory
printf(" Global memory\n"); printf(" Global memory\n");
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000)); printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth); printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
printf(" Peak Memory Bandwidth (GiB/s): %f\n", printf(" Peak Memory Bandwidth (GiB/s): %f\n",
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / 2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
(8. * 1024. * 1024. * 1024.));
printf(" ECC enabled: %d\n", props.ECCEnabled); printf(" ECC enabled: %d\n", props.ECCEnabled);
// Memory usage // Memory usage
size_t free_bytes, total_bytes; size_t free_bytes, total_bytes;
cudaMemGetInfo(&free_bytes, &total_bytes); cudaMemGetInfo(&free_bytes, &total_bytes);
const size_t used_bytes = total_bytes - free_bytes; const size_t used_bytes = total_bytes - free_bytes;
printf(" Total global mem: %.2f GiB\n", printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
props.totalGlobalMem / (1024.0 * 1024 * 1024));
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024)); printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory free (GiB): %.2f\n", printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
free_bytes / (1024.0 * 1024 * 1024)); printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory total (GiB): %.2f\n",
total_bytes / (1024.0 * 1024 * 1024));
printf(" Caches\n"); printf(" Caches\n");
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported); printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported); printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024)); printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024)); printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
printf(" Shared mem per block: %ld KiB\n", printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
props.sharedMemPerBlock / (1024));
printf(" Other\n"); printf(" Other\n");
printf(" Warp size: %d\n", props.warpSize); printf(" Warp size: %d\n", props.warpSize);
// printf(" Single to double perf. ratio: %dx\n", // printf(" Single to double perf. ratio: %dx\n",
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
// versions // versions
printf(" Stream priorities supported: %d\n", printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported);
props.streamPrioritiesSupported);
printf("--------------------------------------------------\n"); printf("--------------------------------------------------\n");
return AC_SUCCESS; return AC_SUCCESS;
} }
static __global__ void dummy_kernel(void) {} static __global__ void
dummy_kernel(void)
{
}
AcResult AcResult
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle) createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -124,10 +124,10 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
cudaDeviceReset(); cudaDeviceReset();
// Create Device // Create Device
struct device_s* device = (struct device_s*) malloc(sizeof(*device)); struct device_s* device = (struct device_s*)malloc(sizeof(*device));
ERRCHK_ALWAYS(device); ERRCHK_ALWAYS(device);
device->id = id; device->id = id;
device->local_config = device_config; device->local_config = device_config;
// Check that the code was compiled for the proper GPU architecture // Check that the code was compiled for the proper GPU architecture
@@ -150,15 +150,14 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes)); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
} }
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad, ERRCHK_CUDA_ALWAYS(
AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config))); cudaMalloc(&device->reduce_scratchpad, AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal))); ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
// Device constants // Device constants
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0, ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
// Multi-GPU offset. This is used to compute globalVertexIdx. // Multi-GPU offset. This is used to compute globalVertexIdx.
// Might be better to calculate this in astaroth.cu instead of here, s.t. // Might be better to calculate this in astaroth.cu instead of here, s.t.
// everything related to the decomposition is limited to the multi-GPU layer // everything related to the decomposition is limited to the multi-GPU layer
@@ -166,7 +165,6 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset, ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice)); sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
printf("Created device %d (%p)\n", device->id, device); printf("Created device %d (%p)\n", device->id, device);
*device_handle = device; *device_handle = device;
return AC_SUCCESS; return AC_SUCCESS;
@@ -211,53 +209,44 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
const int3 start = (int3) {device->local_config.int_params[AC_nx_min], const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min] device->local_config.int_params[AC_nz_min]};
};
const int3 end = (int3) {device->local_config.int_params[AC_nx_max], const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max] device->local_config.int_params[AC_nz_max]};
};
*result = reduce_scal(device->streams[stream_type], rtype, *result = reduce_scal(device->streams[stream_type], rtype, start, end,
start, end, device->vba.in[vtxbuf_handle], device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
device->reduce_scratchpad, device->reduce_result); device->reduce_result);
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult AcResult
reduceVec(const Device device, const StreamType stream_type, reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
const ReductionType rtype, const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf2, AcReal* result)
const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2,
AcReal* result)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
const int3 start = (int3) {device->local_config.int_params[AC_nx_min], const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min], device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min] device->local_config.int_params[AC_nz_min]};
};
const int3 end = (int3) {device->local_config.int_params[AC_nx_max], const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max] device->local_config.int_params[AC_nz_max]};
};
*result = reduce_vec(device->streams[stream_type], rtype, start, end, *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
device->vba.in[vtxbuf0], device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->vba.in[vtxbuf1], device->reduce_scratchpad, device->reduce_result);
device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult AcResult
rkStep(const Device device, const StreamType stream_type, const int step_number, rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
const int3& start, const int3& end, const AcReal dt) const int3& end, const AcReal dt)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba); rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
@@ -270,65 +259,62 @@ synchronize(const Device device, const StreamType stream_type)
cudaSetDevice(device->id); cudaSetDevice(device->id);
if (stream_type == STREAM_ALL) { if (stream_type == STREAM_ALL) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} else { }
else {
cudaStreamSynchronize(device->streams[stream_type]); cudaStreamSynchronize(device->streams[stream_type]);
} }
return AC_SUCCESS; return AC_SUCCESS;
} }
static AcResult static AcResult
loadWithOffset(const Device device, const StreamType stream_type, loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
const AcReal* src, const size_t bytes, AcReal* dst) const size_t bytes, AcReal* dst)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, ERRCHK_CUDA(
device->streams[stream_type])); cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
return AC_SUCCESS; return AC_SUCCESS;
} }
static AcResult static AcResult
storeWithOffset(const Device device, const StreamType stream_type, storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
const AcReal* src, const size_t bytes, AcReal* dst) const size_t bytes, AcReal* dst)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, ERRCHK_CUDA(
device->streams[stream_type])); cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult AcResult
copyMeshToDevice(const Device device, const StreamType stream_type, copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
const AcMesh& host_mesh, const int3& src, const int3& dst, const int3& src, const int3& dst, const int num_vertices)
const int num_vertices)
{ {
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info); const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config); const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal), loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
&device->vba.in[i][dst_idx]); num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
} }
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult AcResult
copyMeshToHost(const Device device, const StreamType stream_type, copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
const int3& src, const int3& dst, const int num_vertices, const int num_vertices, AcMesh* host_mesh)
AcMesh* host_mesh)
{ {
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config); const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info); const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx], storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
num_vertices * sizeof(AcReal), num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
&host_mesh->vertex_buffer[i][dst_idx]);
} }
return AC_SUCCESS; return AC_SUCCESS;
} }
AcResult AcResult
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
const int3& src, Device dst_device, const int3& dst, Device dst_device, const int3& dst, const int num_vertices)
const int num_vertices)
{ {
cudaSetDevice(src_device->id); cudaSetDevice(src_device->id);
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config); const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
@@ -348,7 +334,7 @@ swapBuffers(const Device device)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
AcReal* tmp = device->vba.in[i]; AcReal* tmp = device->vba.in[i];
device->vba.in[i] = device->vba.out[i]; device->vba.in[i] = device->vba.out[i];
device->vba.out[i] = tmp; device->vba.out[i] = tmp;
} }
@@ -364,8 +350,8 @@ loadDeviceConstant(const Device device, const AcIntParam param, const int value)
// Therefore we have to obfuscate the code a bit and compute the offset address before // Therefore we have to obfuscate the code a bit and compute the offset address before
// invoking cudaMemcpyToSymbol. // invoking cudaMemcpyToSymbol.
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), ERRCHK_CUDA_ALWAYS(
offset, cudaMemcpyHostToDevice)); cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
return AC_SUCCESS; return AC_SUCCESS;
} }
@@ -374,8 +360,8 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), ERRCHK_CUDA_ALWAYS(
offset, cudaMemcpyHostToDevice)); cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
return AC_SUCCESS; return AC_SUCCESS;
} }
@@ -383,7 +369,7 @@ AcResult
loadGlobalGrid(const Device device, const Grid grid) loadGlobalGrid(const Device device, const Grid grid)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), ERRCHK_CUDA_ALWAYS(
0, cudaMemcpyHostToDevice)); cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
return AC_SUCCESS; return AC_SUCCESS;
} }

View File

@@ -27,12 +27,14 @@
#pragma once #pragma once
#include "astaroth.h" #include "astaroth.h"
// clang-format off
typedef enum { typedef enum {
STREAM_PRIMARY, STREAM_PRIMARY,
STREAM_SECONDARY, STREAM_SECONDARY,
NUM_STREAM_TYPES, NUM_STREAM_TYPES,
STREAM_ALL STREAM_ALL
} StreamType; } StreamType;
// clang-format on
typedef struct { typedef struct {
int3 m; int3 m;
@@ -52,20 +54,17 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* devi
AcResult destroyDevice(Device device); AcResult destroyDevice(Device device);
/** */ /** */
AcResult boundcondStep(const Device device, const StreamType stream_type, AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
const int3& start, const int3& end); const int3& end);
/** */ /** */
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype, AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result); const VertexBufferHandle vtxbuf_handle, AcReal* result);
/** */ /** */
AcResult reduceVec(const Device device, const StreamType stream_type, AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
const ReductionType rtype, const VertexBufferHandle vec0, const VertexBufferHandle vec1,
const VertexBufferHandle vec0, const VertexBufferHandle vec2, AcReal* result);
const VertexBufferHandle vec1,
const VertexBufferHandle vec2,
AcReal* result);
/** */ /** */
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number, AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
@@ -81,9 +80,8 @@ AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
const int num_vertices); const int num_vertices);
/** */ /** */
AcResult copyMeshToHost(const Device device, const StreamType stream_type, AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
const int3& src, const int3& dst, const int num_vertices, const int3& dst, const int num_vertices, AcMesh* host_mesh);
AcMesh* host_mesh);
/** */ /** */
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx, AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,

View File

@@ -24,7 +24,7 @@
* Detailed info. * Detailed info.
* *
*/ */
#pragma once #pragma once
__global__ void __global__ void
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
@@ -38,7 +38,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z) if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
return; return;
//if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz)) // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
// return; // return;
// If destination index is inside the computational domain, return since // If destination index is inside the computational domain, return since
@@ -69,15 +69,15 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
j_src += DCONST_INT(AC_ny_min); j_src += DCONST_INT(AC_ny_min);
k_src += DCONST_INT(AC_nz_min); k_src += DCONST_INT(AC_nz_min);
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
vtxbuf[dst_idx] = vtxbuf[src_idx]; vtxbuf[dst_idx] = vtxbuf[src_idx];
} }
void void
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
{ {
const dim3 tpb(8,2,8); const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
(unsigned int)ceil((end.y - start.y) / (float)tpb.y), (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
(unsigned int)ceil((end.z - start.z) / (float)tpb.z)); (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
@@ -89,7 +89,6 @@ periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& en
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
#include <assert.h> #include <assert.h>
static __device__ __forceinline__ int static __device__ __forceinline__ int
IDX(const int i) IDX(const int i)
{ {
@@ -120,14 +119,12 @@ create_rotz(const AcReal radians)
return mat; return mat;
} }
#if AC_DOUBLE_PRECISION == 0 #if AC_DOUBLE_PRECISION == 0
#define sin __sinf #define sin __sinf
#define cos __cosf #define cos __cosf
#define exp __expf #define exp __expf
#define rsqrt rsqrtf // hardware reciprocal sqrt #define rsqrt rsqrtf // hardware reciprocal sqrt
#endif // AC_DOUBLE_PRECISION == 0 #endif // AC_DOUBLE_PRECISION == 0
/* /*
typedef struct { typedef struct {
@@ -155,12 +152,11 @@ first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
#elif STENCIL_ORDER == 6 #elif STENCIL_ORDER == 6
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0}; const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
#elif STENCIL_ORDER == 8 #elif STENCIL_ORDER == 8
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, -1.0 / 280.0};
-1.0 / 280.0};
#endif #endif
#define MID (STENCIL_ORDER / 2) #define MID (STENCIL_ORDER / 2)
AcReal res = 0; AcReal res = 0;
#pragma unroll #pragma unroll
for (int i = 1; i <= MID; ++i) for (int i = 1; i <= MID; ++i)
@@ -175,17 +171,15 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
#if STENCIL_ORDER == 2 #if STENCIL_ORDER == 2
const AcReal coefficients[] = {-2., 1.}; const AcReal coefficients[] = {-2., 1.};
#elif STENCIL_ORDER == 4 #elif STENCIL_ORDER == 4
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0}; const AcReal coefficients[] = {-5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
#elif STENCIL_ORDER == 6 #elif STENCIL_ORDER == 6
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, 1.0 / 90.0};
1.0 / 90.0};
#elif STENCIL_ORDER == 8 #elif STENCIL_ORDER == 8
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, 8.0 / 315.0, -1.0 / 560.0};
8.0 / 315.0, -1.0 / 560.0};
#endif #endif
#define MID (STENCIL_ORDER / 2) #define MID (STENCIL_ORDER / 2)
AcReal res = coefficients[0] * pencil[MID]; AcReal res = coefficients[0] * pencil[MID];
#pragma unroll #pragma unroll
for (int i = 1; i <= MID; ++i) for (int i = 1; i <= MID; ++i)
@@ -196,31 +190,29 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */ /** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
static __device__ __forceinline__ AcReal static __device__ __forceinline__ AcReal
cross_derivative(const AcReal* __restrict__ pencil_a, cross_derivative(const AcReal* __restrict__ pencil_a, const AcReal* __restrict__ pencil_b,
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a, const AcReal inv_ds_a, const AcReal inv_ds_b)
const AcReal inv_ds_b)
{ {
#if STENCIL_ORDER == 2 #if STENCIL_ORDER == 2
const AcReal coefficients[] = {0, 1.0 / 4.0}; const AcReal coefficients[] = {0, 1.0 / 4.0};
#elif STENCIL_ORDER == 4 #elif STENCIL_ORDER == 4
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders const AcReal coefficients[] = {
0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
#elif STENCIL_ORDER == 6 #elif STENCIL_ORDER == 6
const AcReal fac = (1. / 720.); const AcReal fac = (1. / 720.);
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, 2.0 * fac};
2.0 * fac};
#elif STENCIL_ORDER == 8 #elif STENCIL_ORDER == 8
const AcReal fac = (1. / 20160.); const AcReal fac = (1. / 20160.);
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, 128. * fac, -9. * fac};
128. * fac, -9. * fac};
#endif #endif
#define MID (STENCIL_ORDER / 2) #define MID (STENCIL_ORDER / 2)
AcReal res = AcReal(0.); AcReal res = AcReal(0.);
#pragma unroll #pragma unroll
for (int i = 1; i <= MID; ++i) { for (int i = 1; i <= MID; ++i) {
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] - res += coefficients[i] *
pencil_b[MID + i] - pencil_b[MID - i]); (pencil_a[MID + i] + pencil_a[MID - i] - pencil_b[MID + i] - pencil_b[MID - i]);
} }
return res * inv_ds_a * inv_ds_b; return res * inv_ds_a * inv_ds_b;
} }
@@ -231,7 +223,8 @@ derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
AcReal pencil[STENCIL_ORDER + 1]; AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll #pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)]; pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx)); return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
} }
@@ -242,7 +235,8 @@ derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
AcReal pencil[STENCIL_ORDER + 1]; AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll #pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)]; pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx)); return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
} }
@@ -262,8 +256,7 @@ derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)]; vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsy));
DCONST_REAL(AC_inv_dsy));
} }
static __device__ __forceinline__ AcReal static __device__ __forceinline__ AcReal
@@ -281,8 +274,7 @@ derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
vertexIdx.z + STENCIL_ORDER / 2 - offset)]; vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsz));
DCONST_REAL(AC_inv_dsz));
} }
static __device__ __forceinline__ AcReal static __device__ __forceinline__ AcReal
@@ -291,7 +283,8 @@ dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
AcReal pencil[STENCIL_ORDER + 1]; AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll #pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)]; pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy)); return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
} }
@@ -302,7 +295,8 @@ deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
AcReal pencil[STENCIL_ORDER + 1]; AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll #pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)]; pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy)); return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
} }
@@ -322,8 +316,7 @@ deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
vertexIdx.z + STENCIL_ORDER / 2 - offset)]; vertexIdx.z + STENCIL_ORDER / 2 - offset)];
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), DCONST_REAL(AC_inv_dsz));
DCONST_REAL(AC_inv_dsz));
} }
static __device__ __forceinline__ AcReal static __device__ __forceinline__ AcReal
@@ -332,7 +325,8 @@ derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
AcReal pencil[STENCIL_ORDER + 1]; AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll #pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)]; pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz)); return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
} }
@@ -343,7 +337,8 @@ derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
AcReal pencil[STENCIL_ORDER + 1]; AcReal pencil[STENCIL_ORDER + 1];
#pragma unroll #pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)]; pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
vertexIdx.z + offset - STENCIL_ORDER / 2)];
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz)); return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
} }
@@ -401,8 +396,7 @@ operator-(const AcReal3& a)
return (AcReal3){-a.x, -a.y, -a.z}; return (AcReal3){-a.x, -a.y, -a.z};
} }
static __host__ __device__ __forceinline__ AcReal3 static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
operator*(const AcReal a, const AcReal3& b)
{ {
return (AcReal3){a * b.x, a * b.y, a * b.z}; return (AcReal3){a * b.x, a * b.y, a * b.z};
} }
@@ -443,7 +437,6 @@ is_valid(const AcReal3& a)
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z); return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
} }
/* /*
* ============================================================================= * =============================================================================
* Level 1 (Stencil Processing Stage) * Level 1 (Stencil Processing Stage)
@@ -476,8 +469,7 @@ laplace_vec(const AcReal3Data& vec)
static __device__ __forceinline__ AcReal3 static __device__ __forceinline__ AcReal3
curl(const AcReal3Data& vec) curl(const AcReal3Data& vec)
{ {
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, gradient(vec.x).z - gradient(vec.z).x,
gradient(vec.x).z - gradient(vec.z).x,
gradient(vec.y).x - gradient(vec.x).y}; gradient(vec.y).x - gradient(vec.x).y};
} }
@@ -520,7 +512,7 @@ contract(const AcMatrix& mat)
{ {
AcReal res = 0; AcReal res = 0;
#pragma unroll #pragma unroll
for (int i = 0; i < 3; ++i) for (int i = 0; i < 3; ++i)
res += dot(mat.row[i], mat.row[i]); res += dot(mat.row[i], mat.row[i]);
@@ -558,12 +550,13 @@ __constant__ AcReal forcing_phi;
static __device__ __forceinline__ AcReal3 static __device__ __forceinline__ AcReal3
forcing(const int i, const int j, const int k) forcing(const int i, const int j, const int k)
{ {
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx)) #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy)) #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz)) #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X, const AcReal3 k_vec = (AcReal3){
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y, (i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z}; (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
AcReal inv_len = reciprocal_len(k_vec); AcReal inv_len = reciprocal_len(k_vec);
if (isnan(inv_len) || isinf(inv_len)) if (isnan(inv_len) || isinf(inv_len))
inv_len = 0; inv_len = 0;
@@ -571,46 +564,41 @@ forcing(const int i, const int j, const int k)
inv_len = 2; inv_len = 2;
const AcReal k_dot_x = dot(k_vec, forcing_vec); const AcReal k_dot_x = dot(k_vec, forcing_vec);
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi); const AcReal waves = cos(k_dot_x) * cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
return inv_len * inv_len * waves * forcing_vec; return inv_len * inv_len * waves * forcing_vec;
} }
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision // in the mesh, then we will inherently lose precision
#define LNT0 (AcReal(0.0)) #define LNT0 (AcReal(0.0))
#define LNRHO0 (AcReal(0.0)) #define LNRHO0 (AcReal(0.0))
#define H_CONST (AcReal(0.0)) #define H_CONST (AcReal(0.0))
#define C_CONST (AcReal(0.0)) #define C_CONST (AcReal(0.0))
template <int step_number> template <int step_number>
static __device__ __forceinline__ AcReal static __device__ __forceinline__ AcReal
rk3_integrate(const AcReal state_previous, const AcReal state_current, rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
const AcReal rate_of_change, const AcReal dt) const AcReal dt)
{ {
// Williamson (1980) // Williamson (1980)
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)}; const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
AcReal(8. / 15.)};
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds" // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
// access (when accessing beta[step_number-1] even when step_number >= 1) // access (when accessing beta[step_number-1] even when step_number >= 1)
switch (step_number) { switch (step_number) {
case 0: case 0:
return state_current + beta[step_number + 1] * rate_of_change * dt; return state_current + beta[step_number + 1] * rate_of_change * dt;
case 1: // Fallthrough case 1: // Fallthrough
case 2: case 2:
return state_current + return state_current +
beta[step_number + 1] * beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) * (state_current - state_previous) +
(state_current - state_previous) + rate_of_change * dt);
rate_of_change * dt); default:
default: return NAN;
return NAN;
} }
} }
/* /*
@@ -646,13 +634,14 @@ static __device__ __forceinline__ AcReal3
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current, rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
const AcReal3 rate_of_change, const AcReal dt) const AcReal3 rate_of_change, const AcReal dt)
{ {
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt), return (AcReal3){
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt), rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)}; rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
} }
#define rk3(state_previous, state_current, rate_of_change, dt)\ #define rk3(state_previous, state_current, rate_of_change, dt) \
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt) rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
/* /*
template <int step_number> template <int step_number>
@@ -708,9 +697,8 @@ read_out(const int idx, AcReal* __restrict__ field[], const int handle)
static __device__ AcReal3 static __device__ AcReal3
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
{ {
return (AcReal3) { read_out(idx, field, handle.x), return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
read_out(idx, field, handle.y), read_out(idx, field, handle.z)};
read_out(idx, field, handle.z) };
} }
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value)) #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
@@ -718,29 +706,28 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
#define READ_OUT(handle) (read_out(idx, buffer.out, handle)) #define READ_OUT(handle) (read_out(idx, buffer.out, handle))
// also write for clarity here also, not for the DSL // also write for clarity here also, not for the DSL
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function //#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the
// function
#define GEN_KERNEL_PARAM_BOILERPLATE \ #define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
const int3 start, const int3 end, VertexBufferArray buffer
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \ #define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\ const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \
threadIdx.y + blockIdx.y * blockDim.y + start.y,\ threadIdx.y + blockIdx.y * blockDim.y + start.y, \
threadIdx.z + blockIdx.z * blockDim.z + start.z};\ threadIdx.z + blockIdx.z * blockDim.z + start.z}; \
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \ const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
d_multigpu_offset.y + vertexIdx.y, \ d_multigpu_offset.y + vertexIdx.y, \
d_multigpu_offset.z + vertexIdx.z}; \ d_multigpu_offset.z + vertexIdx.z}; \
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\ if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
return;\ return; \
\ \
\ assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) && \
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\ vertexIdx.z < DCONST_INT(AC_nz_max)); \
vertexIdx.z < DCONST_INT(AC_nz_max));\ \
\ assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) && \
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\ vertexIdx.z >= DCONST_INT(AC_nz_min)); \
vertexIdx.z >= DCONST_INT(AC_nz_min));\ \
\ const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
#include "stencil_process.cuh" #include "stencil_process.cuh"
@@ -757,33 +744,31 @@ randf(void)
} }
AcResult AcResult
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
const AcReal dt, VertexBufferArray* buffer) const int3& end, const AcReal dt, VertexBufferArray* buffer)
{ {
const dim3 tpb(32, 1, 4); const dim3 tpb(32, 1, 4);
/////////////////// Forcing /////////////////// Forcing
#if LFORCING #if LFORCING
const AcReal ff_scale = AcReal(.2); const AcReal ff_scale = AcReal(.2);
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0}; static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8; const AcReal radians = randf() * AcReal(2 * M_PI) / 360 / 8;
const AcMatrix rotz = create_rotz(radians); const AcMatrix rotz = create_rotz(radians);
ff = mul(rotz, ff); ff = mul(rotz, ff);
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream); cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf(); const AcReal ff_phi = AcReal(M_PI); // AcReal(2 * M_PI) * randf();
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream); cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice,
#endif // LFORCING stream);
#endif // LFORCING
////////////////////////// //////////////////////////
const int nx = end.x - start.x; const int nx = end.x - start.x;
const int ny = end.y - start.y; const int ny = end.y - start.y;
const int nz = end.z - start.z; const int nz = end.z - start.z;
const dim3 bpg( const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
(unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(nz / AcReal(tpb.z)));
(unsigned int)ceil(ny / AcReal(tpb.y)),
(unsigned int)ceil(nz / AcReal(tpb.z)));
if (step_number == 0) if (step_number == 0)
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt); solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
@@ -796,7 +781,6 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
return AC_SUCCESS; return AC_SUCCESS;
} }
////////////////REDUCE/////////////////////////// ////////////////REDUCE///////////////////////////
#include "src/core/math_utils.h" // is_power_of_two #include "src/core/math_utils.h" // is_power_of_two
@@ -848,22 +832,19 @@ template <FilterFunc filter>
__global__ void __global__ void
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst) kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
{ {
const int3 src_idx = (int3) { const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
start.x + threadIdx.x + blockIdx.x * blockDim.x, start.y + threadIdx.y + blockIdx.y * blockDim.y,
start.y + threadIdx.y + blockIdx.y * blockDim.y, start.z + threadIdx.z + blockIdx.z * blockDim.z};
start.z + threadIdx.z + blockIdx.z * blockDim.z
};
const int nx = end.x - start.x; const int nx = end.x - start.x;
const int ny = end.y - start.y; const int ny = end.y - start.y;
const int nz = end.z - start.z; //MV: Added this because it was undefined const int nz = end.z - start.z; // MV: Added this because it was undefined
const int3 dst_idx = (int3) { const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
threadIdx.x + blockIdx.x * blockDim.x, threadIdx.y + blockIdx.y * blockDim.y,
threadIdx.y + blockIdx.y * blockDim.y, threadIdx.z + blockIdx.z * blockDim.z};
threadIdx.z + blockIdx.z * blockDim.z
};
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max)); assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
src_idx.z < DCONST_INT(AC_nz_max));
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
@@ -872,31 +853,27 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,
template <FilterFuncVec filter> template <FilterFuncVec filter>
__global__ void __global__ void
kernel_filter_vec(const __restrict__ AcReal* src0, kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
const __restrict__ AcReal* src1, const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
const __restrict__ AcReal* src2,
const int3 start, const int3 end, AcReal* dst)
{ {
const int3 src_idx = (int3) { const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
start.x + threadIdx.x + blockIdx.x * blockDim.x, start.y + threadIdx.y + blockIdx.y * blockDim.y,
start.y + threadIdx.y + blockIdx.y * blockDim.y, start.z + threadIdx.z + blockIdx.z * blockDim.z};
start.z + threadIdx.z + blockIdx.z * blockDim.z
};
const int nx = end.x - start.x; const int nx = end.x - start.x;
const int ny = end.y - start.y; const int ny = end.y - start.y;
const int nz = end.z - start.z; //MV: Added this because it was undefined const int nz = end.z - start.z; // MV: Added this because it was undefined
const int3 dst_idx = (int3) { const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
threadIdx.x + blockIdx.x * blockDim.x, threadIdx.y + blockIdx.y * blockDim.y,
threadIdx.y + blockIdx.y * blockDim.y, threadIdx.z + blockIdx.z * blockDim.z};
threadIdx.z + blockIdx.z * blockDim.z
};
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max)); assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
src_idx.z < DCONST_INT(AC_nz_max));
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
} }
template <ReduceFunc reduce> template <ReduceFunc reduce>
@@ -908,7 +885,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
extern __shared__ AcReal smem[]; extern __shared__ AcReal smem[];
if (idx < num_elems) { if (idx < num_elems) {
smem[threadIdx.x] = scratchpad[idx]; smem[threadIdx.x] = scratchpad[idx];
} else { }
else {
smem[threadIdx.x] = NAN; smem[threadIdx.x] = NAN;
} }
__syncthreads(); __syncthreads();
@@ -930,9 +908,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
template <ReduceFunc reduce> template <ReduceFunc reduce>
__global__ void __global__ void
kernel_reduce_block(const __restrict__ AcReal* scratchpad, kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
const int num_blocks, const int block_size, const int block_size, AcReal* result)
AcReal* result)
{ {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx != 0) { if (idx != 0) {
@@ -946,23 +923,19 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
*result = res; *result = res;
} }
AcReal AcReal
reduce_scal(const cudaStream_t stream, const ReductionType rtype, reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const int3& start, const int3& end, const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
{ {
const unsigned nx = end.x - start.x; const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y; const unsigned ny = end.y - start.y;
const unsigned nz = end.z - start.z; const unsigned nz = end.z - start.z;
const unsigned num_elems = nx * ny * nz; const unsigned num_elems = nx * ny * nz;
const dim3 tpb_filter(32, 4, 1); const dim3 tpb_filter(32, 4, 1);
const dim3 bpg_filter( const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
(unsigned int)ceil(nx / AcReal(tpb_filter.x)), (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
(unsigned int)ceil(ny / AcReal(tpb_filter.y)), (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
);
const int tpb_reduce = 128; const int tpb_reduce = 128;
const int bpg_reduce = num_elems / tpb_reduce; const int bpg_reduce = num_elems / tpb_reduce;
@@ -974,22 +947,38 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
ERRCHK(nx * ny * nz % 2 == 0); ERRCHK(nx * ny * nz % 2 == 0);
if (rtype == RTYPE_MAX) { if (rtype == RTYPE_MAX) {
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad); kernel_filter<dvalue>
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
} else if (rtype == RTYPE_MIN) { scratchpad, num_elems);
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad); kernel_reduce_block<dmax>
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); }
} else if (rtype == RTYPE_RMS) { else if (rtype == RTYPE_MIN) {
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad); kernel_filter<dvalue>
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
} else if (rtype == RTYPE_RMS_EXP) { scratchpad, num_elems);
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad); kernel_reduce_block<dmin>
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); }
} else { else if (rtype == RTYPE_RMS) {
kernel_filter<dsquared>
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
scratchpad, num_elems);
kernel_reduce_block<dsum>
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
}
else if (rtype == RTYPE_RMS_EXP) {
kernel_filter<dexp_squared>
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
scratchpad, num_elems);
kernel_reduce_block<dsum>
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
}
else {
ERROR("Unrecognized rtype"); ERROR("Unrecognized rtype");
} }
AcReal result; AcReal result;
@@ -998,22 +987,19 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
} }
AcReal AcReal
reduce_vec(const cudaStream_t stream, const ReductionType rtype, reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
const int3& start, const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* reduce_result)
AcReal* scratchpad, AcReal* reduce_result)
{ {
const unsigned nx = end.x - start.x; const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y; const unsigned ny = end.y - start.y;
const unsigned nz = end.z - start.z; const unsigned nz = end.z - start.z;
const unsigned num_elems = nx * ny * nz; const unsigned num_elems = nx * ny * nz;
const dim3 tpb_filter(32, 4, 1); const dim3 tpb_filter(32, 4, 1);
const dim3 bpg_filter( const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
(unsigned int)ceil(nx / AcReal(tpb_filter.x)), (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
(unsigned int)ceil(ny / AcReal(tpb_filter.y)), (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
);
const int tpb_reduce = 128; const int tpb_reduce = 128;
const int bpg_reduce = num_elems / tpb_reduce; const int bpg_reduce = num_elems / tpb_reduce;
@@ -1025,22 +1011,38 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
ERRCHK(nx * ny * nz % 2 == 0); ERRCHK(nx * ny * nz % 2 == 0);
if (rtype == RTYPE_MAX) { if (rtype == RTYPE_MAX) {
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
} else if (rtype == RTYPE_MIN) { scratchpad, num_elems);
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); kernel_reduce_block<dmax>
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); }
} else if (rtype == RTYPE_RMS) { else if (rtype == RTYPE_MIN) {
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
} else if (rtype == RTYPE_RMS_EXP) { scratchpad, num_elems);
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad); kernel_reduce_block<dmin>
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems); <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result); }
} else { else if (rtype == RTYPE_RMS) {
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
scratchpad, num_elems);
kernel_reduce_block<dsum>
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
}
else if (rtype == RTYPE_RMS_EXP) {
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
scratchpad, num_elems);
kernel_reduce_block<dsum>
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
}
else {
ERROR("Unrecognized rtype"); ERROR("Unrecognized rtype");
} }
AcReal result; AcReal result;

File diff suppressed because it is too large Load Diff

View File

@@ -35,10 +35,10 @@
#include "model/model_rk3.h" #include "model/model_rk3.h"
#include "timer_hires.h" #include "timer_hires.h"
#include <vector> #include "src/core/errchk.h"
#include <algorithm> #include <algorithm>
#include <math.h> #include <math.h>
#include "src/core/errchk.h" #include <vector>
static bool static bool
smaller_than(const double& a, const double& b) smaller_than(const double& a, const double& b)
@@ -47,7 +47,8 @@ smaller_than(const double& a, const double& b)
} }
static int static int
write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc) write_runningtimes(const char* path, const int n, const double min, const double max,
const double median, const double perc)
{ {
FILE* fp; FILE* fp;
fp = fopen(path, "a"); fp = fopen(path, "a");
@@ -80,7 +81,8 @@ int
run_benchmark(void) run_benchmark(void)
{ {
char runningtime_path[256]; char runningtime_path[256];
sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep"); sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float",
GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
FILE* fp; FILE* fp;
fp = fopen(runningtime_path, "w"); fp = fopen(runningtime_path, "w");
@@ -88,13 +90,14 @@ run_benchmark(void)
if (fp != NULL) { if (fp != NULL) {
fprintf(fp, "n, min, max, median, perc\n"); fprintf(fp, "n, min, max, median, perc\n");
fclose(fp); fclose(fp);
} else { }
else {
return EXIT_FAILURE; return EXIT_FAILURE;
} }
#define N_STEP_SIZE (128) #define N_STEP_SIZE (128)
#define MAX_MESH_DIM (128) #define MAX_MESH_DIM (128)
#define NUM_ITERS (100) #define NUM_ITERS (100)
for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) { for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
/* Parse configs */ /* Parse configs */
AcMeshInfo mesh_info; AcMeshInfo mesh_info;
@@ -113,7 +116,6 @@ run_benchmark(void)
std::vector<double> results; std::vector<double> results;
results.reserve(NUM_ITERS); results.reserve(NUM_ITERS);
// Warmup // Warmup
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
acIntegrate(0); acIntegrate(0);
@@ -124,28 +126,35 @@ run_benchmark(void)
for (int i = 0; i < NUM_ITERS; ++i) { for (int i = 0; i < NUM_ITERS; ++i) {
timer_reset(&t); timer_reset(&t);
#if GEN_BENCHMARK_RK3 == 1 #if GEN_BENCHMARK_RK3 == 1
acIntegrateStep(2, FLT_EPSILON); acIntegrateStep(2, FLT_EPSILON);
#else // GEN_BENCHMARK_FULL #else // GEN_BENCHMARK_FULL
//const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); // const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
const AcReal dt = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info); const AcReal dt = AcReal(
1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
acIntegrate(dt); acIntegrate(dt);
#endif #endif
acSynchronize(); acSynchronize();
const double ms_elapsed = timer_diff_nsec(t) / 1e6; const double ms_elapsed = timer_diff_nsec(t) / 1e6;
results.push_back(ms_elapsed); results.push_back(ms_elapsed);
} }
#define NTH_PERCENTILE (0.95) #define NTH_PERCENTILE (0.95)
std::sort(results.begin(), results.end(), smaller_than); std::sort(results.begin(), results.end(), smaller_than);
write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]); write_runningtimes(runningtime_path, n, results[0], results[results.size() - 1],
results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
char percentile_path[256]; char percentile_path[256];
sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep"); sprintf(percentile_path, "%d_%s_%s_percentiles.out", n,
AC_DOUBLE_PRECISION ? "double" : "float",
GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
write_percentiles(percentile_path, NUM_ITERS, results); write_percentiles(percentile_path, NUM_ITERS, results);
printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]); printf("%s running time %g ms, (%dth percentile, nx = %d) \n",
GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep",
double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
mesh_info.int_params[AC_nx]);
acStore(mesh); acStore(mesh);
acQuit(); acQuit();
@@ -225,7 +234,8 @@ run_benchmark(void)
return 0; return 0;
} }
#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA #else
//////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
@@ -290,8 +300,8 @@ run_benchmark(void)
#define NTH_PERCENTILE (0.95) #define NTH_PERCENTILE (0.95)
std::sort(results.begin(), results.end(), smaller_than); std::sort(results.begin(), results.end(), smaller_than);
write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]); write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)],
write_percentiles(n, NUM_ITERS, results); results[int(NTH_PERCENTILE * NUM_ITERS)]); write_percentiles(n, NUM_ITERS, results);
} }
return 0; return 0;

View File

@@ -79,8 +79,7 @@ parse_config(const char* path, AcMeshInfo* config)
int idx = -1; int idx = -1;
if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0) if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
config->int_params[idx] = atoi(value); config->int_params[idx] = atoi(value);
else if ((idx = find_str(keyword, realparam_names, else if ((idx = find_str(keyword, realparam_names, NUM_REAL_PARAM_TYPES)) >= 0)
NUM_REAL_PARAM_TYPES)) >= 0)
config->real_params[idx] = AcReal(atof(value)); config->real_params[idx] = AcReal(atof(value));
} }
@@ -92,32 +91,30 @@ update_config(AcMeshInfo* config)
{ {
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER; config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
///////////// PAD TEST ///////////// PAD TEST
//config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
///////////// PAD TEST ///////////// PAD TEST
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER; config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER; config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
// Bounds for the computational domain, i.e. nx_min <= i < nx_max // Bounds for the computational domain, i.e. nx_min <= i < nx_max
config->int_params[AC_nx_min] = STENCIL_ORDER / 2; config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
config->int_params[AC_nx];
config->int_params[AC_ny_min] = STENCIL_ORDER / 2; config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
config->int_params[AC_ny_max] = config->int_params[AC_ny] + config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
STENCIL_ORDER / 2;
config->int_params[AC_nz_min] = STENCIL_ORDER / 2; config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
config->int_params[AC_nz_max] = config->int_params[AC_nz] + config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
STENCIL_ORDER / 2;
// Spacing // Spacing
config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx]; config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy]; config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz]; config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz])); config->real_params[AC_dsmin] = min(
config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
// Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES) // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx]; config->real_params[AC_xlen] = config->real_params[AC_dsx] * config->int_params[AC_mx];
config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my]; config->real_params[AC_ylen] = config->real_params[AC_dsy] * config->int_params[AC_my];
config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz]; config->real_params[AC_zlen] = config->real_params[AC_dsz] * config->int_params[AC_mz];
config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen]; config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];
config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen]; config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen];
@@ -125,35 +122,35 @@ update_config(AcMeshInfo* config)
/* Additional helper params */ /* Additional helper params */
// Int helpers // Int helpers
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
config->int_params[AC_my]; config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
config->int_params[AC_ny];
config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
config->int_params[AC_nz];
// Real helpers // Real helpers
config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] * config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
config->real_params[AC_cs_sound]; config->real_params[AC_cs_sound];
config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma]; config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] /
config->real_params[AC_gamma];
AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module AcReal G_CONST_CGS = AcReal(
AcReal M_sun = AcReal(1.989e33); // g solar mass 6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
AcReal M_sun = AcReal(1.989e33); // g solar mass
config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun / config->real_params[AC_M_star] = config->real_params[AC_M_star] * M_sun /
( (config->real_params[AC_unit_length]* ((config->real_params[AC_unit_length] *
config->real_params[AC_unit_length]* config->real_params[AC_unit_length] *
config->real_params[AC_unit_length]) * config->real_params[AC_unit_length]) *
config->real_params[AC_unit_density] ) ; config->real_params[AC_unit_density]);
config->real_params[AC_G_CONST] = G_CONST_CGS / config->real_params[AC_G_CONST] = G_CONST_CGS / ((config->real_params[AC_unit_velocity] *
( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) / config->real_params[AC_unit_velocity]) /
(config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ; (config->real_params[AC_unit_density] *
config->real_params[AC_unit_length]));
config->real_params[AC_GM_star] = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
config->real_params[AC_GM_star] = config->real_params[AC_M_star] *
config->real_params[AC_G_CONST];
config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2) * config->real_params[AC_GM_star]));
const bool print_config = true; const bool print_config = true;
if (print_config) { if (print_config) {

View File

@@ -50,12 +50,12 @@ static const int window_bpp = 32; // Bits per pixel
SDL_Surface* surfaces[NUM_VTXBUF_HANDLES]; SDL_Surface* surfaces[NUM_VTXBUF_HANDLES];
static int datasurface_width = -1; static int datasurface_width = -1;
static int datasurface_height = -1; static int datasurface_height = -1;
static int k_slice = 0; static int k_slice = 0;
static int k_slice_max = 0; static int k_slice_max = 0;
// Colors // Colors
static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255}; static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
static const int num_tiles = NUM_VTXBUF_HANDLES + 1; static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
static const int tiles_per_row = 3; static const int tiles_per_row = 3;
/* /*
@@ -82,10 +82,9 @@ static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
static inline vec4 static inline vec4
project_ortho(const float2& pos, const float2& bbox, const float2& wdims) project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
{ {
const vec4 rect = (vec4){ const vec4 rect = (vec4){camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x, camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y, camera.scale * bbox.x, camera.scale * bbox.y};
camera.scale * bbox.x, camera.scale * bbox.y};
return rect; return rect;
} }
@@ -103,13 +102,12 @@ renderer_init(const int& mx, const int& my)
SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS); SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
// Setup window // Setup window
window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
SDL_WINDOWPOS_UNDEFINED, window_width, window_width, window_height, SDL_WINDOW_SHOWN);
window_height, SDL_WINDOW_SHOWN);
// Setup SDL renderer // Setup SDL renderer
renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED); renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
//SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP); // SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
SDL_GetWindowSize(window, &window_width, &window_height); SDL_GetWindowSize(window, &window_width, &window_height);
SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
@@ -118,24 +116,24 @@ renderer_init(const int& mx, const int& my)
datasurface_height = my; datasurface_height = my;
// vec drawing uses the surface of the first component, no memory issues here // vec drawing uses the surface of the first component, no memory issues here
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
surfaces[i] = SDL_CreateRGBSurfaceWithFormat( surfaces[i] = SDL_CreateRGBSurfaceWithFormat(0, datasurface_width, datasurface_height,
0, datasurface_width, datasurface_height, window_bpp, window_bpp, SDL_PIXELFORMAT_RGBA8888);
SDL_PIXELFORMAT_RGBA8888);
camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width, camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
-.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height}; -.5f * (num_tiles / tiles_per_row) * datasurface_height +
.5f * datasurface_height};
camera.scale = min(window_width / float(datasurface_width * tiles_per_row), camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
window_height / float(datasurface_height * (num_tiles/tiles_per_row))); window_height / float(datasurface_height * (num_tiles / tiles_per_row)));
SDL_RendererInfo renderer_info; SDL_RendererInfo renderer_info;
SDL_GetRendererInfo(renderer, &renderer_info); SDL_GetRendererInfo(renderer, &renderer_info);
printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height); printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width,
renderer_info.max_texture_height);
return 0; return 0;
} }
static int static int
set_pixel(const int& i, const int& j, const uint32_t& color, set_pixel(const int& i, const int& j, const uint32_t& color, SDL_Surface* surface)
SDL_Surface* surface)
{ {
uint32_t* pixels = (uint32_t*)surface->pixels; uint32_t* pixels = (uint32_t*)surface->pixels;
pixels[i + j * surface->w] = color; pixels[i + j * surface->w] = color;
@@ -143,22 +141,21 @@ set_pixel(const int& i, const int& j, const uint32_t& color,
} }
static int static int
draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int& tile)
const int& tile)
{ {
const float xoffset = (tile % tiles_per_row) * datasurface_width; const float xoffset = (tile % tiles_per_row) * datasurface_width;
const float yoffset = - (tile / tiles_per_row) * datasurface_height; const float yoffset = -(tile / tiles_per_row) * datasurface_height;
/* /*
const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer)); const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer)); const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
*/ */
const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer)); const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer));
const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer)); const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer));
const float range = fabsf(max - min); const float range = fabsf(max - min);
const float mid = max - .5f * range; const float mid = max - .5f * range;
const int k = k_slice; //mesh.info.int_params[AC_mz] / 2; const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) { for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) { for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
@@ -166,29 +163,23 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info); const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
const uint8_t shade = (uint8_t)( const uint8_t shade = (uint8_t)(
255.f * 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / range);
(fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
range);
uint8_t color[4] = {0, 0, 0, 255}; uint8_t color[4] = {0, 0, 0, 255};
color[tile % 3] = shade; color[tile % 3] = shade;
const uint32_t mapped_color = SDL_MapRGBA( const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer]->format, color[0],
surfaces[vertex_buffer]->format, color[0], color[1], color[2], color[1], color[2], color[3]);
color[3]);
set_pixel(i, j, mapped_color, surfaces[vertex_buffer]); set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
} }
} }
const float2 pos = (float2){xoffset, yoffset}; const float2 pos = (float2){xoffset, yoffset};
const float2 bbox = (float2){.5f * datasurface_width, const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height};
.5f * datasurface_height};
const float2 wsize = (float2){float(window_width), float(window_height)}; const float2 wsize = (float2){float(window_width), float(window_height)};
const vec4 rectf = project_ortho(pos, bbox, wsize); const vec4 rectf = project_ortho(pos, bbox, wsize);
SDL_Rect rect = (SDL_Rect){ SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h), int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer]);
surfaces[vertex_buffer]);
SDL_RenderCopy(renderer, tex, NULL, &rect); SDL_RenderCopy(renderer, tex, NULL, &rect);
SDL_DestroyTexture(tex); SDL_DestroyTexture(tex);
@@ -196,14 +187,12 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
} }
static int static int
draw_vertex_buffer_vec(const AcMesh& mesh, draw_vertex_buffer_vec(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer_a,
const VertexBufferHandle& vertex_buffer_a,
const VertexBufferHandle& vertex_buffer_b, const VertexBufferHandle& vertex_buffer_b,
const VertexBufferHandle& vertex_buffer_c, const VertexBufferHandle& vertex_buffer_c, const int& tile)
const int& tile)
{ {
const float xoffset = (tile % tiles_per_row) * datasurface_width; const float xoffset = (tile % tiles_per_row) * datasurface_width;
const float yoffset = - (tile / tiles_per_row) * datasurface_height; const float yoffset = -(tile / tiles_per_row) * datasurface_height;
/* /*
const float maxx = float( const float maxx = float(
@@ -215,52 +204,41 @@ draw_vertex_buffer_vec(const AcMesh& mesh,
min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b), min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c)))); model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
*/ */
const float maxx = float( const float maxx = float(max(
max(acReduceScal(RTYPE_MAX, vertex_buffer_a), acReduceScal(RTYPE_MAX, vertex_buffer_a),
max(acReduceScal(RTYPE_MAX, vertex_buffer_b), max(acReduceScal(RTYPE_MAX, vertex_buffer_b), acReduceScal(RTYPE_MAX, vertex_buffer_c))));
acReduceScal(RTYPE_MAX, vertex_buffer_c)))); const float minn = float(min(
const float minn = float( acReduceScal(RTYPE_MIN, vertex_buffer_a),
min(acReduceScal(RTYPE_MIN, vertex_buffer_a), min(acReduceScal(RTYPE_MIN, vertex_buffer_b), acReduceScal(RTYPE_MIN, vertex_buffer_c))));
min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
acReduceScal(RTYPE_MIN, vertex_buffer_c))));
const float range = fabsf(maxx - minn); const float range = fabsf(maxx - minn);
const float mid = maxx - .5f * range; const float mid = maxx - .5f * range;
const int k = k_slice; //mesh.info.int_params[AC_mz] / 2; const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) { for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) { for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
ERRCHK(i < datasurface_width && j < datasurface_height); ERRCHK(i < datasurface_width && j < datasurface_height);
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info); const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
const uint8_t r = (uint8_t)( const uint8_t r = (uint8_t)(
255.f * 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / range);
(fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
range);
const uint8_t g = (uint8_t)( const uint8_t g = (uint8_t)(
255.f * 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / range);
(fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
range);
const uint8_t b = (uint8_t)( const uint8_t b = (uint8_t)(
255.f * 255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / range);
(fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer_a]->format, r, g, b,
range); 255);
const uint32_t mapped_color = SDL_MapRGBA(
surfaces[vertex_buffer_a]->format, r, g, b, 255);
set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]); set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
} }
} }
const float2 pos = (float2){xoffset, yoffset}; const float2 pos = (float2){xoffset, yoffset};
const float2 bbox = (float2){.5f * datasurface_width, const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height};
.5f * datasurface_height};
const float2 wsize = (float2){float(window_width), float(window_height)}; const float2 wsize = (float2){float(window_width), float(window_height)};
const vec4 rectf = project_ortho(pos, bbox, wsize); const vec4 rectf = project_ortho(pos, bbox, wsize);
SDL_Rect rect = (SDL_Rect){ SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h), int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer_a]);
surfaces[vertex_buffer_a]);
SDL_RenderCopy(renderer, tex, NULL, &rect); SDL_RenderCopy(renderer, tex, NULL, &rect);
SDL_DestroyTexture(tex); SDL_DestroyTexture(tex);
@@ -272,13 +250,11 @@ renderer_draw(const AcMesh& mesh)
{ {
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
draw_vertex_buffer(mesh, VertexBufferHandle(i), i); draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, NUM_VTXBUF_HANDLES);
NUM_VTXBUF_HANDLES);
// Drawing done, present // Drawing done, present
SDL_RenderPresent(renderer); SDL_RenderPresent(renderer);
SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, color_bg.a);
color_bg.a);
SDL_RenderClear(renderer); SDL_RenderClear(renderer);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -404,14 +380,14 @@ run_renderer(void)
/* Step the simulation */ /* Step the simulation */
#if 1 #if 1
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
VTXBUF_UUZ);
const AcReal dt = host_timestep(umax, mesh_info); const AcReal dt = host_timestep(umax, mesh_info);
acIntegrate(dt); acIntegrate(dt);
#else #else
ModelMesh* model_mesh = modelmesh_create(mesh->info); ModelMesh* model_mesh = modelmesh_create(mesh->info);
const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ)); const AcReal umax = AcReal(
const AcReal dt = host_timestep(umax, mesh_info); model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
const AcReal dt = host_timestep(umax, mesh_info);
acmesh_to_modelmesh(*mesh, model_mesh); acmesh_to_modelmesh(*mesh, model_mesh);
model_rk3(dt, model_mesh); model_rk3(dt, model_mesh);
modelmesh_to_acmesh(*model_mesh, mesh); modelmesh_to_acmesh(*model_mesh, mesh);
@@ -425,7 +401,7 @@ run_renderer(void)
/* Render */ /* Render */
const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f; const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
if (timer_diff_sec >= desired_frame_time) { if (timer_diff_sec >= desired_frame_time) {
//acStore(mesh); // acStore(mesh);
const int num_vertices = mesh->info.int_params[AC_mxy]; const int num_vertices = mesh->info.int_params[AC_mxy];
const int3 dst = (int3){0, 0, k_slice}; const int3 dst = (int3){0, 0, k_slice};
acStoreWithOffset(dst, num_vertices, mesh); acStoreWithOffset(dst, num_vertices, mesh);

View File

@@ -60,23 +60,23 @@ print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
} }
*/ */
//Write all setting info into a separate ascii file. This is done to guarantee // Write all setting info into a separate ascii file. This is done to guarantee
//that we have the data specifi information in the thing, even though in // that we have the data specifi information in the thing, even though in
//principle these things are in the astaroth.conf. // principle these things are in the astaroth.conf.
static inline static inline void
void write_mesh_info(const AcMeshInfo* config) write_mesh_info(const AcMeshInfo* config)
{ {
FILE* infotxt; FILE* infotxt;
infotxt = fopen("purge.sh","w"); infotxt = fopen("purge.sh", "w");
fprintf(infotxt, "#!/bin/bash\n"); fprintf(infotxt, "#!/bin/bash\n");
fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n"); fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
fclose(infotxt); fclose(infotxt);
infotxt = fopen("mesh_info.list","w"); infotxt = fopen("mesh_info.list", "w");
//Total grid dimensions // Total grid dimensions
fprintf(infotxt, "int AC_mx %i \n", config->int_params[AC_mx]); fprintf(infotxt, "int AC_mx %i \n", config->int_params[AC_mx]);
fprintf(infotxt, "int AC_my %i \n", config->int_params[AC_my]); fprintf(infotxt, "int AC_my %i \n", config->int_params[AC_my]);
fprintf(infotxt, "int AC_mz %i \n", config->int_params[AC_mz]); fprintf(infotxt, "int AC_mz %i \n", config->int_params[AC_mz]);
@@ -96,28 +96,26 @@ void write_mesh_info(const AcMeshInfo* config)
fprintf(infotxt, "real AC_inv_dsx %e \n", (double)config->real_params[AC_inv_dsx]); fprintf(infotxt, "real AC_inv_dsx %e \n", (double)config->real_params[AC_inv_dsx]);
fprintf(infotxt, "real AC_inv_dsy %e \n", (double)config->real_params[AC_inv_dsy]); fprintf(infotxt, "real AC_inv_dsy %e \n", (double)config->real_params[AC_inv_dsy]);
fprintf(infotxt, "real AC_inv_dsz %e \n", (double)config->real_params[AC_inv_dsz]); fprintf(infotxt, "real AC_inv_dsz %e \n", (double)config->real_params[AC_inv_dsz]);
fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin ]); fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin]);
/* Additional helper params */ /* Additional helper params */
// Int helpers // Int helpers
fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy ]); fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy]);
fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy ]); fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy]);
fprintf(infotxt, "int AC_nxyz %i \n", config->int_params[AC_nxyz]); fprintf(infotxt, "int AC_nxyz %i \n", config->int_params[AC_nxyz]);
// Real helpers // Real helpers
fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]); fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound ]); fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound]);
fclose(infotxt); fclose(infotxt);
} }
// This funtion writes a run state into a set of C binaries. For the sake of
//This funtion writes a run state into a set of C binaries. For the sake of // accuracy, all floating point numbers are to be saved in long double precision
//accuracy, all floating point numbers are to be saved in long double precision // regardless of the choise of accuracy during runtime.
//regardless of the choise of accuracy during runtime.
static inline void static inline void
save_mesh(const AcMesh &save_mesh, const int step, save_mesh(const AcMesh& save_mesh, const int step, const AcReal t_step)
const AcReal t_step)
{ {
FILE* save_ptr; FILE* save_ptr;
@@ -128,7 +126,7 @@ save_mesh(const AcMesh &save_mesh, const int step,
char cstep[10]; char cstep[10];
char bin_filename[80] = "\0"; char bin_filename[80] = "\0";
//sprintf(bin_filename, ""); // sprintf(bin_filename, "");
sprintf(cstep, "%d", step); sprintf(cstep, "%d", step);
@@ -139,28 +137,25 @@ save_mesh(const AcMesh &save_mesh, const int step,
printf("Savefile %s \n", bin_filename); printf("Savefile %s \n", bin_filename);
save_ptr = fopen(bin_filename,"wb"); save_ptr = fopen(bin_filename, "wb");
//Start file with time stamp // Start file with time stamp
long double write_long_buf = (long double) t_step; long double write_long_buf = (long double)t_step;
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr); fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
//Grid data // Grid data
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i]; const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
long double write_long_buf = (long double) point_val; long double write_long_buf = (long double)point_val;
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr); fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
} }
fclose(save_ptr); fclose(save_ptr);
} }
} }
// This function prints out the diagnostic values to std.out and also saves and // This function prints out the diagnostic values to std.out and also saves and
// appends an ascii file to contain all the result. // appends an ascii file to contain all the result.
static inline void static inline void
print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file) print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE* diag_file)
{ {
AcReal buf_rms, buf_max, buf_min; AcReal buf_rms, buf_max, buf_min;
@@ -174,11 +169,10 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
// MV: The ordering in the earlier version was wrong in terms of variable // MV: The ordering in the earlier version was wrong in terms of variable
// MV: name and its diagnostics. // MV: name and its diagnostics.
printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt)); printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", double(buf_min),
double(buf_min), double(buf_rms), double(buf_max)); double(buf_rms), double(buf_max));
fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), double(buf_min),
double(buf_min), double(buf_rms), double(buf_max)); double(buf_rms), double(buf_max));
// Calculate rms, min and max from the variables as scalars // Calculate rms, min and max from the variables as scalars
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -194,11 +188,11 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
fprintf(diag_file, "\n"); fprintf(diag_file, "\n");
} }
/* /*
MV NOTE: At the moment I have no clear idea how to calculate magnetic MV NOTE: At the moment I have no clear idea how to calculate magnetic
diagnostic variables from grid. Vector potential measures have a limited diagnostic variables from grid. Vector potential measures have a limited
value. TODO: Smart way to get brms, bmin and bmax. value. TODO: Smart way to get brms, bmin and bmax.
*/ */
int int
run_simulation(void) run_simulation(void)
@@ -213,8 +207,7 @@ run_simulation(void)
acInit(mesh_info); acInit(mesh_info);
acLoad(*mesh); acLoad(*mesh);
FILE* diag_file;
FILE *diag_file;
diag_file = fopen("timeseries.ts", "a"); diag_file = fopen("timeseries.ts", "a");
// TODO Get time from earlier state. // TODO Get time from earlier state.
AcReal t_step = 0.0; AcReal t_step = 0.0;
@@ -222,7 +215,8 @@ run_simulation(void)
// Generate the title row. // Generate the title row.
fprintf(diag_file, "step t_step dt uu_total_min uu_total_rms uu_total_max "); fprintf(diag_file, "step t_step dt uu_total_min uu_total_rms uu_total_max ");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]); fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i],
vtxbuf_names[i]);
} }
fprintf(diag_file, "\n"); fprintf(diag_file, "\n");
@@ -234,17 +228,16 @@ run_simulation(void)
acStore(mesh); acStore(mesh);
save_mesh(*mesh, 0, t_step); save_mesh(*mesh, 0, t_step);
const int max_steps = mesh_info.int_params[AC_max_steps]; const int max_steps = mesh_info.int_params[AC_max_steps];
const int save_steps = mesh_info.int_params[AC_save_steps]; const int save_steps = mesh_info.int_params[AC_save_steps];
const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; // TODO Get from mesh_info
AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t]; AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
AcReal bin_crit_t = bin_save_t; AcReal bin_crit_t = bin_save_t;
/* Step the simulation */ /* Step the simulation */
for (int i = 1; i < max_steps; ++i) { for (int i = 1; i < max_steps; ++i) {
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
VTXBUF_UUZ);
const AcReal dt = host_timestep(umax, mesh_info); const AcReal dt = host_timestep(umax, mesh_info);
acIntegrate(dt); acIntegrate(dt);
@@ -254,33 +247,32 @@ run_simulation(void)
if ((i % save_steps) == 0) { if ((i % save_steps) == 0) {
/* /*
print_diagnostics() writes out both std.out printout from the print_diagnostics() writes out both std.out printout from the
results and saves the diagnostics into a table for ascii file results and saves the diagnostics into a table for ascii file
timeseries.ts. timeseries.ts.
*/ */
print_diagnostics(i, dt, t_step, diag_file); print_diagnostics(i, dt, t_step, diag_file);
/* /*
We would also might want an XY-average calculating funtion, We would also might want an XY-average calculating funtion,
which can be very useful when observing behaviour of turbulent which can be very useful when observing behaviour of turbulent
simulations. (TODO) simulations. (TODO)
*/ */
} }
/* Save the simulation state and print diagnostics */ /* Save the simulation state and print diagnostics */
if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) { if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) {
/* /*
This loop saves the data into simple C binaries which can be This loop saves the data into simple C binaries which can be
used for analysing the data snapshots closely. used for analysing the data snapshots closely.
Saving simulation state should happen in a separate stage. We do Saving simulation state should happen in a separate stage. We do
not want to save it as often as diagnostics. The file format not want to save it as often as diagnostics. The file format
should IDEALLY be HDF5 which has become a well supported, portable and should IDEALLY be HDF5 which has become a well supported, portable and
reliable data format when it comes to HPC applications. reliable data format when it comes to HPC applications.
However, implementing it will have to for more simpler approach However, implementing it will have to for more simpler approach
to function. (TODO?) to function. (TODO?)
*/ */
@@ -300,9 +292,7 @@ run_simulation(void)
save_mesh(*mesh, i, t_step); save_mesh(*mesh, i, t_step);
bin_crit_t += bin_save_t; bin_crit_t += bin_save_t;
} }
} }
//////Save the final snapshot //////Save the final snapshot
@@ -318,25 +308,3 @@ run_simulation(void)
return 0; return 0;
} }

View File

@@ -52,8 +52,7 @@ timer_diff_nsec(const Timer start)
{ {
Timer end; Timer end;
timer_reset(&end); timer_reset(&end);
const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + (end.tv_nsec - start.tv_nsec);
(end.tv_nsec - start.tv_nsec);
return diff; return diff;
} }