Autoformatted all CUDA/C/C++ code
This commit is contained in:
@@ -316,9 +316,13 @@ traverse(const ASTNode* node)
|
|||||||
if (symbol_table[i].type_qualifier == IN) {
|
if (symbol_table[i].type_qualifier == IN) {
|
||||||
printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
|
printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
|
||||||
symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
|
symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
|
||||||
} else if (symbol_table[i].type_qualifier == OUT) {
|
}
|
||||||
printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
|
else if (symbol_table[i].type_qualifier == OUT) {
|
||||||
//printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
|
printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier),
|
||||||
|
symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
|
||||||
|
// printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n",
|
||||||
|
// translate(symbol_table[i].type_specifier), symbol_table[i].identifier,
|
||||||
|
// inout_name_prefix, symbol_table[i].identifier);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -326,8 +330,7 @@ traverse(const ASTNode* node)
|
|||||||
// Preprocessed parameter boilerplate
|
// Preprocessed parameter boilerplate
|
||||||
if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
|
if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
|
||||||
inside_preprocessed = true;
|
inside_preprocessed = true;
|
||||||
static const char
|
static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
|
||||||
preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
|
|
||||||
if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
|
if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
|
||||||
printf("%s ", preprocessed_parameter_boilerplate);
|
printf("%s ", preprocessed_parameter_boilerplate);
|
||||||
// BOILERPLATE END////////////////////////////////////////////////////////
|
// BOILERPLATE END////////////////////////////////////////////////////////
|
||||||
@@ -343,7 +346,6 @@ traverse(const ASTNode* node)
|
|||||||
if (node->type == NODE_FUNCTION_DECLARATION)
|
if (node->type == NODE_FUNCTION_DECLARATION)
|
||||||
inside_function_declaration = false;
|
inside_function_declaration = false;
|
||||||
|
|
||||||
|
|
||||||
// If the node is a subscript expression and the expression list inside it is not empty
|
// If the node is a subscript expression and the expression list inside it is not empty
|
||||||
if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
|
if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
|
||||||
printf("IDX(");
|
printf("IDX(");
|
||||||
@@ -354,7 +356,7 @@ traverse(const ASTNode* node)
|
|||||||
if (handle >= 0) { // The variable exists in the symbol table
|
if (handle >= 0) { // The variable exists in the symbol table
|
||||||
const Symbol* symbol = &symbol_table[handle];
|
const Symbol* symbol = &symbol_table[handle];
|
||||||
|
|
||||||
//if (symbol->type_qualifier == OUT) {
|
// if (symbol->type_qualifier == OUT) {
|
||||||
// printf("%s%s", inout_name_prefix, symbol->identifier);
|
// printf("%s%s", inout_name_prefix, symbol->identifier);
|
||||||
//}
|
//}
|
||||||
if (symbol->type_qualifier == UNIFORM) {
|
if (symbol->type_qualifier == UNIFORM) {
|
||||||
@@ -400,8 +402,10 @@ traverse(const ASTNode* node)
|
|||||||
if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
|
if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
|
||||||
for (int i = 0; i < num_symbols; ++i) {
|
for (int i = 0; i < num_symbols; ++i) {
|
||||||
if (symbol_table[i].type_qualifier == OUT) {
|
if (symbol_table[i].type_qualifier == OUT) {
|
||||||
printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
|
printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier,
|
||||||
//printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
|
symbol_table[i].identifier);
|
||||||
|
// printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n",
|
||||||
|
// inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -486,8 +490,8 @@ generate_preprocessed_structures(void)
|
|||||||
|
|
||||||
for (int i = 0; i < num_symbols; ++i) {
|
for (int i = 0; i < num_symbols; ++i) {
|
||||||
if (symbol_table[i].type_qualifier == PREPROCESSED)
|
if (symbol_table[i].type_qualifier == PREPROCESSED)
|
||||||
printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier,
|
printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n",
|
||||||
symbol_table[i].identifier);
|
symbol_table[i].identifier, symbol_table[i].identifier);
|
||||||
}
|
}
|
||||||
printf("return data;\n");
|
printf("return data;\n");
|
||||||
printf("}\n");
|
printf("}\n");
|
||||||
|
@@ -41,7 +41,6 @@ extern "C" {
|
|||||||
#include <stdlib.h> // size_t
|
#include <stdlib.h> // size_t
|
||||||
#include <vector_types.h> // CUDA vector types (float4, etc)
|
#include <vector_types.h> // CUDA vector types (float4, etc)
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
* Flags for auto-optimization
|
* Flags for auto-optimization
|
||||||
@@ -59,7 +58,6 @@ extern "C" {
|
|||||||
#define NUM_ITERATIONS (10)
|
#define NUM_ITERATIONS (10)
|
||||||
#define WARP_SIZE (32)
|
#define WARP_SIZE (32)
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
* Compile-time constants used during simulation (user definable)
|
* Compile-time constants used during simulation (user definable)
|
||||||
@@ -75,7 +73,8 @@ extern "C" {
|
|||||||
|
|
||||||
// L-prefix inherited from the old Astaroth, no idea what it means
|
// L-prefix inherited from the old Astaroth, no idea what it means
|
||||||
// MV: L means a Logical switch variale, something having true of false value.
|
// MV: L means a Logical switch variale, something having true of false value.
|
||||||
#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
|
// Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
|
||||||
|
#define LFORCING (0)
|
||||||
#define LINDUCTION (1)
|
#define LINDUCTION (1)
|
||||||
#define LENTROPY (1)
|
#define LENTROPY (1)
|
||||||
#define LTEMPERATURE (0)
|
#define LTEMPERATURE (0)
|
||||||
@@ -258,28 +257,16 @@ typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
|
|||||||
* Reduction types
|
* Reduction types
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum { RTYPE_MAX, RTYPE_MIN, RTYPE_RMS, RTYPE_RMS_EXP, NUM_REDUCTION_TYPES } ReductionType;
|
||||||
RTYPE_MAX,
|
|
||||||
RTYPE_MIN,
|
|
||||||
RTYPE_RMS,
|
|
||||||
RTYPE_RMS_EXP,
|
|
||||||
NUM_REDUCTION_TYPES
|
|
||||||
} ReductionType;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
* Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
|
* Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum { AC_FOR_INT_PARAM_TYPES(AC_GEN_ID), NUM_INT_PARAM_TYPES } AcIntParam;
|
||||||
AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
|
|
||||||
NUM_INT_PARAM_TYPES
|
|
||||||
} AcIntParam;
|
|
||||||
|
|
||||||
typedef enum {
|
typedef enum { AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID), NUM_REAL_PARAM_TYPES } AcRealParam;
|
||||||
AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
|
|
||||||
NUM_REAL_PARAM_TYPES
|
|
||||||
} AcRealParam;
|
|
||||||
|
|
||||||
extern const char* intparam_names[]; // Defined in astaroth.cu
|
extern const char* intparam_names[]; // Defined in astaroth.cu
|
||||||
extern const char* realparam_names[]; // Defined in astaroth.cu
|
extern const char* realparam_names[]; // Defined in astaroth.cu
|
||||||
@@ -294,9 +281,7 @@ typedef struct {
|
|||||||
* Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
|
* Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum { AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES } VertexBufferHandle;
|
||||||
AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
|
|
||||||
} VertexBufferHandle;
|
|
||||||
|
|
||||||
extern const char* vtxbuf_names[]; // Defined in astaroth.cu
|
extern const char* vtxbuf_names[]; // Defined in astaroth.cu
|
||||||
|
|
||||||
@@ -320,12 +305,10 @@ typedef struct {
|
|||||||
((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] * \
|
((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] * \
|
||||||
mesh_info.int_params[AC_mz]))
|
mesh_info.int_params[AC_mz]))
|
||||||
|
|
||||||
#define AC_VTXBUF_SIZE_BYTES(mesh_info) \
|
#define AC_VTXBUF_SIZE_BYTES(mesh_info) (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
|
||||||
(sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
|
|
||||||
|
|
||||||
#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info) \
|
#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info) \
|
||||||
(mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * \
|
(mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * mesh_info.int_params[AC_nz])
|
||||||
mesh_info.int_params[AC_nz])
|
|
||||||
|
|
||||||
#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info) \
|
#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info) \
|
||||||
(sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
|
(sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
|
||||||
|
@@ -35,7 +35,6 @@ const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
|
|||||||
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
|
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
|
||||||
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
|
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
|
||||||
|
|
||||||
|
|
||||||
static const int MAX_NUM_DEVICES = 32;
|
static const int MAX_NUM_DEVICES = 32;
|
||||||
static int num_devices = 1;
|
static int num_devices = 1;
|
||||||
static Device devices[MAX_NUM_DEVICES] = {};
|
static Device devices[MAX_NUM_DEVICES] = {};
|
||||||
@@ -44,17 +43,9 @@ static Grid
|
|||||||
createGrid(const AcMeshInfo& config)
|
createGrid(const AcMeshInfo& config)
|
||||||
{
|
{
|
||||||
Grid grid;
|
Grid grid;
|
||||||
grid.m = (int3) {
|
|
||||||
config.int_params[AC_mx],
|
|
||||||
config.int_params[AC_my],
|
|
||||||
config.int_params[AC_mz]
|
|
||||||
};
|
|
||||||
|
|
||||||
grid.n = (int3) {
|
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
|
||||||
config.int_params[AC_nx],
|
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
|
||||||
config.int_params[AC_ny],
|
|
||||||
config.int_params[AC_nz]
|
|
||||||
};
|
|
||||||
|
|
||||||
return grid;
|
return grid;
|
||||||
}
|
}
|
||||||
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
|
|||||||
static int3
|
static int3
|
||||||
gridIdx3d(const Grid& grid, const int idx)
|
gridIdx3d(const Grid& grid, const int idx)
|
||||||
{
|
{
|
||||||
return (int3){idx % grid.m.x,
|
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
||||||
(idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
|
||||||
idx / (grid.m.x * grid.m.y)};
|
idx / (grid.m.x * grid.m.y)};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
|
|||||||
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
|
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
|
||||||
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
|
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
printf("Grid m "); printInt3(grid.m); printf("\n");
|
printf("Grid m "); printInt3(grid.m); printf("\n");
|
||||||
printf("Grid n "); printInt3(grid.n); printf("\n");
|
printf("Grid n "); printInt3(grid.n); printf("\n");
|
||||||
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
|
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
|
||||||
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
|
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
// Initialize the devices
|
// Initialize the devices
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
|
|||||||
*/
|
*/
|
||||||
if (db.z >= da.z) {
|
if (db.z >= da.z) {
|
||||||
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
||||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
const int3 da_local = (int3){
|
||||||
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
|
da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||||
|
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
|
||||||
|
// printf("\n");
|
||||||
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
|
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
|
|||||||
*/
|
*/
|
||||||
if (db.z >= da.z) {
|
if (db.z >= da.z) {
|
||||||
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
||||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
const int3 da_local = (int3){
|
||||||
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
|
da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||||
|
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
|
||||||
|
// printf("\n");
|
||||||
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
|
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
@@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh)
|
|||||||
AcResult
|
AcResult
|
||||||
acIntegrateStep(const int& isubstep, const AcReal& dt)
|
acIntegrateStep(const int& isubstep, const AcReal& dt)
|
||||||
{
|
{
|
||||||
const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
|
const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
|
||||||
const int3 end = (int3){STENCIL_ORDER/2 + subgrid.n.x,
|
const int3 end = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
|
||||||
STENCIL_ORDER/2 + subgrid.n.y,
|
STENCIL_ORDER / 2 + subgrid.n.z};
|
||||||
STENCIL_ORDER/2 + subgrid.n.z};
|
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
|
rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
|
||||||
}
|
}
|
||||||
@@ -278,34 +273,35 @@ acBoundcondStep(void)
|
|||||||
{
|
{
|
||||||
acSynchronize();
|
acSynchronize();
|
||||||
if (num_devices == 1) {
|
if (num_devices == 1) {
|
||||||
boundcondStep(devices[0], STREAM_PRIMARY,
|
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
|
||||||
(int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
|
(int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// Local boundary conditions
|
// Local boundary conditions
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
|
const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
|
||||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
|
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
|
||||||
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
|
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
// ===MIIKKANOTE START==========================================
|
// ===MIIKKANOTE START==========================================
|
||||||
%JP: The old way for computing boundary conditions conflicts with the
|
%JP: The old way for computing boundary conditions conflicts with the
|
||||||
way we have to do things with multiple GPUs.
|
way we have to do things with multiple GPUs.
|
||||||
|
|
||||||
The older approach relied on unified memory, which represented the whole
|
The older approach relied on unified memory, which represented the whole
|
||||||
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
||||||
in its current state is more meant for quick prototyping when performance is not an issue.
|
in its current state is more meant for quick prototyping when performance is not an issue.
|
||||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
|
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
|
||||||
when managing the memory explicitly.
|
than when managing the memory explicitly.
|
||||||
|
|
||||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||||
Quick rundown:
|
Quick rundown:
|
||||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||||
contain the extents of the whole simulation domain and the decomposed grids, respectively.
|
contain the extents of the whole simulation domain and the decomposed grids,
|
||||||
To simplify thing, we require that each GPU is assigned the same amount of work,
|
respectively. To simplify thing, we require that each GPU is assigned the same amount of
|
||||||
therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
|
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
|
||||||
to work with.
|
work with.
|
||||||
|
|
||||||
The whole simulation domain is decomposed with respect to the z dimension.
|
The whole simulation domain is decomposed with respect to the z dimension.
|
||||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||||
@@ -314,49 +310,50 @@ Quick rundown:
|
|||||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||||
|
|
||||||
Terminology:
|
Terminology:
|
||||||
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
||||||
|
|
||||||
Changes required to this commented code block:
|
Changes required to this commented code block:
|
||||||
- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
|
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
|
||||||
instead. Same holds for any complex index calculations. Instead, the local coordinates
|
device.cu instead. Same holds for any complex index calculations. Instead, the local
|
||||||
should be passed as an int3 type without having to consider how the data is actually
|
coordinates should be passed as an int3 type without having to consider how the data is
|
||||||
laid out in device memory
|
actually laid out in device memory
|
||||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
|
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
|
||||||
of type "Device" which should be passed to single-GPU functions. In this file, all devices
|
handle of type "Device" which should be passed to single-GPU functions. In this file, all
|
||||||
are stored in a global array "devices[num_devices]".
|
devices are stored in a global array "devices[num_devices]".
|
||||||
- Every single-GPU function is executed asynchronously by default such that we
|
- Every single-GPU function is executed asynchronously by default such that we
|
||||||
can optimize Astaroth by executing memory transactions concurrently with computation.
|
can optimize Astaroth by executing memory transactions concurrently with
|
||||||
Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||||
as a parameter and commands executing in different streams can be processed
|
as a parameter and commands executing in different streams can be processed
|
||||||
in parallel/concurrently.
|
in parallel/concurrently.
|
||||||
|
|
||||||
|
|
||||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||||
|
|
||||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||||
|
|
||||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
|
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
|
||||||
|
STENCIL_ORDER/2)
|
||||||
|
|
||||||
on a single device are "local", in the sense that they can be computed without having
|
on a single device are "local", in the sense that they can be computed without
|
||||||
to exchange data with neighboring GPUs. Special care is needed only for transferring
|
having to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||||
the data to the fron and back plates outside this range. In the solution we use here,
|
the data to the fron and back plates outside this range. In the solution we use
|
||||||
we solve the local boundaries first, and then just exchange the front and back plates
|
here, we solve the local boundaries first, and then just exchange the front and back plates
|
||||||
in a "ring", like so
|
in a "ring", like so
|
||||||
device_id
|
device_id
|
||||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||||
|
|
||||||
|
|
||||||
// ======MIIKKANOTE END==========================================
|
// ======MIIKKANOTE END==========================================
|
||||||
|
|
||||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
||||||
moved into device.cu, function boundCondStep()
|
moved into device.cu, function
|
||||||
In astaroth.cu, we use acBoundcondStep()
|
boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
|
||||||
just to distribute the work and manage
|
manage communication between GPUs.
|
||||||
communication between GPUs.
|
|
||||||
|
|
||||||
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
|
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
|
||||||
|
best_dims.z, double(best_time) / NUM_ITERATIONS);
|
||||||
|
|
||||||
exit(0);
|
exit(0);
|
||||||
#else
|
#else
|
||||||
@@ -377,22 +374,24 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
|
|||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||||
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
||||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||||
*/
|
*/
|
||||||
// Exchange halos
|
// Exchange halos
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
|
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2;
|
||||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||||
{
|
{
|
||||||
const int3 src = (int3) {0, 0, subgrid.n.z};
|
const int3 src = (int3){0, 0, subgrid.n.z};
|
||||||
const int3 dst = (int3) {0, 0, 0};
|
const int3 dst = (int3){0, 0, 0};
|
||||||
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
|
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
|
||||||
|
devices[(i + 1) % num_devices], dst, num_vertices);
|
||||||
}
|
}
|
||||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||||
{
|
{
|
||||||
const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
|
const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
|
||||||
const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
|
const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
|
||||||
copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
|
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
|
||||||
|
devices[i], dst, num_vertices);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -427,11 +426,14 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
|
|||||||
for (int i = 1; i < n; ++i) {
|
for (int i = 1; i < n; ++i) {
|
||||||
if (rtype == RTYPE_MAX) {
|
if (rtype == RTYPE_MAX) {
|
||||||
res = max(res, results[i]);
|
res = max(res, results[i]);
|
||||||
} else if (rtype == RTYPE_MIN) {
|
}
|
||||||
|
else if (rtype == RTYPE_MIN) {
|
||||||
res = min(res, results[i]);
|
res = min(res, results[i]);
|
||||||
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
}
|
||||||
|
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||||
res = sum(res, results[i]);
|
res = sum(res, results[i]);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
ERROR("Invalid rtype");
|
ERROR("Invalid rtype");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -445,8 +447,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
acReduceScal(const ReductionType& rtype,
|
acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
|
||||||
const VertexBufferHandle& vtxbuffer_handle)
|
|
||||||
{
|
{
|
||||||
AcReal results[num_devices];
|
AcReal results[num_devices];
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
|
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
|
||||||
const VertexBufferHandle& b, const VertexBufferHandle& c)
|
const VertexBufferHandle& c)
|
||||||
{
|
{
|
||||||
AcReal results[num_devices];
|
AcReal results[num_devices];
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
|
@@ -76,46 +76,46 @@ printDeviceInfo(const Device device)
|
|||||||
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
|
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
|
||||||
printf(" Stream processors: %d\n", props.multiProcessorCount);
|
printf(" Stream processors: %d\n", props.multiProcessorCount);
|
||||||
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
|
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
|
||||||
printf(" Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
|
printf(
|
||||||
|
" Compute mode: %d\n",
|
||||||
|
(int)props
|
||||||
|
.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
|
||||||
// Memory
|
// Memory
|
||||||
printf(" Global memory\n");
|
printf(" Global memory\n");
|
||||||
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
|
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
|
||||||
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
|
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
|
||||||
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
|
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
|
||||||
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
|
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
|
||||||
(8. * 1024. * 1024. * 1024.));
|
|
||||||
printf(" ECC enabled: %d\n", props.ECCEnabled);
|
printf(" ECC enabled: %d\n", props.ECCEnabled);
|
||||||
// Memory usage
|
// Memory usage
|
||||||
size_t free_bytes, total_bytes;
|
size_t free_bytes, total_bytes;
|
||||||
cudaMemGetInfo(&free_bytes, &total_bytes);
|
cudaMemGetInfo(&free_bytes, &total_bytes);
|
||||||
const size_t used_bytes = total_bytes - free_bytes;
|
const size_t used_bytes = total_bytes - free_bytes;
|
||||||
printf(" Total global mem: %.2f GiB\n",
|
printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
|
||||||
props.totalGlobalMem / (1024.0 * 1024 * 1024));
|
|
||||||
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
|
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
|
||||||
printf(" Gmem memory free (GiB): %.2f\n",
|
printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
|
||||||
free_bytes / (1024.0 * 1024 * 1024));
|
printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
|
||||||
printf(" Gmem memory total (GiB): %.2f\n",
|
|
||||||
total_bytes / (1024.0 * 1024 * 1024));
|
|
||||||
printf(" Caches\n");
|
printf(" Caches\n");
|
||||||
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
|
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
|
||||||
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
|
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
|
||||||
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
|
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
|
||||||
printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
|
printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
|
||||||
printf(" Shared mem per block: %ld KiB\n",
|
printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
|
||||||
props.sharedMemPerBlock / (1024));
|
|
||||||
printf(" Other\n");
|
printf(" Other\n");
|
||||||
printf(" Warp size: %d\n", props.warpSize);
|
printf(" Warp size: %d\n", props.warpSize);
|
||||||
// printf(" Single to double perf. ratio: %dx\n",
|
// printf(" Single to double perf. ratio: %dx\n",
|
||||||
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
|
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
|
||||||
// versions
|
// versions
|
||||||
printf(" Stream priorities supported: %d\n",
|
printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported);
|
||||||
props.streamPrioritiesSupported);
|
|
||||||
printf("--------------------------------------------------\n");
|
printf("--------------------------------------------------\n");
|
||||||
|
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dummy_kernel(void) {}
|
static __global__ void
|
||||||
|
dummy_kernel(void)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||||
@@ -124,7 +124,7 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
|||||||
cudaDeviceReset();
|
cudaDeviceReset();
|
||||||
|
|
||||||
// Create Device
|
// Create Device
|
||||||
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
|
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
|
||||||
ERRCHK_ALWAYS(device);
|
ERRCHK_ALWAYS(device);
|
||||||
|
|
||||||
device->id = id;
|
device->id = id;
|
||||||
@@ -150,15 +150,14 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
|||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||||
}
|
}
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
|
ERRCHK_CUDA_ALWAYS(
|
||||||
AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
|
cudaMalloc(&device->reduce_scratchpad, AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||||
|
|
||||||
// Device constants
|
// Device constants
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
|
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
|
|
||||||
// Multi-GPU offset. This is used to compute globalVertexIdx.
|
// Multi-GPU offset. This is used to compute globalVertexIdx.
|
||||||
// Might be better to calculate this in astaroth.cu instead of here, s.t.
|
// Might be better to calculate this in astaroth.cu instead of here, s.t.
|
||||||
// everything related to the decomposition is limited to the multi-GPU layer
|
// everything related to the decomposition is limited to the multi-GPU layer
|
||||||
@@ -166,7 +165,6 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
|||||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
|
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
|
||||||
sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
|
sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
|
|
||||||
printf("Created device %d (%p)\n", device->id, device);
|
printf("Created device %d (%p)\n", device->id, device);
|
||||||
*device_handle = device;
|
*device_handle = device;
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
@@ -211,53 +209,44 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp
|
|||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
|
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||||
device->local_config.int_params[AC_ny_min],
|
device->local_config.int_params[AC_ny_min],
|
||||||
device->local_config.int_params[AC_nz_min]
|
device->local_config.int_params[AC_nz_min]};
|
||||||
};
|
|
||||||
|
|
||||||
const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
|
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||||
device->local_config.int_params[AC_ny_max],
|
device->local_config.int_params[AC_ny_max],
|
||||||
device->local_config.int_params[AC_nz_max]
|
device->local_config.int_params[AC_nz_max]};
|
||||||
};
|
|
||||||
|
|
||||||
*result = reduce_scal(device->streams[stream_type], rtype,
|
*result = reduce_scal(device->streams[stream_type], rtype, start, end,
|
||||||
start, end, device->vba.in[vtxbuf_handle],
|
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
|
||||||
device->reduce_scratchpad, device->reduce_result);
|
device->reduce_result);
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
reduceVec(const Device device, const StreamType stream_type,
|
reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||||
const ReductionType rtype,
|
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||||
const VertexBufferHandle vtxbuf0,
|
const VertexBufferHandle vtxbuf2, AcReal* result)
|
||||||
const VertexBufferHandle vtxbuf1,
|
|
||||||
const VertexBufferHandle vtxbuf2,
|
|
||||||
AcReal* result)
|
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
|
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||||
device->local_config.int_params[AC_ny_min],
|
device->local_config.int_params[AC_ny_min],
|
||||||
device->local_config.int_params[AC_nz_min]
|
device->local_config.int_params[AC_nz_min]};
|
||||||
};
|
|
||||||
|
|
||||||
const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
|
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||||
device->local_config.int_params[AC_ny_max],
|
device->local_config.int_params[AC_ny_max],
|
||||||
device->local_config.int_params[AC_nz_max]
|
device->local_config.int_params[AC_nz_max]};
|
||||||
};
|
|
||||||
|
|
||||||
*result = reduce_vec(device->streams[stream_type], rtype, start, end,
|
*result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
|
||||||
device->vba.in[vtxbuf0],
|
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
|
||||||
device->vba.in[vtxbuf1],
|
|
||||||
device->vba.in[vtxbuf2],
|
|
||||||
device->reduce_scratchpad, device->reduce_result);
|
device->reduce_scratchpad, device->reduce_result);
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
rkStep(const Device device, const StreamType stream_type, const int step_number,
|
rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
|
||||||
const int3& start, const int3& end, const AcReal dt)
|
const int3& end, const AcReal dt)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
|
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
|
||||||
@@ -270,65 +259,62 @@ synchronize(const Device device, const StreamType stream_type)
|
|||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
if (stream_type == STREAM_ALL) {
|
if (stream_type == STREAM_ALL) {
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
cudaStreamSynchronize(device->streams[stream_type]);
|
cudaStreamSynchronize(device->streams[stream_type]);
|
||||||
}
|
}
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcResult
|
static AcResult
|
||||||
loadWithOffset(const Device device, const StreamType stream_type,
|
loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
|
||||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
const size_t bytes, AcReal* dst)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
|
ERRCHK_CUDA(
|
||||||
device->streams[stream_type]));
|
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcResult
|
static AcResult
|
||||||
storeWithOffset(const Device device, const StreamType stream_type,
|
storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
|
||||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
const size_t bytes, AcReal* dst)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
|
ERRCHK_CUDA(
|
||||||
device->streams[stream_type]));
|
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
copyMeshToDevice(const Device device, const StreamType stream_type,
|
copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
|
||||||
const AcMesh& host_mesh, const int3& src, const int3& dst,
|
const int3& src, const int3& dst, const int num_vertices)
|
||||||
const int num_vertices)
|
|
||||||
{
|
{
|
||||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
|
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
|
||||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
|
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
|
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
|
||||||
&device->vba.in[i][dst_idx]);
|
num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
|
||||||
}
|
}
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
copyMeshToHost(const Device device, const StreamType stream_type,
|
copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
|
||||||
const int3& src, const int3& dst, const int num_vertices,
|
const int num_vertices, AcMesh* host_mesh)
|
||||||
AcMesh* host_mesh)
|
|
||||||
{
|
{
|
||||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
|
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
|
||||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
|
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
|
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
|
||||||
num_vertices * sizeof(AcReal),
|
num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
|
||||||
&host_mesh->vertex_buffer[i][dst_idx]);
|
|
||||||
}
|
}
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
|
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
|
||||||
const int3& src, Device dst_device, const int3& dst,
|
Device dst_device, const int3& dst, const int num_vertices)
|
||||||
const int num_vertices)
|
|
||||||
{
|
{
|
||||||
cudaSetDevice(src_device->id);
|
cudaSetDevice(src_device->id);
|
||||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
|
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
|
||||||
@@ -364,8 +350,8 @@ loadDeviceConstant(const Device device, const AcIntParam param, const int value)
|
|||||||
// Therefore we have to obfuscate the code a bit and compute the offset address before
|
// Therefore we have to obfuscate the code a bit and compute the offset address before
|
||||||
// invoking cudaMemcpyToSymbol.
|
// invoking cudaMemcpyToSymbol.
|
||||||
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
|
ERRCHK_CUDA_ALWAYS(
|
||||||
offset, cudaMemcpyHostToDevice));
|
cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -374,8 +360,8 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va
|
|||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
|
ERRCHK_CUDA_ALWAYS(
|
||||||
offset, cudaMemcpyHostToDevice));
|
cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -383,7 +369,7 @@ AcResult
|
|||||||
loadGlobalGrid(const Device device, const Grid grid)
|
loadGlobalGrid(const Device device, const Grid grid)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid),
|
ERRCHK_CUDA_ALWAYS(
|
||||||
0, cudaMemcpyHostToDevice));
|
cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@@ -27,12 +27,14 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include "astaroth.h"
|
#include "astaroth.h"
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
typedef enum {
|
typedef enum {
|
||||||
STREAM_PRIMARY,
|
STREAM_PRIMARY,
|
||||||
STREAM_SECONDARY,
|
STREAM_SECONDARY,
|
||||||
NUM_STREAM_TYPES,
|
NUM_STREAM_TYPES,
|
||||||
STREAM_ALL
|
STREAM_ALL
|
||||||
} StreamType;
|
} StreamType;
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int3 m;
|
int3 m;
|
||||||
@@ -52,20 +54,17 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* devi
|
|||||||
AcResult destroyDevice(Device device);
|
AcResult destroyDevice(Device device);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult boundcondStep(const Device device, const StreamType stream_type,
|
AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
|
||||||
const int3& start, const int3& end);
|
const int3& end);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult reduceVec(const Device device, const StreamType stream_type,
|
AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||||
const ReductionType rtype,
|
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
|
||||||
const VertexBufferHandle vec0,
|
const VertexBufferHandle vec2, AcReal* result);
|
||||||
const VertexBufferHandle vec1,
|
|
||||||
const VertexBufferHandle vec2,
|
|
||||||
AcReal* result);
|
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||||
@@ -81,9 +80,8 @@ AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
|
|||||||
const int num_vertices);
|
const int num_vertices);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult copyMeshToHost(const Device device, const StreamType stream_type,
|
AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
|
||||||
const int3& src, const int3& dst, const int num_vertices,
|
const int3& dst, const int num_vertices, AcMesh* host_mesh);
|
||||||
AcMesh* host_mesh);
|
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
|
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
|
||||||
|
@@ -24,7 +24,7 @@
|
|||||||
* Detailed info.
|
* Detailed info.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
__global__ void
|
__global__ void
|
||||||
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||||
@@ -38,7 +38,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
|||||||
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
//if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
// if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
||||||
// return;
|
// return;
|
||||||
|
|
||||||
// If destination index is inside the computational domain, return since
|
// If destination index is inside the computational domain, return since
|
||||||
@@ -77,7 +77,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
|||||||
void
|
void
|
||||||
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
||||||
{
|
{
|
||||||
const dim3 tpb(8,2,8);
|
const dim3 tpb(8, 2, 8);
|
||||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||||
@@ -89,7 +89,6 @@ periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& en
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
|
|
||||||
static __device__ __forceinline__ int
|
static __device__ __forceinline__ int
|
||||||
IDX(const int i)
|
IDX(const int i)
|
||||||
{
|
{
|
||||||
@@ -120,7 +119,6 @@ create_rotz(const AcReal radians)
|
|||||||
return mat;
|
return mat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#if AC_DOUBLE_PRECISION == 0
|
#if AC_DOUBLE_PRECISION == 0
|
||||||
#define sin __sinf
|
#define sin __sinf
|
||||||
#define cos __cosf
|
#define cos __cosf
|
||||||
@@ -128,7 +126,6 @@ create_rotz(const AcReal radians)
|
|||||||
#define rsqrt rsqrtf // hardware reciprocal sqrt
|
#define rsqrt rsqrtf // hardware reciprocal sqrt
|
||||||
#endif // AC_DOUBLE_PRECISION == 0
|
#endif // AC_DOUBLE_PRECISION == 0
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int i, j, k;
|
int i, j, k;
|
||||||
@@ -155,11 +152,10 @@ first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
|||||||
#elif STENCIL_ORDER == 6
|
#elif STENCIL_ORDER == 6
|
||||||
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
|
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
|
||||||
#elif STENCIL_ORDER == 8
|
#elif STENCIL_ORDER == 8
|
||||||
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
|
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, -1.0 / 280.0};
|
||||||
-1.0 / 280.0};
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MID (STENCIL_ORDER / 2)
|
#define MID (STENCIL_ORDER / 2)
|
||||||
AcReal res = 0;
|
AcReal res = 0;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@@ -175,16 +171,14 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
|||||||
#if STENCIL_ORDER == 2
|
#if STENCIL_ORDER == 2
|
||||||
const AcReal coefficients[] = {-2., 1.};
|
const AcReal coefficients[] = {-2., 1.};
|
||||||
#elif STENCIL_ORDER == 4
|
#elif STENCIL_ORDER == 4
|
||||||
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
|
const AcReal coefficients[] = {-5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
|
||||||
#elif STENCIL_ORDER == 6
|
#elif STENCIL_ORDER == 6
|
||||||
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
|
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, 1.0 / 90.0};
|
||||||
1.0 / 90.0};
|
|
||||||
#elif STENCIL_ORDER == 8
|
#elif STENCIL_ORDER == 8
|
||||||
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
|
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, 8.0 / 315.0, -1.0 / 560.0};
|
||||||
8.0 / 315.0, -1.0 / 560.0};
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MID (STENCIL_ORDER / 2)
|
#define MID (STENCIL_ORDER / 2)
|
||||||
AcReal res = coefficients[0] * pencil[MID];
|
AcReal res = coefficients[0] * pencil[MID];
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@@ -196,31 +190,29 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
|||||||
|
|
||||||
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
|
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
|
||||||
static __device__ __forceinline__ AcReal
|
static __device__ __forceinline__ AcReal
|
||||||
cross_derivative(const AcReal* __restrict__ pencil_a,
|
cross_derivative(const AcReal* __restrict__ pencil_a, const AcReal* __restrict__ pencil_b,
|
||||||
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
|
const AcReal inv_ds_a, const AcReal inv_ds_b)
|
||||||
const AcReal inv_ds_b)
|
|
||||||
{
|
{
|
||||||
#if STENCIL_ORDER == 2
|
#if STENCIL_ORDER == 2
|
||||||
const AcReal coefficients[] = {0, 1.0 / 4.0};
|
const AcReal coefficients[] = {0, 1.0 / 4.0};
|
||||||
#elif STENCIL_ORDER == 4
|
#elif STENCIL_ORDER == 4
|
||||||
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
|
const AcReal coefficients[] = {
|
||||||
|
0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
|
||||||
#elif STENCIL_ORDER == 6
|
#elif STENCIL_ORDER == 6
|
||||||
const AcReal fac = (1. / 720.);
|
const AcReal fac = (1. / 720.);
|
||||||
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
|
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, 2.0 * fac};
|
||||||
2.0 * fac};
|
|
||||||
#elif STENCIL_ORDER == 8
|
#elif STENCIL_ORDER == 8
|
||||||
const AcReal fac = (1. / 20160.);
|
const AcReal fac = (1. / 20160.);
|
||||||
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
|
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, 128. * fac, -9. * fac};
|
||||||
128. * fac, -9. * fac};
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MID (STENCIL_ORDER / 2)
|
#define MID (STENCIL_ORDER / 2)
|
||||||
AcReal res = AcReal(0.);
|
AcReal res = AcReal(0.);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 1; i <= MID; ++i) {
|
for (int i = 1; i <= MID; ++i) {
|
||||||
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
|
res += coefficients[i] *
|
||||||
pencil_b[MID + i] - pencil_b[MID - i]);
|
(pencil_a[MID + i] + pencil_a[MID - i] - pencil_b[MID + i] - pencil_b[MID - i]);
|
||||||
}
|
}
|
||||||
return res * inv_ds_a * inv_ds_b;
|
return res * inv_ds_a * inv_ds_b;
|
||||||
}
|
}
|
||||||
@@ -231,7 +223,8 @@ derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
AcReal pencil[STENCIL_ORDER + 1];
|
AcReal pencil[STENCIL_ORDER + 1];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||||
|
vertexIdx.z)];
|
||||||
|
|
||||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||||
}
|
}
|
||||||
@@ -242,7 +235,8 @@ derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
AcReal pencil[STENCIL_ORDER + 1];
|
AcReal pencil[STENCIL_ORDER + 1];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||||
|
vertexIdx.z)];
|
||||||
|
|
||||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||||
}
|
}
|
||||||
@@ -262,8 +256,7 @@ derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
||||||
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
|
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
|
||||||
|
|
||||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsy));
|
||||||
DCONST_REAL(AC_inv_dsy));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ AcReal
|
static __device__ __forceinline__ AcReal
|
||||||
@@ -281,8 +274,7 @@ derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||||
|
|
||||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsz));
|
||||||
DCONST_REAL(AC_inv_dsz));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ AcReal
|
static __device__ __forceinline__ AcReal
|
||||||
@@ -291,7 +283,8 @@ dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
AcReal pencil[STENCIL_ORDER + 1];
|
AcReal pencil[STENCIL_ORDER + 1];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||||
|
vertexIdx.z)];
|
||||||
|
|
||||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||||
}
|
}
|
||||||
@@ -302,7 +295,8 @@ deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
AcReal pencil[STENCIL_ORDER + 1];
|
AcReal pencil[STENCIL_ORDER + 1];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||||
|
vertexIdx.z)];
|
||||||
|
|
||||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||||
}
|
}
|
||||||
@@ -322,8 +316,7 @@ deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||||
|
|
||||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
|
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), DCONST_REAL(AC_inv_dsz));
|
||||||
DCONST_REAL(AC_inv_dsz));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ AcReal
|
static __device__ __forceinline__ AcReal
|
||||||
@@ -332,7 +325,8 @@ derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
AcReal pencil[STENCIL_ORDER + 1];
|
AcReal pencil[STENCIL_ORDER + 1];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
|
||||||
|
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||||
|
|
||||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||||
}
|
}
|
||||||
@@ -343,7 +337,8 @@ derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
|||||||
AcReal pencil[STENCIL_ORDER + 1];
|
AcReal pencil[STENCIL_ORDER + 1];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
|
||||||
|
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||||
|
|
||||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||||
}
|
}
|
||||||
@@ -401,8 +396,7 @@ operator-(const AcReal3& a)
|
|||||||
return (AcReal3){-a.x, -a.y, -a.z};
|
return (AcReal3){-a.x, -a.y, -a.z};
|
||||||
}
|
}
|
||||||
|
|
||||||
static __host__ __device__ __forceinline__ AcReal3
|
static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
|
||||||
operator*(const AcReal a, const AcReal3& b)
|
|
||||||
{
|
{
|
||||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||||
}
|
}
|
||||||
@@ -443,7 +437,6 @@ is_valid(const AcReal3& a)
|
|||||||
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
|
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
* Level 1 (Stencil Processing Stage)
|
* Level 1 (Stencil Processing Stage)
|
||||||
@@ -476,8 +469,7 @@ laplace_vec(const AcReal3Data& vec)
|
|||||||
static __device__ __forceinline__ AcReal3
|
static __device__ __forceinline__ AcReal3
|
||||||
curl(const AcReal3Data& vec)
|
curl(const AcReal3Data& vec)
|
||||||
{
|
{
|
||||||
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
|
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, gradient(vec.x).z - gradient(vec.z).x,
|
||||||
gradient(vec.x).z - gradient(vec.z).x,
|
|
||||||
gradient(vec.y).x - gradient(vec.x).y};
|
gradient(vec.y).x - gradient(vec.x).y};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -520,7 +512,7 @@ contract(const AcMatrix& mat)
|
|||||||
{
|
{
|
||||||
AcReal res = 0;
|
AcReal res = 0;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 3; ++i)
|
for (int i = 0; i < 3; ++i)
|
||||||
res += dot(mat.row[i], mat.row[i]);
|
res += dot(mat.row[i], mat.row[i]);
|
||||||
|
|
||||||
@@ -558,10 +550,11 @@ __constant__ AcReal forcing_phi;
|
|||||||
static __device__ __forceinline__ AcReal3
|
static __device__ __forceinline__ AcReal3
|
||||||
forcing(const int i, const int j, const int k)
|
forcing(const int i, const int j, const int k)
|
||||||
{
|
{
|
||||||
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
|
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
|
||||||
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
|
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
|
||||||
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
|
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
|
||||||
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
|
const AcReal3 k_vec = (AcReal3){
|
||||||
|
(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
|
||||||
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
|
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
|
||||||
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
|
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
|
||||||
AcReal inv_len = reciprocal_len(k_vec);
|
AcReal inv_len = reciprocal_len(k_vec);
|
||||||
@@ -571,31 +564,27 @@ forcing(const int i, const int j, const int k)
|
|||||||
inv_len = 2;
|
inv_len = 2;
|
||||||
const AcReal k_dot_x = dot(k_vec, forcing_vec);
|
const AcReal k_dot_x = dot(k_vec, forcing_vec);
|
||||||
|
|
||||||
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
|
const AcReal waves = cos(k_dot_x) * cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
|
||||||
|
|
||||||
return inv_len * inv_len * waves * forcing_vec;
|
return inv_len * inv_len * waves * forcing_vec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values
|
||||||
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
|
// in the mesh, then we will inherently lose precision
|
||||||
#define LNT0 (AcReal(0.0))
|
#define LNT0 (AcReal(0.0))
|
||||||
#define LNRHO0 (AcReal(0.0))
|
#define LNRHO0 (AcReal(0.0))
|
||||||
|
|
||||||
#define H_CONST (AcReal(0.0))
|
#define H_CONST (AcReal(0.0))
|
||||||
#define C_CONST (AcReal(0.0))
|
#define C_CONST (AcReal(0.0))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template <int step_number>
|
template <int step_number>
|
||||||
static __device__ __forceinline__ AcReal
|
static __device__ __forceinline__ AcReal
|
||||||
rk3_integrate(const AcReal state_previous, const AcReal state_current,
|
rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
|
||||||
const AcReal rate_of_change, const AcReal dt)
|
const AcReal dt)
|
||||||
{
|
{
|
||||||
// Williamson (1980)
|
// Williamson (1980)
|
||||||
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||||
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
|
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
|
||||||
AcReal(8. / 15.)};
|
|
||||||
|
|
||||||
|
|
||||||
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
||||||
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
||||||
@@ -605,8 +594,7 @@ rk3_integrate(const AcReal state_previous, const AcReal state_current,
|
|||||||
case 1: // Fallthrough
|
case 1: // Fallthrough
|
||||||
case 2:
|
case 2:
|
||||||
return state_current +
|
return state_current +
|
||||||
beta[step_number + 1] *
|
beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
||||||
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
|
||||||
(state_current - state_previous) +
|
(state_current - state_previous) +
|
||||||
rate_of_change * dt);
|
rate_of_change * dt);
|
||||||
default:
|
default:
|
||||||
@@ -646,13 +634,14 @@ static __device__ __forceinline__ AcReal3
|
|||||||
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||||
const AcReal3 rate_of_change, const AcReal dt)
|
const AcReal3 rate_of_change, const AcReal dt)
|
||||||
{
|
{
|
||||||
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
return (AcReal3){
|
||||||
|
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||||
}
|
}
|
||||||
|
|
||||||
#define rk3(state_previous, state_current, rate_of_change, dt)\
|
#define rk3(state_previous, state_current, rate_of_change, dt) \
|
||||||
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
template <int step_number>
|
template <int step_number>
|
||||||
@@ -708,9 +697,8 @@ read_out(const int idx, AcReal* __restrict__ field[], const int handle)
|
|||||||
static __device__ AcReal3
|
static __device__ AcReal3
|
||||||
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||||
{
|
{
|
||||||
return (AcReal3) { read_out(idx, field, handle.x),
|
return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
|
||||||
read_out(idx, field, handle.y),
|
read_out(idx, field, handle.z)};
|
||||||
read_out(idx, field, handle.z) };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
||||||
@@ -718,28 +706,27 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
|||||||
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
||||||
|
|
||||||
// also write for clarity here also, not for the DSL
|
// also write for clarity here also, not for the DSL
|
||||||
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
|
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the
|
||||||
|
// function
|
||||||
|
|
||||||
#define GEN_KERNEL_PARAM_BOILERPLATE \
|
#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
|
||||||
const int3 start, const int3 end, VertexBufferArray buffer
|
|
||||||
|
|
||||||
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
||||||
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
|
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \
|
||||||
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
|
threadIdx.y + blockIdx.y * blockDim.y + start.y, \
|
||||||
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
|
threadIdx.z + blockIdx.z * blockDim.z + start.z}; \
|
||||||
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
|
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
|
||||||
d_multigpu_offset.y + vertexIdx.y, \
|
d_multigpu_offset.y + vertexIdx.y, \
|
||||||
d_multigpu_offset.z + vertexIdx.z}; \
|
d_multigpu_offset.z + vertexIdx.z}; \
|
||||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
|
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
|
||||||
return;\
|
return; \
|
||||||
\
|
\
|
||||||
\
|
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) && \
|
||||||
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
|
vertexIdx.z < DCONST_INT(AC_nz_max)); \
|
||||||
vertexIdx.z < DCONST_INT(AC_nz_max));\
|
\
|
||||||
\
|
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) && \
|
||||||
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
|
vertexIdx.z >= DCONST_INT(AC_nz_min)); \
|
||||||
vertexIdx.z >= DCONST_INT(AC_nz_min));\
|
\
|
||||||
\
|
|
||||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||||
|
|
||||||
#include "stencil_process.cuh"
|
#include "stencil_process.cuh"
|
||||||
@@ -757,34 +744,32 @@ randf(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
|
||||||
const AcReal dt, VertexBufferArray* buffer)
|
const int3& end, const AcReal dt, VertexBufferArray* buffer)
|
||||||
{
|
{
|
||||||
const dim3 tpb(32, 1, 4);
|
const dim3 tpb(32, 1, 4);
|
||||||
/////////////////// Forcing
|
/////////////////// Forcing
|
||||||
#if LFORCING
|
#if LFORCING
|
||||||
const AcReal ff_scale = AcReal(.2);
|
const AcReal ff_scale = AcReal(.2);
|
||||||
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
|
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
|
||||||
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
|
const AcReal radians = randf() * AcReal(2 * M_PI) / 360 / 8;
|
||||||
const AcMatrix rotz = create_rotz(radians);
|
const AcMatrix rotz = create_rotz(radians);
|
||||||
ff = mul(rotz, ff);
|
ff = mul(rotz, ff);
|
||||||
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
|
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
|
||||||
|
|
||||||
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
|
const AcReal ff_phi = AcReal(M_PI); // AcReal(2 * M_PI) * randf();
|
||||||
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
|
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice,
|
||||||
#endif // LFORCING
|
stream);
|
||||||
|
#endif // LFORCING
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
|
|
||||||
const int nx = end.x - start.x;
|
const int nx = end.x - start.x;
|
||||||
const int ny = end.y - start.y;
|
const int ny = end.y - start.y;
|
||||||
const int nz = end.z - start.z;
|
const int nz = end.z - start.z;
|
||||||
|
|
||||||
const dim3 bpg(
|
const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
|
||||||
(unsigned int)ceil(nx / AcReal(tpb.x)),
|
|
||||||
(unsigned int)ceil(ny / AcReal(tpb.y)),
|
|
||||||
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
||||||
|
|
||||||
|
|
||||||
if (step_number == 0)
|
if (step_number == 0)
|
||||||
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||||
else if (step_number == 1)
|
else if (step_number == 1)
|
||||||
@@ -796,7 +781,6 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////REDUCE///////////////////////////
|
////////////////REDUCE///////////////////////////
|
||||||
#include "src/core/math_utils.h" // is_power_of_two
|
#include "src/core/math_utils.h" // is_power_of_two
|
||||||
|
|
||||||
@@ -848,22 +832,19 @@ template <FilterFunc filter>
|
|||||||
__global__ void
|
__global__ void
|
||||||
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
|
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
|
||||||
{
|
{
|
||||||
const int3 src_idx = (int3) {
|
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||||
start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
|
||||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||||
start.z + threadIdx.z + blockIdx.z * blockDim.z
|
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||||
};
|
|
||||||
|
|
||||||
const int nx = end.x - start.x;
|
const int nx = end.x - start.x;
|
||||||
const int ny = end.y - start.y;
|
const int ny = end.y - start.y;
|
||||||
const int nz = end.z - start.z; //MV: Added this because it was undefined
|
const int nz = end.z - start.z; // MV: Added this because it was undefined
|
||||||
const int3 dst_idx = (int3) {
|
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||||
threadIdx.x + blockIdx.x * blockDim.x,
|
|
||||||
threadIdx.y + blockIdx.y * blockDim.y,
|
threadIdx.y + blockIdx.y * blockDim.y,
|
||||||
threadIdx.z + blockIdx.z * blockDim.z
|
threadIdx.z + blockIdx.z * blockDim.z};
|
||||||
};
|
|
||||||
|
|
||||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
|
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||||
|
src_idx.z < DCONST_INT(AC_nz_max));
|
||||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||||
|
|
||||||
@@ -872,31 +853,27 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,
|
|||||||
|
|
||||||
template <FilterFuncVec filter>
|
template <FilterFuncVec filter>
|
||||||
__global__ void
|
__global__ void
|
||||||
kernel_filter_vec(const __restrict__ AcReal* src0,
|
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
|
||||||
const __restrict__ AcReal* src1,
|
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
|
||||||
const __restrict__ AcReal* src2,
|
|
||||||
const int3 start, const int3 end, AcReal* dst)
|
|
||||||
{
|
{
|
||||||
const int3 src_idx = (int3) {
|
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||||
start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
|
||||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||||
start.z + threadIdx.z + blockIdx.z * blockDim.z
|
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||||
};
|
|
||||||
|
|
||||||
const int nx = end.x - start.x;
|
const int nx = end.x - start.x;
|
||||||
const int ny = end.y - start.y;
|
const int ny = end.y - start.y;
|
||||||
const int nz = end.z - start.z; //MV: Added this because it was undefined
|
const int nz = end.z - start.z; // MV: Added this because it was undefined
|
||||||
const int3 dst_idx = (int3) {
|
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||||
threadIdx.x + blockIdx.x * blockDim.x,
|
|
||||||
threadIdx.y + blockIdx.y * blockDim.y,
|
threadIdx.y + blockIdx.y * blockDim.y,
|
||||||
threadIdx.z + blockIdx.z * blockDim.z
|
threadIdx.z + blockIdx.z * blockDim.z};
|
||||||
};
|
|
||||||
|
|
||||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
|
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||||
|
src_idx.z < DCONST_INT(AC_nz_max));
|
||||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||||
|
|
||||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
||||||
|
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <ReduceFunc reduce>
|
template <ReduceFunc reduce>
|
||||||
@@ -908,7 +885,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
|
|||||||
extern __shared__ AcReal smem[];
|
extern __shared__ AcReal smem[];
|
||||||
if (idx < num_elems) {
|
if (idx < num_elems) {
|
||||||
smem[threadIdx.x] = scratchpad[idx];
|
smem[threadIdx.x] = scratchpad[idx];
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
smem[threadIdx.x] = NAN;
|
smem[threadIdx.x] = NAN;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@@ -930,9 +908,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
|
|||||||
|
|
||||||
template <ReduceFunc reduce>
|
template <ReduceFunc reduce>
|
||||||
__global__ void
|
__global__ void
|
||||||
kernel_reduce_block(const __restrict__ AcReal* scratchpad,
|
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
||||||
const int num_blocks, const int block_size,
|
const int block_size, AcReal* result)
|
||||||
AcReal* result)
|
|
||||||
{
|
{
|
||||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (idx != 0) {
|
if (idx != 0) {
|
||||||
@@ -946,11 +923,9 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
|
|||||||
*result = res;
|
*result = res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||||
const int3& start, const int3& end,
|
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||||
const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
|
||||||
{
|
{
|
||||||
const unsigned nx = end.x - start.x;
|
const unsigned nx = end.x - start.x;
|
||||||
const unsigned ny = end.y - start.y;
|
const unsigned ny = end.y - start.y;
|
||||||
@@ -958,11 +933,9 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
|||||||
const unsigned num_elems = nx * ny * nz;
|
const unsigned num_elems = nx * ny * nz;
|
||||||
|
|
||||||
const dim3 tpb_filter(32, 4, 1);
|
const dim3 tpb_filter(32, 4, 1);
|
||||||
const dim3 bpg_filter(
|
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||||
(unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
|
||||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
|
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||||
);
|
|
||||||
|
|
||||||
const int tpb_reduce = 128;
|
const int tpb_reduce = 128;
|
||||||
const int bpg_reduce = num_elems / tpb_reduce;
|
const int bpg_reduce = num_elems / tpb_reduce;
|
||||||
@@ -974,22 +947,38 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
|||||||
ERRCHK(nx * ny * nz % 2 == 0);
|
ERRCHK(nx * ny * nz % 2 == 0);
|
||||||
|
|
||||||
if (rtype == RTYPE_MAX) {
|
if (rtype == RTYPE_MAX) {
|
||||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
kernel_filter<dvalue>
|
||||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
} else if (rtype == RTYPE_MIN) {
|
scratchpad, num_elems);
|
||||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
kernel_reduce_block<dmax>
|
||||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
}
|
||||||
} else if (rtype == RTYPE_RMS) {
|
else if (rtype == RTYPE_MIN) {
|
||||||
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
kernel_filter<dvalue>
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
} else if (rtype == RTYPE_RMS_EXP) {
|
scratchpad, num_elems);
|
||||||
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
kernel_reduce_block<dmin>
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
}
|
||||||
} else {
|
else if (rtype == RTYPE_RMS) {
|
||||||
|
kernel_filter<dsquared>
|
||||||
|
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
|
scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum>
|
||||||
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
}
|
||||||
|
else if (rtype == RTYPE_RMS_EXP) {
|
||||||
|
kernel_filter<dexp_squared>
|
||||||
|
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
|
scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum>
|
||||||
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
}
|
||||||
|
else {
|
||||||
ERROR("Unrecognized rtype");
|
ERROR("Unrecognized rtype");
|
||||||
}
|
}
|
||||||
AcReal result;
|
AcReal result;
|
||||||
@@ -998,10 +987,9 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
reduce_vec(const cudaStream_t stream, const ReductionType rtype,
|
reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||||
const int3& start, const int3& end,
|
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
|
AcReal* reduce_result)
|
||||||
AcReal* scratchpad, AcReal* reduce_result)
|
|
||||||
{
|
{
|
||||||
const unsigned nx = end.x - start.x;
|
const unsigned nx = end.x - start.x;
|
||||||
const unsigned ny = end.y - start.y;
|
const unsigned ny = end.y - start.y;
|
||||||
@@ -1009,11 +997,9 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
|
|||||||
const unsigned num_elems = nx * ny * nz;
|
const unsigned num_elems = nx * ny * nz;
|
||||||
|
|
||||||
const dim3 tpb_filter(32, 4, 1);
|
const dim3 tpb_filter(32, 4, 1);
|
||||||
const dim3 bpg_filter(
|
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||||
(unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
|
||||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
|
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||||
);
|
|
||||||
|
|
||||||
const int tpb_reduce = 128;
|
const int tpb_reduce = 128;
|
||||||
const int bpg_reduce = num_elems / tpb_reduce;
|
const int bpg_reduce = num_elems / tpb_reduce;
|
||||||
@@ -1025,22 +1011,38 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
|
|||||||
ERRCHK(nx * ny * nz % 2 == 0);
|
ERRCHK(nx * ny * nz % 2 == 0);
|
||||||
|
|
||||||
if (rtype == RTYPE_MAX) {
|
if (rtype == RTYPE_MAX) {
|
||||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
} else if (rtype == RTYPE_MIN) {
|
scratchpad, num_elems);
|
||||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
kernel_reduce_block<dmax>
|
||||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
}
|
||||||
} else if (rtype == RTYPE_RMS) {
|
else if (rtype == RTYPE_MIN) {
|
||||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
} else if (rtype == RTYPE_RMS_EXP) {
|
scratchpad, num_elems);
|
||||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
kernel_reduce_block<dmin>
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
}
|
||||||
} else {
|
else if (rtype == RTYPE_RMS) {
|
||||||
|
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||||
|
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
|
scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum>
|
||||||
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
}
|
||||||
|
else if (rtype == RTYPE_RMS_EXP) {
|
||||||
|
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||||
|
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||||
|
scratchpad, num_elems);
|
||||||
|
kernel_reduce_block<dsum>
|
||||||
|
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
|
}
|
||||||
|
else {
|
||||||
ERROR("Unrecognized rtype");
|
ERROR("Unrecognized rtype");
|
||||||
}
|
}
|
||||||
AcReal result;
|
AcReal result;
|
||||||
|
@@ -60,7 +60,6 @@ typedef struct {
|
|||||||
AcReal x, y, z;
|
AcReal x, y, z;
|
||||||
} vec3r;
|
} vec3r;
|
||||||
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ModelScalar model;
|
ModelScalar model;
|
||||||
AcReal candidate;
|
AcReal candidate;
|
||||||
@@ -71,17 +70,20 @@ typedef struct {
|
|||||||
#define THOROUGH_TEST (1)
|
#define THOROUGH_TEST (1)
|
||||||
#define TEST_TYPE QUICK_TEST
|
#define TEST_TYPE QUICK_TEST
|
||||||
|
|
||||||
static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
|
static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE,
|
||||||
|
INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
|
||||||
// #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
|
// #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
|
||||||
|
|
||||||
#if TEST_TYPE == QUICK_TEST // REGULAR TEST START HERE --------------------------------------------------------------------------------------------------------------
|
#if TEST_TYPE == \
|
||||||
static inline ModelScalar
|
QUICK_TEST // REGULAR TEST START HERE
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
static inline ModelScalar
|
||||||
get_absolute_error(const ModelScalar& model, const AcReal& candidate)
|
get_absolute_error(const ModelScalar& model, const AcReal& candidate)
|
||||||
{
|
{
|
||||||
return fabsl(candidate - model);
|
return fabsl(candidate - model);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline ModelScalar
|
static inline ModelScalar
|
||||||
get_acceptable_absolute_error(const ModelScalar& range)
|
get_acceptable_absolute_error(const ModelScalar& range)
|
||||||
{
|
{
|
||||||
// This is the upper limit, which assumes that both the min and max values
|
// This is the upper limit, which assumes that both the min and max values
|
||||||
@@ -93,13 +95,13 @@ get_acceptable_absolute_error(const ModelScalar& range)
|
|||||||
return range * AC_REAL_EPSILON;
|
return range * AC_REAL_EPSILON;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline ModelScalar
|
static inline ModelScalar
|
||||||
get_acceptable_relative_error(void)
|
get_acceptable_relative_error(void)
|
||||||
{
|
{
|
||||||
return 30; // machine epsilons
|
return 30; // machine epsilons
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline ModelScalar
|
static inline ModelScalar
|
||||||
get_relative_error(const ModelScalar& model, const AcReal& candidate)
|
get_relative_error(const ModelScalar& model, const AcReal& candidate)
|
||||||
{
|
{
|
||||||
ModelScalar error = NAN;
|
ModelScalar error = NAN;
|
||||||
@@ -130,7 +132,7 @@ get_relative_error(const ModelScalar& model, const AcReal& candidate)
|
|||||||
return error / AC_REAL_EPSILON;
|
return error / AC_REAL_EPSILON;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range)
|
verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range)
|
||||||
{
|
{
|
||||||
if (!is_valid(model) || !is_valid(cand))
|
if (!is_valid(model) || !is_valid(cand))
|
||||||
@@ -147,51 +149,42 @@ verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ModelScalar
|
static ModelScalar
|
||||||
get_reduction_range(const ModelMesh& mesh)
|
get_reduction_range(const ModelMesh& mesh)
|
||||||
{
|
{
|
||||||
ERRCHK(NUM_VTXBUF_HANDLES >= 3);
|
ERRCHK(NUM_VTXBUF_HANDLES >= 3);
|
||||||
|
|
||||||
const ModelScalar max0 = model_reduce_scal(mesh, RTYPE_MAX,
|
const ModelScalar max0 = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(0));
|
||||||
VertexBufferHandle(0));
|
const ModelScalar max1 = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(1));
|
||||||
const ModelScalar max1 = model_reduce_scal(mesh, RTYPE_MAX,
|
const ModelScalar max2 = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(2));
|
||||||
VertexBufferHandle(1));
|
|
||||||
const ModelScalar max2 = model_reduce_scal(mesh, RTYPE_MAX,
|
|
||||||
VertexBufferHandle(2));
|
|
||||||
const ModelScalar max_scal = max(max0, max(max1, max2));
|
const ModelScalar max_scal = max(max0, max(max1, max2));
|
||||||
|
|
||||||
const ModelScalar min0 = model_reduce_scal(mesh, RTYPE_MIN,
|
const ModelScalar min0 = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(0));
|
||||||
VertexBufferHandle(0));
|
const ModelScalar min1 = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(1));
|
||||||
const ModelScalar min1 = model_reduce_scal(mesh, RTYPE_MIN,
|
const ModelScalar min2 = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(2));
|
||||||
VertexBufferHandle(1));
|
|
||||||
const ModelScalar min2 = model_reduce_scal(mesh, RTYPE_MIN,
|
|
||||||
VertexBufferHandle(2));
|
|
||||||
const ModelScalar min_scal = min(min0, min(min1, min2));
|
const ModelScalar min_scal = min(min0, min(min1, min2));
|
||||||
|
|
||||||
return max_scal - min_scal;
|
return max_scal - min_scal;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
print_debug_info(const ModelScalar& model, const AcReal& candidate,
|
print_debug_info(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range)
|
||||||
const ModelScalar& range)
|
|
||||||
{
|
{
|
||||||
printf("MeshPointInfo\n");
|
printf("MeshPointInfo\n");
|
||||||
printf("\tModel: %e\n", double(model));
|
printf("\tModel: %e\n", double(model));
|
||||||
printf("\tCandidate: %e\n", double(candidate));
|
printf("\tCandidate: %e\n", double(candidate));
|
||||||
printf("\tRange: %e\n", double(range));
|
printf("\tRange: %e\n", double(range));
|
||||||
|
|
||||||
printf("\tAbsolute error: %Le (max acceptable: %Le)\n",
|
printf("\tAbsolute error: %Le (max acceptable: %Le)\n", get_absolute_error(model, candidate),
|
||||||
get_absolute_error(model, candidate),
|
|
||||||
get_acceptable_absolute_error(range));
|
get_acceptable_absolute_error(range));
|
||||||
printf("\tRelative error: %Le (max acceptable: %Le)\n",
|
printf("\tRelative error: %Le (max acceptable: %Le)\n", get_relative_error(model, candidate),
|
||||||
get_relative_error(model, candidate),
|
|
||||||
get_acceptable_relative_error());
|
get_acceptable_relative_error());
|
||||||
printf("\tIs acceptable: %d\n", verify(model, candidate, range));
|
printf("\tIs acceptable: %d\n", verify(model, candidate, range));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
print_result(const ModelScalar& model, const AcReal& candidate,
|
print_result(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range,
|
||||||
const ModelScalar& range, const char* name = "???")
|
const char* name = "???")
|
||||||
{
|
{
|
||||||
const ModelScalar rel_err = get_relative_error(model, candidate);
|
const ModelScalar rel_err = get_relative_error(model, candidate);
|
||||||
const ModelScalar abs_err = get_absolute_error(model, candidate);
|
const ModelScalar abs_err = get_absolute_error(model, candidate);
|
||||||
@@ -216,7 +209,7 @@ print_result(const ModelScalar& model, const AcReal& candidate,
|
|||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
check_reductions(const AcMeshInfo& config)
|
check_reductions(const AcMeshInfo& config)
|
||||||
{
|
{
|
||||||
printf("Testing reductions\n");
|
printf("Testing reductions\n");
|
||||||
@@ -247,8 +240,7 @@ check_reductions(const AcMeshInfo& config)
|
|||||||
// Scal
|
// Scal
|
||||||
ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
|
ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
|
||||||
VertexBufferHandle(ftype));
|
VertexBufferHandle(ftype));
|
||||||
AcReal candidate = acReduceScal(ReductionType(rtype),
|
AcReal candidate = acReduceScal(ReductionType(rtype), VertexBufferHandle(ftype));
|
||||||
VertexBufferHandle(ftype));
|
|
||||||
print_result(model, candidate, range, "UUX scal");
|
print_result(model, candidate, range, "UUX scal");
|
||||||
|
|
||||||
bool is_acceptable = verify(model, candidate, range);
|
bool is_acceptable = verify(model, candidate, range);
|
||||||
@@ -261,10 +253,9 @@ check_reductions(const AcMeshInfo& config)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vec
|
// Vec
|
||||||
model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX,
|
model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY,
|
||||||
VTXBUF_UUY, VTXBUF_UUZ);
|
VTXBUF_UUZ);
|
||||||
candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX,
|
candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||||
VTXBUF_UUY, VTXBUF_UUZ);
|
|
||||||
print_result(model, candidate, range, "UUXYZ vec");
|
print_result(model, candidate, range, "UUXYZ vec");
|
||||||
|
|
||||||
is_acceptable = verify(model, candidate, range);
|
is_acceptable = verify(model, candidate, range);
|
||||||
@@ -277,7 +268,8 @@ check_reductions(const AcMeshInfo& config)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
|
printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n",
|
||||||
|
get_acceptable_relative_error(), get_acceptable_absolute_error(range));
|
||||||
}
|
}
|
||||||
acQuit();
|
acQuit();
|
||||||
modelmesh_destroy(modelmesh);
|
modelmesh_destroy(modelmesh);
|
||||||
@@ -290,14 +282,16 @@ check_reductions(const AcMeshInfo& config)
|
|||||||
* Note! Potentially dangerous if all meshes do not interact with each other.
|
* Note! Potentially dangerous if all meshes do not interact with each other.
|
||||||
* Otherwise the range may be too high.
|
* Otherwise the range may be too high.
|
||||||
*/
|
*/
|
||||||
static ModelScalar
|
static ModelScalar
|
||||||
get_data_range(const ModelMesh& model)
|
get_data_range(const ModelMesh& model)
|
||||||
{
|
{
|
||||||
ModelScalar vertex_buffer_max_all = -INFINITY;
|
ModelScalar vertex_buffer_max_all = -INFINITY;
|
||||||
ModelScalar vertex_buffer_min_all = INFINITY;
|
ModelScalar vertex_buffer_min_all = INFINITY;
|
||||||
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
|
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
|
||||||
const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, VertexBufferHandle(w));
|
const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX,
|
||||||
const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, VertexBufferHandle(w));
|
VertexBufferHandle(w));
|
||||||
|
const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN,
|
||||||
|
VertexBufferHandle(w));
|
||||||
|
|
||||||
if (vertex_buffer_max > vertex_buffer_max_all)
|
if (vertex_buffer_max > vertex_buffer_max_all)
|
||||||
vertex_buffer_max_all = vertex_buffer_max;
|
vertex_buffer_max_all = vertex_buffer_max;
|
||||||
@@ -312,7 +306,7 @@ get_data_range(const ModelMesh& model)
|
|||||||
static FILE* test_result = NULL;
|
static FILE* test_result = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
verify_meshes(const ModelMesh& model, const AcMesh& candidate)
|
verify_meshes(const ModelMesh& model, const AcMesh& candidate)
|
||||||
{
|
{
|
||||||
bool retval = true;
|
bool retval = true;
|
||||||
@@ -335,18 +329,16 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)
|
|||||||
|
|
||||||
if (!verify(model_val, cand_val, range)) {
|
if (!verify(model_val, cand_val, range)) {
|
||||||
const int i0 = i % model.info.int_params[AC_mx];
|
const int i0 = i % model.info.int_params[AC_mx];
|
||||||
const int j0 = ((i % (model.info.int_params[AC_mx] *
|
const int j0 = ((i %
|
||||||
model.info.int_params[AC_my])) /
|
(model.info.int_params[AC_mx] * model.info.int_params[AC_my])) /
|
||||||
model.info.int_params[AC_mx]);
|
model.info.int_params[AC_mx]);
|
||||||
const int k0 = i / (model.info.int_params[AC_mx] *
|
const int k0 = i / (model.info.int_params[AC_mx] * model.info.int_params[AC_my]);
|
||||||
model.info.int_params[AC_my]);
|
|
||||||
printf("Index (%d, %d, %d)\n", i0, j0, k0);
|
printf("Index (%d, %d, %d)\n", i0, j0, k0);
|
||||||
print_debug_info(model_val, cand_val, range);
|
print_debug_info(model_val, cand_val, range);
|
||||||
retval = false;
|
retval = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const ModelScalar abs_error = get_absolute_error(model_val,
|
const ModelScalar abs_error = get_absolute_error(model_val, cand_val);
|
||||||
cand_val);
|
|
||||||
if (abs_error > max_abs_error.error) {
|
if (abs_error > max_abs_error.error) {
|
||||||
max_abs_error.error = abs_error;
|
max_abs_error.error = abs_error;
|
||||||
max_abs_error.model = model_val;
|
max_abs_error.model = model_val;
|
||||||
@@ -368,8 +360,10 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
//print_result(max_rel_error.model, max_rel_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
|
// print_result(max_rel_error.model, max_rel_error.candidate, range,
|
||||||
print_result(max_abs_error.model, max_abs_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
|
// vtxbuf_names[VertexBufferHandle(w)]);
|
||||||
|
print_result(max_abs_error.model, max_abs_error.candidate, range,
|
||||||
|
vtxbuf_names[VertexBufferHandle(w)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if GEN_TEST_RESULT == 1
|
#if GEN_TEST_RESULT == 1
|
||||||
@@ -378,17 +372,17 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)
|
|||||||
fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
|
fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
|
printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n",
|
||||||
|
get_acceptable_relative_error(), get_acceptable_absolute_error(range));
|
||||||
|
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
check_rk3(const AcMeshInfo& mesh_info)
|
check_rk3(const AcMeshInfo& mesh_info)
|
||||||
{
|
{
|
||||||
const int num_iterations = 1; // Note: should work up to at least 15 steps
|
const int num_iterations = 1; // Note: should work up to at least 15 steps
|
||||||
printf("Testing RK3 (running %d steps before checking the result)\n",
|
printf("Testing RK3 (running %d steps before checking the result)\n", num_iterations);
|
||||||
num_iterations);
|
|
||||||
int num_failures = 0;
|
int num_failures = 0;
|
||||||
|
|
||||||
// Init CPU meshes
|
// Init CPU meshes
|
||||||
@@ -412,8 +406,9 @@ check_rk3(const AcMeshInfo& mesh_info)
|
|||||||
boundconds(model_mesh->info, model_mesh);
|
boundconds(model_mesh->info, model_mesh);
|
||||||
|
|
||||||
for (int i = 0; i < num_iterations; ++i) {
|
for (int i = 0; i < num_iterations; ++i) {
|
||||||
//const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
|
// const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
|
||||||
//const AcReal dt = host_timestep(umax, mesh_info);
|
// VTXBUF_UUZ));
|
||||||
|
// const AcReal dt = host_timestep(umax, mesh_info);
|
||||||
const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
|
const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
|
||||||
|
|
||||||
acIntegrate(dt);
|
acIntegrate(dt);
|
||||||
@@ -438,12 +433,13 @@ check_rk3(const AcMeshInfo& mesh_info)
|
|||||||
return num_failures;
|
return num_failures;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
run_autotest(void)
|
run_autotest(void)
|
||||||
{
|
{
|
||||||
#if GEN_TEST_RESULT == 1
|
#if GEN_TEST_RESULT == 1
|
||||||
char testresult_path[256];
|
char testresult_path[256];
|
||||||
sprintf(testresult_path, "%s_fullstep_testresult.out", AC_DOUBLE_PRECISION ? "double" : "float");
|
sprintf(testresult_path, "%s_fullstep_testresult.out",
|
||||||
|
AC_DOUBLE_PRECISION ? "double" : "float");
|
||||||
|
|
||||||
test_result = fopen(testresult_path, "w");
|
test_result = fopen(testresult_path, "w");
|
||||||
ERRCHK(test_result);
|
ERRCHK(test_result);
|
||||||
@@ -456,17 +452,13 @@ run_autotest(void)
|
|||||||
load_config(&config);
|
load_config(&config);
|
||||||
|
|
||||||
if (STENCIL_ORDER > 6)
|
if (STENCIL_ORDER > 6)
|
||||||
printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
|
printf("WARNING!!! If the stencil order is larger than the computational domain some "
|
||||||
|
"vertices may be done twice (f.ex. doing inner and outer domains separately and "
|
||||||
|
"some of the front/back/left/right/etc slabs collide). The mesh must be large "
|
||||||
|
"enough s.t. this doesn't happen.");
|
||||||
|
|
||||||
const vec3i test_dims[] = {
|
const vec3i test_dims[] = {{32, 32, 32}, {64, 32, 32}, {32, 64, 32}, {32, 32, 64},
|
||||||
{32, 32, 32},
|
{64, 64, 32}, {64, 32, 64}, {32, 64, 64}};
|
||||||
{64, 32, 32},
|
|
||||||
{32, 64, 32},
|
|
||||||
{32, 32, 64},
|
|
||||||
{64, 64, 32},
|
|
||||||
{64, 32, 64},
|
|
||||||
{32, 64, 64}
|
|
||||||
};
|
|
||||||
|
|
||||||
int num_failures = 0;
|
int num_failures = 0;
|
||||||
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
|
||||||
@@ -509,7 +501,9 @@ run_autotest(void)
|
|||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif TEST_TYPE == THOROUGH_TEST // GEN TEST FILE START HERE --------------------------------------------------------------------------------------------------------------
|
#elif TEST_TYPE == \
|
||||||
|
THOROUGH_TEST // GEN TEST FILE START HERE
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ModelScalar model;
|
ModelScalar model;
|
||||||
AcReal candidate;
|
AcReal candidate;
|
||||||
@@ -520,7 +514,8 @@ typedef struct {
|
|||||||
ModelScalar minimum_magnitude;
|
ModelScalar minimum_magnitude;
|
||||||
} Error;
|
} Error;
|
||||||
|
|
||||||
Error get_error(ModelScalar model, AcReal candidate)
|
Error
|
||||||
|
get_error(ModelScalar model, AcReal candidate)
|
||||||
{
|
{
|
||||||
Error error;
|
Error error;
|
||||||
error.abs_error = 0;
|
error.abs_error = 0;
|
||||||
@@ -532,18 +527,20 @@ Error get_error(ModelScalar model, AcReal candidate)
|
|||||||
error.abs_error = 0;
|
error.abs_error = 0;
|
||||||
error.rel_error = 0;
|
error.rel_error = 0;
|
||||||
error.ulp_error = 0;
|
error.ulp_error = 0;
|
||||||
} else if (!is_valid(error.model) || !is_valid(error.candidate)) {
|
}
|
||||||
|
else if (!is_valid(error.model) || !is_valid(error.candidate)) {
|
||||||
error.abs_error = INFINITY;
|
error.abs_error = INFINITY;
|
||||||
error.rel_error = INFINITY;
|
error.rel_error = INFINITY;
|
||||||
error.ulp_error = INFINITY;
|
error.ulp_error = INFINITY;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
const int base = 2;
|
const int base = 2;
|
||||||
const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant
|
const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant
|
||||||
|
|
||||||
const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2));
|
const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2));
|
||||||
|
|
||||||
const ModelScalar ulp = powl(base, e - (p-1));
|
const ModelScalar ulp = powl(base, e - (p - 1));
|
||||||
const ModelScalar machine_epsilon = 0.5 * powl(base, -(p-1));
|
const ModelScalar machine_epsilon = 0.5 * powl(base, -(p - 1));
|
||||||
error.abs_error = fabsl(model - candidate);
|
error.abs_error = fabsl(model - candidate);
|
||||||
error.ulp_error = error.abs_error / ulp;
|
error.ulp_error = error.abs_error / ulp;
|
||||||
error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon;
|
error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon;
|
||||||
@@ -552,14 +549,16 @@ Error get_error(ModelScalar model, AcReal candidate)
|
|||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
Error get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
|
Error
|
||||||
|
get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
|
||||||
{
|
{
|
||||||
Error error;
|
Error error;
|
||||||
error.abs_error = -1;
|
error.abs_error = -1;
|
||||||
|
|
||||||
for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
|
for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
|
||||||
for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
|
for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
|
||||||
Error curr_error = get_error(model_mesh.vertex_buffer[j][i], candidate_mesh.vertex_buffer[j][i]);
|
Error curr_error = get_error(model_mesh.vertex_buffer[j][i],
|
||||||
|
candidate_mesh.vertex_buffer[j][i]);
|
||||||
if (curr_error.abs_error > error.abs_error)
|
if (curr_error.abs_error > error.abs_error)
|
||||||
error = curr_error;
|
error = curr_error;
|
||||||
}
|
}
|
||||||
@@ -582,7 +581,6 @@ get_maximum_magnitude(const ModelScalar* field, const AcMeshInfo info)
|
|||||||
return maximum;
|
return maximum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static ModelScalar
|
static ModelScalar
|
||||||
get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
|
get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
|
||||||
{
|
{
|
||||||
@@ -594,7 +592,9 @@ get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
|
|||||||
return minimum;
|
return minimum;
|
||||||
}
|
}
|
||||||
|
|
||||||
Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
|
Error
|
||||||
|
get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh,
|
||||||
|
const AcMesh& candidate_mesh)
|
||||||
{
|
{
|
||||||
ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
|
ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
|
||||||
AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle];
|
AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle];
|
||||||
@@ -610,7 +610,6 @@ Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const Mod
|
|||||||
error = curr_error;
|
error = curr_error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
|
error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
|
||||||
error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
|
error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
|
||||||
|
|
||||||
@@ -621,14 +620,17 @@ void
|
|||||||
print_error_to_file(const char* path, const int n, const Error error)
|
print_error_to_file(const char* path, const int n, const Error error)
|
||||||
{
|
{
|
||||||
FILE* file = fopen(path, "a");
|
FILE* file = fopen(path, "a");
|
||||||
fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
|
fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error,
|
||||||
//fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
|
error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
|
||||||
|
// fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude,
|
||||||
|
// error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
|
||||||
fclose(file);
|
fclose(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_PATH_LEN (256)
|
#define MAX_PATH_LEN (256)
|
||||||
|
|
||||||
int run_autotest(void)
|
int
|
||||||
|
run_autotest(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
#define N_MIN (32)
|
#define N_MIN (32)
|
||||||
@@ -660,31 +662,41 @@ int run_autotest(void)
|
|||||||
acStore(candidate_mesh);
|
acStore(candidate_mesh);
|
||||||
Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh);
|
Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh);
|
||||||
char boundcond_path[MAX_PATH_LEN];
|
char boundcond_path[MAX_PATH_LEN];
|
||||||
sprintf(boundcond_path, "%s_boundcond_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
|
sprintf(boundcond_path, "%s_boundcond_%s.testresult",
|
||||||
|
AC_DOUBLE_PRECISION ? "double" : "float",
|
||||||
|
init_type_names[(InitType)init_type]);
|
||||||
print_error_to_file(boundcond_path, n, boundcond_error);
|
print_error_to_file(boundcond_path, n, boundcond_error);
|
||||||
}
|
}
|
||||||
|
|
||||||
{ // Check scalar max reduction
|
{ // Check scalar max reduction
|
||||||
ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX);
|
ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX,
|
||||||
|
VTXBUF_UUX);
|
||||||
AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
|
AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
|
||||||
Error scalar_reduce_error = get_error(model, candidate);
|
Error scalar_reduce_error = get_error(model, candidate);
|
||||||
char scalar_reduce_path[MAX_PATH_LEN];
|
char scalar_reduce_path[MAX_PATH_LEN];
|
||||||
sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
|
sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",
|
||||||
|
AC_DOUBLE_PRECISION ? "double" : "float",
|
||||||
|
init_type_names[(InitType)init_type]);
|
||||||
print_error_to_file(scalar_reduce_path, n, scalar_reduce_error);
|
print_error_to_file(scalar_reduce_path, n, scalar_reduce_error);
|
||||||
}
|
}
|
||||||
|
|
||||||
{ // Check vector max reduction
|
{ // Check vector max reduction
|
||||||
ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX,
|
||||||
AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||||
|
AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
|
||||||
|
VTXBUF_UUZ);
|
||||||
Error vector_reduce_error = get_error(model, candidate);
|
Error vector_reduce_error = get_error(model, candidate);
|
||||||
char vector_reduce_path[MAX_PATH_LEN];
|
char vector_reduce_path[MAX_PATH_LEN];
|
||||||
sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
|
sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",
|
||||||
|
AC_DOUBLE_PRECISION ? "double" : "float",
|
||||||
|
init_type_names[(InitType)init_type]);
|
||||||
print_error_to_file(vector_reduce_path, n, vector_reduce_error);
|
print_error_to_file(vector_reduce_path, n, vector_reduce_error);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Time advance
|
// Time advance
|
||||||
{
|
{
|
||||||
const AcReal umax = (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
const AcReal umax = (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX,
|
||||||
|
VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||||
const AcReal dt = host_timestep(umax, config);
|
const AcReal dt = host_timestep(umax, config);
|
||||||
|
|
||||||
// Host integration step
|
// Host integration step
|
||||||
@@ -699,12 +711,18 @@ int run_autotest(void)
|
|||||||
|
|
||||||
// Check fields
|
// Check fields
|
||||||
for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) {
|
for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) {
|
||||||
Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, *model_mesh, *candidate_mesh);
|
Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle,
|
||||||
|
*model_mesh, *candidate_mesh);
|
||||||
|
|
||||||
printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
|
printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n",
|
||||||
|
(ModelScalar)field_error.model, (ModelScalar)field_error.candidate,
|
||||||
|
(ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
|
||||||
|
|
||||||
char field_path[MAX_PATH_LEN];
|
char field_path[MAX_PATH_LEN];
|
||||||
sprintf(field_path, "%s_integrationstep_%s_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type], vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
|
sprintf(field_path, "%s_integrationstep_%s_%s.testresult",
|
||||||
|
AC_DOUBLE_PRECISION ? "double" : "float",
|
||||||
|
init_type_names[(InitType)init_type],
|
||||||
|
vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
|
||||||
print_error_to_file(field_path, n, field_error);
|
print_error_to_file(field_path, n, field_error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -35,10 +35,10 @@
|
|||||||
#include "model/model_rk3.h"
|
#include "model/model_rk3.h"
|
||||||
#include "timer_hires.h"
|
#include "timer_hires.h"
|
||||||
|
|
||||||
#include <vector>
|
#include "src/core/errchk.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include "src/core/errchk.h"
|
#include <vector>
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
smaller_than(const double& a, const double& b)
|
smaller_than(const double& a, const double& b)
|
||||||
@@ -47,7 +47,8 @@ smaller_than(const double& a, const double& b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
|
write_runningtimes(const char* path, const int n, const double min, const double max,
|
||||||
|
const double median, const double perc)
|
||||||
{
|
{
|
||||||
FILE* fp;
|
FILE* fp;
|
||||||
fp = fopen(path, "a");
|
fp = fopen(path, "a");
|
||||||
@@ -80,7 +81,8 @@ int
|
|||||||
run_benchmark(void)
|
run_benchmark(void)
|
||||||
{
|
{
|
||||||
char runningtime_path[256];
|
char runningtime_path[256];
|
||||||
sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float",
|
||||||
|
GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
||||||
|
|
||||||
FILE* fp;
|
FILE* fp;
|
||||||
fp = fopen(runningtime_path, "w");
|
fp = fopen(runningtime_path, "w");
|
||||||
@@ -88,13 +90,14 @@ run_benchmark(void)
|
|||||||
if (fp != NULL) {
|
if (fp != NULL) {
|
||||||
fprintf(fp, "n, min, max, median, perc\n");
|
fprintf(fp, "n, min, max, median, perc\n");
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define N_STEP_SIZE (128)
|
#define N_STEP_SIZE (128)
|
||||||
#define MAX_MESH_DIM (128)
|
#define MAX_MESH_DIM (128)
|
||||||
#define NUM_ITERS (100)
|
#define NUM_ITERS (100)
|
||||||
for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
|
for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
|
||||||
/* Parse configs */
|
/* Parse configs */
|
||||||
AcMeshInfo mesh_info;
|
AcMeshInfo mesh_info;
|
||||||
@@ -113,7 +116,6 @@ run_benchmark(void)
|
|||||||
std::vector<double> results;
|
std::vector<double> results;
|
||||||
results.reserve(NUM_ITERS);
|
results.reserve(NUM_ITERS);
|
||||||
|
|
||||||
|
|
||||||
// Warmup
|
// Warmup
|
||||||
for (int i = 0; i < 10; ++i) {
|
for (int i = 0; i < 10; ++i) {
|
||||||
acIntegrate(0);
|
acIntegrate(0);
|
||||||
@@ -124,28 +126,35 @@ run_benchmark(void)
|
|||||||
for (int i = 0; i < NUM_ITERS; ++i) {
|
for (int i = 0; i < NUM_ITERS; ++i) {
|
||||||
|
|
||||||
timer_reset(&t);
|
timer_reset(&t);
|
||||||
#if GEN_BENCHMARK_RK3 == 1
|
#if GEN_BENCHMARK_RK3 == 1
|
||||||
acIntegrateStep(2, FLT_EPSILON);
|
acIntegrateStep(2, FLT_EPSILON);
|
||||||
#else // GEN_BENCHMARK_FULL
|
#else // GEN_BENCHMARK_FULL
|
||||||
//const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
// const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||||
const AcReal dt = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
|
const AcReal dt = AcReal(
|
||||||
|
1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
|
||||||
acIntegrate(dt);
|
acIntegrate(dt);
|
||||||
#endif
|
#endif
|
||||||
acSynchronize();
|
acSynchronize();
|
||||||
|
|
||||||
const double ms_elapsed = timer_diff_nsec(t) / 1e6;
|
const double ms_elapsed = timer_diff_nsec(t) / 1e6;
|
||||||
results.push_back(ms_elapsed);
|
results.push_back(ms_elapsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define NTH_PERCENTILE (0.95)
|
#define NTH_PERCENTILE (0.95)
|
||||||
std::sort(results.begin(), results.end(), smaller_than);
|
std::sort(results.begin(), results.end(), smaller_than);
|
||||||
write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
|
write_runningtimes(runningtime_path, n, results[0], results[results.size() - 1],
|
||||||
|
results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
|
||||||
|
|
||||||
char percentile_path[256];
|
char percentile_path[256];
|
||||||
sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
sprintf(percentile_path, "%d_%s_%s_percentiles.out", n,
|
||||||
|
AC_DOUBLE_PRECISION ? "double" : "float",
|
||||||
|
GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
||||||
write_percentiles(percentile_path, NUM_ITERS, results);
|
write_percentiles(percentile_path, NUM_ITERS, results);
|
||||||
|
|
||||||
printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
|
printf("%s running time %g ms, (%dth percentile, nx = %d) \n",
|
||||||
|
GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep",
|
||||||
|
double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
|
||||||
|
mesh_info.int_params[AC_nx]);
|
||||||
|
|
||||||
acStore(mesh);
|
acStore(mesh);
|
||||||
acQuit();
|
acQuit();
|
||||||
@@ -225,7 +234,8 @@ run_benchmark(void)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
|
#else
|
||||||
|
//////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -290,8 +300,8 @@ run_benchmark(void)
|
|||||||
|
|
||||||
#define NTH_PERCENTILE (0.95)
|
#define NTH_PERCENTILE (0.95)
|
||||||
std::sort(results.begin(), results.end(), smaller_than);
|
std::sort(results.begin(), results.end(), smaller_than);
|
||||||
write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
|
write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)],
|
||||||
write_percentiles(n, NUM_ITERS, results);
|
results[int(NTH_PERCENTILE * NUM_ITERS)]); write_percentiles(n, NUM_ITERS, results);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@@ -79,8 +79,7 @@ parse_config(const char* path, AcMeshInfo* config)
|
|||||||
int idx = -1;
|
int idx = -1;
|
||||||
if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
|
if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
|
||||||
config->int_params[idx] = atoi(value);
|
config->int_params[idx] = atoi(value);
|
||||||
else if ((idx = find_str(keyword, realparam_names,
|
else if ((idx = find_str(keyword, realparam_names, NUM_REAL_PARAM_TYPES)) >= 0)
|
||||||
NUM_REAL_PARAM_TYPES)) >= 0)
|
|
||||||
config->real_params[idx] = AcReal(atof(value));
|
config->real_params[idx] = AcReal(atof(value));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,32 +91,30 @@ update_config(AcMeshInfo* config)
|
|||||||
{
|
{
|
||||||
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
||||||
///////////// PAD TEST
|
///////////// PAD TEST
|
||||||
//config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||||
///////////// PAD TEST
|
///////////// PAD TEST
|
||||||
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
||||||
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
||||||
|
|
||||||
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
||||||
config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
|
config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
|
||||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
|
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
||||||
config->int_params[AC_nx];
|
|
||||||
config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
|
config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
|
||||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] +
|
config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
|
||||||
STENCIL_ORDER / 2;
|
|
||||||
config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
|
config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
|
||||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] +
|
config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
|
||||||
STENCIL_ORDER / 2;
|
|
||||||
|
|
||||||
// Spacing
|
// Spacing
|
||||||
config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
|
config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
|
||||||
config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
|
config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
|
||||||
config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
|
config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
|
||||||
config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
|
config->real_params[AC_dsmin] = min(
|
||||||
|
config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
|
||||||
|
|
||||||
// Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
|
// Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
|
||||||
config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx];
|
config->real_params[AC_xlen] = config->real_params[AC_dsx] * config->int_params[AC_mx];
|
||||||
config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my];
|
config->real_params[AC_ylen] = config->real_params[AC_dsy] * config->int_params[AC_my];
|
||||||
config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz];
|
config->real_params[AC_zlen] = config->real_params[AC_dsz] * config->int_params[AC_mz];
|
||||||
|
|
||||||
config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];
|
config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];
|
||||||
config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen];
|
config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen];
|
||||||
@@ -125,35 +122,35 @@ update_config(AcMeshInfo* config)
|
|||||||
|
|
||||||
/* Additional helper params */
|
/* Additional helper params */
|
||||||
// Int helpers
|
// Int helpers
|
||||||
config->int_params[AC_mxy] = config->int_params[AC_mx] *
|
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
||||||
config->int_params[AC_my];
|
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
||||||
config->int_params[AC_nxy] = config->int_params[AC_nx] *
|
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
||||||
config->int_params[AC_ny];
|
|
||||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
|
|
||||||
config->int_params[AC_nz];
|
|
||||||
|
|
||||||
// Real helpers
|
// Real helpers
|
||||||
config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
|
config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
|
||||||
config->real_params[AC_cs_sound];
|
config->real_params[AC_cs_sound];
|
||||||
|
|
||||||
config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
|
config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] /
|
||||||
|
config->real_params[AC_gamma];
|
||||||
|
|
||||||
AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
|
AcReal G_CONST_CGS = AcReal(
|
||||||
|
6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
|
||||||
AcReal M_sun = AcReal(1.989e33); // g solar mass
|
AcReal M_sun = AcReal(1.989e33); // g solar mass
|
||||||
|
|
||||||
config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun /
|
config->real_params[AC_M_star] = config->real_params[AC_M_star] * M_sun /
|
||||||
( (config->real_params[AC_unit_length]*
|
((config->real_params[AC_unit_length] *
|
||||||
config->real_params[AC_unit_length]*
|
config->real_params[AC_unit_length] *
|
||||||
config->real_params[AC_unit_length]) *
|
config->real_params[AC_unit_length]) *
|
||||||
config->real_params[AC_unit_density] ) ;
|
config->real_params[AC_unit_density]);
|
||||||
|
|
||||||
config->real_params[AC_G_CONST] = G_CONST_CGS /
|
config->real_params[AC_G_CONST] = G_CONST_CGS / ((config->real_params[AC_unit_velocity] *
|
||||||
( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
|
config->real_params[AC_unit_velocity]) /
|
||||||
(config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
|
(config->real_params[AC_unit_density] *
|
||||||
|
config->real_params[AC_unit_length]));
|
||||||
config->real_params[AC_GM_star] = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
|
|
||||||
config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
|
|
||||||
|
|
||||||
|
config->real_params[AC_GM_star] = config->real_params[AC_M_star] *
|
||||||
|
config->real_params[AC_G_CONST];
|
||||||
|
config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2) * config->real_params[AC_GM_star]));
|
||||||
|
|
||||||
const bool print_config = true;
|
const bool print_config = true;
|
||||||
if (print_config) {
|
if (print_config) {
|
||||||
|
@@ -82,8 +82,7 @@ static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
|
|||||||
static inline vec4
|
static inline vec4
|
||||||
project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
|
project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
|
||||||
{
|
{
|
||||||
const vec4 rect = (vec4){
|
const vec4 rect = (vec4){camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
|
||||||
camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
|
|
||||||
camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
|
camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
|
||||||
camera.scale * bbox.x, camera.scale * bbox.y};
|
camera.scale * bbox.x, camera.scale * bbox.y};
|
||||||
|
|
||||||
@@ -103,13 +102,12 @@ renderer_init(const int& mx, const int& my)
|
|||||||
SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
|
SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
|
||||||
|
|
||||||
// Setup window
|
// Setup window
|
||||||
window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
|
window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
|
||||||
SDL_WINDOWPOS_UNDEFINED, window_width,
|
window_width, window_height, SDL_WINDOW_SHOWN);
|
||||||
window_height, SDL_WINDOW_SHOWN);
|
|
||||||
|
|
||||||
// Setup SDL renderer
|
// Setup SDL renderer
|
||||||
renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
|
renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
|
||||||
//SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
|
// SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
|
||||||
SDL_GetWindowSize(window, &window_width, &window_height);
|
SDL_GetWindowSize(window, &window_width, &window_height);
|
||||||
|
|
||||||
SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
|
SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
|
||||||
@@ -118,24 +116,24 @@ renderer_init(const int& mx, const int& my)
|
|||||||
datasurface_height = my;
|
datasurface_height = my;
|
||||||
// vec drawing uses the surface of the first component, no memory issues here
|
// vec drawing uses the surface of the first component, no memory issues here
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||||
surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
|
surfaces[i] = SDL_CreateRGBSurfaceWithFormat(0, datasurface_width, datasurface_height,
|
||||||
0, datasurface_width, datasurface_height, window_bpp,
|
window_bpp, SDL_PIXELFORMAT_RGBA8888);
|
||||||
SDL_PIXELFORMAT_RGBA8888);
|
|
||||||
|
|
||||||
camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
|
camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
|
||||||
-.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
|
-.5f * (num_tiles / tiles_per_row) * datasurface_height +
|
||||||
|
.5f * datasurface_height};
|
||||||
camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
|
camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
|
||||||
window_height / float(datasurface_height * (num_tiles/tiles_per_row)));
|
window_height / float(datasurface_height * (num_tiles / tiles_per_row)));
|
||||||
|
|
||||||
SDL_RendererInfo renderer_info;
|
SDL_RendererInfo renderer_info;
|
||||||
SDL_GetRendererInfo(renderer, &renderer_info);
|
SDL_GetRendererInfo(renderer, &renderer_info);
|
||||||
printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
|
printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width,
|
||||||
|
renderer_info.max_texture_height);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
set_pixel(const int& i, const int& j, const uint32_t& color,
|
set_pixel(const int& i, const int& j, const uint32_t& color, SDL_Surface* surface)
|
||||||
SDL_Surface* surface)
|
|
||||||
{
|
{
|
||||||
uint32_t* pixels = (uint32_t*)surface->pixels;
|
uint32_t* pixels = (uint32_t*)surface->pixels;
|
||||||
pixels[i + j * surface->w] = color;
|
pixels[i + j * surface->w] = color;
|
||||||
@@ -143,11 +141,10 @@ set_pixel(const int& i, const int& j, const uint32_t& color,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
|
draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int& tile)
|
||||||
const int& tile)
|
|
||||||
{
|
{
|
||||||
const float xoffset = (tile % tiles_per_row) * datasurface_width;
|
const float xoffset = (tile % tiles_per_row) * datasurface_width;
|
||||||
const float yoffset = - (tile / tiles_per_row) * datasurface_height;
|
const float yoffset = -(tile / tiles_per_row) * datasurface_height;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
|
const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
|
||||||
@@ -158,7 +155,7 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
|
|||||||
const float range = fabsf(max - min);
|
const float range = fabsf(max - min);
|
||||||
const float mid = max - .5f * range;
|
const float mid = max - .5f * range;
|
||||||
|
|
||||||
const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
|
const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
|
||||||
|
|
||||||
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
|
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
|
||||||
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
|
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
|
||||||
@@ -166,29 +163,23 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
|
|||||||
|
|
||||||
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
|
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
|
||||||
const uint8_t shade = (uint8_t)(
|
const uint8_t shade = (uint8_t)(
|
||||||
255.f *
|
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / range);
|
||||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
|
|
||||||
range);
|
|
||||||
uint8_t color[4] = {0, 0, 0, 255};
|
uint8_t color[4] = {0, 0, 0, 255};
|
||||||
color[tile % 3] = shade;
|
color[tile % 3] = shade;
|
||||||
const uint32_t mapped_color = SDL_MapRGBA(
|
const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer]->format, color[0],
|
||||||
surfaces[vertex_buffer]->format, color[0], color[1], color[2],
|
color[1], color[2], color[3]);
|
||||||
color[3]);
|
|
||||||
set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
|
set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const float2 pos = (float2){xoffset, yoffset};
|
const float2 pos = (float2){xoffset, yoffset};
|
||||||
const float2 bbox = (float2){.5f * datasurface_width,
|
const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height};
|
||||||
.5f * datasurface_height};
|
|
||||||
const float2 wsize = (float2){float(window_width), float(window_height)};
|
const float2 wsize = (float2){float(window_width), float(window_height)};
|
||||||
const vec4 rectf = project_ortho(pos, bbox, wsize);
|
const vec4 rectf = project_ortho(pos, bbox, wsize);
|
||||||
SDL_Rect rect = (SDL_Rect){
|
SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
||||||
int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
|
||||||
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
||||||
|
|
||||||
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
|
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer]);
|
||||||
surfaces[vertex_buffer]);
|
|
||||||
SDL_RenderCopy(renderer, tex, NULL, &rect);
|
SDL_RenderCopy(renderer, tex, NULL, &rect);
|
||||||
SDL_DestroyTexture(tex);
|
SDL_DestroyTexture(tex);
|
||||||
|
|
||||||
@@ -196,14 +187,12 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
draw_vertex_buffer_vec(const AcMesh& mesh,
|
draw_vertex_buffer_vec(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer_a,
|
||||||
const VertexBufferHandle& vertex_buffer_a,
|
|
||||||
const VertexBufferHandle& vertex_buffer_b,
|
const VertexBufferHandle& vertex_buffer_b,
|
||||||
const VertexBufferHandle& vertex_buffer_c,
|
const VertexBufferHandle& vertex_buffer_c, const int& tile)
|
||||||
const int& tile)
|
|
||||||
{
|
{
|
||||||
const float xoffset = (tile % tiles_per_row) * datasurface_width;
|
const float xoffset = (tile % tiles_per_row) * datasurface_width;
|
||||||
const float yoffset = - (tile / tiles_per_row) * datasurface_height;
|
const float yoffset = -(tile / tiles_per_row) * datasurface_height;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
const float maxx = float(
|
const float maxx = float(
|
||||||
@@ -215,52 +204,41 @@ draw_vertex_buffer_vec(const AcMesh& mesh,
|
|||||||
min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
|
min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
|
||||||
model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
|
model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
|
||||||
*/
|
*/
|
||||||
const float maxx = float(
|
const float maxx = float(max(
|
||||||
max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
|
acReduceScal(RTYPE_MAX, vertex_buffer_a),
|
||||||
max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
|
max(acReduceScal(RTYPE_MAX, vertex_buffer_b), acReduceScal(RTYPE_MAX, vertex_buffer_c))));
|
||||||
acReduceScal(RTYPE_MAX, vertex_buffer_c))));
|
const float minn = float(min(
|
||||||
const float minn = float(
|
acReduceScal(RTYPE_MIN, vertex_buffer_a),
|
||||||
min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
|
min(acReduceScal(RTYPE_MIN, vertex_buffer_b), acReduceScal(RTYPE_MIN, vertex_buffer_c))));
|
||||||
min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
|
|
||||||
acReduceScal(RTYPE_MIN, vertex_buffer_c))));
|
|
||||||
const float range = fabsf(maxx - minn);
|
const float range = fabsf(maxx - minn);
|
||||||
const float mid = maxx - .5f * range;
|
const float mid = maxx - .5f * range;
|
||||||
|
|
||||||
const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
|
const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
|
||||||
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
|
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
|
||||||
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
|
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
|
||||||
ERRCHK(i < datasurface_width && j < datasurface_height);
|
ERRCHK(i < datasurface_width && j < datasurface_height);
|
||||||
|
|
||||||
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
|
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
|
||||||
const uint8_t r = (uint8_t)(
|
const uint8_t r = (uint8_t)(
|
||||||
255.f *
|
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / range);
|
||||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
|
|
||||||
range);
|
|
||||||
const uint8_t g = (uint8_t)(
|
const uint8_t g = (uint8_t)(
|
||||||
255.f *
|
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / range);
|
||||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
|
|
||||||
range);
|
|
||||||
const uint8_t b = (uint8_t)(
|
const uint8_t b = (uint8_t)(
|
||||||
255.f *
|
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / range);
|
||||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
|
const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer_a]->format, r, g, b,
|
||||||
range);
|
255);
|
||||||
const uint32_t mapped_color = SDL_MapRGBA(
|
|
||||||
surfaces[vertex_buffer_a]->format, r, g, b, 255);
|
|
||||||
set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
|
set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const float2 pos = (float2){xoffset, yoffset};
|
const float2 pos = (float2){xoffset, yoffset};
|
||||||
const float2 bbox = (float2){.5f * datasurface_width,
|
const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height};
|
||||||
.5f * datasurface_height};
|
|
||||||
const float2 wsize = (float2){float(window_width), float(window_height)};
|
const float2 wsize = (float2){float(window_width), float(window_height)};
|
||||||
const vec4 rectf = project_ortho(pos, bbox, wsize);
|
const vec4 rectf = project_ortho(pos, bbox, wsize);
|
||||||
SDL_Rect rect = (SDL_Rect){
|
SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
||||||
int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
|
||||||
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
||||||
|
|
||||||
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
|
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer_a]);
|
||||||
surfaces[vertex_buffer_a]);
|
|
||||||
SDL_RenderCopy(renderer, tex, NULL, &rect);
|
SDL_RenderCopy(renderer, tex, NULL, &rect);
|
||||||
SDL_DestroyTexture(tex);
|
SDL_DestroyTexture(tex);
|
||||||
|
|
||||||
@@ -272,13 +250,11 @@ renderer_draw(const AcMesh& mesh)
|
|||||||
{
|
{
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||||
draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
|
draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
|
||||||
draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
|
draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, NUM_VTXBUF_HANDLES);
|
||||||
NUM_VTXBUF_HANDLES);
|
|
||||||
|
|
||||||
// Drawing done, present
|
// Drawing done, present
|
||||||
SDL_RenderPresent(renderer);
|
SDL_RenderPresent(renderer);
|
||||||
SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
|
SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, color_bg.a);
|
||||||
color_bg.a);
|
|
||||||
SDL_RenderClear(renderer);
|
SDL_RenderClear(renderer);
|
||||||
|
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
@@ -404,13 +380,13 @@ run_renderer(void)
|
|||||||
|
|
||||||
/* Step the simulation */
|
/* Step the simulation */
|
||||||
#if 1
|
#if 1
|
||||||
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
|
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||||
VTXBUF_UUZ);
|
|
||||||
const AcReal dt = host_timestep(umax, mesh_info);
|
const AcReal dt = host_timestep(umax, mesh_info);
|
||||||
acIntegrate(dt);
|
acIntegrate(dt);
|
||||||
#else
|
#else
|
||||||
ModelMesh* model_mesh = modelmesh_create(mesh->info);
|
ModelMesh* model_mesh = modelmesh_create(mesh->info);
|
||||||
const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
|
const AcReal umax = AcReal(
|
||||||
|
model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
|
||||||
const AcReal dt = host_timestep(umax, mesh_info);
|
const AcReal dt = host_timestep(umax, mesh_info);
|
||||||
acmesh_to_modelmesh(*mesh, model_mesh);
|
acmesh_to_modelmesh(*mesh, model_mesh);
|
||||||
model_rk3(dt, model_mesh);
|
model_rk3(dt, model_mesh);
|
||||||
@@ -425,7 +401,7 @@ run_renderer(void)
|
|||||||
/* Render */
|
/* Render */
|
||||||
const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
|
const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
|
||||||
if (timer_diff_sec >= desired_frame_time) {
|
if (timer_diff_sec >= desired_frame_time) {
|
||||||
//acStore(mesh);
|
// acStore(mesh);
|
||||||
const int num_vertices = mesh->info.int_params[AC_mxy];
|
const int num_vertices = mesh->info.int_params[AC_mxy];
|
||||||
const int3 dst = (int3){0, 0, k_slice};
|
const int3 dst = (int3){0, 0, k_slice};
|
||||||
acStoreWithOffset(dst, num_vertices, mesh);
|
acStoreWithOffset(dst, num_vertices, mesh);
|
||||||
|
@@ -60,23 +60,23 @@ print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
//Write all setting info into a separate ascii file. This is done to guarantee
|
// Write all setting info into a separate ascii file. This is done to guarantee
|
||||||
//that we have the data specifi information in the thing, even though in
|
// that we have the data specifi information in the thing, even though in
|
||||||
//principle these things are in the astaroth.conf.
|
// principle these things are in the astaroth.conf.
|
||||||
static inline
|
static inline void
|
||||||
void write_mesh_info(const AcMeshInfo* config)
|
write_mesh_info(const AcMeshInfo* config)
|
||||||
{
|
{
|
||||||
|
|
||||||
FILE* infotxt;
|
FILE* infotxt;
|
||||||
|
|
||||||
infotxt = fopen("purge.sh","w");
|
infotxt = fopen("purge.sh", "w");
|
||||||
fprintf(infotxt, "#!/bin/bash\n");
|
fprintf(infotxt, "#!/bin/bash\n");
|
||||||
fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
|
fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
|
||||||
fclose(infotxt);
|
fclose(infotxt);
|
||||||
|
|
||||||
infotxt = fopen("mesh_info.list","w");
|
infotxt = fopen("mesh_info.list", "w");
|
||||||
|
|
||||||
//Total grid dimensions
|
// Total grid dimensions
|
||||||
fprintf(infotxt, "int AC_mx %i \n", config->int_params[AC_mx]);
|
fprintf(infotxt, "int AC_mx %i \n", config->int_params[AC_mx]);
|
||||||
fprintf(infotxt, "int AC_my %i \n", config->int_params[AC_my]);
|
fprintf(infotxt, "int AC_my %i \n", config->int_params[AC_my]);
|
||||||
fprintf(infotxt, "int AC_mz %i \n", config->int_params[AC_mz]);
|
fprintf(infotxt, "int AC_mz %i \n", config->int_params[AC_mz]);
|
||||||
@@ -96,28 +96,26 @@ void write_mesh_info(const AcMeshInfo* config)
|
|||||||
fprintf(infotxt, "real AC_inv_dsx %e \n", (double)config->real_params[AC_inv_dsx]);
|
fprintf(infotxt, "real AC_inv_dsx %e \n", (double)config->real_params[AC_inv_dsx]);
|
||||||
fprintf(infotxt, "real AC_inv_dsy %e \n", (double)config->real_params[AC_inv_dsy]);
|
fprintf(infotxt, "real AC_inv_dsy %e \n", (double)config->real_params[AC_inv_dsy]);
|
||||||
fprintf(infotxt, "real AC_inv_dsz %e \n", (double)config->real_params[AC_inv_dsz]);
|
fprintf(infotxt, "real AC_inv_dsz %e \n", (double)config->real_params[AC_inv_dsz]);
|
||||||
fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin ]);
|
fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin]);
|
||||||
|
|
||||||
/* Additional helper params */
|
/* Additional helper params */
|
||||||
// Int helpers
|
// Int helpers
|
||||||
fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy ]);
|
fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy]);
|
||||||
fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy ]);
|
fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy]);
|
||||||
fprintf(infotxt, "int AC_nxyz %i \n", config->int_params[AC_nxyz]);
|
fprintf(infotxt, "int AC_nxyz %i \n", config->int_params[AC_nxyz]);
|
||||||
|
|
||||||
// Real helpers
|
// Real helpers
|
||||||
fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
|
fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
|
||||||
fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound ]);
|
fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound]);
|
||||||
|
|
||||||
fclose(infotxt);
|
fclose(infotxt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This funtion writes a run state into a set of C binaries. For the sake of
|
||||||
//This funtion writes a run state into a set of C binaries. For the sake of
|
// accuracy, all floating point numbers are to be saved in long double precision
|
||||||
//accuracy, all floating point numbers are to be saved in long double precision
|
// regardless of the choise of accuracy during runtime.
|
||||||
//regardless of the choise of accuracy during runtime.
|
|
||||||
static inline void
|
static inline void
|
||||||
save_mesh(const AcMesh &save_mesh, const int step,
|
save_mesh(const AcMesh& save_mesh, const int step, const AcReal t_step)
|
||||||
const AcReal t_step)
|
|
||||||
{
|
{
|
||||||
FILE* save_ptr;
|
FILE* save_ptr;
|
||||||
|
|
||||||
@@ -128,7 +126,7 @@ save_mesh(const AcMesh &save_mesh, const int step,
|
|||||||
char cstep[10];
|
char cstep[10];
|
||||||
char bin_filename[80] = "\0";
|
char bin_filename[80] = "\0";
|
||||||
|
|
||||||
//sprintf(bin_filename, "");
|
// sprintf(bin_filename, "");
|
||||||
|
|
||||||
sprintf(cstep, "%d", step);
|
sprintf(cstep, "%d", step);
|
||||||
|
|
||||||
@@ -139,28 +137,25 @@ save_mesh(const AcMesh &save_mesh, const int step,
|
|||||||
|
|
||||||
printf("Savefile %s \n", bin_filename);
|
printf("Savefile %s \n", bin_filename);
|
||||||
|
|
||||||
save_ptr = fopen(bin_filename,"wb");
|
save_ptr = fopen(bin_filename, "wb");
|
||||||
|
|
||||||
//Start file with time stamp
|
// Start file with time stamp
|
||||||
long double write_long_buf = (long double) t_step;
|
long double write_long_buf = (long double)t_step;
|
||||||
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
|
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
|
||||||
//Grid data
|
// Grid data
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (size_t i = 0; i < n; ++i) {
|
||||||
const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
|
const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
|
||||||
long double write_long_buf = (long double) point_val;
|
long double write_long_buf = (long double)point_val;
|
||||||
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
|
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
|
||||||
}
|
}
|
||||||
fclose(save_ptr);
|
fclose(save_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// This function prints out the diagnostic values to std.out and also saves and
|
// This function prints out the diagnostic values to std.out and also saves and
|
||||||
// appends an ascii file to contain all the result.
|
// appends an ascii file to contain all the result.
|
||||||
static inline void
|
static inline void
|
||||||
print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file)
|
print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE* diag_file)
|
||||||
{
|
{
|
||||||
|
|
||||||
AcReal buf_rms, buf_max, buf_min;
|
AcReal buf_rms, buf_max, buf_min;
|
||||||
@@ -174,11 +169,10 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
|
|||||||
// MV: The ordering in the earlier version was wrong in terms of variable
|
// MV: The ordering in the earlier version was wrong in terms of variable
|
||||||
// MV: name and its diagnostics.
|
// MV: name and its diagnostics.
|
||||||
printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
|
printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
|
||||||
printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
|
printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", double(buf_min),
|
||||||
double(buf_min), double(buf_rms), double(buf_max));
|
double(buf_rms), double(buf_max));
|
||||||
fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt),
|
fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), double(buf_min),
|
||||||
double(buf_min), double(buf_rms), double(buf_max));
|
double(buf_rms), double(buf_max));
|
||||||
|
|
||||||
|
|
||||||
// Calculate rms, min and max from the variables as scalars
|
// Calculate rms, min and max from the variables as scalars
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
@@ -194,11 +188,11 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
|
|||||||
fprintf(diag_file, "\n");
|
fprintf(diag_file, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
MV NOTE: At the moment I have no clear idea how to calculate magnetic
|
MV NOTE: At the moment I have no clear idea how to calculate magnetic
|
||||||
diagnostic variables from grid. Vector potential measures have a limited
|
diagnostic variables from grid. Vector potential measures have a limited
|
||||||
value. TODO: Smart way to get brms, bmin and bmax.
|
value. TODO: Smart way to get brms, bmin and bmax.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int
|
int
|
||||||
run_simulation(void)
|
run_simulation(void)
|
||||||
@@ -213,8 +207,7 @@ run_simulation(void)
|
|||||||
acInit(mesh_info);
|
acInit(mesh_info);
|
||||||
acLoad(*mesh);
|
acLoad(*mesh);
|
||||||
|
|
||||||
|
FILE* diag_file;
|
||||||
FILE *diag_file;
|
|
||||||
diag_file = fopen("timeseries.ts", "a");
|
diag_file = fopen("timeseries.ts", "a");
|
||||||
// TODO Get time from earlier state.
|
// TODO Get time from earlier state.
|
||||||
AcReal t_step = 0.0;
|
AcReal t_step = 0.0;
|
||||||
@@ -222,7 +215,8 @@ run_simulation(void)
|
|||||||
// Generate the title row.
|
// Generate the title row.
|
||||||
fprintf(diag_file, "step t_step dt uu_total_min uu_total_rms uu_total_max ");
|
fprintf(diag_file, "step t_step dt uu_total_min uu_total_rms uu_total_max ");
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
|
fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i],
|
||||||
|
vtxbuf_names[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(diag_file, "\n");
|
fprintf(diag_file, "\n");
|
||||||
@@ -236,15 +230,14 @@ run_simulation(void)
|
|||||||
|
|
||||||
const int max_steps = mesh_info.int_params[AC_max_steps];
|
const int max_steps = mesh_info.int_params[AC_max_steps];
|
||||||
const int save_steps = mesh_info.int_params[AC_save_steps];
|
const int save_steps = mesh_info.int_params[AC_save_steps];
|
||||||
const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info
|
const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; // TODO Get from mesh_info
|
||||||
|
|
||||||
AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
|
AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
|
||||||
AcReal bin_crit_t = bin_save_t;
|
AcReal bin_crit_t = bin_save_t;
|
||||||
|
|
||||||
/* Step the simulation */
|
/* Step the simulation */
|
||||||
for (int i = 1; i < max_steps; ++i) {
|
for (int i = 1; i < max_steps; ++i) {
|
||||||
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
|
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||||
VTXBUF_UUZ);
|
|
||||||
const AcReal dt = host_timestep(umax, mesh_info);
|
const AcReal dt = host_timestep(umax, mesh_info);
|
||||||
acIntegrate(dt);
|
acIntegrate(dt);
|
||||||
|
|
||||||
@@ -266,7 +259,6 @@ run_simulation(void)
|
|||||||
which can be very useful when observing behaviour of turbulent
|
which can be very useful when observing behaviour of turbulent
|
||||||
simulations. (TODO)
|
simulations. (TODO)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Save the simulation state and print diagnostics */
|
/* Save the simulation state and print diagnostics */
|
||||||
@@ -300,9 +292,7 @@ run_simulation(void)
|
|||||||
save_mesh(*mesh, i, t_step);
|
save_mesh(*mesh, i, t_step);
|
||||||
|
|
||||||
bin_crit_t += bin_save_t;
|
bin_crit_t += bin_save_t;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////Save the final snapshot
|
//////Save the final snapshot
|
||||||
@@ -318,25 +308,3 @@ run_simulation(void)
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@@ -52,8 +52,7 @@ timer_diff_nsec(const Timer start)
|
|||||||
{
|
{
|
||||||
Timer end;
|
Timer end;
|
||||||
timer_reset(&end);
|
timer_reset(&end);
|
||||||
const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
|
const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + (end.tv_nsec - start.tv_nsec);
|
||||||
(end.tv_nsec - start.tv_nsec);
|
|
||||||
return diff;
|
return diff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user