Autoformatted all CUDA/C/C++ code
This commit is contained in:
@@ -28,33 +28,24 @@
|
||||
#include "errchk.h"
|
||||
|
||||
#include "device.cuh"
|
||||
#include "math_utils.h" // sum for reductions
|
||||
#include "math_utils.h" // sum for reductions
|
||||
#include "standalone/config_loader.h" // update_config
|
||||
|
||||
const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
|
||||
const char* intparam_names[] = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
|
||||
|
||||
|
||||
static const int MAX_NUM_DEVICES = 32;
|
||||
static int num_devices = 1;
|
||||
static const int MAX_NUM_DEVICES = 32;
|
||||
static int num_devices = 1;
|
||||
static Device devices[MAX_NUM_DEVICES] = {};
|
||||
|
||||
static Grid
|
||||
createGrid(const AcMeshInfo& config)
|
||||
{
|
||||
Grid grid;
|
||||
grid.m = (int3) {
|
||||
config.int_params[AC_mx],
|
||||
config.int_params[AC_my],
|
||||
config.int_params[AC_mz]
|
||||
};
|
||||
|
||||
grid.n = (int3) {
|
||||
config.int_params[AC_nx],
|
||||
config.int_params[AC_ny],
|
||||
config.int_params[AC_nz]
|
||||
};
|
||||
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
|
||||
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
|
||||
|
||||
return grid;
|
||||
}
|
||||
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
|
||||
static int3
|
||||
gridIdx3d(const Grid& grid, const int idx)
|
||||
{
|
||||
return (int3){idx % grid.m.x,
|
||||
(idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
||||
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
||||
idx / (grid.m.x * grid.m.y)};
|
||||
}
|
||||
|
||||
@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
|
||||
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
|
||||
|
||||
printf("Grid m "); printInt3(grid.m); printf("\n");
|
||||
printf("Grid n "); printInt3(grid.n); printf("\n");
|
||||
// clang-format off
|
||||
printf("Grid m "); printInt3(grid.m); printf("\n");
|
||||
printf("Grid n "); printInt3(grid.n); printf("\n");
|
||||
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
|
||||
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
|
||||
// clang-format on
|
||||
|
||||
// Initialize the devices
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
|
||||
*/
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
|
||||
const int3 da_local = (int3){
|
||||
da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
|
||||
// printf("\n");
|
||||
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
|
||||
}
|
||||
printf("\n");
|
||||
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
|
||||
*/
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
|
||||
const int3 da_local = (int3){
|
||||
da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
|
||||
// printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
|
||||
// printf("\n");
|
||||
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
|
||||
}
|
||||
printf("\n");
|
||||
@@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh)
|
||||
AcResult
|
||||
acIntegrateStep(const int& isubstep, const AcReal& dt)
|
||||
{
|
||||
const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
|
||||
const int3 end = (int3){STENCIL_ORDER/2 + subgrid.n.x,
|
||||
STENCIL_ORDER/2 + subgrid.n.y,
|
||||
STENCIL_ORDER/2 + subgrid.n.z};
|
||||
const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
|
||||
const int3 end = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
|
||||
STENCIL_ORDER / 2 + subgrid.n.z};
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
|
||||
}
|
||||
@@ -278,121 +273,125 @@ acBoundcondStep(void)
|
||||
{
|
||||
acSynchronize();
|
||||
if (num_devices == 1) {
|
||||
boundcondStep(devices[0], STREAM_PRIMARY,
|
||||
(int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
|
||||
} else {
|
||||
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
|
||||
(int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
|
||||
}
|
||||
else {
|
||||
// Local boundary conditions
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
|
||||
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
|
||||
}
|
||||
|
||||
/*
|
||||
// ===MIIKKANOTE START==========================================
|
||||
%JP: The old way for computing boundary conditions conflicts with the
|
||||
way we have to do things with multiple GPUs.
|
||||
/*
|
||||
// ===MIIKKANOTE START==========================================
|
||||
%JP: The old way for computing boundary conditions conflicts with the
|
||||
way we have to do things with multiple GPUs.
|
||||
|
||||
The older approach relied on unified memory, which represented the whole
|
||||
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
||||
in its current state is more meant for quick prototyping when performance is not an issue.
|
||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
|
||||
when managing the memory explicitly.
|
||||
The older approach relied on unified memory, which represented the whole
|
||||
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
||||
in its current state is more meant for quick prototyping when performance is not an issue.
|
||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
|
||||
than when managing the memory explicitly.
|
||||
|
||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||
Quick rundown:
|
||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||
contain the extents of the whole simulation domain and the decomposed grids, respectively.
|
||||
To simplify thing, we require that each GPU is assigned the same amount of work,
|
||||
therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
|
||||
to work with.
|
||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||
Quick rundown:
|
||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||
contain the extents of the whole simulation domain and the decomposed grids,
|
||||
respectively. To simplify thing, we require that each GPU is assigned the same amount of
|
||||
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
|
||||
work with.
|
||||
|
||||
The whole simulation domain is decomposed with respect to the z dimension.
|
||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||
contain (nx, ny, nz / num_devices) vertices.
|
||||
The whole simulation domain is decomposed with respect to the z dimension.
|
||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||
contain (nx, ny, nz / num_devices) vertices.
|
||||
|
||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||
|
||||
Terminology:
|
||||
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
||||
Terminology:
|
||||
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
||||
|
||||
Changes required to this commented code block:
|
||||
- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
|
||||
instead. Same holds for any complex index calculations. Instead, the local coordinates
|
||||
should be passed as an int3 type without having to consider how the data is actually
|
||||
laid out in device memory
|
||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
|
||||
of type "Device" which should be passed to single-GPU functions. In this file, all devices
|
||||
are stored in a global array "devices[num_devices]".
|
||||
- Every single-GPU function is executed asynchronously by default such that we
|
||||
can optimize Astaroth by executing memory transactions concurrently with computation.
|
||||
Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||
as a parameter and commands executing in different streams can be processed
|
||||
in parallel/concurrently.
|
||||
Changes required to this commented code block:
|
||||
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
|
||||
device.cu instead. Same holds for any complex index calculations. Instead, the local
|
||||
coordinates should be passed as an int3 type without having to consider how the data is
|
||||
actually laid out in device memory
|
||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
|
||||
handle of type "Device" which should be passed to single-GPU functions. In this file, all
|
||||
devices are stored in a global array "devices[num_devices]".
|
||||
- Every single-GPU function is executed asynchronously by default such that we
|
||||
can optimize Astaroth by executing memory transactions concurrently with
|
||||
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||
as a parameter and commands executing in different streams can be processed
|
||||
in parallel/concurrently.
|
||||
|
||||
|
||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||
|
||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||
|
||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
|
||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
|
||||
STENCIL_ORDER/2)
|
||||
|
||||
on a single device are "local", in the sense that they can be computed without having
|
||||
to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||
the data to the fron and back plates outside this range. In the solution we use here,
|
||||
we solve the local boundaries first, and then just exchange the front and back plates
|
||||
in a "ring", like so
|
||||
device_id
|
||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||
on a single device are "local", in the sense that they can be computed without
|
||||
having to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||
the data to the fron and back plates outside this range. In the solution we use
|
||||
here, we solve the local boundaries first, and then just exchange the front and back plates
|
||||
in a "ring", like so
|
||||
device_id
|
||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||
|
||||
|
||||
// ======MIIKKANOTE END==========================================
|
||||
// ======MIIKKANOTE END==========================================
|
||||
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
||||
moved into device.cu, function boundCondStep()
|
||||
In astaroth.cu, we use acBoundcondStep()
|
||||
just to distribute the work and manage
|
||||
communication between GPUs.
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
|
||||
moved into device.cu, function
|
||||
boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
|
||||
manage communication between GPUs.
|
||||
|
||||
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
|
||||
printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
|
||||
best_dims.z, double(best_time) / NUM_ITERATIONS);
|
||||
|
||||
exit(0);
|
||||
#else
|
||||
exit(0);
|
||||
#else
|
||||
|
||||
|
||||
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
|
||||
const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
|
||||
|
||||
const int3 start = (int3){0, 0, device_id * depth};
|
||||
const int3 end = (int3){mesh_info.int_params[AC_mx],
|
||||
mesh_info.int_params[AC_my],
|
||||
min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
|
||||
const int3 start = (int3){0, 0, device_id * depth};
|
||||
const int3 end = (int3){mesh_info.int_params[AC_mx],
|
||||
mesh_info.int_params[AC_my],
|
||||
min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
|
||||
|
||||
const dim3 tpb(8,2,8);
|
||||
const dim3 tpb(8,2,8);
|
||||
|
||||
// TODO uses the default stream currently
|
||||
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
|
||||
wedge_boundconds(0, tpb, start, end, d_buffer);
|
||||
} else {
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
*/
|
||||
// TODO uses the default stream currently
|
||||
if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
|
||||
wedge_boundconds(0, tpb, start, end, d_buffer);
|
||||
} else {
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||
periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
|
||||
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
*/
|
||||
// Exchange halos
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
|
||||
const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2;
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
{
|
||||
const int3 src = (int3) {0, 0, subgrid.n.z};
|
||||
const int3 dst = (int3) {0, 0, 0};
|
||||
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
|
||||
const int3 src = (int3){0, 0, subgrid.n.z};
|
||||
const int3 dst = (int3){0, 0, 0};
|
||||
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
|
||||
devices[(i + 1) % num_devices], dst, num_vertices);
|
||||
}
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
{
|
||||
const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
|
||||
const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
|
||||
copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
|
||||
const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
|
||||
const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
|
||||
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
|
||||
devices[i], dst, num_vertices);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -427,26 +426,28 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (rtype == RTYPE_MAX) {
|
||||
res = max(res, results[i]);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
}
|
||||
else if (rtype == RTYPE_MIN) {
|
||||
res = min(res, results[i]);
|
||||
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
}
|
||||
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
res = sum(res, results[i]);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
ERROR("Invalid rtype");
|
||||
}
|
||||
}
|
||||
|
||||
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
|
||||
res = sqrt(inv_n * res);
|
||||
res = sqrt(inv_n * res);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceScal(const ReductionType& rtype,
|
||||
const VertexBufferHandle& vtxbuffer_handle)
|
||||
acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
|
||||
{
|
||||
AcReal results[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
|
||||
const VertexBufferHandle& b, const VertexBufferHandle& c)
|
||||
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
|
||||
const VertexBufferHandle& c)
|
||||
{
|
||||
AcReal results[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
|
@@ -36,7 +36,7 @@ typedef struct {
|
||||
__constant__ AcMeshInfo d_mesh_info;
|
||||
__constant__ int3 d_multigpu_offset;
|
||||
__constant__ Grid globalGrid;
|
||||
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
||||
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
||||
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
||||
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
|
||||
@@ -76,46 +76,46 @@ printDeviceInfo(const Device device)
|
||||
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
|
||||
printf(" Stream processors: %d\n", props.multiProcessorCount);
|
||||
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
|
||||
printf(" Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
|
||||
printf(
|
||||
" Compute mode: %d\n",
|
||||
(int)props
|
||||
.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
|
||||
// Memory
|
||||
printf(" Global memory\n");
|
||||
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
|
||||
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
|
||||
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
|
||||
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
|
||||
(8. * 1024. * 1024. * 1024.));
|
||||
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
|
||||
printf(" ECC enabled: %d\n", props.ECCEnabled);
|
||||
// Memory usage
|
||||
size_t free_bytes, total_bytes;
|
||||
cudaMemGetInfo(&free_bytes, &total_bytes);
|
||||
const size_t used_bytes = total_bytes - free_bytes;
|
||||
printf(" Total global mem: %.2f GiB\n",
|
||||
props.totalGlobalMem / (1024.0 * 1024 * 1024));
|
||||
printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory free (GiB): %.2f\n",
|
||||
free_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory total (GiB): %.2f\n",
|
||||
total_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
|
||||
printf(" Caches\n");
|
||||
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
|
||||
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
|
||||
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
|
||||
printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
|
||||
printf(" Shared mem per block: %ld KiB\n",
|
||||
props.sharedMemPerBlock / (1024));
|
||||
printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
|
||||
printf(" Other\n");
|
||||
printf(" Warp size: %d\n", props.warpSize);
|
||||
// printf(" Single to double perf. ratio: %dx\n",
|
||||
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
|
||||
// versions
|
||||
printf(" Stream priorities supported: %d\n",
|
||||
props.streamPrioritiesSupported);
|
||||
printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported);
|
||||
printf("--------------------------------------------------\n");
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static __global__ void dummy_kernel(void) {}
|
||||
static __global__ void
|
||||
dummy_kernel(void)
|
||||
{
|
||||
}
|
||||
|
||||
AcResult
|
||||
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||
@@ -124,10 +124,10 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
||||
cudaDeviceReset();
|
||||
|
||||
// Create Device
|
||||
struct device_s* device = (struct device_s*) malloc(sizeof(*device));
|
||||
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
|
||||
ERRCHK_ALWAYS(device);
|
||||
|
||||
device->id = id;
|
||||
device->id = id;
|
||||
device->local_config = device_config;
|
||||
|
||||
// Check that the code was compiled for the proper GPU architecture
|
||||
@@ -150,15 +150,14 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||
}
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
|
||||
AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMalloc(&device->reduce_scratchpad, AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||
|
||||
// Device constants
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
|
||||
// Multi-GPU offset. This is used to compute globalVertexIdx.
|
||||
// Might be better to calculate this in astaroth.cu instead of here, s.t.
|
||||
// everything related to the decomposition is limited to the multi-GPU layer
|
||||
@@ -166,7 +165,6 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
|
||||
sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
|
||||
|
||||
|
||||
printf("Created device %d (%p)\n", device->id, device);
|
||||
*device_handle = device;
|
||||
return AC_SUCCESS;
|
||||
@@ -211,53 +209,44 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]
|
||||
};
|
||||
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]};
|
||||
|
||||
const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]
|
||||
};
|
||||
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_scal(device->streams[stream_type], rtype,
|
||||
start, end, device->vba.in[vtxbuf_handle],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
*result = reduce_scal(device->streams[stream_type], rtype, start, end,
|
||||
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
|
||||
device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceVec(const Device device, const StreamType stream_type,
|
||||
const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0,
|
||||
const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2,
|
||||
AcReal* result)
|
||||
reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]
|
||||
};
|
||||
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]};
|
||||
|
||||
const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]
|
||||
};
|
||||
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_vec(device->streams[stream_type], rtype, start, end,
|
||||
device->vba.in[vtxbuf0],
|
||||
device->vba.in[vtxbuf1],
|
||||
device->vba.in[vtxbuf2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
*result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
|
||||
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||
const int3& start, const int3& end, const AcReal dt)
|
||||
rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
|
||||
const int3& end, const AcReal dt)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
|
||||
@@ -270,65 +259,62 @@ synchronize(const Device device, const StreamType stream_type)
|
||||
cudaSetDevice(device->id);
|
||||
if (stream_type == STREAM_ALL) {
|
||||
cudaDeviceSynchronize();
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
cudaStreamSynchronize(device->streams[stream_type]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
loadWithOffset(const Device device, const StreamType stream_type,
|
||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
||||
loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
|
||||
const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
|
||||
device->streams[stream_type]));
|
||||
ERRCHK_CUDA(
|
||||
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
storeWithOffset(const Device device, const StreamType stream_type,
|
||||
const AcReal* src, const size_t bytes, AcReal* dst)
|
||||
storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
|
||||
const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
|
||||
device->streams[stream_type]));
|
||||
ERRCHK_CUDA(
|
||||
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToDevice(const Device device, const StreamType stream_type,
|
||||
const AcMesh& host_mesh, const int3& src, const int3& dst,
|
||||
const int num_vertices)
|
||||
copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
|
||||
const int3& src, const int3& dst, const int num_vertices)
|
||||
{
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
|
||||
&device->vba.in[i][dst_idx]);
|
||||
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
|
||||
num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToHost(const Device device, const StreamType stream_type,
|
||||
const int3& src, const int3& dst, const int num_vertices,
|
||||
AcMesh* host_mesh)
|
||||
copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
|
||||
const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
|
||||
const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
|
||||
num_vertices * sizeof(AcReal),
|
||||
&host_mesh->vertex_buffer[i][dst_idx]);
|
||||
num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
|
||||
const int3& src, Device dst_device, const int3& dst,
|
||||
const int num_vertices)
|
||||
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
|
||||
Device dst_device, const int3& dst, const int num_vertices)
|
||||
{
|
||||
cudaSetDevice(src_device->id);
|
||||
const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
|
||||
@@ -348,7 +334,7 @@ swapBuffers(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
AcReal* tmp = device->vba.in[i];
|
||||
AcReal* tmp = device->vba.in[i];
|
||||
device->vba.in[i] = device->vba.out[i];
|
||||
device->vba.out[i] = tmp;
|
||||
}
|
||||
@@ -364,8 +350,8 @@ loadDeviceConstant(const Device device, const AcIntParam param, const int value)
|
||||
// Therefore we have to obfuscate the code a bit and compute the offset address before
|
||||
// invoking cudaMemcpyToSymbol.
|
||||
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
|
||||
offset, cudaMemcpyHostToDevice));
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -374,8 +360,8 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
|
||||
offset, cudaMemcpyHostToDevice));
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -383,7 +369,7 @@ AcResult
|
||||
loadGlobalGrid(const Device device, const Grid grid)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid),
|
||||
0, cudaMemcpyHostToDevice));
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
@@ -27,12 +27,14 @@
|
||||
#pragma once
|
||||
#include "astaroth.h"
|
||||
|
||||
// clang-format off
|
||||
typedef enum {
|
||||
STREAM_PRIMARY,
|
||||
STREAM_SECONDARY,
|
||||
NUM_STREAM_TYPES,
|
||||
STREAM_ALL
|
||||
STREAM_PRIMARY,
|
||||
STREAM_SECONDARY,
|
||||
NUM_STREAM_TYPES,
|
||||
STREAM_ALL
|
||||
} StreamType;
|
||||
// clang-format on
|
||||
|
||||
typedef struct {
|
||||
int3 m;
|
||||
@@ -52,20 +54,17 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* devi
|
||||
AcResult destroyDevice(Device device);
|
||||
|
||||
/** */
|
||||
AcResult boundcondStep(const Device device, const StreamType stream_type,
|
||||
const int3& start, const int3& end);
|
||||
AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
|
||||
const int3& end);
|
||||
|
||||
/** */
|
||||
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
|
||||
/** */
|
||||
AcResult reduceVec(const Device device, const StreamType stream_type,
|
||||
const ReductionType rtype,
|
||||
const VertexBufferHandle vec0,
|
||||
const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2,
|
||||
AcReal* result);
|
||||
AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2, AcReal* result);
|
||||
|
||||
/** */
|
||||
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||
@@ -81,9 +80,8 @@ AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
|
||||
const int num_vertices);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshToHost(const Device device, const StreamType stream_type,
|
||||
const int3& src, const int3& dst, const int num_vertices,
|
||||
AcMesh* host_mesh);
|
||||
AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
|
||||
const int3& dst, const int num_vertices, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
|
||||
|
@@ -24,7 +24,7 @@
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#pragma once
|
||||
|
||||
__global__ void
|
||||
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
@@ -38,7 +38,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
|
||||
return;
|
||||
|
||||
//if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
||||
// if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
|
||||
// return;
|
||||
|
||||
// If destination index is inside the computational domain, return since
|
||||
@@ -69,15 +69,15 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
j_src += DCONST_INT(AC_ny_min);
|
||||
k_src += DCONST_INT(AC_nz_min);
|
||||
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
||||
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
||||
vtxbuf[dst_idx] = vtxbuf[src_idx];
|
||||
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
|
||||
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
|
||||
vtxbuf[dst_idx] = vtxbuf[src_idx];
|
||||
}
|
||||
|
||||
void
|
||||
periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
||||
{
|
||||
const dim3 tpb(8,2,8);
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
@@ -89,7 +89,6 @@ periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& en
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
static __device__ __forceinline__ int
|
||||
IDX(const int i)
|
||||
{
|
||||
@@ -120,14 +119,12 @@ create_rotz(const AcReal radians)
|
||||
return mat;
|
||||
}
|
||||
|
||||
|
||||
#if AC_DOUBLE_PRECISION == 0
|
||||
#define sin __sinf
|
||||
#define cos __cosf
|
||||
#define exp __expf
|
||||
#define rsqrt rsqrtf // hardware reciprocal sqrt
|
||||
#endif // AC_DOUBLE_PRECISION == 0
|
||||
|
||||
#endif // AC_DOUBLE_PRECISION == 0
|
||||
|
||||
/*
|
||||
typedef struct {
|
||||
@@ -155,12 +152,11 @@ first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
|
||||
-1.0 / 280.0};
|
||||
const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, -1.0 / 280.0};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = 0;
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i)
|
||||
@@ -175,17 +171,15 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {-2., 1.};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
|
||||
const AcReal coefficients[] = {-5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
|
||||
1.0 / 90.0};
|
||||
const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, 1.0 / 90.0};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
|
||||
8.0 / 315.0, -1.0 / 560.0};
|
||||
const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, 8.0 / 315.0, -1.0 / 560.0};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = coefficients[0] * pencil[MID];
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = coefficients[0] * pencil[MID];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i)
|
||||
@@ -196,31 +190,29 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
|
||||
|
||||
/** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
|
||||
static __device__ __forceinline__ AcReal
|
||||
cross_derivative(const AcReal* __restrict__ pencil_a,
|
||||
const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
|
||||
const AcReal inv_ds_b)
|
||||
cross_derivative(const AcReal* __restrict__ pencil_a, const AcReal* __restrict__ pencil_b,
|
||||
const AcReal inv_ds_a, const AcReal inv_ds_b)
|
||||
{
|
||||
#if STENCIL_ORDER == 2
|
||||
const AcReal coefficients[] = {0, 1.0 / 4.0};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
|
||||
const AcReal coefficients[] = {
|
||||
0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
|
||||
#elif STENCIL_ORDER == 6
|
||||
const AcReal fac = (1. / 720.);
|
||||
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
|
||||
2.0 * fac};
|
||||
const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, 2.0 * fac};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const AcReal fac = (1. / 20160.);
|
||||
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
|
||||
128. * fac, -9. * fac};
|
||||
const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, 128. * fac, -9. * fac};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = AcReal(0.);
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
AcReal res = AcReal(0.);
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = 1; i <= MID; ++i) {
|
||||
res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
|
||||
pencil_b[MID + i] - pencil_b[MID - i]);
|
||||
res += coefficients[i] *
|
||||
(pencil_a[MID + i] + pencil_a[MID - i] - pencil_b[MID + i] - pencil_b[MID - i]);
|
||||
}
|
||||
return res * inv_ds_a * inv_ds_b;
|
||||
}
|
||||
@@ -231,7 +223,8 @@ derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||
vertexIdx.z)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||
}
|
||||
@@ -242,7 +235,8 @@ derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
|
||||
pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||
vertexIdx.z)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
|
||||
}
|
||||
@@ -262,8 +256,7 @@ derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
||||
DCONST_REAL(AC_inv_dsy));
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
@@ -281,8 +274,7 @@ derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
|
||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
|
||||
DCONST_REAL(AC_inv_dsz));
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
@@ -291,7 +283,8 @@ dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.z)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
@@ -302,7 +295,8 @@ deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.z)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
|
||||
}
|
||||
@@ -322,8 +316,7 @@ deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
|
||||
vertexIdx.z + STENCIL_ORDER / 2 - offset)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
|
||||
DCONST_REAL(AC_inv_dsz));
|
||||
return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ AcReal
|
||||
@@ -332,7 +325,8 @@ derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
|
||||
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
@@ -343,7 +337,8 @@ derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
|
||||
AcReal pencil[STENCIL_ORDER + 1];
|
||||
#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
|
||||
vertexIdx.z + offset - STENCIL_ORDER / 2)];
|
||||
|
||||
return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
|
||||
}
|
||||
@@ -401,8 +396,7 @@ operator-(const AcReal3& a)
|
||||
return (AcReal3){-a.x, -a.y, -a.z};
|
||||
}
|
||||
|
||||
static __host__ __device__ __forceinline__ AcReal3
|
||||
operator*(const AcReal a, const AcReal3& b)
|
||||
static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
@@ -443,7 +437,6 @@ is_valid(const AcReal3& a)
|
||||
return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 1 (Stencil Processing Stage)
|
||||
@@ -476,8 +469,7 @@ laplace_vec(const AcReal3Data& vec)
|
||||
static __device__ __forceinline__ AcReal3
|
||||
curl(const AcReal3Data& vec)
|
||||
{
|
||||
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
|
||||
gradient(vec.x).z - gradient(vec.z).x,
|
||||
return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, gradient(vec.x).z - gradient(vec.z).x,
|
||||
gradient(vec.y).x - gradient(vec.x).y};
|
||||
}
|
||||
|
||||
@@ -520,7 +512,7 @@ contract(const AcMatrix& mat)
|
||||
{
|
||||
AcReal res = 0;
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 3; ++i)
|
||||
res += dot(mat.row[i], mat.row[i]);
|
||||
|
||||
@@ -558,12 +550,13 @@ __constant__ AcReal forcing_phi;
|
||||
static __device__ __forceinline__ AcReal3
|
||||
forcing(const int i, const int j, const int k)
|
||||
{
|
||||
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
|
||||
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
|
||||
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
|
||||
const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
|
||||
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
|
||||
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
|
||||
#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
|
||||
#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
|
||||
#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
|
||||
const AcReal3 k_vec = (AcReal3){
|
||||
(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
|
||||
(j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
|
||||
(k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
|
||||
AcReal inv_len = reciprocal_len(k_vec);
|
||||
if (isnan(inv_len) || isinf(inv_len))
|
||||
inv_len = 0;
|
||||
@@ -571,46 +564,41 @@ forcing(const int i, const int j, const int k)
|
||||
inv_len = 2;
|
||||
const AcReal k_dot_x = dot(k_vec, forcing_vec);
|
||||
|
||||
const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
|
||||
const AcReal waves = cos(k_dot_x) * cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
|
||||
|
||||
return inv_len * inv_len * waves * forcing_vec;
|
||||
}
|
||||
|
||||
|
||||
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
|
||||
// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values
|
||||
// in the mesh, then we will inherently lose precision
|
||||
#define LNT0 (AcReal(0.0))
|
||||
#define LNRHO0 (AcReal(0.0))
|
||||
|
||||
#define H_CONST (AcReal(0.0))
|
||||
#define C_CONST (AcReal(0.0))
|
||||
|
||||
|
||||
|
||||
template <int step_number>
|
||||
static __device__ __forceinline__ AcReal
|
||||
rk3_integrate(const AcReal state_previous, const AcReal state_current,
|
||||
const AcReal rate_of_change, const AcReal dt)
|
||||
rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
|
||||
const AcReal dt)
|
||||
{
|
||||
// Williamson (1980)
|
||||
const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
|
||||
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.),
|
||||
AcReal(8. / 15.)};
|
||||
|
||||
const AcReal beta[] = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
|
||||
|
||||
// Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
|
||||
// access (when accessing beta[step_number-1] even when step_number >= 1)
|
||||
switch (step_number) {
|
||||
case 0:
|
||||
return state_current + beta[step_number + 1] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number + 1] *
|
||||
(alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
case 0:
|
||||
return state_current + beta[step_number + 1] * rate_of_change * dt;
|
||||
case 1: // Fallthrough
|
||||
case 2:
|
||||
return state_current +
|
||||
beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
|
||||
(state_current - state_previous) +
|
||||
rate_of_change * dt);
|
||||
default:
|
||||
return NAN;
|
||||
}
|
||||
}
|
||||
/*
|
||||
@@ -646,13 +634,14 @@ static __device__ __forceinline__ AcReal3
|
||||
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||
const AcReal3 rate_of_change, const AcReal dt)
|
||||
{
|
||||
return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||
return (AcReal3){
|
||||
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||
}
|
||||
|
||||
#define rk3(state_previous, state_current, rate_of_change, dt)\
|
||||
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
||||
#define rk3(state_previous, state_current, rate_of_change, dt) \
|
||||
rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
|
||||
|
||||
/*
|
||||
template <int step_number>
|
||||
@@ -708,9 +697,8 @@ read_out(const int idx, AcReal* __restrict__ field[], const int handle)
|
||||
static __device__ AcReal3
|
||||
read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
{
|
||||
return (AcReal3) { read_out(idx, field, handle.x),
|
||||
read_out(idx, field, handle.y),
|
||||
read_out(idx, field, handle.z) };
|
||||
return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
|
||||
read_out(idx, field, handle.z)};
|
||||
}
|
||||
|
||||
#define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
|
||||
@@ -718,29 +706,28 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
#define READ_OUT(handle) (read_out(idx, buffer.out, handle))
|
||||
|
||||
// also write for clarity here also, not for the DSL
|
||||
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
|
||||
//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the
|
||||
// function
|
||||
|
||||
#define GEN_KERNEL_PARAM_BOILERPLATE \
|
||||
const int3 start, const int3 end, VertexBufferArray buffer
|
||||
#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
|
||||
|
||||
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
||||
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
|
||||
threadIdx.y + blockIdx.y * blockDim.y + start.y,\
|
||||
threadIdx.z + blockIdx.z * blockDim.z + start.z};\
|
||||
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
|
||||
d_multigpu_offset.y + vertexIdx.y, \
|
||||
d_multigpu_offset.z + vertexIdx.z}; \
|
||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
|
||||
return;\
|
||||
\
|
||||
\
|
||||
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
|
||||
vertexIdx.z < DCONST_INT(AC_nz_max));\
|
||||
\
|
||||
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
|
||||
vertexIdx.z >= DCONST_INT(AC_nz_min));\
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
|
||||
const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x, \
|
||||
threadIdx.y + blockIdx.y * blockDim.y + start.y, \
|
||||
threadIdx.z + blockIdx.z * blockDim.z + start.z}; \
|
||||
const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
|
||||
d_multigpu_offset.y + vertexIdx.y, \
|
||||
d_multigpu_offset.z + vertexIdx.z}; \
|
||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
|
||||
return; \
|
||||
\
|
||||
assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) && \
|
||||
vertexIdx.z < DCONST_INT(AC_nz_max)); \
|
||||
\
|
||||
assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) && \
|
||||
vertexIdx.z >= DCONST_INT(AC_nz_min)); \
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
#include "stencil_process.cuh"
|
||||
|
||||
@@ -757,33 +744,31 @@ randf(void)
|
||||
}
|
||||
|
||||
AcResult
|
||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
|
||||
const AcReal dt, VertexBufferArray* buffer)
|
||||
rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
|
||||
const int3& end, const AcReal dt, VertexBufferArray* buffer)
|
||||
{
|
||||
const dim3 tpb(32, 1, 4);
|
||||
/////////////////// Forcing
|
||||
#if LFORCING
|
||||
/////////////////// Forcing
|
||||
#if LFORCING
|
||||
const AcReal ff_scale = AcReal(.2);
|
||||
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
|
||||
const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
|
||||
const AcMatrix rotz = create_rotz(radians);
|
||||
ff = mul(rotz, ff);
|
||||
static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
|
||||
const AcReal radians = randf() * AcReal(2 * M_PI) / 360 / 8;
|
||||
const AcMatrix rotz = create_rotz(radians);
|
||||
ff = mul(rotz, ff);
|
||||
cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
|
||||
|
||||
const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
|
||||
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
|
||||
#endif // LFORCING
|
||||
const AcReal ff_phi = AcReal(M_PI); // AcReal(2 * M_PI) * randf();
|
||||
cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice,
|
||||
stream);
|
||||
#endif // LFORCING
|
||||
//////////////////////////
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z;
|
||||
|
||||
const dim3 bpg(
|
||||
(unsigned int)ceil(nx / AcReal(tpb.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
||||
|
||||
const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb.z)));
|
||||
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
|
||||
@@ -796,7 +781,6 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
////////////////REDUCE///////////////////////////
|
||||
#include "src/core/math_utils.h" // is_power_of_two
|
||||
|
||||
@@ -848,22 +832,19 @@ template <FilterFunc filter>
|
||||
__global__ void
|
||||
kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3) {
|
||||
start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z
|
||||
};
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z; //MV: Added this because it was undefined
|
||||
const int3 dst_idx = (int3) {
|
||||
threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z
|
||||
};
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z; // MV: Added this because it was undefined
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||
src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
@@ -872,31 +853,27 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,
|
||||
|
||||
template <FilterFuncVec filter>
|
||||
__global__ void
|
||||
kernel_filter_vec(const __restrict__ AcReal* src0,
|
||||
const __restrict__ AcReal* src1,
|
||||
const __restrict__ AcReal* src2,
|
||||
const int3 start, const int3 end, AcReal* dst)
|
||||
kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
|
||||
const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
|
||||
{
|
||||
const int3 src_idx = (int3) {
|
||||
start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z
|
||||
};
|
||||
const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
|
||||
start.y + threadIdx.y + blockIdx.y * blockDim.y,
|
||||
start.z + threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z; //MV: Added this because it was undefined
|
||||
const int3 dst_idx = (int3) {
|
||||
threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z
|
||||
};
|
||||
const int nx = end.x - start.x;
|
||||
const int ny = end.y - start.y;
|
||||
const int nz = end.z - start.z; // MV: Added this because it was undefined
|
||||
const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
|
||||
threadIdx.y + blockIdx.y * blockDim.y,
|
||||
threadIdx.z + blockIdx.z * blockDim.z};
|
||||
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
|
||||
src_idx.z < DCONST_INT(AC_nz_max));
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
||||
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
@@ -908,7 +885,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
|
||||
extern __shared__ AcReal smem[];
|
||||
if (idx < num_elems) {
|
||||
smem[threadIdx.x] = scratchpad[idx];
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
smem[threadIdx.x] = NAN;
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -930,9 +908,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
__global__ void
|
||||
kernel_reduce_block(const __restrict__ AcReal* scratchpad,
|
||||
const int num_blocks, const int block_size,
|
||||
AcReal* result)
|
||||
kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
||||
const int block_size, AcReal* result)
|
||||
{
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx != 0) {
|
||||
@@ -946,23 +923,19 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
|
||||
*result = res;
|
||||
}
|
||||
|
||||
|
||||
AcReal
|
||||
reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
||||
const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter(
|
||||
(unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
|
||||
);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
@@ -974,22 +947,38 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
kernel_filter<dvalue>
|
||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter<dvalue>
|
||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter<dsquared>
|
||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter<dexp_squared>
|
||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
AcReal result;
|
||||
@@ -998,22 +987,19 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
|
||||
}
|
||||
|
||||
AcReal
|
||||
reduce_vec(const cudaStream_t stream, const ReductionType rtype,
|
||||
const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
|
||||
AcReal* scratchpad, AcReal* reduce_result)
|
||||
reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||
AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
const unsigned nz = end.z - start.z;
|
||||
const unsigned num_elems = nx * ny * nz;
|
||||
|
||||
const dim3 tpb_filter(32, 4, 1);
|
||||
const dim3 bpg_filter(
|
||||
(unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z))
|
||||
);
|
||||
const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
|
||||
(unsigned int)ceil(ny / AcReal(tpb_filter.y)),
|
||||
(unsigned int)ceil(nz / AcReal(tpb_filter.z)));
|
||||
|
||||
const int tpb_reduce = 128;
|
||||
const int bpg_reduce = num_elems / tpb_reduce;
|
||||
@@ -1025,22 +1011,38 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
|
||||
ERRCHK(nx * ny * nz % 2 == 0);
|
||||
|
||||
if (rtype == RTYPE_MAX) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dmax>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else if (rtype == RTYPE_MIN) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dmin>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else if (rtype == RTYPE_RMS) {
|
||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else if (rtype == RTYPE_RMS_EXP) {
|
||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
||||
scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum>
|
||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
}
|
||||
else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
AcReal result;
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -35,10 +35,10 @@
|
||||
#include "model/model_rk3.h"
|
||||
#include "timer_hires.h"
|
||||
|
||||
#include <vector>
|
||||
#include "src/core/errchk.h"
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "src/core/errchk.h"
|
||||
#include <vector>
|
||||
|
||||
static bool
|
||||
smaller_than(const double& a, const double& b)
|
||||
@@ -47,7 +47,8 @@ smaller_than(const double& a, const double& b)
|
||||
}
|
||||
|
||||
static int
|
||||
write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
|
||||
write_runningtimes(const char* path, const int n, const double min, const double max,
|
||||
const double median, const double perc)
|
||||
{
|
||||
FILE* fp;
|
||||
fp = fopen(path, "a");
|
||||
@@ -80,7 +81,8 @@ int
|
||||
run_benchmark(void)
|
||||
{
|
||||
char runningtime_path[256];
|
||||
sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
||||
sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float",
|
||||
GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
||||
|
||||
FILE* fp;
|
||||
fp = fopen(runningtime_path, "w");
|
||||
@@ -88,13 +90,14 @@ run_benchmark(void)
|
||||
if (fp != NULL) {
|
||||
fprintf(fp, "n, min, max, median, perc\n");
|
||||
fclose(fp);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
#define N_STEP_SIZE (128)
|
||||
#define MAX_MESH_DIM (128)
|
||||
#define NUM_ITERS (100)
|
||||
#define N_STEP_SIZE (128)
|
||||
#define MAX_MESH_DIM (128)
|
||||
#define NUM_ITERS (100)
|
||||
for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
|
||||
/* Parse configs */
|
||||
AcMeshInfo mesh_info;
|
||||
@@ -113,7 +116,6 @@ run_benchmark(void)
|
||||
std::vector<double> results;
|
||||
results.reserve(NUM_ITERS);
|
||||
|
||||
|
||||
// Warmup
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
acIntegrate(0);
|
||||
@@ -124,28 +126,35 @@ run_benchmark(void)
|
||||
for (int i = 0; i < NUM_ITERS; ++i) {
|
||||
|
||||
timer_reset(&t);
|
||||
#if GEN_BENCHMARK_RK3 == 1
|
||||
#if GEN_BENCHMARK_RK3 == 1
|
||||
acIntegrateStep(2, FLT_EPSILON);
|
||||
#else // GEN_BENCHMARK_FULL
|
||||
//const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||
const AcReal dt = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
|
||||
#else // GEN_BENCHMARK_FULL
|
||||
// const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||
const AcReal dt = AcReal(
|
||||
1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
|
||||
acIntegrate(dt);
|
||||
#endif
|
||||
#endif
|
||||
acSynchronize();
|
||||
|
||||
const double ms_elapsed = timer_diff_nsec(t) / 1e6;
|
||||
results.push_back(ms_elapsed);
|
||||
}
|
||||
|
||||
#define NTH_PERCENTILE (0.95)
|
||||
#define NTH_PERCENTILE (0.95)
|
||||
std::sort(results.begin(), results.end(), smaller_than);
|
||||
write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
|
||||
write_runningtimes(runningtime_path, n, results[0], results[results.size() - 1],
|
||||
results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
|
||||
|
||||
char percentile_path[256];
|
||||
sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
||||
sprintf(percentile_path, "%d_%s_%s_percentiles.out", n,
|
||||
AC_DOUBLE_PRECISION ? "double" : "float",
|
||||
GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
|
||||
write_percentiles(percentile_path, NUM_ITERS, results);
|
||||
|
||||
printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
|
||||
printf("%s running time %g ms, (%dth percentile, nx = %d) \n",
|
||||
GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep",
|
||||
double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
|
||||
mesh_info.int_params[AC_nx]);
|
||||
|
||||
acStore(mesh);
|
||||
acQuit();
|
||||
@@ -225,7 +234,8 @@ run_benchmark(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
|
||||
#else
|
||||
//////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
|
||||
|
||||
|
||||
|
||||
@@ -290,8 +300,8 @@ run_benchmark(void)
|
||||
|
||||
#define NTH_PERCENTILE (0.95)
|
||||
std::sort(results.begin(), results.end(), smaller_than);
|
||||
write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
|
||||
write_percentiles(n, NUM_ITERS, results);
|
||||
write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)],
|
||||
results[int(NTH_PERCENTILE * NUM_ITERS)]); write_percentiles(n, NUM_ITERS, results);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@@ -63,7 +63,7 @@ parse_config(const char* path, AcMeshInfo* config)
|
||||
{
|
||||
FILE* fp;
|
||||
fp = fopen(path, "r");
|
||||
// For knowing which .conf file will be used
|
||||
// For knowing which .conf file will be used
|
||||
printf("Config file path: \n %s \n ", path);
|
||||
ERRCHK(fp != NULL);
|
||||
|
||||
@@ -79,8 +79,7 @@ parse_config(const char* path, AcMeshInfo* config)
|
||||
int idx = -1;
|
||||
if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
|
||||
config->int_params[idx] = atoi(value);
|
||||
else if ((idx = find_str(keyword, realparam_names,
|
||||
NUM_REAL_PARAM_TYPES)) >= 0)
|
||||
else if ((idx = find_str(keyword, realparam_names, NUM_REAL_PARAM_TYPES)) >= 0)
|
||||
config->real_params[idx] = AcReal(atof(value));
|
||||
}
|
||||
|
||||
@@ -92,68 +91,66 @@ update_config(AcMeshInfo* config)
|
||||
{
|
||||
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
||||
///////////// PAD TEST
|
||||
//config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||
///////////// PAD TEST
|
||||
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
||||
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
||||
|
||||
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
||||
config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
|
||||
config->int_params[AC_nx];
|
||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
||||
config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] +
|
||||
STENCIL_ORDER / 2;
|
||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] +
|
||||
STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
|
||||
|
||||
// Spacing
|
||||
config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
|
||||
config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
|
||||
config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
|
||||
config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
|
||||
config->real_params[AC_dsmin] = min(
|
||||
config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
|
||||
|
||||
// Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
|
||||
config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx];
|
||||
config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my];
|
||||
config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz];
|
||||
config->real_params[AC_xlen] = config->real_params[AC_dsx] * config->int_params[AC_mx];
|
||||
config->real_params[AC_ylen] = config->real_params[AC_dsy] * config->int_params[AC_my];
|
||||
config->real_params[AC_zlen] = config->real_params[AC_dsz] * config->int_params[AC_mz];
|
||||
|
||||
config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];
|
||||
config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen];
|
||||
config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen];
|
||||
config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];
|
||||
config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen];
|
||||
config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen];
|
||||
|
||||
/* Additional helper params */
|
||||
// Int helpers
|
||||
config->int_params[AC_mxy] = config->int_params[AC_mx] *
|
||||
config->int_params[AC_my];
|
||||
config->int_params[AC_nxy] = config->int_params[AC_nx] *
|
||||
config->int_params[AC_ny];
|
||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
|
||||
config->int_params[AC_nz];
|
||||
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
||||
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
||||
|
||||
// Real helpers
|
||||
config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
|
||||
config->real_params[AC_cs_sound];
|
||||
|
||||
config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
|
||||
config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] /
|
||||
config->real_params[AC_gamma];
|
||||
|
||||
AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
|
||||
AcReal M_sun = AcReal(1.989e33); // g solar mass
|
||||
AcReal G_CONST_CGS = AcReal(
|
||||
6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
|
||||
AcReal M_sun = AcReal(1.989e33); // g solar mass
|
||||
|
||||
config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun /
|
||||
( (config->real_params[AC_unit_length]*
|
||||
config->real_params[AC_unit_length]*
|
||||
config->real_params[AC_unit_length]) *
|
||||
config->real_params[AC_unit_density] ) ;
|
||||
config->real_params[AC_M_star] = config->real_params[AC_M_star] * M_sun /
|
||||
((config->real_params[AC_unit_length] *
|
||||
config->real_params[AC_unit_length] *
|
||||
config->real_params[AC_unit_length]) *
|
||||
config->real_params[AC_unit_density]);
|
||||
|
||||
config->real_params[AC_G_CONST] = G_CONST_CGS /
|
||||
( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
|
||||
(config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
|
||||
|
||||
config->real_params[AC_GM_star] = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
|
||||
config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
|
||||
config->real_params[AC_G_CONST] = G_CONST_CGS / ((config->real_params[AC_unit_velocity] *
|
||||
config->real_params[AC_unit_velocity]) /
|
||||
(config->real_params[AC_unit_density] *
|
||||
config->real_params[AC_unit_length]));
|
||||
|
||||
config->real_params[AC_GM_star] = config->real_params[AC_M_star] *
|
||||
config->real_params[AC_G_CONST];
|
||||
config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2) * config->real_params[AC_GM_star]));
|
||||
|
||||
const bool print_config = true;
|
||||
if (print_config) {
|
||||
|
@@ -50,12 +50,12 @@ static const int window_bpp = 32; // Bits per pixel
|
||||
SDL_Surface* surfaces[NUM_VTXBUF_HANDLES];
|
||||
static int datasurface_width = -1;
|
||||
static int datasurface_height = -1;
|
||||
static int k_slice = 0;
|
||||
static int k_slice_max = 0;
|
||||
static int k_slice = 0;
|
||||
static int k_slice_max = 0;
|
||||
|
||||
// Colors
|
||||
static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
|
||||
static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
|
||||
static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
|
||||
static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
|
||||
static const int tiles_per_row = 3;
|
||||
|
||||
/*
|
||||
@@ -82,10 +82,9 @@ static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
|
||||
static inline vec4
|
||||
project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
|
||||
{
|
||||
const vec4 rect = (vec4){
|
||||
camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
|
||||
camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
|
||||
camera.scale * bbox.x, camera.scale * bbox.y};
|
||||
const vec4 rect = (vec4){camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
|
||||
camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
|
||||
camera.scale * bbox.x, camera.scale * bbox.y};
|
||||
|
||||
return rect;
|
||||
}
|
||||
@@ -103,13 +102,12 @@ renderer_init(const int& mx, const int& my)
|
||||
SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
|
||||
|
||||
// Setup window
|
||||
window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
|
||||
SDL_WINDOWPOS_UNDEFINED, window_width,
|
||||
window_height, SDL_WINDOW_SHOWN);
|
||||
window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
|
||||
window_width, window_height, SDL_WINDOW_SHOWN);
|
||||
|
||||
// Setup SDL renderer
|
||||
renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
|
||||
//SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
|
||||
// SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
|
||||
SDL_GetWindowSize(window, &window_width, &window_height);
|
||||
|
||||
SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
|
||||
@@ -118,24 +116,24 @@ renderer_init(const int& mx, const int& my)
|
||||
datasurface_height = my;
|
||||
// vec drawing uses the surface of the first component, no memory issues here
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||
surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
|
||||
0, datasurface_width, datasurface_height, window_bpp,
|
||||
SDL_PIXELFORMAT_RGBA8888);
|
||||
surfaces[i] = SDL_CreateRGBSurfaceWithFormat(0, datasurface_width, datasurface_height,
|
||||
window_bpp, SDL_PIXELFORMAT_RGBA8888);
|
||||
|
||||
camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
|
||||
-.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
|
||||
camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
|
||||
-.5f * (num_tiles / tiles_per_row) * datasurface_height +
|
||||
.5f * datasurface_height};
|
||||
camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
|
||||
window_height / float(datasurface_height * (num_tiles/tiles_per_row)));
|
||||
window_height / float(datasurface_height * (num_tiles / tiles_per_row)));
|
||||
|
||||
SDL_RendererInfo renderer_info;
|
||||
SDL_GetRendererInfo(renderer, &renderer_info);
|
||||
printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
|
||||
printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width,
|
||||
renderer_info.max_texture_height);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
set_pixel(const int& i, const int& j, const uint32_t& color,
|
||||
SDL_Surface* surface)
|
||||
set_pixel(const int& i, const int& j, const uint32_t& color, SDL_Surface* surface)
|
||||
{
|
||||
uint32_t* pixels = (uint32_t*)surface->pixels;
|
||||
pixels[i + j * surface->w] = color;
|
||||
@@ -143,22 +141,21 @@ set_pixel(const int& i, const int& j, const uint32_t& color,
|
||||
}
|
||||
|
||||
static int
|
||||
draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
|
||||
const int& tile)
|
||||
draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int& tile)
|
||||
{
|
||||
const float xoffset = (tile % tiles_per_row) * datasurface_width;
|
||||
const float yoffset = - (tile / tiles_per_row) * datasurface_height;
|
||||
const float yoffset = -(tile / tiles_per_row) * datasurface_height;
|
||||
|
||||
/*
|
||||
const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
|
||||
const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
|
||||
*/
|
||||
const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer));
|
||||
const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer));
|
||||
const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer));
|
||||
const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer));
|
||||
const float range = fabsf(max - min);
|
||||
const float mid = max - .5f * range;
|
||||
|
||||
const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
|
||||
const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
|
||||
|
||||
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
|
||||
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
|
||||
@@ -166,29 +163,23 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
|
||||
|
||||
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
|
||||
const uint8_t shade = (uint8_t)(
|
||||
255.f *
|
||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
|
||||
range);
|
||||
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / range);
|
||||
uint8_t color[4] = {0, 0, 0, 255};
|
||||
color[tile % 3] = shade;
|
||||
const uint32_t mapped_color = SDL_MapRGBA(
|
||||
surfaces[vertex_buffer]->format, color[0], color[1], color[2],
|
||||
color[3]);
|
||||
const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer]->format, color[0],
|
||||
color[1], color[2], color[3]);
|
||||
set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
|
||||
}
|
||||
}
|
||||
|
||||
const float2 pos = (float2){xoffset, yoffset};
|
||||
const float2 bbox = (float2){.5f * datasurface_width,
|
||||
.5f * datasurface_height};
|
||||
const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height};
|
||||
const float2 wsize = (float2){float(window_width), float(window_height)};
|
||||
const vec4 rectf = project_ortho(pos, bbox, wsize);
|
||||
SDL_Rect rect = (SDL_Rect){
|
||||
int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
||||
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
||||
SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
||||
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
||||
|
||||
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
|
||||
surfaces[vertex_buffer]);
|
||||
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer]);
|
||||
SDL_RenderCopy(renderer, tex, NULL, &rect);
|
||||
SDL_DestroyTexture(tex);
|
||||
|
||||
@@ -196,14 +187,12 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
|
||||
}
|
||||
|
||||
static int
|
||||
draw_vertex_buffer_vec(const AcMesh& mesh,
|
||||
const VertexBufferHandle& vertex_buffer_a,
|
||||
draw_vertex_buffer_vec(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer_a,
|
||||
const VertexBufferHandle& vertex_buffer_b,
|
||||
const VertexBufferHandle& vertex_buffer_c,
|
||||
const int& tile)
|
||||
const VertexBufferHandle& vertex_buffer_c, const int& tile)
|
||||
{
|
||||
const float xoffset = (tile % tiles_per_row) * datasurface_width;
|
||||
const float yoffset = - (tile / tiles_per_row) * datasurface_height;
|
||||
const float yoffset = -(tile / tiles_per_row) * datasurface_height;
|
||||
|
||||
/*
|
||||
const float maxx = float(
|
||||
@@ -215,52 +204,41 @@ draw_vertex_buffer_vec(const AcMesh& mesh,
|
||||
min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
|
||||
model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
|
||||
*/
|
||||
const float maxx = float(
|
||||
max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
|
||||
max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
|
||||
acReduceScal(RTYPE_MAX, vertex_buffer_c))));
|
||||
const float minn = float(
|
||||
min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
|
||||
min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
|
||||
acReduceScal(RTYPE_MIN, vertex_buffer_c))));
|
||||
const float maxx = float(max(
|
||||
acReduceScal(RTYPE_MAX, vertex_buffer_a),
|
||||
max(acReduceScal(RTYPE_MAX, vertex_buffer_b), acReduceScal(RTYPE_MAX, vertex_buffer_c))));
|
||||
const float minn = float(min(
|
||||
acReduceScal(RTYPE_MIN, vertex_buffer_a),
|
||||
min(acReduceScal(RTYPE_MIN, vertex_buffer_b), acReduceScal(RTYPE_MIN, vertex_buffer_c))));
|
||||
const float range = fabsf(maxx - minn);
|
||||
const float mid = maxx - .5f * range;
|
||||
|
||||
const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
|
||||
const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
|
||||
for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
|
||||
for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
|
||||
ERRCHK(i < datasurface_width && j < datasurface_height);
|
||||
|
||||
const int idx = AC_VTXBUF_IDX(i, j, k, mesh.info);
|
||||
const uint8_t r = (uint8_t)(
|
||||
255.f *
|
||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
|
||||
range);
|
||||
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / range);
|
||||
const uint8_t g = (uint8_t)(
|
||||
255.f *
|
||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
|
||||
range);
|
||||
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / range);
|
||||
const uint8_t b = (uint8_t)(
|
||||
255.f *
|
||||
(fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
|
||||
range);
|
||||
const uint32_t mapped_color = SDL_MapRGBA(
|
||||
surfaces[vertex_buffer_a]->format, r, g, b, 255);
|
||||
255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / range);
|
||||
const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer_a]->format, r, g, b,
|
||||
255);
|
||||
set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
|
||||
}
|
||||
}
|
||||
|
||||
const float2 pos = (float2){xoffset, yoffset};
|
||||
const float2 bbox = (float2){.5f * datasurface_width,
|
||||
.5f * datasurface_height};
|
||||
const float2 bbox = (float2){.5f * datasurface_width, .5f * datasurface_height};
|
||||
const float2 wsize = (float2){float(window_width), float(window_height)};
|
||||
const vec4 rectf = project_ortho(pos, bbox, wsize);
|
||||
SDL_Rect rect = (SDL_Rect){
|
||||
int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
||||
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
||||
SDL_Rect rect = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
|
||||
int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
|
||||
|
||||
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
|
||||
surfaces[vertex_buffer_a]);
|
||||
SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer_a]);
|
||||
SDL_RenderCopy(renderer, tex, NULL, &rect);
|
||||
SDL_DestroyTexture(tex);
|
||||
|
||||
@@ -272,13 +250,11 @@ renderer_draw(const AcMesh& mesh)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||
draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
|
||||
draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
|
||||
NUM_VTXBUF_HANDLES);
|
||||
draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, NUM_VTXBUF_HANDLES);
|
||||
|
||||
// Drawing done, present
|
||||
SDL_RenderPresent(renderer);
|
||||
SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
|
||||
color_bg.a);
|
||||
SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, color_bg.a);
|
||||
SDL_RenderClear(renderer);
|
||||
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
@@ -404,14 +380,14 @@ run_renderer(void)
|
||||
|
||||
/* Step the simulation */
|
||||
#if 1
|
||||
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
|
||||
VTXBUF_UUZ);
|
||||
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||
const AcReal dt = host_timestep(umax, mesh_info);
|
||||
acIntegrate(dt);
|
||||
#else
|
||||
ModelMesh* model_mesh = modelmesh_create(mesh->info);
|
||||
const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
|
||||
const AcReal dt = host_timestep(umax, mesh_info);
|
||||
const AcReal umax = AcReal(
|
||||
model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
|
||||
const AcReal dt = host_timestep(umax, mesh_info);
|
||||
acmesh_to_modelmesh(*mesh, model_mesh);
|
||||
model_rk3(dt, model_mesh);
|
||||
modelmesh_to_acmesh(*model_mesh, mesh);
|
||||
@@ -425,7 +401,7 @@ run_renderer(void)
|
||||
/* Render */
|
||||
const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
|
||||
if (timer_diff_sec >= desired_frame_time) {
|
||||
//acStore(mesh);
|
||||
// acStore(mesh);
|
||||
const int num_vertices = mesh->info.int_params[AC_mxy];
|
||||
const int3 dst = (int3){0, 0, k_slice};
|
||||
acStoreWithOffset(dst, num_vertices, mesh);
|
||||
|
@@ -60,23 +60,23 @@ print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
|
||||
}
|
||||
*/
|
||||
|
||||
//Write all setting info into a separate ascii file. This is done to guarantee
|
||||
//that we have the data specifi information in the thing, even though in
|
||||
//principle these things are in the astaroth.conf.
|
||||
static inline
|
||||
void write_mesh_info(const AcMeshInfo* config)
|
||||
// Write all setting info into a separate ascii file. This is done to guarantee
|
||||
// that we have the data specifi information in the thing, even though in
|
||||
// principle these things are in the astaroth.conf.
|
||||
static inline void
|
||||
write_mesh_info(const AcMeshInfo* config)
|
||||
{
|
||||
|
||||
|
||||
FILE* infotxt;
|
||||
|
||||
infotxt = fopen("purge.sh","w");
|
||||
infotxt = fopen("purge.sh", "w");
|
||||
fprintf(infotxt, "#!/bin/bash\n");
|
||||
fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
|
||||
fclose(infotxt);
|
||||
fclose(infotxt);
|
||||
|
||||
infotxt = fopen("mesh_info.list","w");
|
||||
infotxt = fopen("mesh_info.list", "w");
|
||||
|
||||
//Total grid dimensions
|
||||
// Total grid dimensions
|
||||
fprintf(infotxt, "int AC_mx %i \n", config->int_params[AC_mx]);
|
||||
fprintf(infotxt, "int AC_my %i \n", config->int_params[AC_my]);
|
||||
fprintf(infotxt, "int AC_mz %i \n", config->int_params[AC_mz]);
|
||||
@@ -96,30 +96,28 @@ void write_mesh_info(const AcMeshInfo* config)
|
||||
fprintf(infotxt, "real AC_inv_dsx %e \n", (double)config->real_params[AC_inv_dsx]);
|
||||
fprintf(infotxt, "real AC_inv_dsy %e \n", (double)config->real_params[AC_inv_dsy]);
|
||||
fprintf(infotxt, "real AC_inv_dsz %e \n", (double)config->real_params[AC_inv_dsz]);
|
||||
fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin ]);
|
||||
fprintf(infotxt, "real AC_dsmin %e \n", (double)config->real_params[AC_dsmin]);
|
||||
|
||||
/* Additional helper params */
|
||||
// Int helpers
|
||||
fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy ]);
|
||||
fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy ]);
|
||||
fprintf(infotxt, "int AC_mxy %i \n", config->int_params[AC_mxy]);
|
||||
fprintf(infotxt, "int AC_nxy %i \n", config->int_params[AC_nxy]);
|
||||
fprintf(infotxt, "int AC_nxyz %i \n", config->int_params[AC_nxyz]);
|
||||
|
||||
// Real helpers
|
||||
fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
|
||||
fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound ]);
|
||||
fprintf(infotxt, "real AC_cv_sound %e \n", (double)config->real_params[AC_cv_sound]);
|
||||
|
||||
fclose(infotxt);
|
||||
}
|
||||
|
||||
|
||||
//This funtion writes a run state into a set of C binaries. For the sake of
|
||||
//accuracy, all floating point numbers are to be saved in long double precision
|
||||
//regardless of the choise of accuracy during runtime.
|
||||
// This funtion writes a run state into a set of C binaries. For the sake of
|
||||
// accuracy, all floating point numbers are to be saved in long double precision
|
||||
// regardless of the choise of accuracy during runtime.
|
||||
static inline void
|
||||
save_mesh(const AcMesh &save_mesh, const int step,
|
||||
const AcReal t_step)
|
||||
save_mesh(const AcMesh& save_mesh, const int step, const AcReal t_step)
|
||||
{
|
||||
FILE* save_ptr;
|
||||
FILE* save_ptr;
|
||||
|
||||
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
|
||||
const size_t n = AC_VTXBUF_SIZE(save_mesh.info);
|
||||
@@ -128,7 +126,7 @@ save_mesh(const AcMesh &save_mesh, const int step,
|
||||
char cstep[10];
|
||||
char bin_filename[80] = "\0";
|
||||
|
||||
//sprintf(bin_filename, "");
|
||||
// sprintf(bin_filename, "");
|
||||
|
||||
sprintf(cstep, "%d", step);
|
||||
|
||||
@@ -139,30 +137,27 @@ save_mesh(const AcMesh &save_mesh, const int step,
|
||||
|
||||
printf("Savefile %s \n", bin_filename);
|
||||
|
||||
save_ptr = fopen(bin_filename,"wb");
|
||||
save_ptr = fopen(bin_filename, "wb");
|
||||
|
||||
//Start file with time stamp
|
||||
long double write_long_buf = (long double) t_step;
|
||||
// Start file with time stamp
|
||||
long double write_long_buf = (long double)t_step;
|
||||
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
|
||||
//Grid data
|
||||
// Grid data
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
|
||||
long double write_long_buf = (long double) point_val;
|
||||
const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
|
||||
long double write_long_buf = (long double)point_val;
|
||||
fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
|
||||
}
|
||||
fclose(save_ptr);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
// This function prints out the diagnostic values to std.out and also saves and
|
||||
// appends an ascii file to contain all the result.
|
||||
// appends an ascii file to contain all the result.
|
||||
static inline void
|
||||
print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file)
|
||||
print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE* diag_file)
|
||||
{
|
||||
|
||||
|
||||
AcReal buf_rms, buf_max, buf_min;
|
||||
const int max_name_width = 16;
|
||||
|
||||
@@ -172,20 +167,19 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
|
||||
buf_rms = acReduceVec(RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||
|
||||
// MV: The ordering in the earlier version was wrong in terms of variable
|
||||
// MV: name and its diagnostics.
|
||||
// MV: name and its diagnostics.
|
||||
printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
|
||||
printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
|
||||
double(buf_min), double(buf_rms), double(buf_max));
|
||||
fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt),
|
||||
double(buf_min), double(buf_rms), double(buf_max));
|
||||
|
||||
printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", double(buf_min),
|
||||
double(buf_rms), double(buf_max));
|
||||
fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), double(buf_min),
|
||||
double(buf_rms), double(buf_max));
|
||||
|
||||
// Calculate rms, min and max from the variables as scalars
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
buf_max = acReduceScal(RTYPE_MAX, VertexBufferHandle(i));
|
||||
buf_min = acReduceScal(RTYPE_MIN, VertexBufferHandle(i));
|
||||
buf_rms = acReduceScal(RTYPE_RMS, VertexBufferHandle(i));
|
||||
|
||||
|
||||
printf(" %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
|
||||
double(buf_min), double(buf_rms), double(buf_max));
|
||||
fprintf(diag_file, "%e %e %e ", double(buf_min), double(buf_rms), double(buf_max));
|
||||
@@ -194,11 +188,11 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
|
||||
fprintf(diag_file, "\n");
|
||||
}
|
||||
|
||||
/*
|
||||
MV NOTE: At the moment I have no clear idea how to calculate magnetic
|
||||
diagnostic variables from grid. Vector potential measures have a limited
|
||||
value. TODO: Smart way to get brms, bmin and bmax.
|
||||
*/
|
||||
/*
|
||||
MV NOTE: At the moment I have no clear idea how to calculate magnetic
|
||||
diagnostic variables from grid. Vector potential measures have a limited
|
||||
value. TODO: Smart way to get brms, bmin and bmax.
|
||||
*/
|
||||
|
||||
int
|
||||
run_simulation(void)
|
||||
@@ -213,16 +207,16 @@ run_simulation(void)
|
||||
acInit(mesh_info);
|
||||
acLoad(*mesh);
|
||||
|
||||
|
||||
FILE *diag_file;
|
||||
FILE* diag_file;
|
||||
diag_file = fopen("timeseries.ts", "a");
|
||||
// TODO Get time from earlier state.
|
||||
// TODO Get time from earlier state.
|
||||
AcReal t_step = 0.0;
|
||||
|
||||
// Generate the title row.
|
||||
fprintf(diag_file, "step t_step dt uu_total_min uu_total_rms uu_total_max ");
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
|
||||
fprintf(diag_file, "%s_min %s_rms %s_max ", vtxbuf_names[i], vtxbuf_names[i],
|
||||
vtxbuf_names[i]);
|
||||
}
|
||||
|
||||
fprintf(diag_file, "\n");
|
||||
@@ -234,56 +228,54 @@ run_simulation(void)
|
||||
acStore(mesh);
|
||||
save_mesh(*mesh, 0, t_step);
|
||||
|
||||
const int max_steps = mesh_info.int_params[AC_max_steps];
|
||||
const int max_steps = mesh_info.int_params[AC_max_steps];
|
||||
const int save_steps = mesh_info.int_params[AC_save_steps];
|
||||
const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info
|
||||
const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; // TODO Get from mesh_info
|
||||
|
||||
AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
|
||||
AcReal bin_crit_t = bin_save_t;
|
||||
|
||||
/* Step the simulation */
|
||||
for (int i = 1; i < max_steps; ++i) {
|
||||
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
|
||||
VTXBUF_UUZ);
|
||||
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||
const AcReal dt = host_timestep(umax, mesh_info);
|
||||
acIntegrate(dt);
|
||||
|
||||
t_step += dt;
|
||||
t_step += dt;
|
||||
|
||||
/* Save the simulation state and print diagnostics */
|
||||
if ((i % save_steps) == 0) {
|
||||
|
||||
/*
|
||||
print_diagnostics() writes out both std.out printout from the
|
||||
results and saves the diagnostics into a table for ascii file
|
||||
print_diagnostics() writes out both std.out printout from the
|
||||
results and saves the diagnostics into a table for ascii file
|
||||
timeseries.ts.
|
||||
*/
|
||||
|
||||
print_diagnostics(i, dt, t_step, diag_file);
|
||||
|
||||
/*
|
||||
We would also might want an XY-average calculating funtion,
|
||||
which can be very useful when observing behaviour of turbulent
|
||||
We would also might want an XY-average calculating funtion,
|
||||
which can be very useful when observing behaviour of turbulent
|
||||
simulations. (TODO)
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
/* Save the simulation state and print diagnostics */
|
||||
if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) {
|
||||
|
||||
/*
|
||||
This loop saves the data into simple C binaries which can be
|
||||
This loop saves the data into simple C binaries which can be
|
||||
used for analysing the data snapshots closely.
|
||||
|
||||
Saving simulation state should happen in a separate stage. We do
|
||||
not want to save it as often as diagnostics. The file format
|
||||
should IDEALLY be HDF5 which has become a well supported, portable and
|
||||
reliable data format when it comes to HPC applications.
|
||||
However, implementing it will have to for more simpler approach
|
||||
|
||||
Saving simulation state should happen in a separate stage. We do
|
||||
not want to save it as often as diagnostics. The file format
|
||||
should IDEALLY be HDF5 which has become a well supported, portable and
|
||||
reliable data format when it comes to HPC applications.
|
||||
However, implementing it will have to for more simpler approach
|
||||
to function. (TODO?)
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
The updated mesh will be located on the GPU. Also all calls
|
||||
to the astaroth interface (functions beginning with ac*) are
|
||||
@@ -299,10 +291,8 @@ run_simulation(void)
|
||||
|
||||
save_mesh(*mesh, i, t_step);
|
||||
|
||||
bin_crit_t += bin_save_t;
|
||||
|
||||
bin_crit_t += bin_save_t;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//////Save the final snapshot
|
||||
@@ -318,25 +308,3 @@ run_simulation(void)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -52,8 +52,7 @@ timer_diff_nsec(const Timer start)
|
||||
{
|
||||
Timer end;
|
||||
timer_reset(&end);
|
||||
const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
|
||||
(end.tv_nsec - start.tv_nsec);
|
||||
const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + (end.tv_nsec - start.tv_nsec);
|
||||
return diff;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user