Reviewed the Astaroth interface. Now there's a clear distinction between synchronous and asynchronous functions. For basic usage, we provide a set of functions that are always safe to call (acIntegrate, acLoad, etc), but because of this, must be quite restricted in the sense that f.ex. the whole mesh must be loaded at once and computations cannot be executed concurrently on multiple GPUs. For more advanced users we provide asynchronous functions (such as acLoadWithOffset). Since we cannot know how the asynchronous functions are called (for example, when the integration step has been fully completed and the halos of neighboring subgrids can be safely communicated between GPUs), the responsibility of synchronization must be left to the user. In the existing implementations we currently use only the basic "safe" set of functions (except in renderer.cc), so the existing functionality has not been changed with these latests commits. Autotests also pass.

This commit is contained in:
jpekkila
2019-07-09 18:42:00 +03:00
parent 1251f61570
commit 0bda016e17
7 changed files with 298 additions and 300 deletions

View File

@@ -212,71 +212,89 @@ typedef struct {
/*
* =============================================================================
* Astaroth interface
* Astaroth interface: Basic functions. Synchronous.
* =============================================================================
*/
typedef enum {
STREAM_PRIMARY, //
STREAM_SECONDARY, //
NUM_STREAM_TYPES, //
STREAM_ALL
} StreamType;
/** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more,
AC_FAILURE otherwise. */
* AC_FAILURE otherwise. */
AcResult acCheckDeviceAvailability(void);
/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL
* passed as a parameter */
AcResult acSynchronizeStream(const StreamType stream);
/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos
* of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */
AcResult acSynchronizeMesh(void);
/** Starting point of all GPU computation. Handles the allocation and
initialization of *all memory needed on all GPUs in the node*. In other words,
setups everything GPU-side so that calling any other GPU interface function
afterwards does not result in illegal memory accesses. */
AcResult acInit(const AcMeshInfo& mesh_info);
/** Splits the host_mesh and distributes it among the GPUs in the node */
AcResult acLoad(const AcMesh& host_mesh);
AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
/** Does all three steps of the RK3 integration and computes the boundary
conditions when necessary. Note that the boundary conditions are not applied
after the final integration step.
The result can be fetched to CPU memory with acStore(). */
AcResult acIntegrate(const AcReal& dt);
/** Performs a single RK3 step without computing boundary conditions. */
AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
/** Performs a single RK3 step without computing boundary conditions.
Operates on a three-dimensional cuboid, where start and end are the
opposite corners. */
AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
const int3& end);
/** Applies boundary conditions on the GPU meshs and communicates the
ghost zones among GPUs if necessary */
AcResult acBoundcondStep(void);
/** Performs a scalar reduction on all GPUs in the node and returns the result.
*/
AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
/** Performs a vector reduction on all GPUs in the node and returns the result.
*/
AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
const VertexBufferHandle& b, const VertexBufferHandle& c);
/** Stores the mesh distributed among GPUs of the node back to a single host
* mesh */
AcResult acStore(AcMesh* host_mesh);
AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
/** Frees all GPU allocations and resets all devices in the node. Should be
* called at exit. */
AcResult acQuit(void);
/** Synchronizes all devices. All calls to Astaroth are asynchronous by default
unless otherwise stated. */
AcResult acSynchronize(void);
/** Does all three substeps of the RK3 integration and computes the boundary
conditions when necessary. The result is synchronized and the boundary conditions are applied
after the final substep, after which the result can be fetched to CPU memory with acStore. */
AcResult acIntegrate(const AcReal& dt);
/** Loads a parameter to the constant memory of all devices */
/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the
* whole computational domain, which must be up to date and synchronized before calling
* acReduceScal.
*/
AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a);
/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the
* whole computational domain, which must be up to date and synchronized before calling
* acReduceVec.
*/
AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
const VertexBufferHandle& b, const VertexBufferHandle& c);
/** Distributes the host mesh among the GPUs in the node. Synchronous. */
AcResult acLoad(const AcMesh& host_mesh);
/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous.
*/
AcResult acStore(AcMesh* host_mesh);
/*
* =============================================================================
* Astaroth interface: Advanced functions. Asynchronous.
* =============================================================================
*/
/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
/** Auto-optimizes the library. This function is free from side-effects: the input vertex buffer is
* guaranteed not be modified.*/
AcResult acAutoOptimize(void);
/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
* CPU memory. Asynchronous.
*/
AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
* Asynchronous.*/
AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
const int3& end);
/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
AcResult acBoundcondStep(void);
/* End extern "C" */
#ifdef __cplusplus

View File

@@ -183,6 +183,61 @@ acCheckDeviceAvailability(void)
return AC_FAILURE;
}
AcResult
acSynchronizeStream(const StreamType stream)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
synchronize(devices[i], stream);
}
return AC_SUCCESS;
}
static AcResult
synchronize_halos(void)
{
// Exchanges the halos of subgrids
// After this step, the data within the main grid ranging from
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
// has been synchronized and transferred to appropriate subgrids
// We loop only to num_devices - 1 since the front and back plate of the grid is not
// transferred because their contents depend on the boundary conditions.
// IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
// I.e. the halos of subgrids must contain up-to-date data!
// #pragma omp parallel for
for (int i = 0; i < num_devices - 1; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
dst, num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
dst, num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acSynchronizeMesh(void)
{
acSynchronizeStream(STREAM_ALL);
synchronize_halos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acInit(const AcMeshInfo& config)
{
@@ -235,18 +290,187 @@ acInit(const AcMeshInfo& config)
loadGlobalGrid(devices[i], grid);
printDeviceInfo(devices[i]);
}
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acQuit(void)
{
acSynchronizeStream(STREAM_ALL);
for (int i = 0; i < num_devices; ++i) {
destroyDevice(devices[i]);
}
return AC_SUCCESS;
}
AcResult
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
// DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
const int3 da = max(start, d0);
const int3 db = min(end, d1);
if (db.z >= da.z) {
const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
}
}
return AC_SUCCESS;
}
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
{
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = start + grid.n;
acIntegrateStepWithOffset(isubstep, dt, start, end);
return AC_SUCCESS;
}
static AcResult
local_boundcondstep(void)
{
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
}
else {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
}
}
return AC_SUCCESS;
}
static AcResult
global_boundcondstep(void)
{
if (num_devices > 1) {
// With periodic boundary conditions we exchange the front and back plates of the
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acBoundcondStep(void)
{
local_boundcondstep();
acSynchronizeStream(STREAM_ALL);
global_boundcondstep();
synchronize_halos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
static AcResult
swap_buffers(void)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
swapBuffers(devices[i]);
}
return AC_SUCCESS;
}
AcResult
acIntegrate(const AcReal& dt)
{
acSynchronizeStream(STREAM_ALL);
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
swap_buffers();
acBoundcondStep();
}
return AC_SUCCESS;
}
static AcReal
simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
{
AcReal res = results[0];
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
}
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
}
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
res = sum(res, results[i]);
}
else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res);
}
return res;
}
AcReal
acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
{
acSynchronizeStream(STREAM_ALL);
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
const VertexBufferHandle& c)
{
acSynchronizeStream(STREAM_ALL);
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcResult
acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
{
@@ -284,6 +508,14 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
return AC_SUCCESS;
}
AcResult
acLoad(const AcMesh& host_mesh)
{
acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
{
@@ -308,241 +540,14 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
return AC_SUCCESS;
}
// acCopyMeshToDevice
AcResult
acLoad(const AcMesh& host_mesh)
{
return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
}
// acCopyMeshToHost
AcResult
acStore(AcMesh* host_mesh)
{
return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
}
static AcResult
acSwapBuffers(void)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
swapBuffers(devices[i]);
}
return AC_SUCCESS;
}
static AcResult
acSynchronizeHalos(void)
{
// Exchanges the halos of subgrids
// After this step, the data within the main grid ranging from
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
// has been synchronized and transferred to appropriate subgrids
// We loop only to num_devices - 1 since the front and back plate of the grid is not
// transferred because their contents depend on the boundary conditions.
// IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
// I.e. the halos of subgrids must contain up-to-date data!
// #pragma omp parallel for
for (int i = 0; i < num_devices - 1; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
dst, num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
dst, num_vertices);
}
}
return AC_SUCCESS;
}
static AcResult
acSynchronizeStream(const StreamType stream)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
synchronize(devices[i], stream);
}
return AC_SUCCESS;
}
AcResult
acSynchronize(void)
{
acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
acSynchronizeStream(STREAM_ALL);
acSynchronizeHalos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
static AcResult
acLocalBoundcondStep(void)
{
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
}
else {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
}
}
return AC_SUCCESS;
}
static AcResult
acGlobalBoundcondStep(void)
{
if (num_devices > 1) {
// With periodic boundary conditions we exchange the front and back plates of the
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acBoundcondStep(void)
{
acLocalBoundcondStep();
acSynchronizeStream(STREAM_ALL);
acGlobalBoundcondStep();
acSynchronizeHalos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
// DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
const int3 da = max(start, d0);
const int3 db = min(end, d1);
if (db.z >= da.z) {
const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
}
}
return AC_SUCCESS;
}
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
{
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = start + grid.n;
acIntegrateStepWithOffset(isubstep, dt, start, end);
return AC_SUCCESS;
}
AcResult
acIntegrate(const AcReal& dt)
{
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
acSwapBuffers();
acLocalBoundcondStep();
acSynchronizeStream(STREAM_ALL);
acGlobalBoundcondStep();
acSynchronizeHalos();
acSynchronizeStream(STREAM_ALL);
}
return AC_SUCCESS;
}
static AcReal
simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
{
AcReal res = results[0];
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
}
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
}
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
res = sum(res, results[i]);
}
else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res);
}
return res;
}
AcReal
acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
{
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
const VertexBufferHandle& c)
{
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcResult
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
{
@@ -552,9 +557,3 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
}
return AC_SUCCESS;
}
AcResult
acAutoOptimize(void)
{
return autoOptimize(devices[0]);
}

View File

@@ -27,15 +27,6 @@
#pragma once
#include "astaroth.h"
// clang-format off
typedef enum {
STREAM_PRIMARY,
STREAM_SECONDARY,
NUM_STREAM_TYPES,
STREAM_ALL
} StreamType;
// clang-format on
typedef struct {
int3 m;
int3 n;

View File

@@ -413,8 +413,6 @@ check_rk3(const AcMeshInfo& mesh_info)
const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
acIntegrate(dt);
acBoundcondStep();
acSynchronize();
model_rk3(dt, model_mesh);
boundconds(model_mesh->info, model_mesh);
@@ -712,8 +710,6 @@ run_autotest(void)
// Device integration step
acIntegrate(dt);
acBoundcondStep();
acSynchronize();
acStore(candidate_mesh);
// Check fields

View File

@@ -122,7 +122,6 @@ run_benchmark(void)
// Warmup
for (int i = 0; i < 10; ++i) {
acIntegrate(0);
acSynchronize();
}
Timer t;
@@ -135,8 +134,6 @@ run_benchmark(void)
#else // GEN_BENCHMARK_FULL
acIntegrate(dt);
#endif
acSynchronize();
const double ms_elapsed = timer_diff_nsec(t) / 1e6;
results.push_back(ms_elapsed);
}

View File

@@ -406,7 +406,7 @@ run_renderer(void)
const int num_vertices = mesh->info.int_params[AC_mxy];
const int3 dst = (int3){0, 0, k_slice};
acStoreWithOffset(dst, num_vertices, mesh);
acSynchronize();
acSynchronizeStream(STREAM_ALL);
renderer_draw(*mesh); // Bottleneck is here
printf("Step #%d, dt: %f\n", steps, double(dt));
timer_reset(&frame_timer);

View File

@@ -205,7 +205,6 @@ run_simulation(void)
write_mesh_info(&mesh_info);
print_diagnostics(0, AcReal(.0), t_step, diag_file);
acSynchronize();
acStore(mesh);
save_mesh(*mesh, 0, t_step);
@@ -275,8 +274,6 @@ run_simulation(void)
acSynchronize();
acStore(mesh);
*/
acSynchronize();
acStore(mesh);
save_mesh(*mesh, i, t_step);