diff --git a/include/astaroth.h b/include/astaroth.h index 9c91448..f3c0fb0 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -212,71 +212,89 @@ typedef struct { /* * ============================================================================= - * Astaroth interface + * Astaroth interface: Basic functions. Synchronous. * ============================================================================= */ +typedef enum { + STREAM_PRIMARY, // + STREAM_SECONDARY, // + NUM_STREAM_TYPES, // + STREAM_ALL +} StreamType; /** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more, - AC_FAILURE otherwise. */ + * AC_FAILURE otherwise. */ AcResult acCheckDeviceAvailability(void); +/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL + * passed as a parameter */ +AcResult acSynchronizeStream(const StreamType stream); + +/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos + * of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */ +AcResult acSynchronizeMesh(void); + /** Starting point of all GPU computation. Handles the allocation and initialization of *all memory needed on all GPUs in the node*. In other words, setups everything GPU-side so that calling any other GPU interface function afterwards does not result in illegal memory accesses. */ AcResult acInit(const AcMeshInfo& mesh_info); -/** Splits the host_mesh and distributes it among the GPUs in the node */ -AcResult acLoad(const AcMesh& host_mesh); -AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices); - -/** Does all three steps of the RK3 integration and computes the boundary -conditions when necessary. Note that the boundary conditions are not applied -after the final integration step. -The result can be fetched to CPU memory with acStore(). */ -AcResult acIntegrate(const AcReal& dt); - -/** Performs a single RK3 step without computing boundary conditions. */ -AcResult acIntegrateStep(const int& isubstep, const AcReal& dt); - -/** Performs a single RK3 step without computing boundary conditions. - Operates on a three-dimensional cuboid, where start and end are the - opposite corners. */ -AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, - const int3& end); - -/** Applies boundary conditions on the GPU meshs and communicates the - ghost zones among GPUs if necessary */ -AcResult acBoundcondStep(void); - -/** Performs a scalar reduction on all GPUs in the node and returns the result. - */ -AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a); - -/** Performs a vector reduction on all GPUs in the node and returns the result. - */ -AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, - const VertexBufferHandle& b, const VertexBufferHandle& c); - -/** Stores the mesh distributed among GPUs of the node back to a single host - * mesh */ -AcResult acStore(AcMesh* host_mesh); -AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh); - /** Frees all GPU allocations and resets all devices in the node. Should be * called at exit. */ AcResult acQuit(void); -/** Synchronizes all devices. All calls to Astaroth are asynchronous by default - unless otherwise stated. */ -AcResult acSynchronize(void); +/** Does all three substeps of the RK3 integration and computes the boundary +conditions when necessary. The result is synchronized and the boundary conditions are applied +after the final substep, after which the result can be fetched to CPU memory with acStore. */ +AcResult acIntegrate(const AcReal& dt); -/** Loads a parameter to the constant memory of all devices */ +/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the + * whole computational domain, which must be up to date and synchronized before calling + * acReduceScal. + */ +AcReal acReduceScal(const ReductionType& rtype, const VertexBufferHandle& a); + +/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the + * whole computational domain, which must be up to date and synchronized before calling + * acReduceVec. + */ +AcReal acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, + const VertexBufferHandle& b, const VertexBufferHandle& c); + +/** Distributes the host mesh among the GPUs in the node. Synchronous. */ +AcResult acLoad(const AcMesh& host_mesh); + +/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous. + */ +AcResult acStore(AcMesh* host_mesh); + +/* + * ============================================================================= + * Astaroth interface: Advanced functions. Asynchronous. + * ============================================================================= + */ +/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */ AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value); -/** Auto-optimizes the library. This function is free from side-effects: the input vertex buffer is - * guaranteed not be modified.*/ -AcResult acAutoOptimize(void); +/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */ +AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices); + +/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to + * CPU memory. Asynchronous. + */ +AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh); + +/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/ +AcResult acIntegrateStep(const int& isubstep, const AcReal& dt); + +/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions. + * Asynchronous.*/ +AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, + const int3& end); + +/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */ +AcResult acBoundcondStep(void); /* End extern "C" */ #ifdef __cplusplus diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 78f552c..3f22820 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -183,6 +183,61 @@ acCheckDeviceAvailability(void) return AC_FAILURE; } +AcResult +acSynchronizeStream(const StreamType stream) +{ + // #pragma omp parallel for + for (int i = 0; i < num_devices; ++i) { + synchronize(devices[i], stream); + } + + return AC_SUCCESS; +} + +static AcResult +synchronize_halos(void) +{ + // Exchanges the halos of subgrids + // After this step, the data within the main grid ranging from + // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z + // has been synchronized and transferred to appropriate subgrids + + // We loop only to num_devices - 1 since the front and back plate of the grid is not + // transferred because their contents depend on the boundary conditions. + + // IMPORTANT NOTE: the boundary conditions must be applied before calling this function! + // I.e. the halos of subgrids must contain up-to-date data! + // #pragma omp parallel for + for (int i = 0; i < num_devices - 1; ++i) { + const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; + // ...|ooooxxx|... -> xxx|ooooooo|... + { + const int3 src = (int3){0, 0, subgrid.n.z}; + const int3 dst = (int3){0, 0, 0}; + copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices], + dst, num_vertices); + } + // ...|ooooooo|xxx <- ...|xxxoooo|... + { + const int3 src = (int3){0, 0, NGHOST}; + const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; + copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i], + dst, num_vertices); + } + } + return AC_SUCCESS; +} + +AcResult +acSynchronizeMesh(void) +{ + acSynchronizeStream(STREAM_ALL); + synchronize_halos(); + acSynchronizeStream(STREAM_ALL); + + return AC_SUCCESS; +} + AcResult acInit(const AcMeshInfo& config) { @@ -235,18 +290,187 @@ acInit(const AcMeshInfo& config) loadGlobalGrid(devices[i], grid); printDeviceInfo(devices[i]); } + + acSynchronizeStream(STREAM_ALL); return AC_SUCCESS; } AcResult acQuit(void) { + acSynchronizeStream(STREAM_ALL); + for (int i = 0; i < num_devices; ++i) { destroyDevice(devices[i]); } return AC_SUCCESS; } +AcResult +acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end) +{ + // See the beginning of the file for an explanation of the index mapping + // #pragma omp parallel for + for (int i = 0; i < num_devices; ++i) { + // DECOMPOSITION OFFSET HERE + const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z}; + const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z}; + + const int3 da = max(start, d0); + const int3 db = min(end, d1); + + if (db.z >= da.z) { + const int3 da_local = da - (int3){0, 0, i * subgrid.n.z}; + const int3 db_local = db - (int3){0, 0, i * subgrid.n.z}; + rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt); + } + } + return AC_SUCCESS; +} + +AcResult +acIntegrateStep(const int& isubstep, const AcReal& dt) +{ + const int3 start = (int3){NGHOST, NGHOST, NGHOST}; + const int3 end = start + grid.n; + acIntegrateStepWithOffset(isubstep, dt, start, end); + + return AC_SUCCESS; +} + +static AcResult +local_boundcondstep(void) +{ + if (num_devices == 1) { + boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m); + } + else { + // Local boundary conditions + // #pragma omp parallel for + for (int i = 0; i < num_devices; ++i) { + const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE + const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; + boundcondStep(devices[i], STREAM_PRIMARY, d0, d1); + } + } + return AC_SUCCESS; +} + +static AcResult +global_boundcondstep(void) +{ + if (num_devices > 1) { + // With periodic boundary conditions we exchange the front and back plates of the + // grid. The exchange is done between the first and last device (0 and num_devices - 1). + const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; + // ...|ooooxxx|... -> xxx|ooooooo|... + { + const int3 src = (int3){0, 0, subgrid.n.z}; + const int3 dst = (int3){0, 0, 0}; + copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst, + num_vertices); + } + // ...|ooooooo|xxx <- ...|xxxoooo|... + { + const int3 src = (int3){0, 0, NGHOST}; + const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; + copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst, + num_vertices); + } + } + return AC_SUCCESS; +} + +AcResult +acBoundcondStep(void) +{ + local_boundcondstep(); + acSynchronizeStream(STREAM_ALL); + global_boundcondstep(); + synchronize_halos(); + acSynchronizeStream(STREAM_ALL); + + return AC_SUCCESS; +} + +static AcResult +swap_buffers(void) +{ + // #pragma omp parallel for + for (int i = 0; i < num_devices; ++i) { + swapBuffers(devices[i]); + } + return AC_SUCCESS; +} + +AcResult +acIntegrate(const AcReal& dt) +{ + acSynchronizeStream(STREAM_ALL); + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized. + swap_buffers(); + acBoundcondStep(); + } + return AC_SUCCESS; +} + +static AcReal +simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n) +{ + AcReal res = results[0]; + for (int i = 1; i < n; ++i) { + if (rtype == RTYPE_MAX) { + res = max(res, results[i]); + } + else if (rtype == RTYPE_MIN) { + res = min(res, results[i]); + } + else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + res = sum(res, results[i]); + } + else { + ERROR("Invalid rtype"); + } + } + + if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z); + res = sqrt(inv_n * res); + } + + return res; +} + +AcReal +acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle) +{ + acSynchronizeStream(STREAM_ALL); + + AcReal results[num_devices]; + // #pragma omp parallel for + for (int i = 0; i < num_devices; ++i) { + reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]); + } + + return simple_final_reduce_scal(rtype, results, num_devices); +} + +AcReal +acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b, + const VertexBufferHandle& c) +{ + acSynchronizeStream(STREAM_ALL); + + AcReal results[num_devices]; + // #pragma omp parallel for + for (int i = 0; i < num_devices; ++i) { + reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]); + } + + return simple_final_reduce_scal(rtype, results, num_devices); +} + AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices) { @@ -284,6 +508,14 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice return AC_SUCCESS; } +AcResult +acLoad(const AcMesh& host_mesh) +{ + acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info)); + acSynchronizeStream(STREAM_ALL); + return AC_SUCCESS; +} + AcResult acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh) { @@ -308,241 +540,14 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh) return AC_SUCCESS; } -// acCopyMeshToDevice -AcResult -acLoad(const AcMesh& host_mesh) -{ - return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info)); -} - -// acCopyMeshToHost AcResult acStore(AcMesh* host_mesh) { - return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh); -} - -static AcResult -acSwapBuffers(void) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - swapBuffers(devices[i]); - } - return AC_SUCCESS; -} - -static AcResult -acSynchronizeHalos(void) -{ - // Exchanges the halos of subgrids - // After this step, the data within the main grid ranging from - // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z - // has been synchronized and transferred to appropriate subgrids - - // We loop only to num_devices - 1 since the front and back plate of the grid is not - // transferred because their contents depend on the boundary conditions. - - // IMPORTANT NOTE: the boundary conditions must be applied before calling this function! - // I.e. the halos of subgrids must contain up-to-date data! - // #pragma omp parallel for - for (int i = 0; i < num_devices - 1; ++i) { - const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; - // ...|ooooxxx|... -> xxx|ooooooo|... - { - const int3 src = (int3){0, 0, subgrid.n.z}; - const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices], - dst, num_vertices); - } - // ...|ooooooo|xxx <- ...|xxxoooo|... - { - const int3 src = (int3){0, 0, NGHOST}; - const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i], - dst, num_vertices); - } - } - return AC_SUCCESS; -} - -static AcResult -acSynchronizeStream(const StreamType stream) -{ - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - synchronize(devices[i], stream); - } - - return AC_SUCCESS; -} - -AcResult -acSynchronize(void) -{ + acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh); acSynchronizeStream(STREAM_ALL); - acSynchronizeHalos(); - acSynchronizeStream(STREAM_ALL); - return AC_SUCCESS; } -static AcResult -acLocalBoundcondStep(void) -{ - if (num_devices == 1) { - boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m); - } - else { - // Local boundary conditions - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE - const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; - boundcondStep(devices[i], STREAM_PRIMARY, d0, d1); - } - } - return AC_SUCCESS; -} - -static AcResult -acGlobalBoundcondStep(void) -{ - if (num_devices > 1) { - // With periodic boundary conditions we exchange the front and back plates of the - // grid. The exchange is done between the first and last device (0 and num_devices - 1). - const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST; - // ...|ooooxxx|... -> xxx|ooooooo|... - { - const int3 src = (int3){0, 0, subgrid.n.z}; - const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst, - num_vertices); - } - // ...|ooooooo|xxx <- ...|xxxoooo|... - { - const int3 src = (int3){0, 0, NGHOST}; - const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst, - num_vertices); - } - } - return AC_SUCCESS; -} - -AcResult -acBoundcondStep(void) -{ - acLocalBoundcondStep(); - acSynchronizeStream(STREAM_ALL); - acGlobalBoundcondStep(); - acSynchronizeHalos(); - acSynchronizeStream(STREAM_ALL); - - return AC_SUCCESS; -} - -AcResult -acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end) -{ - // See the beginning of the file for an explanation of the index mapping - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - // DECOMPOSITION OFFSET HERE - const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z}; - const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z}; - - const int3 da = max(start, d0); - const int3 db = min(end, d1); - - if (db.z >= da.z) { - const int3 da_local = da - (int3){0, 0, i * subgrid.n.z}; - const int3 db_local = db - (int3){0, 0, i * subgrid.n.z}; - rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt); - } - } - return AC_SUCCESS; -} - -AcResult -acIntegrateStep(const int& isubstep, const AcReal& dt) -{ - const int3 start = (int3){NGHOST, NGHOST, NGHOST}; - const int3 end = start + grid.n; - acIntegrateStepWithOffset(isubstep, dt, start, end); - - return AC_SUCCESS; -} - -AcResult -acIntegrate(const AcReal& dt) -{ - for (int isubstep = 0; isubstep < 3; ++isubstep) { - acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized. - acSwapBuffers(); - acLocalBoundcondStep(); - acSynchronizeStream(STREAM_ALL); - acGlobalBoundcondStep(); - acSynchronizeHalos(); - acSynchronizeStream(STREAM_ALL); - } - return AC_SUCCESS; -} - -static AcReal -simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n) -{ - AcReal res = results[0]; - for (int i = 1; i < n; ++i) { - if (rtype == RTYPE_MAX) { - res = max(res, results[i]); - } - else if (rtype == RTYPE_MIN) { - res = min(res, results[i]); - } - else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { - res = sum(res, results[i]); - } - else { - ERROR("Invalid rtype"); - } - } - - if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { - const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z); - res = sqrt(inv_n * res); - } - - return res; -} - -AcReal -acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle) -{ - AcReal results[num_devices]; - - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]); - } - - return simple_final_reduce_scal(rtype, results, num_devices); -} - -AcReal -acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b, - const VertexBufferHandle& c) -{ - AcReal results[num_devices]; - - // #pragma omp parallel for - for (int i = 0; i < num_devices; ++i) { - reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]); - } - - return simple_final_reduce_scal(rtype, results, num_devices); -} - AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value) { @@ -552,9 +557,3 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value) } return AC_SUCCESS; } - -AcResult -acAutoOptimize(void) -{ - return autoOptimize(devices[0]); -} diff --git a/src/core/device.cuh b/src/core/device.cuh index 228f4e9..bd2a308 100644 --- a/src/core/device.cuh +++ b/src/core/device.cuh @@ -27,15 +27,6 @@ #pragma once #include "astaroth.h" -// clang-format off -typedef enum { - STREAM_PRIMARY, - STREAM_SECONDARY, - NUM_STREAM_TYPES, - STREAM_ALL -} StreamType; -// clang-format on - typedef struct { int3 m; int3 n; diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc index 6c1855e..bb8b55f 100644 --- a/src/standalone/autotest.cc +++ b/src/standalone/autotest.cc @@ -413,8 +413,6 @@ check_rk3(const AcMeshInfo& mesh_info) const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities acIntegrate(dt); - acBoundcondStep(); - acSynchronize(); model_rk3(dt, model_mesh); boundconds(model_mesh->info, model_mesh); @@ -712,8 +710,6 @@ run_autotest(void) // Device integration step acIntegrate(dt); - acBoundcondStep(); - acSynchronize(); acStore(candidate_mesh); // Check fields diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc index 03acd63..10ee0b4 100644 --- a/src/standalone/benchmark.cc +++ b/src/standalone/benchmark.cc @@ -122,7 +122,6 @@ run_benchmark(void) // Warmup for (int i = 0; i < 10; ++i) { acIntegrate(0); - acSynchronize(); } Timer t; @@ -135,8 +134,6 @@ run_benchmark(void) #else // GEN_BENCHMARK_FULL acIntegrate(dt); #endif - acSynchronize(); - const double ms_elapsed = timer_diff_nsec(t) / 1e6; results.push_back(ms_elapsed); } diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc index 4497001..8587226 100644 --- a/src/standalone/renderer.cc +++ b/src/standalone/renderer.cc @@ -406,7 +406,7 @@ run_renderer(void) const int num_vertices = mesh->info.int_params[AC_mxy]; const int3 dst = (int3){0, 0, k_slice}; acStoreWithOffset(dst, num_vertices, mesh); - acSynchronize(); + acSynchronizeStream(STREAM_ALL); renderer_draw(*mesh); // Bottleneck is here printf("Step #%d, dt: %f\n", steps, double(dt)); timer_reset(&frame_timer); diff --git a/src/standalone/simulation.cc b/src/standalone/simulation.cc index 69a74c9..133f691 100644 --- a/src/standalone/simulation.cc +++ b/src/standalone/simulation.cc @@ -205,7 +205,6 @@ run_simulation(void) write_mesh_info(&mesh_info); print_diagnostics(0, AcReal(.0), t_step, diag_file); - acSynchronize(); acStore(mesh); save_mesh(*mesh, 0, t_step); @@ -275,8 +274,6 @@ run_simulation(void) acSynchronize(); acStore(mesh); */ - - acSynchronize(); acStore(mesh); save_mesh(*mesh, i, t_step);