Reviewed the Astaroth interface. Now there's a clear distinction between synchronous and asynchronous functions. For basic usage, we provide a set of functions that are always safe to call (acIntegrate, acLoad, etc), but because of this, must be quite restricted in the sense that f.ex. the whole mesh must be loaded at once and computations cannot be executed concurrently on multiple GPUs. For more advanced users we provide asynchronous functions (such as acLoadWithOffset). Since we cannot know how the asynchronous functions are called (for example, when the integration step has been fully completed and the halos of neighboring subgrids can be safely communicated between GPUs), the responsibility of synchronization must be left to the user. In the existing implementations we currently use only the basic "safe" set of functions (except in renderer.cc), so the existing functionality has not been changed with these latests commits. Autotests also pass.

This commit is contained in:
jpekkila
2019-07-09 18:42:00 +03:00
parent 1251f61570
commit 0bda016e17
7 changed files with 298 additions and 300 deletions

View File

@@ -183,6 +183,61 @@ acCheckDeviceAvailability(void)
return AC_FAILURE;
}
AcResult
acSynchronizeStream(const StreamType stream)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
synchronize(devices[i], stream);
}
return AC_SUCCESS;
}
static AcResult
synchronize_halos(void)
{
// Exchanges the halos of subgrids
// After this step, the data within the main grid ranging from
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
// has been synchronized and transferred to appropriate subgrids
// We loop only to num_devices - 1 since the front and back plate of the grid is not
// transferred because their contents depend on the boundary conditions.
// IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
// I.e. the halos of subgrids must contain up-to-date data!
// #pragma omp parallel for
for (int i = 0; i < num_devices - 1; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
dst, num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
dst, num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acSynchronizeMesh(void)
{
acSynchronizeStream(STREAM_ALL);
synchronize_halos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acInit(const AcMeshInfo& config)
{
@@ -235,18 +290,187 @@ acInit(const AcMeshInfo& config)
loadGlobalGrid(devices[i], grid);
printDeviceInfo(devices[i]);
}
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acQuit(void)
{
acSynchronizeStream(STREAM_ALL);
for (int i = 0; i < num_devices; ++i) {
destroyDevice(devices[i]);
}
return AC_SUCCESS;
}
AcResult
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
// DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
const int3 da = max(start, d0);
const int3 db = min(end, d1);
if (db.z >= da.z) {
const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
}
}
return AC_SUCCESS;
}
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
{
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = start + grid.n;
acIntegrateStepWithOffset(isubstep, dt, start, end);
return AC_SUCCESS;
}
static AcResult
local_boundcondstep(void)
{
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
}
else {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
}
}
return AC_SUCCESS;
}
static AcResult
global_boundcondstep(void)
{
if (num_devices > 1) {
// With periodic boundary conditions we exchange the front and back plates of the
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acBoundcondStep(void)
{
local_boundcondstep();
acSynchronizeStream(STREAM_ALL);
global_boundcondstep();
synchronize_halos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
static AcResult
swap_buffers(void)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
swapBuffers(devices[i]);
}
return AC_SUCCESS;
}
AcResult
acIntegrate(const AcReal& dt)
{
acSynchronizeStream(STREAM_ALL);
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
swap_buffers();
acBoundcondStep();
}
return AC_SUCCESS;
}
static AcReal
simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
{
AcReal res = results[0];
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
}
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
}
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
res = sum(res, results[i]);
}
else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res);
}
return res;
}
AcReal
acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
{
acSynchronizeStream(STREAM_ALL);
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
const VertexBufferHandle& c)
{
acSynchronizeStream(STREAM_ALL);
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcResult
acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
{
@@ -284,6 +508,14 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
return AC_SUCCESS;
}
AcResult
acLoad(const AcMesh& host_mesh)
{
acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
{
@@ -308,241 +540,14 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
return AC_SUCCESS;
}
// acCopyMeshToDevice
AcResult
acLoad(const AcMesh& host_mesh)
{
return acLoadWithOffset(host_mesh, (int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh.info));
}
// acCopyMeshToHost
AcResult
acStore(AcMesh* host_mesh)
{
return acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
}
static AcResult
acSwapBuffers(void)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
swapBuffers(devices[i]);
}
return AC_SUCCESS;
}
static AcResult
acSynchronizeHalos(void)
{
// Exchanges the halos of subgrids
// After this step, the data within the main grid ranging from
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
// has been synchronized and transferred to appropriate subgrids
// We loop only to num_devices - 1 since the front and back plate of the grid is not
// transferred because their contents depend on the boundary conditions.
// IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
// I.e. the halos of subgrids must contain up-to-date data!
// #pragma omp parallel for
for (int i = 0; i < num_devices - 1; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
dst, num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
dst, num_vertices);
}
}
return AC_SUCCESS;
}
static AcResult
acSynchronizeStream(const StreamType stream)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
synchronize(devices[i], stream);
}
return AC_SUCCESS;
}
AcResult
acSynchronize(void)
{
acStoreWithOffset((int3){0, 0, 0}, AC_VTXBUF_SIZE(host_mesh->info), host_mesh);
acSynchronizeStream(STREAM_ALL);
acSynchronizeHalos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
static AcResult
acLocalBoundcondStep(void)
{
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
}
else {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
}
}
return AC_SUCCESS;
}
static AcResult
acGlobalBoundcondStep(void)
{
if (num_devices > 1) {
// With periodic boundary conditions we exchange the front and back plates of the
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acBoundcondStep(void)
{
acLocalBoundcondStep();
acSynchronizeStream(STREAM_ALL);
acGlobalBoundcondStep();
acSynchronizeHalos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
// DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
const int3 da = max(start, d0);
const int3 db = min(end, d1);
if (db.z >= da.z) {
const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
}
}
return AC_SUCCESS;
}
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
{
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = start + grid.n;
acIntegrateStepWithOffset(isubstep, dt, start, end);
return AC_SUCCESS;
}
AcResult
acIntegrate(const AcReal& dt)
{
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
acSwapBuffers();
acLocalBoundcondStep();
acSynchronizeStream(STREAM_ALL);
acGlobalBoundcondStep();
acSynchronizeHalos();
acSynchronizeStream(STREAM_ALL);
}
return AC_SUCCESS;
}
static AcReal
simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
{
AcReal res = results[0];
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
}
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
}
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
res = sum(res, results[i]);
}
else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res);
}
return res;
}
AcReal
acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
{
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcReal
acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
const VertexBufferHandle& c)
{
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcResult
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
{
@@ -552,9 +557,3 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
}
return AC_SUCCESS;
}
AcResult
acAutoOptimize(void)
{
return autoOptimize(devices[0]);
}

View File

@@ -27,15 +27,6 @@
#pragma once
#include "astaroth.h"
// clang-format off
typedef enum {
STREAM_PRIMARY,
STREAM_SECONDARY,
NUM_STREAM_TYPES,
STREAM_ALL
} StreamType;
// clang-format on
typedef struct {
int3 m;
int3 n;

View File

@@ -413,8 +413,6 @@ check_rk3(const AcMeshInfo& mesh_info)
const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
acIntegrate(dt);
acBoundcondStep();
acSynchronize();
model_rk3(dt, model_mesh);
boundconds(model_mesh->info, model_mesh);
@@ -712,8 +710,6 @@ run_autotest(void)
// Device integration step
acIntegrate(dt);
acBoundcondStep();
acSynchronize();
acStore(candidate_mesh);
// Check fields

View File

@@ -122,7 +122,6 @@ run_benchmark(void)
// Warmup
for (int i = 0; i < 10; ++i) {
acIntegrate(0);
acSynchronize();
}
Timer t;
@@ -135,8 +134,6 @@ run_benchmark(void)
#else // GEN_BENCHMARK_FULL
acIntegrate(dt);
#endif
acSynchronize();
const double ms_elapsed = timer_diff_nsec(t) / 1e6;
results.push_back(ms_elapsed);
}

View File

@@ -406,7 +406,7 @@ run_renderer(void)
const int num_vertices = mesh->info.int_params[AC_mxy];
const int3 dst = (int3){0, 0, k_slice};
acStoreWithOffset(dst, num_vertices, mesh);
acSynchronize();
acSynchronizeStream(STREAM_ALL);
renderer_draw(*mesh); // Bottleneck is here
printf("Step #%d, dt: %f\n", steps, double(dt));
timer_reset(&frame_timer);

View File

@@ -205,7 +205,6 @@ run_simulation(void)
write_mesh_info(&mesh_info);
print_diagnostics(0, AcReal(.0), t_step, diag_file);
acSynchronize();
acStore(mesh);
save_mesh(*mesh, 0, t_step);
@@ -275,8 +274,6 @@ run_simulation(void)
acSynchronize();
acStore(mesh);
*/
acSynchronize();
acStore(mesh);
save_mesh(*mesh, i, t_step);