Introduced versions of the asynchronous functions which take a stream as a parameter

This commit is contained in:
jpekkila
2019-07-10 15:49:21 +03:00
parent bd98eaf9f7
commit 93fc121f5c
2 changed files with 80 additions and 34 deletions

View File

@@ -215,8 +215,7 @@ typedef struct {
* =============================================================================
*/
typedef enum {
STREAM_PRIMARY, //
STREAM_SECONDARY, //
STREAM_DEFAULT,
NUM_STREAM_TYPES, //
STREAM_ALL
} StreamType;
@@ -275,25 +274,35 @@ AcResult acStore(AcMesh* host_mesh);
*/
/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value,
const StreamType stream);
/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices);
AcResult acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& start, const int num_vertices,
const StreamType stream);
/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
* CPU memory. Asynchronous.
*/
AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh);
AcResult acStoreWithOffsetAsync(const int3& start, const int num_vertices, AcMesh* host_mesh,
const StreamType stream);
/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
AcResult acIntegrateStep(const int& isubstep, const AcReal& dt);
AcResult acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream);
/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
* Asynchronous.*/
AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start,
const int3& end);
AcResult acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start,
const int3& end, const StreamType stream);
/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
AcResult acBoundcondStep(void);
AcResult acBoundcondStepAsync(const StreamType stream);
/* End extern "C" */
#ifdef __cplusplus

View File

@@ -195,7 +195,7 @@ acSynchronizeStream(const StreamType stream)
}
static AcResult
synchronize_halos(void)
synchronize_halos(const StreamType stream)
{
// Exchanges the halos of subgrids
// After this step, the data within the main grid ranging from
@@ -214,15 +214,15 @@ synchronize_halos(void)
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices],
dst, num_vertices);
copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i],
dst, num_vertices);
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
num_vertices);
}
}
return AC_SUCCESS;
@@ -232,7 +232,7 @@ AcResult
acSynchronizeMesh(void)
{
acSynchronizeStream(STREAM_ALL);
synchronize_halos();
synchronize_halos(STREAM_DEFAULT);
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
@@ -307,7 +307,8 @@ acQuit(void)
}
AcResult
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start,
const int3& end, const StreamType stream)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
@@ -322,27 +323,37 @@ acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& sta
if (db.z >= da.z) {
const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt);
rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
}
}
return AC_SUCCESS;
}
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end)
{
return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
}
AcResult
acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream)
{
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = start + grid.n;
acIntegrateStepWithOffset(isubstep, dt, start, end);
return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
}
return AC_SUCCESS;
AcResult
acIntegrateStep(const int& isubstep, const AcReal& dt)
{
return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
}
static AcResult
local_boundcondstep(void)
local_boundcondstep(const StreamType stream)
{
if (num_devices == 1) {
boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m);
boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
}
else {
// Local boundary conditions
@@ -350,14 +361,14 @@ local_boundcondstep(void)
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
boundcondStep(devices[i], stream, d0, d1);
}
}
return AC_SUCCESS;
}
static AcResult
global_boundcondstep(void)
global_boundcondstep(const StreamType stream)
{
if (num_devices > 1) {
// With periodic boundary conditions we exchange the front and back plates of the
@@ -367,30 +378,37 @@ global_boundcondstep(void)
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst,
copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst,
copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acBoundcondStepAsync(const StreamType stream)
{
ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
local_boundcondstep(stream);
acSynchronizeStream(stream);
global_boundcondstep(stream);
synchronize_halos(stream);
acSynchronizeStream(stream);
return AC_SUCCESS;
}
AcResult
acBoundcondStep(void)
{
local_boundcondstep();
acSynchronizeStream(STREAM_ALL);
global_boundcondstep();
synchronize_halos();
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
return acBoundcondStepAsync(STREAM_DEFAULT);
}
static AcResult
@@ -450,7 +468,7 @@ acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_han
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
@@ -465,14 +483,15 @@ acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const Verte
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
}
AcResult
acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& src, const int num_vertices,
const StreamType stream)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
@@ -501,13 +520,19 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
// printf("\n");
copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
}
// printf("\n");
}
return AC_SUCCESS;
}
AcResult
acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices)
{
return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
}
AcResult
acLoad(const AcMesh& host_mesh)
{
@@ -517,7 +542,8 @@ acLoad(const AcMesh& host_mesh)
}
AcResult
acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
acStoreWithOffsetAsync(const int3& src, const int num_vertices, AcMesh* host_mesh,
const StreamType stream)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
@@ -534,12 +560,18 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
// DECOMPOSITION OFFSET HERE
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
}
}
return AC_SUCCESS;
}
AcResult
acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
{
return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
}
AcResult
acStore(AcMesh* host_mesh)
{
@@ -549,12 +581,17 @@ acStore(AcMesh* host_mesh)
}
AcResult
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
{
const StreamType stream = STREAM_PRIMARY;
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
loadDeviceConstant(devices[i], stream, param, value);
}
return AC_SUCCESS;
}
AcResult
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
{
return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
}