diff --git a/include/astaroth.h b/include/astaroth.h index 7e8a3a8..848740b 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -215,8 +215,7 @@ typedef struct { * ============================================================================= */ typedef enum { - STREAM_PRIMARY, // - STREAM_SECONDARY, // + STREAM_DEFAULT, NUM_STREAM_TYPES, // STREAM_ALL } StreamType; @@ -275,25 +274,35 @@ AcResult acStore(AcMesh* host_mesh); */ /** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */ AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value); +AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, + const StreamType stream); /** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */ AcResult acLoadWithOffset(const AcMesh& host_mesh, const int3& start, const int num_vertices); +AcResult acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& start, const int num_vertices, + const StreamType stream); /** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to * CPU memory. Asynchronous. */ AcResult acStoreWithOffset(const int3& start, const int num_vertices, AcMesh* host_mesh); +AcResult acStoreWithOffsetAsync(const int3& start, const int num_vertices, AcMesh* host_mesh, + const StreamType stream); /** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/ AcResult acIntegrateStep(const int& isubstep, const AcReal& dt); +AcResult acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream); /** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions. * Asynchronous.*/ AcResult acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end); +AcResult acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start, + const int3& end, const StreamType stream); /** Performs the boundary condition step on the GPUs in the node. Asynchronous. */ AcResult acBoundcondStep(void); +AcResult acBoundcondStepAsync(const StreamType stream); /* End extern "C" */ #ifdef __cplusplus diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 5decb78..8cfcd2c 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -195,7 +195,7 @@ acSynchronizeStream(const StreamType stream) } static AcResult -synchronize_halos(void) +synchronize_halos(const StreamType stream) { // Exchanges the halos of subgrids // After this step, the data within the main grid ranging from @@ -214,15 +214,15 @@ synchronize_halos(void) { const int3 src = (int3){0, 0, subgrid.n.z}; const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i + 1) % num_devices], - dst, num_vertices); + copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst, + num_vertices); } // ...|ooooooo|xxx <- ...|xxxoooo|... { const int3 src = (int3){0, 0, NGHOST}; const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src, devices[i], - dst, num_vertices); + copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst, + num_vertices); } } return AC_SUCCESS; @@ -232,7 +232,7 @@ AcResult acSynchronizeMesh(void) { acSynchronizeStream(STREAM_ALL); - synchronize_halos(); + synchronize_halos(STREAM_DEFAULT); acSynchronizeStream(STREAM_ALL); return AC_SUCCESS; @@ -307,7 +307,8 @@ acQuit(void) } AcResult -acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end) +acIntegrateStepWithOffsetAsync(const int& isubstep, const AcReal& dt, const int3& start, + const int3& end, const StreamType stream) { // See the beginning of the file for an explanation of the index mapping // #pragma omp parallel for @@ -322,27 +323,37 @@ acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& sta if (db.z >= da.z) { const int3 da_local = da - (int3){0, 0, i * subgrid.n.z}; const int3 db_local = db - (int3){0, 0, i * subgrid.n.z}; - rkStep(devices[i], STREAM_PRIMARY, isubstep, da_local, db_local, dt); + rkStep(devices[i], stream, isubstep, da_local, db_local, dt); } } return AC_SUCCESS; } AcResult -acIntegrateStep(const int& isubstep, const AcReal& dt) +acIntegrateStepWithOffset(const int& isubstep, const AcReal& dt, const int3& start, const int3& end) +{ + return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT); +} + +AcResult +acIntegrateStepAsync(const int& isubstep, const AcReal& dt, const StreamType stream) { const int3 start = (int3){NGHOST, NGHOST, NGHOST}; const int3 end = start + grid.n; - acIntegrateStepWithOffset(isubstep, dt, start, end); + return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream); +} - return AC_SUCCESS; +AcResult +acIntegrateStep(const int& isubstep, const AcReal& dt) +{ + return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT); } static AcResult -local_boundcondstep(void) +local_boundcondstep(const StreamType stream) { if (num_devices == 1) { - boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0}, subgrid.m); + boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m); } else { // Local boundary conditions @@ -350,14 +361,14 @@ local_boundcondstep(void) for (int i = 0; i < num_devices; ++i) { const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z}; - boundcondStep(devices[i], STREAM_PRIMARY, d0, d1); + boundcondStep(devices[i], stream, d0, d1); } } return AC_SUCCESS; } static AcResult -global_boundcondstep(void) +global_boundcondstep(const StreamType stream) { if (num_devices > 1) { // With periodic boundary conditions we exchange the front and back plates of the @@ -367,30 +378,37 @@ global_boundcondstep(void) { const int3 src = (int3){0, 0, subgrid.n.z}; const int3 dst = (int3){0, 0, 0}; - copyMeshDeviceToDevice(devices[num_devices - 1], STREAM_PRIMARY, src, devices[0], dst, + copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst, num_vertices); } // ...|ooooooo|xxx <- ...|xxxoooo|... { const int3 src = (int3){0, 0, NGHOST}; const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z}; - copyMeshDeviceToDevice(devices[0], STREAM_PRIMARY, src, devices[num_devices - 1], dst, + copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst, num_vertices); } } return AC_SUCCESS; } +AcResult +acBoundcondStepAsync(const StreamType stream) +{ + ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES); + + local_boundcondstep(stream); + acSynchronizeStream(stream); + global_boundcondstep(stream); + synchronize_halos(stream); + acSynchronizeStream(stream); + return AC_SUCCESS; +} + AcResult acBoundcondStep(void) { - local_boundcondstep(); - acSynchronizeStream(STREAM_ALL); - global_boundcondstep(); - synchronize_halos(); - acSynchronizeStream(STREAM_ALL); - - return AC_SUCCESS; + return acBoundcondStepAsync(STREAM_DEFAULT); } static AcResult @@ -450,7 +468,7 @@ acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_han AcReal results[num_devices]; // #pragma omp parallel for for (int i = 0; i < num_devices; ++i) { - reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]); + reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]); } return simple_final_reduce_scal(rtype, results, num_devices); @@ -465,14 +483,15 @@ acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const Verte AcReal results[num_devices]; // #pragma omp parallel for for (int i = 0; i < num_devices; ++i) { - reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]); + reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]); } return simple_final_reduce_scal(rtype, results, num_devices); } AcResult -acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices) +acLoadWithOffsetAsync(const AcMesh& host_mesh, const int3& src, const int num_vertices, + const StreamType stream) { // See the beginning of the file for an explanation of the index mapping // #pragma omp parallel for @@ -501,13 +520,19 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); // printf("\n"); - copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells); + copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells); } // printf("\n"); } return AC_SUCCESS; } +AcResult +acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertices) +{ + return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT); +} + AcResult acLoad(const AcMesh& host_mesh) { @@ -517,7 +542,8 @@ acLoad(const AcMesh& host_mesh) } AcResult -acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh) +acStoreWithOffsetAsync(const int3& src, const int num_vertices, AcMesh* host_mesh, + const StreamType stream) { // See the beginning of the file for an explanation of the index mapping // #pragma omp parallel for @@ -534,12 +560,18 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh) const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da); // DECOMPOSITION OFFSET HERE const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; - copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh); + copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh); } } return AC_SUCCESS; } +AcResult +acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh) +{ + return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT); +} + AcResult acStore(AcMesh* host_mesh) { @@ -549,12 +581,17 @@ acStore(AcMesh* host_mesh) } AcResult -acLoadDeviceConstant(const AcRealParam param, const AcReal value) +acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream) { - const StreamType stream = STREAM_PRIMARY; // #pragma omp parallel for for (int i = 0; i < num_devices; ++i) { loadDeviceConstant(devices[i], stream, param, value); } return AC_SUCCESS; } + +AcResult +acLoadDeviceConstant(const AcRealParam param, const AcReal value) +{ + return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT); +}