From fa422cf4575cb0a8eace8f0409693e881fc0f709 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 10 Jun 2020 02:16:23 +0300 Subject: [PATCH] Added a better-pipelined version of the acGridIntegrate and a switch for toggling the transfer of corners --- src/core/device.cc | 211 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) diff --git a/src/core/device.cc b/src/core/device.cc index 35af82d..481b465 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -1373,9 +1373,220 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) +#define MPI_INCL_CORNERS (0) AcResult acGridIntegrate(const Stream stream, const AcReal dt) +{ + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); + + const Device device = grid.device; + const int3 nn = grid.nn; +#if MPI_INCL_CORNERS + CommData corner_data = grid.corner_data; // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + CommData edgex_data = grid.edgex_data; + CommData edgey_data = grid.edgey_data; + CommData edgez_data = grid.edgez_data; + CommData sidexy_data = grid.sidexy_data; + CommData sidexz_data = grid.sidexz_data; + CommData sideyz_data = grid.sideyz_data; + +// Corners +#if MPI_INCL_CORNERS + // Do not rm: required for corners + const int3 corner_b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){0, 0, NGHOST + nn.z}, + + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; +#endif // MPI_INCL_CORNERS + + // Edges X + const int3 edgex_b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, + + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; + + // Edges Y + const int3 edgey_b0s[] = { + (int3){0, NGHOST, 0}, + (int3){NGHOST + nn.x, NGHOST, 0}, + + (int3){0, NGHOST, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, + }; + + // Edges Z + const int3 edgez_b0s[] = { + (int3){0, 0, NGHOST}, + (int3){NGHOST + nn.x, 0, NGHOST}, + + (int3){0, NGHOST + nn.y, NGHOST}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, + }; + + // Sides XY + const int3 sidexy_b0s[] = { + (int3){NGHOST, NGHOST, 0}, // + (int3){NGHOST, NGHOST, NGHOST + nn.z}, // + }; + + // Sides XZ + const int3 sidexz_b0s[] = { + (int3){NGHOST, 0, NGHOST}, // + (int3){NGHOST, NGHOST + nn.y, NGHOST}, // + }; + + // Sides YZ + const int3 sideyz_b0s[] = { + (int3){0, NGHOST, NGHOST}, // + (int3){NGHOST + nn.x, NGHOST, NGHOST}, // + }; + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acDeviceSynchronizeStream(device, STREAM_ALL); + MPI_Barrier(MPI_COMM_WORLD); + +#if MPI_COMPUTE_ENABLED + acPackCommData(device, sidexy_b0s, &sidexy_data); + acPackCommData(device, sidexz_b0s, &sidexz_data); + acPackCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED + acTransferCommData(device, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); + } + + acPackCommData(device, edgex_b0s, &edgex_data); + acPackCommData(device, edgey_b0s, &edgey_data); + acPackCommData(device, edgez_b0s, &edgez_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED + acTransferCommDataWait(sidexy_data); + acUnpinCommData(device, &sidexy_data); + acTransferCommDataWait(sidexz_data); + acUnpinCommData(device, &sidexz_data); + acTransferCommDataWait(sideyz_data); + acUnpinCommData(device, &sideyz_data); + + acTransferCommData(device, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_b0s, &edgez_data); +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED +#if MPI_INCL_CORNERS + acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED + acTransferCommDataWait(edgex_data); + acUnpinCommData(device, &edgex_data); + acTransferCommDataWait(edgey_data); + acUnpinCommData(device, &edgey_data); + acTransferCommDataWait(edgez_data); + acUnpinCommData(device, &edgez_data); + +#if MPI_INCL_CORNERS + acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED +#if MPI_INCL_CORNERS + acTransferCommDataWait(corner_data); // Do not rm: required for corners + acUnpinCommData(device, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS +#endif // MPI_COMM_ENABLED +#if MPI_COMPUTE_ENABLED +#if MPI_INCL_CORNERS + acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS +#endif // MPI_COMPUTE_ENABLED + + // Wait for unpacking + acSyncCommData(sidexy_data); + acSyncCommData(sidexz_data); + acSyncCommData(sideyz_data); + acSyncCommData(edgex_data); + acSyncCommData(edgey_data); + acSyncCommData(edgez_data); +#if MPI_INCL_CORNERS + acSyncCommData(corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + +#if MPI_COMPUTE_ENABLED + { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt); + } + { // Back + const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt); + } + { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt); + } + { // Top + const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt); + } + { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt); + } + { // Right + const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); + } +#endif // MPI_COMPUTE_ENABLED + acDeviceSwapBuffers(device); + } + + return AC_SUCCESS; +} + +AcResult +acGridIntegrateORIGINAL(const Stream stream, const AcReal dt) { ERRCHK(grid.initialized); // acGridSynchronizeStream(stream);