Added a better-pipelined version of the acGridIntegrate and a switch for toggling the transfer of corners

This commit is contained in:
jpekkila
2020-06-10 02:16:23 +03:00
parent 9840b817d0
commit fa422cf457

View File

@@ -1373,9 +1373,220 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
#define MPI_COMPUTE_ENABLED (1) #define MPI_COMPUTE_ENABLED (1)
#define MPI_COMM_ENABLED (1) #define MPI_COMM_ENABLED (1)
#define MPI_INCL_CORNERS (0)
AcResult AcResult
acGridIntegrate(const Stream stream, const AcReal dt) acGridIntegrate(const Stream stream, const AcReal dt)
{
ERRCHK(grid.initialized);
acGridSynchronizeStream(stream);
const Device device = grid.device;
const int3 nn = grid.nn;
#if MPI_INCL_CORNERS
CommData corner_data = grid.corner_data; // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
CommData edgex_data = grid.edgex_data;
CommData edgey_data = grid.edgey_data;
CommData edgez_data = grid.edgez_data;
CommData sidexy_data = grid.sidexy_data;
CommData sidexz_data = grid.sidexz_data;
CommData sideyz_data = grid.sideyz_data;
// Corners
#if MPI_INCL_CORNERS
// Do not rm: required for corners
const int3 corner_b0s[] = {
(int3){0, 0, 0},
(int3){NGHOST + nn.x, 0, 0},
(int3){0, NGHOST + nn.y, 0},
(int3){0, 0, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
};
#endif // MPI_INCL_CORNERS
// Edges X
const int3 edgex_b0s[] = {
(int3){NGHOST, 0, 0},
(int3){NGHOST, NGHOST + nn.y, 0},
(int3){NGHOST, 0, NGHOST + nn.z},
(int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
};
// Edges Y
const int3 edgey_b0s[] = {
(int3){0, NGHOST, 0},
(int3){NGHOST + nn.x, NGHOST, 0},
(int3){0, NGHOST, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
};
// Edges Z
const int3 edgez_b0s[] = {
(int3){0, 0, NGHOST},
(int3){NGHOST + nn.x, 0, NGHOST},
(int3){0, NGHOST + nn.y, NGHOST},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
};
// Sides XY
const int3 sidexy_b0s[] = {
(int3){NGHOST, NGHOST, 0}, //
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
};
// Sides XZ
const int3 sidexz_b0s[] = {
(int3){NGHOST, 0, NGHOST}, //
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
};
// Sides YZ
const int3 sideyz_b0s[] = {
(int3){0, NGHOST, NGHOST}, //
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
};
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acDeviceSynchronizeStream(device, STREAM_ALL);
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_COMPUTE_ENABLED
acPackCommData(device, sidexy_b0s, &sidexy_data);
acPackCommData(device, sidexz_b0s, &sidexz_data);
acPackCommData(device, sideyz_b0s, &sideyz_data);
#endif // MPI_COMPUTE_ENABLED
#if MPI_COMM_ENABLED
acTransferCommData(device, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_b0s, &sideyz_data);
#endif // MPI_COMM_ENABLED
#if MPI_COMPUTE_ENABLED
//////////// INNER INTEGRATION //////////////
{
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = nn;
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
}
acPackCommData(device, edgex_b0s, &edgex_data);
acPackCommData(device, edgey_b0s, &edgey_data);
acPackCommData(device, edgez_b0s, &edgez_data);
#endif // MPI_COMPUTE_ENABLED
#if MPI_COMM_ENABLED
acTransferCommDataWait(sidexy_data);
acUnpinCommData(device, &sidexy_data);
acTransferCommDataWait(sidexz_data);
acUnpinCommData(device, &sidexz_data);
acTransferCommDataWait(sideyz_data);
acUnpinCommData(device, &sideyz_data);
acTransferCommData(device, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_b0s, &edgez_data);
#endif // MPI_COMM_ENABLED
#if MPI_COMPUTE_ENABLED
#if MPI_INCL_CORNERS
acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
#endif // MPI_COMPUTE_ENABLED
#if MPI_COMM_ENABLED
acTransferCommDataWait(edgex_data);
acUnpinCommData(device, &edgex_data);
acTransferCommDataWait(edgey_data);
acUnpinCommData(device, &edgey_data);
acTransferCommDataWait(edgez_data);
acUnpinCommData(device, &edgez_data);
#if MPI_INCL_CORNERS
acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
#endif // MPI_COMM_ENABLED
#if MPI_COMPUTE_ENABLED
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);
acUnpackCommData(device, edgez_b0s, &edgez_data);
#endif // MPI_COMPUTE_ENABLED
#if MPI_COMM_ENABLED
#if MPI_INCL_CORNERS
acTransferCommDataWait(corner_data); // Do not rm: required for corners
acUnpinCommData(device, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
#endif // MPI_COMM_ENABLED
#if MPI_COMPUTE_ENABLED
#if MPI_INCL_CORNERS
acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
#endif // MPI_COMPUTE_ENABLED
// Wait for unpacking
acSyncCommData(sidexy_data);
acSyncCommData(sidexz_data);
acSyncCommData(sideyz_data);
acSyncCommData(edgex_data);
acSyncCommData(edgey_data);
acSyncCommData(edgez_data);
#if MPI_INCL_CORNERS
acSyncCommData(corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
#if MPI_COMPUTE_ENABLED
{ // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
}
{ // Back
const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
}
{ // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
}
{ // Top
const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
}
{ // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
}
{ // Right
const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
}
#endif // MPI_COMPUTE_ENABLED
acDeviceSwapBuffers(device);
}
return AC_SUCCESS;
}
AcResult
acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
{ {
ERRCHK(grid.initialized); ERRCHK(grid.initialized);
// acGridSynchronizeStream(stream); // acGridSynchronizeStream(stream);