Added a better-pipelined version of the acGridIntegrate and a switch for toggling the transfer of corners

2020-06-10 02:16:23 +03:00
parent 9840b817d0
commit fa422cf457
1 changed files with 211 additions and 0 deletions
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1373,9 +1373,220 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
 AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
    ERRCHK(grid.initialized);
    acGridSynchronizeStream(stream);
    const Device device = grid.device;
    const int3 nn       = grid.nn;
 #if MPI_INCL_CORNERS
    CommData corner_data = grid.corner_data; // Do not rm: required for corners
 #endif                                       // MPI_INCL_CORNERS
    CommData edgex_data  = grid.edgex_data;
    CommData edgey_data  = grid.edgey_data;
    CommData edgez_data  = grid.edgez_data;
    CommData sidexy_data = grid.sidexy_data;
    CommData sidexz_data = grid.sidexz_data;
    CommData sideyz_data = grid.sideyz_data;
 // Corners
 #if MPI_INCL_CORNERS
    // Do not rm: required for corners
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
        (int3){0, 0, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };
 #endif // MPI_INCL_CORNERS
    // Edges X
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},
        (int3){NGHOST, 0, NGHOST + nn.z},
        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
    };
    // Edges Y
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},
        (int3){0, NGHOST, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
    };
    // Edges Z
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},
        (int3){0, NGHOST + nn.y, NGHOST},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
    };
    // Sides XY
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };
    // Sides XZ
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };
    // Sides YZ
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
        acDeviceSynchronizeStream(device, STREAM_ALL);
        MPI_Barrier(MPI_COMM_WORLD);
 #if MPI_COMPUTE_ENABLED
        acPackCommData(device, sidexy_b0s, &sidexy_data);
        acPackCommData(device, sidexz_b0s, &sidexz_data);
        acPackCommData(device, sideyz_b0s, &sideyz_data);
 #endif // MPI_COMPUTE_ENABLED
 #if MPI_COMM_ENABLED
        acTransferCommData(device, sidexy_b0s, &sidexy_data);
        acTransferCommData(device, sidexz_b0s, &sidexz_data);
        acTransferCommData(device, sideyz_b0s, &sideyz_data);
 #endif // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
        //////////// INNER INTEGRATION //////////////
        {
            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = nn;
            acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
        }
        acPackCommData(device, edgex_b0s, &edgex_data);
        acPackCommData(device, edgey_b0s, &edgey_data);
        acPackCommData(device, edgez_b0s, &edgez_data);
 #endif // MPI_COMPUTE_ENABLED
 #if MPI_COMM_ENABLED
        acTransferCommDataWait(sidexy_data);
        acUnpinCommData(device, &sidexy_data);
        acTransferCommDataWait(sidexz_data);
        acUnpinCommData(device, &sidexz_data);
        acTransferCommDataWait(sideyz_data);
        acUnpinCommData(device, &sideyz_data);
        acTransferCommData(device, edgex_b0s, &edgex_data);
        acTransferCommData(device, edgey_b0s, &edgey_data);
        acTransferCommData(device, edgez_b0s, &edgez_data);
 #endif // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
 #if MPI_INCL_CORNERS
        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
 #endif                                                    // MPI_INCL_CORNERS
        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
 #endif // MPI_COMPUTE_ENABLED
 #if MPI_COMM_ENABLED
        acTransferCommDataWait(edgex_data);
        acUnpinCommData(device, &edgex_data);
        acTransferCommDataWait(edgey_data);
        acUnpinCommData(device, &edgey_data);
        acTransferCommDataWait(edgez_data);
        acUnpinCommData(device, &edgez_data);
 #if MPI_INCL_CORNERS
        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
 #endif                                                        // MPI_INCL_CORNERS
 #endif                                                        // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
        acUnpackCommData(device, edgex_b0s, &edgex_data);
        acUnpackCommData(device, edgey_b0s, &edgey_data);
        acUnpackCommData(device, edgez_b0s, &edgez_data);
 #endif // MPI_COMPUTE_ENABLED
 #if MPI_COMM_ENABLED
 #if MPI_INCL_CORNERS
        acTransferCommDataWait(corner_data);   // Do not rm: required for corners
        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
 #endif                                         // MPI_INCL_CORNERS
 #endif                                         // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
 #if MPI_INCL_CORNERS
        acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
 #endif                                                      // MPI_INCL_CORNERS
 #endif                                                      // MPI_COMPUTE_ENABLED
        // Wait for unpacking
        acSyncCommData(sidexy_data);
        acSyncCommData(sidexz_data);
        acSyncCommData(sideyz_data);
        acSyncCommData(edgex_data);
        acSyncCommData(edgey_data);
        acSyncCommData(edgez_data);
 #if MPI_INCL_CORNERS
        acSyncCommData(corner_data); // Do not rm: required for corners
 #endif                               // MPI_INCL_CORNERS
 #if MPI_COMPUTE_ENABLED
        { // Front
            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
        }
        { // Back
            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
        }
        { // Bottom
            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
        }
        { // Top
            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
        }
        { // Left
            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
        }
        { // Right
            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
        }
 #endif // MPI_COMPUTE_ENABLED
        acDeviceSwapBuffers(device);
    }
    return AC_SUCCESS;
 }
 AcResult
 acGridIntegrateORIGINAL(const Stream stream, const AcReal dt)
 {
    ERRCHK(grid.initialized);
    // acGridSynchronizeStream(stream);