WIP further MPI optimizations

2020-03-24 19:02:58 +02:00
parent ef63813679
commit 672137f7f1
1 changed files with 177 additions and 131 deletions
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1171,6 +1171,8 @@ acDeviceIntegrateMPI(const Device device, const AcReal dt)
    timer_reset(&ttot);
    MPI_Barrier(MPI_COMM_WORLD);

+    const int num_iterations = 1;
+    for (int i = 0; i < num_iterations; ++i) {
        for (int isubstep = 0; isubstep < 3; ++isubstep) {
            acPackCommData(device, corner_a0s, &corner_data);
            acPackCommData(device, edgex_a0s, &edgex_data);
@@ -1267,6 +1269,7 @@ acDeviceIntegrateMPI(const Device device, const AcReal dt)
            acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
            ////////////////////////////////////////////
        }
+    }

    cudaDeviceSynchronize();
    MPI_Barrier(MPI_COMM_WORLD);
@@ -1277,7 +1280,8 @@ acDeviceIntegrateMPI(const Device device, const AcReal dt)
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    if (!pid) {
-        printf("--- Total communication time per step: %f ms\n", msec);
+        printf("--- Total communication time per step w/ integration: %f ms\n",
+               msec / num_iterations);

        // Write out to file
        FILE* fp = fopen("benchmark.result", "a+");
@@ -1434,7 +1438,6 @@ acDeviceCommunicateHalosMPI(const Device device)
    timer_reset(&ttot);
    MPI_Barrier(MPI_COMM_WORLD);

-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
    acPackCommData(device, corner_a0s, &corner_data);
    acPackCommData(device, edgex_a0s, &edgex_data);
    acPackCommData(device, edgey_a0s, &edgey_data);
@@ -1486,7 +1489,6 @@ acDeviceCommunicateHalosMPI(const Device device)
    acUnpackCommData(device, sidexy_b0s, &sidexy_data);
    acUnpackCommData(device, sidexz_b0s, &sidexz_data);
    acUnpackCommData(device, sideyz_b0s, &sideyz_data);
-    }

    cudaDeviceSynchronize();
    MPI_Barrier(MPI_COMM_WORLD);
@@ -1497,7 +1499,7 @@ acDeviceCommunicateHalosMPI(const Device device)
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    if (!pid) {
-        printf("--- Total communication time per step: %f ms\n", msec);
+        printf("--- Total communication time per substep (comm): %f ms\n", msec);

        // Write out to file
        FILE* fp = fopen("benchmark.result", "a+");
@@ -1517,6 +1519,13 @@ acDeviceCommunicateHalosMPI(const Device device)
    return AC_SUCCESS;
 }

+/*
+static int3
+findOptimalDecomposition(const int3 nn)
+{
+  int3 decomposition = (int3){1, 1, 1};
+}*/
+
 AcResult
 acDeviceRunMPITest(void)
 {
@@ -1632,7 +1641,6 @@ acDeviceRunMPITest(void)

    // VERIFY ////////////////////////////////////////////////////
    if (pid == 0) {
-        // acMeshApplyPeriodicBounds(&model);
        acModelIntegrateStep(model, FLT_EPSILON);
        acMeshApplyPeriodicBounds(&model);

@@ -1657,3 +1665,41 @@ acDeviceRunMPITest(void)
    return AC_FAILURE;
 }
 #endif // AC_MPI_ENABLED
+
+/*
+struct grid_s {
+    Device device;
+};
+
+typedef grid_s* Grid;
+
+AcResult
+acGridInit(void)
+{
+    MPI_Init(NULL, NULL);
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+    printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
+}
+
+AcResult
+acGridLoad(const AcMesh mesh, Grid* grid)
+{
+}
+
+AcResult
+acGridStore(const Grid grid, AcMesh* mesh)
+{
+}
+
+AcResult
+acGridQuit(AcGrid& grid)
+{
+}
+*/