MPI distribute and gather were incorrect, fixed. Now tested to work with 1,2, and 4 GPUs.

2020-01-16 19:12:32 +02:00
parent d7f56eeb67
commit 29b38d3b89
1 changed files with 326 additions and 13 deletions
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1096,11 +1096,322 @@ getPid(const int3 pid, const int3 decomposition)
           mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
 }

+static AcResult
+acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Distributing mesh...\n");
+    fflush(stdout);
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        dst->info.int_params[AC_nx],
+        dst->info.int_params[AC_ny],
+        dst->info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        dst->info.int_params[AC_mx],
+        dst->info.int_params[AC_my],
+        dst->info.int_params[AC_mz],
+    };
+
+    // Send to self
+    if (pid == 0) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                           &src.vertex_buffer[vtxbuf][src_idx],  //
+                           count * sizeof(src.vertex_buffer[i][0]));
+                }
+            }
+        }
+    }
+
+    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        // For pencils
+        for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+            for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                const int i     = NGHOST;
+                const int count = nn.x;
+
+                if (pid != 0) {
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    // Recv
+                    MPI_Status status;
+                    MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0,
+                             MPI_COMM_WORLD, &status);
+                }
+                else {
+                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
+                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+                        const int src_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                              j + tgt_pid3d.y * nn.y, //
+                                                              k + tgt_pid3d.z * nn.z, //
+                                                              src.info);
+
+                        // Send
+                        MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0,
+                                 MPI_COMM_WORLD);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Gathering mesh...\n");
+    fflush(stdout);
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        src.info.int_params[AC_nx],
+        src.info.int_params[AC_ny],
+        src.info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        src.info.int_params[AC_mx],
+        src.info.int_params[AC_my],
+        src.info.int_params[AC_mz],
+    };
+
+    // Send to self
+    if (pid == 0) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                           &src.vertex_buffer[vtxbuf][src_idx],  //
+                           count * sizeof(src.vertex_buffer[i][0]));
+                }
+            }
+        }
+    }
+
+    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        // For pencils
+        for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+            for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                const int i     = NGHOST;
+                const int count = nn.x;
+
+                if (pid != 0) {
+                    // Send
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0,
+                             MPI_COMM_WORLD);
+                }
+                else {
+                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
+                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+                        const int dst_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                              j + tgt_pid3d.y * nn.y, //
+                                                              k + tgt_pid3d.z * nn.z, //
+                                                              dst->info);
+
+                        // Recv
+                        MPI_Status status;
+                        MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0,
+                                 MPI_COMM_WORLD, &status);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+/*
+// Close to correct 3D distribute/gather but deadlocks
+static AcResult
+acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Distributing mesh...\n");
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        dst->info.int_params[AC_nx],
+        dst->info.int_params[AC_ny],
+        dst->info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        dst->info.int_params[AC_mx],
+        dst->info.int_params[AC_my],
+        dst->info.int_params[AC_mz],
+    };
+
+    for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
+        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                          j + tgt_pid3d.y * nn.y, //
+                                                          k + tgt_pid3d.z * nn.z, //
+                                                          src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+
+                    if (pid == 0) {
+                        // Send
+                        if (pid == tgt_pid) {
+                            // Send to self
+                            memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                                   &src.vertex_buffer[vtxbuf][src_idx],  //
+                                   count * sizeof(src.vertex_buffer[i][0]));
+                        }
+                        else {
+                            // Send to others
+                            MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid,
+                                     vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
+                                     MPI_COMM_WORLD);
+                        }
+                    }
+                    else {
+                        // Recv
+                        MPI_Status status;
+                        MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0,
+                                 vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
+                                 MPI_COMM_WORLD, &status);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Distributing mesh...\n");
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        dst->info.int_params[AC_nx],
+        dst->info.int_params[AC_ny],
+        dst->info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        dst->info.int_params[AC_mx],
+        dst->info.int_params[AC_my],
+        dst->info.int_params[AC_mz],
+    };
+
+    for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
+        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                          j + tgt_pid3d.y * nn.y, //
+                                                          k + tgt_pid3d.z * nn.z, //
+                                                          dst->info);
+
+                    if (pid == 0) {
+                        // Send
+                        if (pid == tgt_pid) {
+                            // Send to self
+                            memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                                   &src.vertex_buffer[vtxbuf][src_idx],  //
+                                   count * sizeof(src.vertex_buffer[i][0]));
+                        }
+                        else {
+                            // Recv from others
+                            MPI_Status status;
+                            MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid,
+                                     vtxbuf, MPI_COMM_WORLD, &status);
+                        }
+                    }
+                    else {
+                        // Send to 0
+                        MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, vtxbuf,
+                                 MPI_COMM_WORLD);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+*/
+
+/*
 static void
 acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
 {
    MPI_Barrier(MPI_COMM_WORLD);
    printf("Distributing mesh...\n");
+    assert(dst);

    MPI_Datatype datatype = MPI_FLOAT;
    if (sizeof(AcReal) == 8)
@@ -1110,33 +1421,31 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);

-    const size_t count = acVertexBufferSize(dst->info);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-
-        if (pid == 0) {
-            // Communicate to self
-            assert(dst);
-            memcpy(&dst->vertex_buffer[i][0], //
-                   &src.vertex_buffer[i][0],  //
-                   count * sizeof(src.vertex_buffer[i][0]));
-
-            // Communicate to others
-            for (int j = 1; j < num_processes; ++j) {
-                const int3 target_pid = getPid3D(j, decomposition);
+    // const size_t count = acVertexBufferSize(dst->info);
    const int3 offset = (int3){
        src.info.int_params[AC_nx] / decomposition.x,
        src.info.int_params[AC_ny] / decomposition.y,
        src.info.int_params[AC_nz] / decomposition.z,
    };
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+
+        if (pid == 0) {
+            // Communicate to self
+            memcpy(&dst->vertex_buffer[i][0], //
+                   &src.vertex_buffer[i][0],  //
+                   count * sizeof(src.vertex_buffer[i][0]));
+
+            // Communicate to others
+            for (int tgt = 1; tgt < num_processes; ++tgt) {
+                const int3 target_pid = getPid3D(tgt, decomposition);
                const size_t src_idx  = acVertexBufferIdx(target_pid.x * offset.x,
                                                         target_pid.y * offset.y,
                                                         target_pid.z * offset.z, src.info);

-                MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, j, 0, MPI_COMM_WORLD);
+                MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, tgt, 0, MPI_COMM_WORLD);
            }
        }
        else {
-            assert(dst);

            // Recv
            const size_t dst_idx = 0;
@@ -1147,6 +1456,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
    }
 }

+
 static void
 acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
 {
@@ -1199,6 +1509,7 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
        }
    }
 }
+*/

 /*
 static AcResult
@@ -2210,7 +2521,9 @@ acDeviceRunMPITest(void)
    acMeshRandomize(&submesh);

 #if BENCH_STRONG_SCALING
+    MPI_Barrier(MPI_COMM_WORLD);
    acDeviceDistributeMeshMPI(model, decomposition, &submesh);
+    MPI_Barrier(MPI_COMM_WORLD);
 #endif

    // Init the GPU
@@ -2254,7 +2567,7 @@ acDeviceRunMPITest(void)
        acDeviceCommunicateHalosMPI(device);
        acDeviceSynchronizeStream(device, STREAM_ALL);

-        acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
+        // acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); // TODO re-enable
        acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
        if (pid == 0) {
            // acModelIntegrateStep(model, FLT_EPSILON); // TODO