Added a simplified and cleaned up 3D decomp MPI implementation. Tested to work at least up to 2x2x2 nodes.

2020-01-17 15:22:23 +02:00
parent 975a15f7f4
commit ff6a7155e5
1 changed files with 649 additions and 0 deletions
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -581,3 +581,652 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
 #if PACKED_DATA_TRANSFERS // TODO DEPRECATED, see AC_MPI_ENABLED instead
 // Functions for calling packed data transfers
 #endif
+
+#if AC_MPI_ENABLED == 0
+AcResult
+acDeviceRunMPITest(void)
+{
+    WARNING("MPI was not enabled but acDeviceRunMPITest() was called");
+    return AC_FAILURE;
+}
+#else // MPI_ENABLED ///////////////////////////////////////////////////////////////////////////////
+#include <mpi.h>
+
+// Kernels
+#include "kernels/packing.cuh"
+
+// From Astaroth Utils
+#include "src/utils/config_loader.h"
+#include "src/utils/memory.h"
+#include "src/utils/verification.h"
+
+static int
+mod(const int a, const int b)
+{
+    const int r = a % b;
+    return r < 0 ? r + b : r;
+}
+
+static int
+getPid(const int3 pid, const int3 decomposition)
+{
+    return mod(pid.x, decomposition.x) +                   //
+           mod(pid.y, decomposition.y) * decomposition.x + //
+           mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
+}
+
+static int3
+getPid3D(const int pid, const int3 decomposition)
+{
+    const int3 pid3d = (int3){
+        mod(pid, decomposition.x),
+        mod(pid / decomposition.x, decomposition.y),
+        (pid / (decomposition.x * decomposition.y)),
+    };
+    return pid3d;
+}
+
+static int3
+decompose(const int target)
+{
+    int decomposition[] = {1, 1, 1};
+
+    int axis = 0;
+    while (decomposition[0] * decomposition[1] * decomposition[2] < target) {
+        ++decomposition[axis];
+        axis = (axis + 1) % 3;
+    }
+
+    const int found = decomposition[0] * decomposition[1] * decomposition[2];
+    if (found != target) {
+        fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
+        fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found);
+        ERROR("Invalid nprocs");
+        return (int3){-1, -1, -1};
+    }
+    else {
+        return (int3){decomposition[0], decomposition[1], decomposition[2]};
+    }
+}
+
+static PackedData
+acCreatePackedData(const int3 dims)
+{
+    PackedData data = {};
+
+    data.dims = dims;
+
+    const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
+    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
+
+    return data;
+}
+
+static AcResult
+acDestroyPackedData(PackedData* data)
+{
+    data->dims = (int3){-1, -1, -1};
+    cudaFree(data->data);
+    data->data = NULL;
+
+    return AC_SUCCESS;
+}
+
+static PackedData
+acCreatePackedDataHost(const int3 dims)
+{
+    PackedData data = {};
+
+    data.dims = dims;
+
+    const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
+    data.data          = (AcReal*)malloc(bytes);
+    ERRCHK_ALWAYS(data.data);
+
+    return data;
+}
+
+static void
+acTransferPackedDataToHost(const PackedData ddata, PackedData* hdata)
+{
+    const size_t bytes = ddata.dims.x * ddata.dims.y * ddata.dims.z * sizeof(ddata.data[0]) *
+                         NUM_VTXBUF_HANDLES;
+    ERRCHK_CUDA_ALWAYS(cudaMemcpy(hdata->data, ddata.data, bytes, cudaMemcpyDeviceToHost));
+}
+
+static void
+acTransferPackedDataToDevice(const PackedData hdata, PackedData* ddata)
+{
+    const size_t bytes = hdata.dims.x * hdata.dims.y * hdata.dims.z * sizeof(hdata.data[0]) *
+                         NUM_VTXBUF_HANDLES;
+    ERRCHK_CUDA_ALWAYS(cudaMemcpy(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice));
+}
+
+static AcResult
+acDestroyPackedDataHost(PackedData* data)
+{
+    data->dims = (int3){-1, -1, -1};
+    free(data->data);
+    data->data = NULL;
+
+    return AC_SUCCESS;
+}
+
+// TODO: do with packed data
+static AcResult
+acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Distributing mesh...\n");
+    fflush(stdout);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    ERRCHK_ALWAYS(dst);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        dst->info.int_params[AC_nx],
+        dst->info.int_params[AC_ny],
+        dst->info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        dst->info.int_params[AC_mx],
+        dst->info.int_params[AC_my],
+        dst->info.int_params[AC_mz],
+    };
+
+    // Send to self
+    if (pid == 0) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                           &src.vertex_buffer[vtxbuf][src_idx],  //
+                           count * sizeof(src.vertex_buffer[i][0]));
+                }
+            }
+        }
+    }
+
+    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        // For pencils
+        for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+            for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                const int i     = NGHOST;
+                const int count = nn.x;
+
+                if (pid != 0) {
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    // Recv
+                    MPI_Status status;
+                    MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0,
+                             MPI_COMM_WORLD, &status);
+                }
+                else {
+                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
+                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+                        const int src_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                              j + tgt_pid3d.y * nn.y, //
+                                                              k + tgt_pid3d.z * nn.z, //
+                                                              src.info);
+
+                        // Send
+                        MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0,
+                                 MPI_COMM_WORLD);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+// TODO: do with packed data
+static AcResult
+acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Gathering mesh...\n");
+    fflush(stdout);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    if (pid == 0)
+        ERRCHK_ALWAYS(dst);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        src.info.int_params[AC_nx],
+        src.info.int_params[AC_ny],
+        src.info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        src.info.int_params[AC_mx],
+        src.info.int_params[AC_my],
+        src.info.int_params[AC_mz],
+    };
+
+    // Send to self
+    if (pid == 0) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            // For pencils
+            for (int k = 0; k < mm.z; ++k) {
+                for (int j = 0; j < mm.y; ++j) {
+                    const int i       = 0;
+                    const int count   = mm.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                           &src.vertex_buffer[vtxbuf][src_idx],  //
+                           count * sizeof(src.vertex_buffer[i][0]));
+                }
+            }
+        }
+    }
+
+    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        // For pencils
+        for (int k = 0; k < mm.z; ++k) {
+            for (int j = 0; j < mm.y; ++j) {
+                const int i     = 0;
+                const int count = mm.x;
+
+                if (pid != 0) {
+                    // Send
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0,
+                             MPI_COMM_WORLD);
+                }
+                else {
+                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
+                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+                        const int dst_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                              j + tgt_pid3d.y * nn.y, //
+                                                              k + tgt_pid3d.z * nn.z, //
+                                                              dst->info);
+
+                        // Recv
+                        MPI_Status status;
+                        MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0,
+                                 MPI_COMM_WORLD, &status);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+acDeviceCommunicateBlocksMPI(const Device device,     //
+                             const int3* a0s,         // Src idx inside comp. domain
+                             const int3* b0s,         // Dst idx inside bound zone
+                             const int mapping_count, // Num a0s and b0s
+                             const int3 dims)         // Block size
+{
+    cudaSetDevice(device->id);
+    acDeviceSynchronizeStream(device, STREAM_ALL);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const int3 decomp = decompose(nprocs);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    for (int k = -1; k <= 1; ++k) {
+        for (int j = -1; j <= 1; ++j) {
+            for (int i = -1; i <= 1; ++i) {
+                if (i == 0 && j == 0 && k == 0)
+                    continue;
+
+                for (size_t a_idx = 0; a_idx < mapping_count; ++a_idx) {
+                    for (size_t b_idx = 0; b_idx < mapping_count; ++b_idx) {
+                        const int3 neighbor = (int3){i, j, k};
+
+                        const int3 a0 = a0s[a_idx];
+                        // const int3 a1 = a0 + dims;
+
+                        const int3 b0 = a0 - neighbor * nn;
+                        // const int3 b1 = a1 - neighbor * nn;
+
+                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
+
+                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+
+                            PackedData src = acCreatePackedData(dims);
+                            PackedData dst = acCreatePackedData(dims);
+
+                            const cudaStream_t stream = device->streams[STREAM_DEFAULT];
+                            acKernelPackData(stream, device->vba, a0, src);
+                            acDeviceSynchronizeStream(device, STREAM_DEFAULT);
+
+                            // Host ////////////////////////////////////////////////
+                            PackedData src_host = acCreatePackedDataHost(dims);
+                            PackedData dst_host = acCreatePackedDataHost(dims);
+                            acTransferPackedDataToHost(src, &src_host);
+                            acDeviceSynchronizeStream(device, STREAM_ALL);
+                            MPI_Barrier(MPI_COMM_WORLD);
+                            ////////////////////////////////////////////////////////
+
+                            const int3 pid3d = getPid3D(pid, decomp);
+                            MPI_Request send_req, recv_req;
+                            MPI_Isend(src_host.data, count, datatype,
+                                      getPid(pid3d + neighbor, decomp), b_idx, MPI_COMM_WORLD,
+                                      &send_req);
+                            MPI_Irecv(dst_host.data, count, datatype,
+                                      getPid(pid3d - neighbor, decomp), b_idx, MPI_COMM_WORLD,
+                                      &recv_req);
+
+                            MPI_Wait(&send_req, MPI_STATUS_IGNORE);
+                            MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
+
+                            // Host ////////////////////////////////////////////////
+                            acTransferPackedDataToDevice(dst_host, &dst);
+                            acDeviceSynchronizeStream(device, STREAM_ALL);
+                            acDestroyPackedDataHost(&src_host);
+                            acDestroyPackedDataHost(&dst_host);
+                            ////////////////////////////////////////////////////////
+
+                            acKernelUnpackData(stream, dst, b0, device->vba);
+                            acDeviceSynchronizeStream(device, STREAM_DEFAULT);
+
+                            acDestroyPackedData(&src);
+                            acDestroyPackedData(&dst);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+acDeviceCommunicateHalosMPI(const Device device)
+{
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+
+    { // Corners
+        const int3 a0s[] = {
+            (int3){NGHOST, NGHOST, NGHOST}, //
+            (int3){nn.x, NGHOST, NGHOST},   //
+            (int3){NGHOST, nn.y, NGHOST},   //
+            (int3){nn.x, nn.y, NGHOST},     //
+
+            (int3){NGHOST, NGHOST, nn.z}, //
+            (int3){nn.x, NGHOST, nn.z},   //
+            (int3){NGHOST, nn.y, nn.z},   //
+            (int3){nn.x, nn.y, nn.z},
+        };
+        const int3 b0s[] = {
+            (int3){0, 0, 0},
+            (int3){NGHOST + nn.x, 0, 0},
+            (int3){0, NGHOST + nn.y, 0},
+            (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+
+            (int3){0, 0, NGHOST + nn.z},
+            (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+            (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+            (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+        };
+        const int3 dims = (int3){NGHOST, NGHOST, NGHOST};
+
+        const int mapping_count = sizeof(a0s) / sizeof(a0s[0]);
+        acDeviceCommunicateBlocksMPI(device, a0s, b0s, mapping_count, dims);
+    }
+    { // Edges X
+        const int3 a0s[] = {
+            (int3){NGHOST, NGHOST, NGHOST}, //
+            (int3){NGHOST, nn.y, NGHOST},   //
+
+            (int3){NGHOST, NGHOST, nn.z}, //
+            (int3){NGHOST, nn.y, nn.z},   //
+        };
+        const int3 b0s[] = {
+            (int3){NGHOST, 0, 0},
+            (int3){NGHOST, NGHOST + nn.y, 0},
+
+            (int3){NGHOST, 0, NGHOST + nn.z},
+            (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+        };
+        const int3 dims = (int3){nn.x, NGHOST, NGHOST};
+
+        const int mapping_count = sizeof(a0s) / sizeof(a0s[0]);
+        acDeviceCommunicateBlocksMPI(device, a0s, b0s, mapping_count, dims);
+    }
+    { // Edges Y
+        const int3 a0s[] = {
+            (int3){NGHOST, NGHOST, NGHOST}, //
+            (int3){nn.x, NGHOST, NGHOST},   //
+
+            (int3){NGHOST, NGHOST, nn.z}, //
+            (int3){nn.x, NGHOST, nn.z},   //
+        };
+        const int3 b0s[] = {
+            (int3){0, NGHOST, 0},
+            (int3){NGHOST + nn.x, NGHOST, 0},
+
+            (int3){0, NGHOST, NGHOST + nn.z},
+            (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+        };
+        const int3 dims = (int3){NGHOST, nn.y, NGHOST};
+
+        const int mapping_count = sizeof(a0s) / sizeof(a0s[0]);
+        acDeviceCommunicateBlocksMPI(device, a0s, b0s, mapping_count, dims);
+    }
+    { // Edges Z
+        const int3 a0s[] = {
+            (int3){NGHOST, NGHOST, NGHOST}, //
+            (int3){nn.x, NGHOST, NGHOST},   //
+
+            (int3){NGHOST, nn.y, NGHOST}, //
+            (int3){nn.x, nn.y, NGHOST},   //
+        };
+        const int3 b0s[] = {
+            (int3){0, 0, NGHOST},
+            (int3){NGHOST + nn.x, 0, NGHOST},
+
+            (int3){0, NGHOST + nn.y, NGHOST},
+            (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+        };
+
+        const int3 dims = (int3){NGHOST, NGHOST, nn.z};
+
+        const int mapping_count = sizeof(a0s) / sizeof(a0s[0]);
+        acDeviceCommunicateBlocksMPI(device, a0s, b0s, mapping_count, dims);
+    }
+
+    { // Sides XY
+        const int3 a0s[] = {
+            (int3){NGHOST, NGHOST, NGHOST}, //
+            (int3){NGHOST, NGHOST, nn.z},   //
+        };
+        const int3 b0s[] = {
+            (int3){NGHOST, NGHOST, 0},             //
+            (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+        };
+        const int3 dims = (int3){nn.x, nn.y, NGHOST};
+
+        const int mapping_count = sizeof(a0s) / sizeof(a0s[0]);
+        acDeviceCommunicateBlocksMPI(device, a0s, b0s, mapping_count, dims);
+    }
+    { // Sides XZ
+        const int3 a0s[] = {
+            (int3){NGHOST, NGHOST, NGHOST}, //
+            (int3){NGHOST, nn.y, NGHOST},   //
+        };
+        const int3 b0s[] = {
+            (int3){NGHOST, 0, NGHOST},             //
+            (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+        };
+        const int3 dims = (int3){nn.x, NGHOST, nn.z};
+
+        const int mapping_count = sizeof(a0s) / sizeof(a0s[0]);
+        acDeviceCommunicateBlocksMPI(device, a0s, b0s, mapping_count, dims);
+    }
+    { // Sides YZ
+        const int3 a0s[] = {
+            (int3){NGHOST, NGHOST, NGHOST}, //
+            (int3){nn.x, NGHOST, NGHOST},   //
+        };
+        const int3 b0s[] = {
+            (int3){0, NGHOST, NGHOST},             //
+            (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+        };
+        const int3 dims = (int3){NGHOST, nn.y, nn.z};
+
+        const int mapping_count = sizeof(a0s) / sizeof(a0s[0]);
+        acDeviceCommunicateBlocksMPI(device, a0s, b0s, mapping_count, dims);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceRunMPITest(void)
+{
+    MPI_Init(NULL, NULL);
+
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+    printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
+
+    // Create model and candidate meshes
+    AcMeshInfo info;
+    acLoadConfig(AC_DEFAULT_CONFIG, &info);
+    // Some real params must be calculated (for the MHD case) // TODO DANGEROUS
+    info.real_params[AC_inv_dsx]   = AcReal(1.0) / info.real_params[AC_dsx];
+    info.real_params[AC_inv_dsy]   = AcReal(1.0) / info.real_params[AC_dsy];
+    info.real_params[AC_inv_dsz]   = AcReal(1.0) / info.real_params[AC_dsz];
+    info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
+
+    AcMesh model, candidate;
+
+    // Master CPU
+    if (pid == 0) {
+        acMeshCreate(info, &model);
+        acMeshCreate(info, &candidate);
+
+        acMeshRandomize(&model);
+        acMeshRandomize(&candidate);
+    }
+
+    /// DECOMPOSITION & SUBMESH ///////////////////////////////////
+    AcMeshInfo submesh_info  = info;
+    const int3 decomposition = decompose(nprocs);
+    const int3 pid3d         = getPid3D(pid, decomposition);
+
+    printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
+    printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
+    ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
+    ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
+    ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0);
+
+    submesh_info.int_params[AC_nx]             = info.int_params[AC_nx] / decomposition.x;
+    submesh_info.int_params[AC_ny]             = info.int_params[AC_ny] / decomposition.y;
+    submesh_info.int_params[AC_nz]             = info.int_params[AC_nz] / decomposition.z;
+    submesh_info.int3_params[AC_global_grid_n] = (int3){
+        info.int_params[AC_nx],
+        info.int_params[AC_ny],
+        info.int_params[AC_nz],
+    };
+    submesh_info.int3_params[AC_multigpu_offset] = (int3){-1, -1, -1}; // TODO
+    WARNING("AC_multigpu_offset not yet implemented");
+    acUpdateConfig(&submesh_info);
+    ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_inv_dsx]));
+    ERRCHK_ALWAYS(is_valid(submesh_info.real_params[AC_cs2_sound]));
+
+    AcMesh submesh;
+    acMeshCreate(submesh_info, &submesh);
+    acMeshRandomize(&submesh);
+    ////////////////////////////////////////////////////////////////
+
+    // GPU INIT ////////////////////////////////////////////////////
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+
+    Device device;
+    acDeviceCreate(pid % devices_per_node, submesh_info, &device);
+    // TODO enable peer access
+    ////////////////////////////////////////////////////////////////
+
+    // DISTRIBUTE & LOAD //////////////////////////////////////////
+    acDeviceDistributeMeshMPI(model, decomposition, &submesh);
+    acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
+    ///////////////////////////////////////////////////////////////
+
+    // SYNC //////////////////////////////////////////////////////
+    acDeviceSynchronizeStream(device, STREAM_ALL);
+    MPI_Barrier(MPI_COMM_WORLD);
+    //////////////////////////////////////////////////////////////
+
+    // INTEGRATION & BOUNDCONDS////////////////////////////////////
+    acDeviceCommunicateHalosMPI(device);
+    ///////////////////////////////////////////////////////////////
+
+    // STORE & GATHER /////////////////////////////////////////////
+    MPI_Barrier(MPI_COMM_WORLD);
+    acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
+    acDeviceSynchronizeStream(device, STREAM_DEFAULT);
+    acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
+    //////////////////////////////////////////////////////////////
+
+    // VERIFY ////////////////////////////////////////////////////
+    if (pid == 0) {
+        acMeshApplyPeriodicBounds(&model);
+
+        const bool valid = acVerifyMesh(model, candidate);
+        acMeshDestroy(&model);
+        acMeshDestroy(&candidate);
+    }
+    //////////////////////////////////////////////////////////////
+
+    // DESTROY ///////////////////////////////////////////////////
+    acDeviceDestroy(device);
+    acMeshDestroy(&submesh);
+    MPI_Finalize();
+    //////////////////////////////////////////////////////////////
+    return AC_SUCCESS;
+}
+#endif // MPI_ENABLED //////////////////////////////////////////////////////////////////////////////