diff --git a/src/core/device.cc b/src/core/device.cc
index 23ead6f..23f2ff2 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1096,11 +1096,322 @@ getPid(const int3 pid, const int3 decomposition)
            mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
 }
 
+static AcResult
+acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Distributing mesh...\n");
+    fflush(stdout);
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        dst->info.int_params[AC_nx],
+        dst->info.int_params[AC_ny],
+        dst->info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        dst->info.int_params[AC_mx],
+        dst->info.int_params[AC_my],
+        dst->info.int_params[AC_mz],
+    };
+
+    // Send to self
+    if (pid == 0) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                           &src.vertex_buffer[vtxbuf][src_idx],  //
+                           count * sizeof(src.vertex_buffer[i][0]));
+                }
+            }
+        }
+    }
+
+    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        // For pencils
+        for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+            for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                const int i     = NGHOST;
+                const int count = nn.x;
+
+                if (pid != 0) {
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    // Recv
+                    MPI_Status status;
+                    MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0,
+                             MPI_COMM_WORLD, &status);
+                }
+                else {
+                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
+                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+                        const int src_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                              j + tgt_pid3d.y * nn.y, //
+                                                              k + tgt_pid3d.z * nn.z, //
+                                                              src.info);
+
+                        // Send
+                        MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0,
+                                 MPI_COMM_WORLD);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Gathering mesh...\n");
+    fflush(stdout);
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        src.info.int_params[AC_nx],
+        src.info.int_params[AC_ny],
+        src.info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        src.info.int_params[AC_mx],
+        src.info.int_params[AC_my],
+        src.info.int_params[AC_mz],
+    };
+
+    // Send to self
+    if (pid == 0) {
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                           &src.vertex_buffer[vtxbuf][src_idx],  //
+                           count * sizeof(src.vertex_buffer[i][0]));
+                }
+            }
+        }
+    }
+
+    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+        // For pencils
+        for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+            for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                const int i     = NGHOST;
+                const int count = nn.x;
+
+                if (pid != 0) {
+                    // Send
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0,
+                             MPI_COMM_WORLD);
+                }
+                else {
+                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
+                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+                        const int dst_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                              j + tgt_pid3d.y * nn.y, //
+                                                              k + tgt_pid3d.z * nn.z, //
+                                                              dst->info);
+
+                        // Recv
+                        MPI_Status status;
+                        MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0,
+                                 MPI_COMM_WORLD, &status);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+/*
+// Close to correct 3D distribute/gather but deadlocks
+static AcResult
+acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Distributing mesh...\n");
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        dst->info.int_params[AC_nx],
+        dst->info.int_params[AC_ny],
+        dst->info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        dst->info.int_params[AC_mx],
+        dst->info.int_params[AC_my],
+        dst->info.int_params[AC_mz],
+    };
+
+    for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
+        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                          j + tgt_pid3d.y * nn.y, //
+                                                          k + tgt_pid3d.z * nn.z, //
+                                                          src.info);
+                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
+
+                    if (pid == 0) {
+                        // Send
+                        if (pid == tgt_pid) {
+                            // Send to self
+                            memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                                   &src.vertex_buffer[vtxbuf][src_idx],  //
+                                   count * sizeof(src.vertex_buffer[i][0]));
+                        }
+                        else {
+                            // Send to others
+                            MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid,
+                                     vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
+                                     MPI_COMM_WORLD);
+                        }
+                    }
+                    else {
+                        // Recv
+                        MPI_Status status;
+                        MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0,
+                                 vtxbuf + (j + k * (NGHOST + nn.y)) * NUM_VTXBUF_HANDLES,
+                                 MPI_COMM_WORLD, &status);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+
+static AcResult
+acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+{
+    MPI_Barrier(MPI_COMM_WORLD);
+    printf("Distributing mesh...\n");
+    assert(dst);
+
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    // Submesh nn
+    const int3 nn = (int3){
+        dst->info.int_params[AC_nx],
+        dst->info.int_params[AC_ny],
+        dst->info.int_params[AC_nz],
+    };
+
+    // Submesh mm
+    const int3 mm = (int3){
+        dst->info.int_params[AC_mx],
+        dst->info.int_params[AC_my],
+        dst->info.int_params[AC_mz],
+    };
+
+    for (int tgt_pid = 0; tgt_pid < nprocs; ++tgt_pid) {
+        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+
+            // For pencils
+            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
+                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
+                    const int i       = NGHOST;
+                    const int count   = nn.x;
+                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
+                    const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
+                                                          j + tgt_pid3d.y * nn.y, //
+                                                          k + tgt_pid3d.z * nn.z, //
+                                                          dst->info);
+
+                    if (pid == 0) {
+                        // Send
+                        if (pid == tgt_pid) {
+                            // Send to self
+                            memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
+                                   &src.vertex_buffer[vtxbuf][src_idx],  //
+                                   count * sizeof(src.vertex_buffer[i][0]));
+                        }
+                        else {
+                            // Recv from others
+                            MPI_Status status;
+                            MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid,
+                                     vtxbuf, MPI_COMM_WORLD, &status);
+                        }
+                    }
+                    else {
+                        // Send to 0
+                        MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, vtxbuf,
+                                 MPI_COMM_WORLD);
+                    }
+                }
+            }
+        }
+    }
+    return AC_SUCCESS;
+}
+*/
+
+/*
 static void
 acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
 {
     MPI_Barrier(MPI_COMM_WORLD);
     printf("Distributing mesh...\n");
+    assert(dst);
 
     MPI_Datatype datatype = MPI_FLOAT;
     if (sizeof(AcReal) == 8)
@@ -1110,33 +1421,31 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
     MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
 
-    const size_t count = acVertexBufferSize(dst->info);
+    // const size_t count = acVertexBufferSize(dst->info);
+    const int3 offset = (int3){
+        src.info.int_params[AC_nx] / decomposition.x,
+        src.info.int_params[AC_ny] / decomposition.y,
+        src.info.int_params[AC_nz] / decomposition.z,
+    };
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
 
         if (pid == 0) {
             // Communicate to self
-            assert(dst);
             memcpy(&dst->vertex_buffer[i][0], //
                    &src.vertex_buffer[i][0],  //
                    count * sizeof(src.vertex_buffer[i][0]));
 
             // Communicate to others
-            for (int j = 1; j < num_processes; ++j) {
-                const int3 target_pid = getPid3D(j, decomposition);
-                const int3 offset     = (int3){
-                    src.info.int_params[AC_nx] / decomposition.x,
-                    src.info.int_params[AC_ny] / decomposition.y,
-                    src.info.int_params[AC_nz] / decomposition.z,
-                };
-                const size_t src_idx = acVertexBufferIdx(target_pid.x * offset.x,
+            for (int tgt = 1; tgt < num_processes; ++tgt) {
+                const int3 target_pid = getPid3D(tgt, decomposition);
+                const size_t src_idx  = acVertexBufferIdx(target_pid.x * offset.x,
                                                          target_pid.y * offset.y,
                                                          target_pid.z * offset.z, src.info);
 
-                MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, j, 0, MPI_COMM_WORLD);
+                MPI_Send(&src.vertex_buffer[i][src_idx], count, datatype, tgt, 0, MPI_COMM_WORLD);
             }
         }
         else {
-            assert(dst);
 
             // Recv
             const size_t dst_idx = 0;
@@ -1147,6 +1456,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
     }
 }
 
+
 static void
 acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
 {
@@ -1199,6 +1509,7 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
         }
     }
 }
+*/
 
 /*
 static AcResult
@@ -2210,7 +2521,9 @@ acDeviceRunMPITest(void)
     acMeshRandomize(&submesh);
 
 #if BENCH_STRONG_SCALING
+    MPI_Barrier(MPI_COMM_WORLD);
     acDeviceDistributeMeshMPI(model, decomposition, &submesh);
+    MPI_Barrier(MPI_COMM_WORLD);
 #endif
 
     // Init the GPU
@@ -2254,7 +2567,7 @@ acDeviceRunMPITest(void)
         acDeviceCommunicateHalosMPI(device);
         acDeviceSynchronizeStream(device, STREAM_ALL);
 
-        acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
+        // acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); // TODO re-enable
         acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
         if (pid == 0) {
             // acModelIntegrateStep(model, FLT_EPSILON); // TODO