diff --git a/src/core/device.cc b/src/core/device.cc
index 1cdcc04..4641e77 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -871,6 +871,7 @@ acDeviceIntegrateStepMPI(const Device device, const AcReal dt)
         }
 
         // MPI
+        MPI_Request send_requests[2 * NUM_VTXBUF_HANDLES];
         MPI_Request recv_requests[2 * NUM_VTXBUF_HANDLES];
         MPI_Datatype datatype = MPI_FLOAT;
         if (sizeof(AcReal) == 8)
@@ -909,18 +910,17 @@ acDeviceIntegrateStepMPI(const Device device, const AcReal dt)
                                                          device->local_config);
                 const int send_pid   = (pid + 1) % num_processes;
 
-                MPI_Request request;
                 MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD,
-                          &request);
+                          &send_requests[i]);
             }
             { // Send back
                 // ...|ooooooo|xxx <- ...|xxxoooo|...
                 const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config);
                 const int send_pid   = (pid + num_processes - 1) % num_processes;
 
-                MPI_Request request;
                 MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
-                          i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD, &request);
+                          i + NUM_VTXBUF_HANDLES, MPI_COMM_WORLD,
+                          &send_requests[i + NUM_VTXBUF_HANDLES]);
             }
         }
         // Inner integration
@@ -931,44 +931,35 @@ acDeviceIntegrateStepMPI(const Device device, const AcReal dt)
             acDeviceIntegrateSubstep(device, (Stream)(NUM_STREAMS - 2), isubstep, m1, m2, dt);
         }
 
-        for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-            MPI_Status status;
-            MPI_Wait(&recv_requests[i], &status);
-            MPI_Wait(&recv_requests[i + NUM_VTXBUF_HANDLES], &status);
-        }
+        MPI_Waitall(2 * NUM_VTXBUF_HANDLES, recv_requests, MPI_STATUSES_IGNORE);
+        MPI_Waitall(2 * NUM_VTXBUF_HANDLES, send_requests, MPI_STATUSES_IGNORE);
 
         acDeviceSynchronizeStream(device, INNER_BOUNDCOND_STREAM);
-        // #pragma omp parallel for
         { // Front
             const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
             const int3 m2 = m1 + (int3){nx, ny, NGHOST};
             acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
         }
-        // #pragma omp parallel for
         { // Back
             const int3 m1 = (int3){NGHOST, NGHOST, nz};
             const int3 m2 = m1 + (int3){nx, ny, NGHOST};
             acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
         }
-        // #pragma omp parallel for
         { // Bottom
             const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
             const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST};
             acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
         }
-        // #pragma omp parallel for
         { // Top
             const int3 m1 = (int3){NGHOST, ny, 2 * NGHOST};
             const int3 m2 = m1 + (int3){nx, NGHOST, nz - 2 * NGHOST};
             acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
         }
-        // #pragma omp parallel for
         { // Left
             const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
             const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST};
             acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
         }
-        // #pragma omp parallel for
         { // Right
             const int3 m1 = (int3){nx, 2 * NGHOST, 2 * NGHOST};
             const int3 m2 = m1 + (int3){NGHOST, ny - 2 * NGHOST, nz - 2 * NGHOST};
@@ -1066,7 +1057,6 @@ acDeviceRunMPITest(void)
     acDeviceCreate(pid % devices_per_node, submesh_info, &device);
     acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
 
-    /*
     // Warmup
     for (int i = 0; i < 5; ++i) {
         acDeviceIntegrateStepMPI(device, FLT_EPSILON);
@@ -1098,7 +1088,6 @@ acDeviceRunMPITest(void)
         fclose(fp);
     }
     ////////////////////////////// Timer end
-    */
     acDeviceBoundStepMPI(device);
     acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
     acDeviceDestroy(device);