Merged the newest MPI changes

2020-05-30 19:18:46 +03:00
parent d6d5920553 555bf8b252
commit e05338c128
12 changed files with 1065 additions and 351 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,8 @@ if (BUILD_SAMPLES)
    add_subdirectory(samples/cpptest)
    add_subdirectory(samples/mpitest)
    add_subdirectory(samples/benchmark)
+    add_subdirectory(samples/bwtest)
+    add_subdirectory(samples/genbenchmarkscripts)
 endif()

 if (BUILD_STANDALONE)
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -89,19 +89,45 @@ main(void)
        }
    }*/

+    /*
+    // Basic
+    const size_t num_iters = 100;
+
    // Warmup
-    for (size_t i = 0; i < 10; ++i)
+    for (size_t i = 0; i < num_iters / 10; ++i)
        acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);

    // Benchmark
    Timer t;
    const AcReal dt = FLT_EPSILON;
+
+    acGridSynchronizeStream(STREAM_ALL);
+    timer_reset(&t);
+    acGridSynchronizeStream(STREAM_ALL);
+
+    for (size_t i = 0; i < num_iters; ++i)
+        acGridIntegrate(STREAM_DEFAULT, dt);
+
+    acGridSynchronizeStream(STREAM_ALL);
+    if (!pid)
+        timer_diff_print(t);
+    acGridSynchronizeStream(STREAM_ALL);
+    */
+
+    // Percentiles
    const size_t num_iters      = 100;
    const double nth_percentile = 0.90;
-
    std::vector<double> results; // ms
    results.reserve(num_iters);

+    // Warmup
+    for (size_t i = 0; i < num_iters / 10; ++i)
+        acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
+
+    // Benchmark
+    Timer t;
+    const AcReal dt = FLT_EPSILON;
+
    for (size_t i = 0; i < num_iters; ++i) {
        acGridSynchronizeStream(STREAM_ALL);
        timer_reset(&t);
@@ -109,9 +135,9 @@ main(void)
        acGridIntegrate(STREAM_DEFAULT, dt);
        acGridSynchronizeStream(STREAM_ALL);
        results.push_back(timer_diff_nsec(t) / 1e6);
+        acGridSynchronizeStream(STREAM_ALL);
    }

-    // Write benchmark to file
    if (!pid) {
        std::sort(results.begin(), results.end(),
                  [](const double& a, const double& b) { return a < b; });
@@ -137,6 +163,48 @@ main(void)
        fclose(fp);
    }

+    /*
+const size_t num_iters      = 100;
+const double nth_percentile = 0.90;
+
+std::vector<double> results; // ms
+results.reserve(num_iters);
+
+for (size_t i = 0; i < num_iters; ++i) {
+    acGridSynchronizeStream(STREAM_ALL);
+    timer_reset(&t);
+    acGridSynchronizeStream(STREAM_ALL);
+    acGridIntegrate(STREAM_DEFAULT, dt);
+    acGridSynchronizeStream(STREAM_ALL);
+    results.push_back(timer_diff_nsec(t) / 1e6);
+}
+
+// Write benchmark to file
+if (!pid) {
+    std::sort(results.begin(), results.end(),
+              [](const double& a, const double& b) { return a < b; });
+    fprintf(stdout,
+            "Integration step time %g ms (%gth "
+            "percentile)--------------------------------------\n",
+            results[nth_percentile * num_iters], 100 * nth_percentile);
+
+    char path[4096] = "";
+    if (test == TEST_STRONG_SCALING)
+        strncpy(path, "strong_scaling.csv", sizeof(path));
+    else if (test == TEST_WEAK_SCALING)
+        strncpy(path, "weak_scaling.csv", sizeof(path));
+    else
+        ERROR("Invalid test type");
+
+    FILE* fp = fopen(path, "a");
+    ERRCHK_ALWAYS(fp);
+    // Format
+    // nprocs, measured (ms)
+    fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
+
+    fclose(fp);
+}*/
+
    acGridQuit();
    MPI_Finalize();
    return EXIT_SUCCESS;
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
+
+find_package(MPI)
+find_package(OpenMP)
+find_package(CUDAToolkit)
+
+add_executable(bwtest main.c)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
+target_compile_options(bwtest PRIVATE -O3)
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -0,0 +1,486 @@
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <mpi.h>
+
+#include <cuda_runtime_api.h>
+
+#include "timer_hires.h" // From src/common
+
+//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
+#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
+
+/*
+  Findings:
+    - MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
+    - Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
+    - For internode one should use pinned memory (RDMA is staged through pinned, gives full
+    network speed iff pinned)
+    - Both the sending and receiving arrays must be pinned to see performance improvement
+    in internode comm
+*/
+
+static uint8_t*
+allocHost(const size_t bytes)
+{
+    uint8_t* arr = malloc(bytes);
+    assert(arr);
+    return arr;
+}
+
+static void
+freeHost(uint8_t* arr)
+{
+    free(arr);
+}
+
+static uint8_t*
+allocDevice(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    assert(retval == cudaSuccess);
+    return arr;
+}
+
+static uint8_t*
+allocDevicePinned(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    assert(retval == cudaSuccess);
+    return arr;
+}
+
+static void
+freeDevice(uint8_t* arr)
+{
+    cudaFree(arr);
+}
+
+static void
+sendrecv_blocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    if (!pid) {
+        MPI_Status status;
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+    }
+    else {
+        MPI_Status status;
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+    }
+}
+
+static void
+sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Request recv_request, send_request;
+    MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
+    MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
+
+    MPI_Status status;
+    MPI_Wait(&recv_request, &status);
+    MPI_Wait(&send_request, &status);
+}
+
+static void
+sendrecv_twoway(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Status status;
+    MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
+                 MPI_COMM_WORLD, &status);
+}
+
+static void
+sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        int nback  = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
+        MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&recv_requests[i], &status);
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
+static void
+sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    static bool error_shown = false;
+    if (!pid && !error_shown) {
+        fprintf(stderr, "\tWARNING: make sure you init MPI_Init_thread for OpenMP support (no "
+                        "supported on puhti atm "
+                        "2020-04-05\n");
+        error_shown = true;
+    }
+#pragma omp parallel for
+    for (int i = 1; i < nprocs; ++i) {
+        int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        MPI_Status status;
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &status);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
+static void
+sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    static uint8_t* src_pinned = NULL;
+    static uint8_t* dst_pinned = NULL;
+    if (!src_pinned)
+        src_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
+    if (!dst_pinned)
+        dst_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
+
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+
+    const int node_id = pid / devices_per_node;
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        int nback  = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        if (nback / devices_per_node != pid / devices_per_node) // Not on the same node
+            MPI_Irecv(dst_pinned, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD,
+                      &recv_requests[i]);
+        else
+            MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
+
+        if (nfront / devices_per_node != pid / devices_per_node) // Not on the same node
+            MPI_Isend(src_pinned, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD,
+                      &send_requests[i]);
+        else
+            MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&recv_requests[i], &status);
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
+static void
+send_d2h(const uint8_t* src, uint8_t* dst)
+{
+    cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
+}
+
+static void
+send_h2d(const uint8_t* src, uint8_t* dst)
+{
+    cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
+}
+
+
+static void
+sendrecv_d2h2d(const uint8_t* dsrc, uint8_t* hdst, const uint8_t* hsrc, uint8_t* ddst)
+{
+    cudaStream_t d2h, h2d;
+    cudaStreamCreate(&d2h);
+    cudaStreamCreate(&h2d);
+
+    cudaMemcpyAsync(hdst, dsrc, BLOCK_SIZE, cudaMemcpyDeviceToHost, d2h);
+    cudaMemcpyAsync(ddst, hsrc, BLOCK_SIZE, cudaMemcpyHostToDevice, h2d);
+
+    cudaStreamSynchronize(d2h);
+    cudaStreamSynchronize(h2d);
+
+    cudaStreamDestroy(d2h);
+    cudaStreamDestroy(h2d);
+}
+
+#define PRINT                                                                                      \
+    if (!pid)                                                                                      \
+    printf
+
+static void
+measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
+          uint8_t* dst)
+{
+    const size_t num_samples = 100;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    PRINT("%s\n", msg);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("\tWarming up... ");
+    for (size_t i = 0; i < num_samples / 10; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    PRINT("Done\n");
+
+    PRINT("\tBandwidth... ");
+    fflush(stdout);
+
+    Timer t;
+    MPI_Barrier(MPI_COMM_WORLD);
+    timer_reset(&t);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (size_t i = 0; i < num_samples; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
+    PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+
+static void
+measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(const uint8_t*, uint8_t*, const uint8_t*, uint8_t*), const uint8_t* dsrc, uint8_t* hdst,
+                                                                                                            const uint8_t* hsrc, uint8_t* ddst)
+{
+    const size_t num_samples = 100;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    PRINT("%s\n", msg);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("\tWarming up... ");
+    for (size_t i = 0; i < num_samples / 10; ++i)
+        sendrecv(dsrc, hdst, hsrc, ddst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    PRINT("Done\n");
+
+    PRINT("\tBandwidth... ");
+    fflush(stdout);
+
+    Timer t;
+    MPI_Barrier(MPI_COMM_WORLD);
+    timer_reset(&t);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (size_t i = 0; i < num_samples; ++i)
+        sendrecv(dsrc, hdst, hsrc, ddst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
+    PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int
+main(void)
+{
+    MPI_Init(NULL, NULL);
+    // int provided;
+    // MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
+    // assert(provided >= MPI_THREAD_MULTIPLE);
+
+    // Disable stdout buffering
+    setbuf(stdout, NULL);
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    assert(nprocs >= 2); // Require at least one neighbor
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (!pid) {
+        printf("Do we have threads? The following should not be ordered (unless very lucky)\n");
+#pragma omp parallel for
+        for (int i = 0; i < 10; ++i)
+            printf("%d, ", i);
+        printf("\n");
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+    const int device_id = pid % devices_per_node;
+    cudaSetDevice(device_id);
+
+    printf("Process %d of %d running.\n", pid, nprocs);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
+
+#if 1
+    {
+        uint8_t* src = allocHost(BLOCK_SIZE);
+        uint8_t* dst = allocHost(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Host)", //
+                  2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+        //          2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
+
+        freeHost(src);
+        freeHost(dst);
+    }
+    PRINT("\n------------------------\n");
+
+    {
+        uint8_t* src = allocDevice(BLOCK_SIZE);
+        uint8_t* dst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device)", //
+                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+        //          2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
+                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+    PRINT("\n------------------------\n");
+
+    {
+        uint8_t* src = allocDevicePinned(BLOCK_SIZE);
+        uint8_t* dst = allocDevicePinned(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
+                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+    PRINT("\n------------------------\n");
+    
+    {
+        uint8_t* hsrc = allocHost(BLOCK_SIZE);
+        uint8_t* hdst = allocHost(BLOCK_SIZE);
+        uint8_t* dsrc = allocDevice(BLOCK_SIZE);
+        uint8_t* ddst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
+        measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
+
+        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+
+        freeDevice(dsrc);
+        freeDevice(ddst);
+        freeHost(hsrc);
+        freeHost(hdst);
+    }
+    PRINT("\n------------------------\n");
+    
+    {
+        uint8_t* hsrc = allocHost(BLOCK_SIZE);
+        uint8_t* hdst = allocHost(BLOCK_SIZE);
+        uint8_t* dsrc = allocDevicePinned(BLOCK_SIZE);
+        uint8_t* ddst = allocDevicePinned(BLOCK_SIZE);
+
+        measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
+        measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
+
+        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+
+        freeDevice(dsrc);
+        freeDevice(ddst);
+        freeHost(hsrc);
+        freeHost(hdst);
+    }
+    PRINT("\n------------------------\n");
+#else
+    { // Final run for easy identification with the profiler
+        uint8_t* src = allocDevice(BLOCK_SIZE);
+        uint8_t* dst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+#endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
--- a/samples/genbenchmarkscripts/CMakeLists.txt
+++ b/samples/genbenchmarkscripts/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(genbenchmarkscripts main.c)
+
+add_custom_command(
+  TARGET genbenchmarkscripts POST_BUILD
+  COMMAND genbenchmarkscripts
+  WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+  COMMENT "Generating benchmark scripts"
+)
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -0,0 +1,46 @@
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main(void)
+{
+    const int max_nprocs = 128;
+    for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
+        char filename[4096];
+        sprintf(filename, "benchmark_%d.sh", nprocs);
+
+        FILE* fp = fopen(filename, "w");
+        assert(fp);
+
+        // Boilerplate
+        fprintf(fp, "#!/bin/bash\n");
+        fprintf(fp, "#BATCH --job-name=astaroth\n");
+        fprintf(fp, "#SBATCH --account=project_2000403\n");
+        fprintf(fp, "#SBATCH --time=00:14:59\n");
+        fprintf(fp, "#SBATCH --mem=32000\n");
+        fprintf(fp, "#SBATCH --partition=gpu\n");
+
+        // nprocs, nodes, gpus
+        const int max_gpus_per_node = 4;
+        const int gpus_per_node     = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
+        const int nodes             = (int)ceil((double)nprocs / max_gpus_per_node);
+        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
+        fprintf(fp, "#SBATCH -n %d\n", nprocs);
+        fprintf(fp, "#SBATCH -N %d\n", nodes);
+
+        // Modules
+        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
+
+        // Profile and run
+        fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+        fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
+                nprocs);
+
+        fclose(fp);
+    }
+
+    return EXIT_SUCCESS;
+}
--- a/samples/mpitest/main.cc
+++ b/samples/mpitest/main.cc
@@ -74,7 +74,8 @@ main(void)
 int
 main(void)
 {
-    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with cmake -DMPI_ENABLED=ON .. to enable.\n");
+    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
+           "cmake -DMPI_ENABLED=ON .. to enable.\n");
    return EXIT_FAILURE;
 }
 #endif // AC_MPI_ENABLES
--- a/samples/mpitest/main_old.cc
+++ b/samples/mpitest/main_old.cc
@@ -0,0 +1,80 @@
+/*
+    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+    Running: mpirun -np <num processes> <executable>
+*/
+#include "astaroth.h"
+#include "astaroth_utils.h"
+
+#include <mpi.h>
+
+int
+main(void)
+{
+    MPI_Init(NULL, NULL);
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+    // CPU alloc
+    AcMeshInfo info;
+    acLoadConfig(AC_DEFAULT_CONFIG, &info);
+    info.real_params[AC_inv_dsx]   = AcReal(1.0) / info.real_params[AC_dsx];
+    info.real_params[AC_inv_dsy]   = AcReal(1.0) / info.real_params[AC_dsy];
+    info.real_params[AC_inv_dsz]   = AcReal(1.0) / info.real_params[AC_dsz];
+    info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
+
+    AcMesh model, candidate;
+    if (pid == 0) {
+        acMeshCreate(info, &model);
+        acMeshCreate(info, &candidate);
+        acMeshRandomize(&model);
+        acMeshRandomize(&candidate);
+    }
+
+    // GPU alloc & compute
+    Grid grid;
+    acGridCreateMPI(info, &grid);
+
+    acGridLoadMeshMPI(grid, STREAM_DEFAULT, model);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+
+    acGridIntegrateMPI(grid, FLT_EPSILON);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+    acGridSynchronizeMeshMPI(grid, STREAM_DEFAULT);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+
+    acGridStoreMeshMPI(grid, STREAM_DEFAULT, &candidate);
+    acGridSynchronizeStreamMPI(grid, STREAM_ALL);
+
+    acGridDestroyMPI(grid);
+
+    // Verify
+    if (pid == 0) {
+        acModelIntegrateStep(model, FLT_EPSILON);
+        acMeshApplyPeriodicBounds(&model);
+
+        acVerifyMesh(model, candidate);
+        acMeshDestroy(&model);
+        acMeshDestroy(&candidate);
+    }
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
--- a/src/common/math_utils.h
+++ b/src/common/math_utils.h
@@ -105,7 +105,8 @@ operator+(const int3& a, const int3& b)
    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
 }

-static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
+static HOST_DEVICE_INLINE int3
+operator*(const int3& a, const int3& b)
 {
    return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
 }
@@ -144,12 +145,20 @@ operator-=(AcReal3& lhs, const AcReal3& rhs)
    lhs.z -= rhs.z;
 }

-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
+static HOST_DEVICE_INLINE int3
+operator*(const int& a, const int3& b)
+{
+    return (int3){a * b.x, a * b.y, a * b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+operator*(const AcReal& a, const AcReal3& b)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }

-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
+static HOST_DEVICE_INLINE AcReal3
+operator*(const AcReal3& b, const AcReal& a)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,11 +1,14 @@
+find_package(CUDAToolkit)
+
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)

 ## Options
 if (MPI_ENABLED)
    find_package(MPI)
-    target_link_libraries(astaroth_core MPI::MPI_CXX)
+    find_package(OpenMP)
+    target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
 endif()

 if (MULTIGPU_ENABLED)
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -460,64 +460,139 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
 }

 #if AC_MPI_ENABLED
+/**
+Quick overview of the MPI implementation:
+
+The halo is partitioned into segments. The first coordinate of a segment is b0.
+The array containing multiple b0s is called... "b0s".
+
+Each b0 maps to an index in the computational domain of some neighboring process a0.
+We have a0 = mod(b0 - nghost, nn) + nghost.
+Intuitively, we
+  1) Transform b0 into a coordinate system where (0, 0, 0) is the first index in
+     the comp domain.
+  2) Wrap the transformed b0 around nn (comp domain)
+  3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
+     in the ghost zone
+
+struct PackedData is used for packing and unpacking. Holds the actual data in
+                  the halo partition
+struct CommData holds multiple PackedDatas for sending and receiving halo
+                partitions
+struct Grid contains information about the local GPU device, decomposition, the
+            total mesh dimensions and CommDatas
+
+
+Basic steps:
+  1) Distribute the mesh among ranks
+  2) Integrate & communicate
+    - start inner integration and at the same time, pack halo data and send it to neighbors
+    - once all halo data has been received, unpack and do outer integration
+    - sync and start again
+  3) Gather the mesh to rank 0 for postprocessing
+*/
 #include <mpi.h>

-static int
+#include <stdint.h>
+
+typedef struct {
+    uint64_t x, y, z;
+} uint3_64;
+
+static uint3_64
+operator+(const uint3_64& a, const uint3_64& b)
+{
+    return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static int3
+make_int3(const uint3_64 a)
+{
+    return (int3){(int)a.x, (int)a.y, (int)a.z};
+}
+
+static uint64_t
 mod(const int a, const int b)
 {
    const int r = a % b;
    return r < 0 ? r + b : r;
 }

-static int
-getPid(const int3 pid, const int3 decomposition)
+static uint3_64
+morton3D(const uint64_t pid)
 {
-    return mod(pid.x, decomposition.x) +                   //
-           mod(pid.y, decomposition.y) * decomposition.x + //
-           mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
+    uint64_t i, j, k;
+    i = j = k = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << 3 * bit;
+        i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+        k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+    }
+
+    return (uint3_64){i, j, k};
 }

-static int3
-getPid3D(const int pid, const int3 decomposition)
+static uint64_t
+morton1D(const uint3_64 pid)
 {
-    const int3 pid3d = (int3){
-        mod(pid, decomposition.x),
-        mod(pid / decomposition.x, decomposition.y),
-        (pid / (decomposition.x * decomposition.y)),
+    uint64_t i = 0;
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << bit;
+        i |= ((pid.x & mask) << 0) << 2 * bit;
+        i |= ((pid.y & mask) << 1) << 2 * bit;
+        i |= ((pid.z & mask) << 2) << 2 * bit;
+    }
+    return i;
+}
+
+static uint3_64
+decompose(const uint64_t target)
+{
+    // This is just so beautifully elegant. Complex and efficient decomposition
+    // in just one line of code.
+    uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
+
+    ERRCHK_ALWAYS(p.x * p.y * p.z == target);
+    return p;
+}
+
+static uint3_64
+wrap(const int3 i, const uint3_64 n)
+{
+    return (uint3_64){
+        mod(i.x, n.x),
+        mod(i.y, n.y),
+        mod(i.z, n.z),
    };
-    return pid3d;
+}
+
+static int
+getPid(const int3 pid_raw, const uint3_64 decomp)
+{
+    const uint3_64 pid = wrap(pid_raw, decomp);
+    return (int)morton1D(pid);
 }

 static int3
-decompose(const int target)
+getPid3D(const uint64_t pid, const uint3_64 decomp)
 {
-    if (target == 16)
-        return (int3){4, 2, 2};
-    if (target == 32)
-        return (int3){4, 4, 2};
-    if (target == 128)
-        return (int3){8, 4, 4};
-    if (target == 256)
-        return (int3){8, 8, 4};
+    const uint3_64 pid3D = morton3D(pid);
+    ERRCHK_ALWAYS(getPid(make_int3(pid3D), decomp) == (int)pid);
+    return (int3){(int)pid3D.x, (int)pid3D.y, (int)pid3D.z};
+}

-    int decomposition[] = {1, 1, 1};
+/** Assumes that contiguous pids are on the same node and there is one process per GPU. */
+static inline bool
+onTheSameNode(const uint64_t pid_a, const uint64_t pid_b)
+{
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);

-    int axis = 0;
-    while (decomposition[0] * decomposition[1] * decomposition[2] < target) {
-        ++decomposition[axis];
-        axis = (axis + 1) % 3;
-    }
+    const uint64_t node_a = pid_a / devices_per_node;
+    const uint64_t node_b = pid_b / devices_per_node;

-    const int found = decomposition[0] * decomposition[1] * decomposition[2];
-    if (found != target) {
-        fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
-        fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found);
-        ERROR("Invalid nprocs");
-        return (int3){-1, -1, -1};
-    }
-    else {
-        return (int3){decomposition[0], decomposition[1], decomposition[2]};
-    }
+    return node_a == node_b;
 }

 static PackedData
@@ -530,12 +605,18 @@ acCreatePackedData(const int3 dims)
    const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));

+    ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
+    // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
+    // slower than pinned (38 ms vs. 125 ms)
+
    return data;
 }

 static AcResult
 acDestroyPackedData(PackedData* data)
 {
+    cudaFree(data->data_pinned);
+
    data->dims = (int3){-1, -1, -1};
    cudaFree(data->data);
    data->data = NULL;
@@ -558,6 +639,16 @@ acCreatePackedDataHost(const int3 dims)
    return data;
 }

+static AcResult
+acDestroyPackedDataHost(PackedData* data)
+{
+    data->dims = (int3){-1, -1, -1};
+    free(data->data);
+    data->data = NULL;
+
+    return AC_SUCCESS;
+}
+
 static void
 acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata,
                           PackedData* hdata)
@@ -579,21 +670,38 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
                         NUM_VTXBUF_HANDLES;
    ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream));
 }
-
-static AcResult
-acDestroyPackedDataHost(PackedData* data)
-{
-    data->dims = (int3){-1, -1, -1};
-    free(data->data);
-    data->data = NULL;
-
-    return AC_SUCCESS;
-}
 #endif // MPI_GPUDIRECT_DISABLED

+static void
+acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
+{
+    cudaSetDevice(device->id);
+    // TODO sync stream
+    ddata->pinned = true;
+
+    const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
+                         NUM_VTXBUF_HANDLES;
+    ERRCHK_CUDA(cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream));
+}
+
+static void
+acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
+{
+    if (!ddata->pinned) // Unpin iff the data was pinned previously
+        return;
+
+    cudaSetDevice(device->id);
+    // TODO sync stream
+    ddata->pinned = false;
+
+    const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
+                         NUM_VTXBUF_HANDLES;
+    ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
+}
+
 // TODO: do with packed data
 static AcResult
-acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+acDeviceDistributeMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
 {
    MPI_Barrier(MPI_COMM_WORLD);
    printf("Distributing mesh...\n");
@@ -669,7 +777,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds

 // TODO: do with packed data
 static AcResult
-acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
+acDeviceGatherMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
 {
    MPI_Barrier(MPI_COMM_WORLD);
    printf("Gathering mesh...\n");
@@ -803,7 +911,9 @@ acCreateCommData(const Device device, const int3 dims, const size_t count)
        data.dsts_host[i] = acCreatePackedDataHost(dims);
 #endif

-        cudaStreamCreate(&data.streams[i]);
+        int low_prio, high_prio;
+        cudaDeviceGetStreamPriorityRange(&low_prio, &high_prio);
+        cudaStreamCreateWithPriority(&data.streams[i], cudaStreamNonBlocking, high_prio);
    }

    return data;
@@ -849,12 +959,28 @@ acSyncCommData(const CommData data)
        cudaStreamSynchronize(data.streams[i]);
 }

+static int3
+mod(const int3 a, const int3 n)
+{
+    return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
+}
+
 static void
-acPackCommData(const Device device, const int3* a0s, CommData* data)
+acPackCommData(const Device device, const int3* b0s, CommData* data)
 {
    cudaSetDevice(device->id);
-    for (size_t i = 0; i < data->count; ++i)
-        acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
+
+    const int3 nn = (int3){
+        device->local_config.int_params[AC_nx],
+        device->local_config.int_params[AC_ny],
+        device->local_config.int_params[AC_nz],
+    };
+    const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
+
+    for (size_t i = 0; i < data->count; ++i) {
+        const int3 a0 = mod(b0s[i] - nghost, nn) + nghost;
+        acKernelPackData(data->streams[i], device->vba, a0, data->srcs[i]);
+    }
 }

 static void
@@ -884,10 +1010,31 @@ acTransferCommDataToDevice(const Device device, CommData* data)
 }
 #endif

+static inline void
+acPinCommData(const Device device, CommData* data)
+{
+    cudaSetDevice(device->id);
+    for (size_t i = 0; i < data->count; ++i)
+        acPinPackedData(device, data->streams[i], &data->srcs[i]);
+}
+
+static void
+acUnpinCommData(const Device device, CommData* data)
+{
+    cudaSetDevice(device->id);
+
+    // Clear pin flags from src
+    for (size_t i = 0; i < data->count; ++i)
+        data->srcs[i].pinned = false;
+
+    // Transfer from pinned to gmem
+    for (size_t i = 0; i < data->count; ++i)
+        acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
+}
+
 static AcResult
 acTransferCommData(const Device device, //
-                   const int3* a0s,     // Src idx inside comp. domain
-                   const int3* b0s,     // Dst idx inside bound zone
+                   const int3* b0s,     // Halo partition coordinates
                   CommData* data)
 {
    cudaSetDevice(device->id);
@@ -899,7 +1046,7 @@ acTransferCommData(const Device device, //
    int nprocs, pid;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const int3 decomp = decompose(nprocs);
+    const uint3_64 decomp = decompose(nprocs);

    const int3 nn = (int3){
        device->local_config.int_params[AC_nx],
@@ -907,80 +1054,55 @@ acTransferCommData(const Device device, //
        device->local_config.int_params[AC_nz],
    };

+    const int3 pid3d        = getPid3D(pid, decomp);
    const int3 dims         = data->dims;
    const size_t blockcount = data->count;
-
-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
-
-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
    const size_t count      = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
+    const int3 nghost       = (int3){NGHOST, NGHOST, NGHOST};

-#if MPI_GPUDIRECT_DISABLED
-                            PackedData dst = data->dsts_host[b_idx];
-#else
-                            PackedData dst = data->dsts[b_idx];
-#endif
+    for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {

-                            const int3 pid3d = getPid3D(pid, decomp);
-                            MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
-                                      b_idx, MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
-                        }
-                    }
-                }
+        const int3 b0       = b0s[b0_idx];
+        const int3 neighbor = (int3){
+            b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
+            b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
+            b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
+        };
+        const int npid = getPid(pid3d + neighbor, decomp);
+
+        PackedData* dst = &data->dsts[b0_idx];
+        if (onTheSameNode(pid, npid)) {
+            MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
+            dst->pinned = false;
        }
+        else {
+            MPI_Irecv(dst->data_pinned, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
+            dst->pinned = true;
        }
    }

-    for (int k = -1; k <= 1; ++k) {
-        for (int j = -1; j <= 1; ++j) {
-            for (int i = -1; i <= 1; ++i) {
-                if (i == 0 && j == 0 && k == 0)
-                    continue;
+    for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
+        const int3 b0       = b0s[b0_idx];
+        const int3 neighbor = (int3){
+            b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
+            b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
+            b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
+        };
+        const int npid = getPid(pid3d - neighbor, decomp);

-                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
-                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
-                        const int3 neighbor = (int3){i, j, k};
-
-                        const int3 a0 = a0s[a_idx];
-                        // const int3 a1 = a0 + dims;
-
-                        const int3 b0 = a0 - neighbor * nn;
-                        // const int3 b1 = a1 - neighbor * nn;
-
-                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
-
-                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
-
-#if MPI_GPUDIRECT_DISABLED
-                            PackedData src = data->srcs_host[a_idx];
-#else
-                            PackedData src = data->srcs[a_idx];
-#endif
-
-                            const int3 pid3d = getPid3D(pid, decomp);
-
-                            cudaStreamSynchronize(data->streams[a_idx]);
-                            MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp),
-                                      b_idx, MPI_COMM_WORLD, &data->send_reqs[b_idx]);
-                        }
-                    }
-                }
+        PackedData* src = &data->srcs[b0_idx];
+        if (onTheSameNode(pid, npid)) {
+            cudaStreamSynchronize(data->streams[b0_idx]);
+            MPI_Isend(src->data, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
        }
+        else {
+            acPinPackedData(device, data->streams[b0_idx], src);
+            cudaStreamSynchronize(data->streams[b0_idx]);
+            MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
+                      MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
        }
    }

@@ -999,7 +1121,7 @@ acTransferCommDataWait(const CommData data)
 typedef struct {
    Device device;
    AcMesh submesh;
-    int3 decomposition;
+    uint3_64 decomposition;
    bool initialized;

    int3 nn;
@@ -1041,10 +1163,10 @@ acGridInit(const AcMeshInfo info)

    // Decompose
    AcMeshInfo submesh_info      = info;
-    const int3 decomposition = decompose(nprocs);
+    const uint3_64 decomposition = decompose(nprocs);
    const int3 pid3d             = getPid3D(pid, decomposition);

-    printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
+    printf("Decomposition: %lu, %lu, %lu\n", decomposition.x, decomposition.y, decomposition.z);
    printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
    ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
    ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
@@ -1089,79 +1211,19 @@ acGridInit(const AcMeshInfo info)
        device->local_config.int_params[AC_nz],
    };

-    // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
-
-    // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
-
-    // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
-
-    // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
-
-    // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
-
-    // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
-
-    // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
-
-    const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 edgex_dims  = (int3){nn.x, NGHOST, NGHOST};
-    const int3 edgey_dims  = (int3){NGHOST, nn.y, NGHOST};
-    const int3 edgez_dims  = (int3){NGHOST, NGHOST, nn.z};
-    const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};
-    const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};
-    const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};
+    // Create CommData
+    // We have 8 corners, 12 edges, and 6 sides
+    //
+    // For simplicity's sake all data blocks inside a single CommData struct
+    // have the same dimensions.
    grid.nn          = nn;
-    grid.corner_data       = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
-    grid.edgex_data        = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
-    grid.edgey_data        = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
-    grid.edgez_data        = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
-    grid.sidexy_data       = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
-    grid.sidexz_data       = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
-    grid.sideyz_data       = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));
+    grid.corner_data = acCreateCommData(device, (int3){NGHOST, NGHOST, NGHOST}, 8);
+    grid.edgex_data  = acCreateCommData(device, (int3){nn.x, NGHOST, NGHOST}, 4);
+    grid.edgey_data  = acCreateCommData(device, (int3){NGHOST, nn.y, NGHOST}, 4);
+    grid.edgez_data  = acCreateCommData(device, (int3){NGHOST, NGHOST, nn.z}, 4);
+    grid.sidexy_data = acCreateCommData(device, (int3){nn.x, nn.y, NGHOST}, 2);
+    grid.sidexz_data = acCreateCommData(device, (int3){nn.x, NGHOST, nn.z}, 2);
+    grid.sideyz_data = acCreateCommData(device, (int3){NGHOST, nn.y, nn.z}, 2);

    acGridSynchronizeStream(STREAM_ALL);
    return AC_SUCCESS;
@@ -1182,7 +1244,7 @@ acGridQuit(void)
    acDestroyCommData(grid.device, &grid.sideyz_data);

    grid.initialized   = false;
-    grid.decomposition = (int3){-1, -1, -1};
+    grid.decomposition = (uint3_64){0, 0, 0};
    acMeshDestroy(&grid.submesh);
    acDeviceDestroy(grid.device);

@@ -1220,7 +1282,7 @@ AcResult
 acGridIntegrate(const Stream stream, const AcReal dt)
 {
    ERRCHK(grid.initialized);
-    //acGridSynchronizeStream(stream);
+    // acGridSynchronizeStream(stream);

    const Device device  = grid.device;
    const int3 nn        = grid.nn;
@@ -1235,37 +1297,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    acDeviceSynchronizeStream(device, stream);

    // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-
        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };

    // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},
@@ -1275,13 +1319,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    };

    // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},
@@ -1291,13 +1328,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    };

    // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},
@@ -1307,63 +1337,31 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    };

    // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };

    // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };

    // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };

    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acPackCommData(device, corner_a0s, &corner_data);
-        acPackCommData(device, edgex_a0s, &edgex_data);
-        acPackCommData(device, edgey_a0s, &edgey_data);
-        acPackCommData(device, edgez_a0s, &edgez_data);
-        acPackCommData(device, sidexy_a0s, &sidexy_data);
-        acPackCommData(device, sidexz_a0s, &sidexz_data);
-        acPackCommData(device, sideyz_a0s, &sideyz_data);
-
-	MPI_Barrier(MPI_COMM_WORLD);
-
-#if MPI_GPUDIRECT_DISABLED
-        acTransferCommDataToHost(device, &corner_data);
-        acTransferCommDataToHost(device, &edgex_data);
-        acTransferCommDataToHost(device, &edgey_data);
-        acTransferCommDataToHost(device, &edgez_data);
-        acTransferCommDataToHost(device, &sidexy_data);
-        acTransferCommDataToHost(device, &sidexz_data);
-        acTransferCommDataToHost(device, &sideyz_data);
-#endif
-
-        acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
-        acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
-        acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
-        acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
-        acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
-        acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
-        acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+        // acPackCommData(device, corner_b0s, &corner_data);
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);

        //////////// INNER INTEGRATION //////////////
        {
@@ -1373,7 +1371,27 @@ acGridIntegrate(const Stream stream, const AcReal dt)
        }
        ////////////////////////////////////////////

-        acTransferCommDataWait(corner_data);
+        MPI_Barrier(MPI_COMM_WORLD);
+
+#if MPI_GPUDIRECT_DISABLED
+        // acTransferCommDataToHost(device, &corner_data);
+        acTransferCommDataToHost(device, &edgex_data);
+        acTransferCommDataToHost(device, &edgey_data);
+        acTransferCommDataToHost(device, &edgez_data);
+        acTransferCommDataToHost(device, &sidexy_data);
+        acTransferCommDataToHost(device, &sidexz_data);
+        acTransferCommDataToHost(device, &sideyz_data);
+#endif
+
+        // acTransferCommData(device, corner_b0s, &corner_data);
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
+
+        // acTransferCommDataWait(corner_data);
        acTransferCommDataWait(edgex_data);
        acTransferCommDataWait(edgey_data);
        acTransferCommDataWait(edgez_data);
@@ -1382,7 +1400,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
        acTransferCommDataWait(sideyz_data);

 #if MPI_GPUDIRECT_DISABLED
-        acTransferCommDataToDevice(device, &corner_data);
+        // acTransferCommDataToDevice(device, &corner_data);
        acTransferCommDataToDevice(device, &edgex_data);
        acTransferCommDataToDevice(device, &edgey_data);
        acTransferCommDataToDevice(device, &edgez_data);
@@ -1391,7 +1409,15 @@ acGridIntegrate(const Stream stream, const AcReal dt)
        acTransferCommDataToDevice(device, &sideyz_data);
 #endif

-        acUnpackCommData(device, corner_b0s, &corner_data);
+        // acUnpinCommData(device, &corner_data);
+        acUnpinCommData(device, &edgex_data);
+        acUnpinCommData(device, &edgey_data);
+        acUnpinCommData(device, &edgez_data);
+        acUnpinCommData(device, &sidexy_data);
+        acUnpinCommData(device, &sidexz_data);
+        acUnpinCommData(device, &sideyz_data);
+
+        // acUnpackCommData(device, corner_b0s, &corner_data);
        acUnpackCommData(device, edgex_b0s, &edgex_data);
        acUnpackCommData(device, edgey_b0s, &edgey_data);
        acUnpackCommData(device, edgez_b0s, &edgez_data);
@@ -1401,7 +1427,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
        //////////// OUTER INTEGRATION //////////////

        // Wait for unpacking
-        acSyncCommData(corner_data);
+        // acSyncCommData(corner_data);
        acSyncCommData(edgex_data);
        acSyncCommData(edgey_data);
        acSyncCommData(edgez_data);
@@ -1463,37 +1489,19 @@ acGridPeriodicBoundconds(const Stream stream)
    CommData sideyz_data = grid.sideyz_data;

    // Corners
-    const int3 corner_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-        (int3){NGHOST, nn.y, NGHOST},   //
-        (int3){nn.x, nn.y, NGHOST},     //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-        (int3){NGHOST, nn.y, nn.z},   //
-        (int3){nn.x, nn.y, nn.z},
-    };
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
-        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
-
        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };

    // Edges X
-    const int3 edgex_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){NGHOST, nn.y, nn.z},   //
-    };
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},
@@ -1503,13 +1511,6 @@ acGridPeriodicBoundconds(const Stream stream)
    };

    // Edges Y
-    const int3 edgey_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, NGHOST, nn.z}, //
-        (int3){nn.x, NGHOST, nn.z},   //
-    };
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},
@@ -1519,13 +1520,6 @@ acGridPeriodicBoundconds(const Stream stream)
    };

    // Edges Z
-    const int3 edgez_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-
-        (int3){NGHOST, nn.y, NGHOST}, //
-        (int3){nn.x, nn.y, NGHOST},   //
-    };
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},
@@ -1535,42 +1529,32 @@ acGridPeriodicBoundconds(const Stream stream)
    };

    // Sides XY
-    const int3 sidexy_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, NGHOST, nn.z},   //
-    };
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };

    // Sides XZ
-    const int3 sidexz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){NGHOST, nn.y, NGHOST},   //
-    };
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };

    // Sides YZ
-    const int3 sideyz_a0s[] = {
-        (int3){NGHOST, NGHOST, NGHOST}, //
-        (int3){nn.x, NGHOST, NGHOST},   //
-    };
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };

-    acPackCommData(device, corner_a0s, &corner_data);
-    acPackCommData(device, edgex_a0s, &edgex_data);
-    acPackCommData(device, edgey_a0s, &edgey_data);
-    acPackCommData(device, edgez_a0s, &edgez_data);
-    acPackCommData(device, sidexy_a0s, &sidexy_data);
-    acPackCommData(device, sidexz_a0s, &sidexz_data);
-    acPackCommData(device, sideyz_a0s, &sideyz_data);
+    acPackCommData(device, corner_b0s, &corner_data);
+    acPackCommData(device, edgex_b0s, &edgex_data);
+    acPackCommData(device, edgey_b0s, &edgey_data);
+    acPackCommData(device, edgez_b0s, &edgez_data);
+    acPackCommData(device, sidexy_b0s, &sidexy_data);
+    acPackCommData(device, sidexz_b0s, &sidexz_data);
+    acPackCommData(device, sideyz_b0s, &sideyz_data);
+
+    MPI_Barrier(MPI_COMM_WORLD);

 #if MPI_GPUDIRECT_DISABLED
    acTransferCommDataToHost(device, &corner_data);
@@ -1582,13 +1566,13 @@ acGridPeriodicBoundconds(const Stream stream)
    acTransferCommDataToHost(device, &sideyz_data);
 #endif

-    acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
-    acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
-    acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
-    acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
-    acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
-    acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
-    acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
+    acTransferCommData(device, corner_b0s, &corner_data);
+    acTransferCommData(device, edgex_b0s, &edgex_data);
+    acTransferCommData(device, edgey_b0s, &edgey_data);
+    acTransferCommData(device, edgez_b0s, &edgez_data);
+    acTransferCommData(device, sidexy_b0s, &sidexy_data);
+    acTransferCommData(device, sidexz_b0s, &sidexz_data);
+    acTransferCommData(device, sideyz_b0s, &sideyz_data);

    acTransferCommDataWait(corner_data);
    acTransferCommDataWait(edgex_data);
@@ -1608,6 +1592,14 @@ acGridPeriodicBoundconds(const Stream stream)
    acTransferCommDataToDevice(device, &sideyz_data);
 #endif

+    acUnpinCommData(device, &corner_data);
+    acUnpinCommData(device, &edgex_data);
+    acUnpinCommData(device, &edgey_data);
+    acUnpinCommData(device, &edgez_data);
+    acUnpinCommData(device, &sidexy_data);
+    acUnpinCommData(device, &sidexz_data);
+    acUnpinCommData(device, &sideyz_data);
+
    acUnpackCommData(device, corner_b0s, &corner_data);
    acUnpackCommData(device, edgex_b0s, &edgex_data);
    acUnpackCommData(device, edgey_b0s, &edgey_data);
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -1,9 +1,19 @@
 #pragma once
 #include "astaroth.h"

+#if AC_MPI_ENABLED
+#include <mpi.h>
+#include <stdbool.h>
+
+#define MPI_GPUDIRECT_DISABLED (0)
+#endif // AC_MPI_ENABLED
+
 typedef struct {
    int3 dims;
    AcReal* data;
+
+    AcReal* data_pinned;
+    bool pinned = false; // Set if data was received to pinned memory
 } PackedData;

 typedef struct {