#include "astaroth.h"

#include <string.h>

#include "astaroth_utils.h"
#include "errchk.h"
#include "math_utils.h"
#include "timer_hires.h"

#include "kernels/kernels.h"

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
#define MPI_GPUDIRECT_DISABLED (0)

AcResult
acDevicePrintInfo(const Device device)
{
    const int device_id = device->id;

    cudaDeviceProp props;
    cudaGetDeviceProperties(&props, device_id);
    printf("--------------------------------------------------\n");
    printf("Device Number: %d\n", device_id);
    const size_t bus_id_max_len = 128;
    char bus_id[bus_id_max_len];
    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
    printf("  PCI bus ID: %s\n", bus_id);
    printf("    Device name: %s\n", props.name);
    printf("    Compute capability: %d.%d\n", props.major, props.minor);

    // Compute
    printf("  Compute\n");
    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
    printf("    Stream processors: %d\n", props.multiProcessorCount);
    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
    printf(
        "    Compute mode: %d\n",
        (int)props
            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
    // Memory
    printf("  Global memory\n");
    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
    printf("    ECC enabled: %d\n", props.ECCEnabled);

    // Memory usage
    size_t free_bytes, total_bytes;
    cudaMemGetInfo(&free_bytes, &total_bytes);
    const size_t used_bytes = total_bytes - free_bytes;
    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
    printf("  Caches\n");
    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
    printf("  Other\n");
    printf("    Warp size: %d\n", props.warpSize);
    // printf("    Single to double perf. ratio: %dx\n",
    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
    // versions
    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
    printf("--------------------------------------------------\n");

    return AC_SUCCESS;
}

AcResult
acDeviceAutoOptimize(const Device device)
{
    cudaSetDevice(device->id);
    const int3 start = (int3){
        device->local_config.int_params[AC_nx_min],
        device->local_config.int_params[AC_ny_min],
        device->local_config.int_params[AC_nz_min],
    };
    const int3 end = (int3){
        device->local_config.int_params[AC_nx_max],
        device->local_config.int_params[AC_ny_max],
        device->local_config.int_params[AC_nz_max],
    };
    return acKernelAutoOptimizeIntegration(start, end, device->vba);
}

AcResult
acDeviceSynchronizeStream(const Device device, const Stream stream)
{
    cudaSetDevice(device->id);
    if (stream == STREAM_ALL) {
        cudaDeviceSynchronize();
    }
    else {
        cudaStreamSynchronize(device->streams[stream]);
    }
    return AC_SUCCESS;
}

AcResult
acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
{
    cudaSetDevice(id);
    // cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate
    // multiple devices with a single GPU

    // Create Device
    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
    ERRCHK_ALWAYS(device);

    device->id           = id;
    device->local_config = device_config;
    acDevicePrintInfo(device);

    // Check that the code was compiled for the proper GPU architecture
    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
           "device supports the CUDA architecture you are compiling for.\n"
           "Running dummy kernel... ");
    fflush(stdout);
    acKernelDummy();
    printf("Success!\n");

    // Concurrency
    for (int i = 0; i < NUM_STREAMS; ++i) {
        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, i);
    }

    // Memory
    // VBA in/out
    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.in[i], vba_size_bytes));
        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.out[i], vba_size_bytes));
    }
    // VBA Profiles
    const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
                                                           max(device_config.int_params[AC_my],
                                                               device_config.int_params[AC_mz]));
    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
        ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.profiles[i], profile_size_bytes));
    }

    // Reductions
    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
                                  acVertexBufferCompdomainSizeBytes(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));

    // Device constants
    acDeviceLoadDefaultUniforms(device);
    acDeviceLoadMeshInfo(device, device_config);

    printf("Created device %d (%p)\n", device->id, device);
    *device_handle = device;

    // Autoptimize
    if (id == 0) {
        acDeviceAutoOptimize(device);
    }

    return AC_SUCCESS;
}

AcResult
acDeviceDestroy(Device device)
{
    cudaSetDevice(device->id);
    printf("Destroying device %d (%p)\n", device->id, device);
    acDeviceSynchronizeStream(device, STREAM_ALL);

    // Memory
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        cudaFree(device->vba.in[i]);
        cudaFree(device->vba.out[i]);
    }
    for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
        cudaFree(device->vba.profiles[i]);
    }

    cudaFree(device->reduce_scratchpad);
    cudaFree(device->reduce_result);

    // Concurrency
    for (int i = 0; i < NUM_STREAMS; ++i) {
        cudaStreamDestroy(device->streams[i]);
    }

    // Destroy Device
    free(device);
    return AC_SUCCESS;
}

AcResult
acDeviceSwapBuffers(const Device device)
{
    cudaSetDevice(device->id);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        AcReal* tmp        = device->vba.in[i];
        device->vba.in[i]  = device->vba.out[i];
        device->vba.out[i] = tmp;
    }
    return AC_SUCCESS;
}

AcResult
acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
                        const size_t start, const AcReal* data, const size_t num)
{
    cudaSetDevice(device->id);

    if (handle >= NUM_SCALARARRAY_HANDLES || !NUM_SCALARARRAY_HANDLES)
        return AC_FAILURE;

    ERRCHK((int)(start + num) <= max(device->local_config.int_params[AC_mx],
                                     max(device->local_config.int_params[AC_my],
                                         device->local_config.int_params[AC_mz])));
    ERRCHK_ALWAYS(handle < NUM_SCALARARRAY_HANDLES);
    ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num,
                                cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
}

AcResult
acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
                                   const int3 dst, const int num_vertices)
{
    cudaSetDevice(device->id);
    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);

    const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx];
    AcReal* dst_ptr       = &device->vba.in[vtxbuf_handle][dst_idx];
    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);

    ERRCHK_CUDA(                                                                                  //
        cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
    );

    return AC_SUCCESS;
}

AcResult
acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                           const int3 src, const int3 dst, const int num_vertices)
{
    WARNING("This function is deprecated");
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
                                           dst, num_vertices);
    }
    return AC_SUCCESS;
}

AcResult
acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
                         const VertexBufferHandle vtxbuf_handle)
{
    const int3 src            = (int3){0, 0, 0};
    const int3 dst            = src;
    const size_t num_vertices = acVertexBufferSize(device->local_config);
    acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst,
                                       num_vertices);

    return AC_SUCCESS;
}

AcResult
acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
    }

    return AC_SUCCESS;
}

AcResult
acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
                                    const VertexBufferHandle vtxbuf_handle, const int3 src,
                                    const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
    cudaSetDevice(device->id);
    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);

    const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx];
    AcReal* dst_ptr       = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx];
    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);

    ERRCHK_CUDA(                                                                                  //
        cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
    );

    return AC_SUCCESS;
}

AcResult
acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
                            const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
    WARNING("This function is deprecated");
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
                                            num_vertices, host_mesh);
    }

    return AC_SUCCESS;
}

AcResult
acDeviceStoreVertexBuffer(const Device device, const Stream stream,
                          const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
{
    int3 src                  = (int3){0, 0, 0};
    int3 dst                  = src;
    const size_t num_vertices = acVertexBufferSize(device->local_config);

    acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
                                        host_mesh);

    return AC_SUCCESS;
}

AcResult
acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
    }

    return AC_SUCCESS;
}

AcResult
acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
                                       const VertexBufferHandle vtxbuf_handle, const int3 src,
                                       const int3 dst, const int num_vertices, Device dst_device)
{
    cudaSetDevice(src_device->id);
    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);

    const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx];
    AcReal* dst_ptr       = &dst_device->vba.in[vtxbuf_handle][dst_idx];
    const size_t bytes    = num_vertices * sizeof(src_ptr[0]);

    ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes,
                                    src_device->streams[stream]));
    return AC_SUCCESS;
}

AcResult
acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
                               const int3 dst, const int num_vertices, Device dst_device)
{
    WARNING("This function is deprecated");
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
                                               num_vertices, dst_device);
    }
    return AC_SUCCESS;
}

AcResult
acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
                             const VertexBufferHandle vtxbuf_handle, Device dst_device)
{
    int3 src                  = (int3){0, 0, 0};
    int3 dst                  = src;
    const size_t num_vertices = acVertexBufferSize(src_device->local_config);

    acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                           num_vertices, dst_device);
    return AC_SUCCESS;
}

AcResult
acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
{
    WARNING("This function is deprecated");
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
    }
    return AC_SUCCESS;
}

AcResult
acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
                         const int3 start, const int3 end, const AcReal dt)
{
    cudaSetDevice(device->id);
    acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
    return acKernelIntegrateSubstep(device->streams[stream], step_number, start, end, device->vba);
}

AcResult
acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
                              const VertexBufferHandle vtxbuf_handle, const int3 start,
                              const int3 end)
{
    cudaSetDevice(device->id);
    return acKernelPeriodicBoundconds(device->streams[stream], start, end,
                                      device->vba.in[vtxbuf_handle]);
}

AcResult
acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
                           const int3 end)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
    }
    return AC_SUCCESS;
}

AcResult
acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                   const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
    cudaSetDevice(device->id);

    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
                              device->local_config.int_params[AC_ny_min],
                              device->local_config.int_params[AC_nz_min]};

    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
                            device->local_config.int_params[AC_ny_max],
                            device->local_config.int_params[AC_nz_max]};

    *result = acKernelReduceScal(device->streams[stream], rtype, start, end,
                                 device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
                                 device->reduce_result);
    return AC_SUCCESS;
}

AcResult
acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
                  const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
                  const VertexBufferHandle vtxbuf2, AcReal* result)
{
    cudaSetDevice(device->id);

    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
                              device->local_config.int_params[AC_ny_min],
                              device->local_config.int_params[AC_nz_min]};

    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
                            device->local_config.int_params[AC_ny_max],
                            device->local_config.int_params[AC_nz_max]};

    *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
                                device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
                                device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
}

#if AC_MPI_ENABLED
#include <mpi.h>

static int
mod(const int a, const int b)
{
    const int r = a % b;
    return r < 0 ? r + b : r;
}

static int
getPid(const int3 pid, const int3 decomposition)
{
    return mod(pid.x, decomposition.x) +                   //
           mod(pid.y, decomposition.y) * decomposition.x + //
           mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
}

static int3
getPid3D(const int pid, const int3 decomposition)
{
    const int3 pid3d = (int3){
        mod(pid, decomposition.x),
        mod(pid / decomposition.x, decomposition.y),
        (pid / (decomposition.x * decomposition.y)),
    };
    return pid3d;
}

static int3
decompose(const int target)
{
    int decomposition[] = {1, 1, 1};

    int axis = 0;
    while (decomposition[0] * decomposition[1] * decomposition[2] < target) {
        ++decomposition[axis];
        axis = (axis + 1) % 3;
    }

    const int found = decomposition[0] * decomposition[1] * decomposition[2];
    if (found != target) {
        fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
        fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found);
        ERROR("Invalid nprocs");
        return (int3){-1, -1, -1};
    }
    else {
        return (int3){decomposition[0], decomposition[1], decomposition[2]};
    }
}

static PackedData
acCreatePackedData(const int3 dims)
{
    PackedData data = {};

    data.dims = dims;

    const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));

    return data;
}

static AcResult
acDestroyPackedData(PackedData* data)
{
    data->dims = (int3){-1, -1, -1};
    cudaFree(data->data);
    data->data = NULL;

    return AC_SUCCESS;
}

#if MPI_GPUDIRECT_DISABLED
static PackedData
acCreatePackedDataHost(const int3 dims)
{
    PackedData data = {};

    data.dims = dims;

    const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
    data.data          = (AcReal*)malloc(bytes);
    ERRCHK_ALWAYS(data.data);

    return data;
}

static void
acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata,
                           PackedData* hdata)
{
    cudaSetDevice(device->id);

    const size_t bytes = ddata.dims.x * ddata.dims.y * ddata.dims.z * sizeof(ddata.data[0]) *
                         NUM_VTXBUF_HANDLES;
    ERRCHK_CUDA(cudaMemcpyAsync(hdata->data, ddata.data, bytes, cudaMemcpyDeviceToHost, stream));
}

static void
acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, const PackedData hdata,
                             PackedData* ddata)
{
    cudaSetDevice(device->id);

    const size_t bytes = hdata.dims.x * hdata.dims.y * hdata.dims.z * sizeof(hdata.data[0]) *
                         NUM_VTXBUF_HANDLES;
    ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream));
}

static AcResult
acDestroyPackedDataHost(PackedData* data)
{
    data->dims = (int3){-1, -1, -1};
    free(data->data);
    data->data = NULL;

    return AC_SUCCESS;
}
#endif // MPI_GPUDIRECT_DISABLED

// TODO: do with packed data
static AcResult
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{
    MPI_Barrier(MPI_COMM_WORLD);
    printf("Distributing mesh...\n");
    fflush(stdout);

    MPI_Datatype datatype = MPI_FLOAT;
    if (sizeof(AcReal) == 8)
        datatype = MPI_DOUBLE;

    int pid, nprocs;
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

    ERRCHK_ALWAYS(dst);

    // Submesh nn
    const int3 nn = (int3){
        dst->info.int_params[AC_nx],
        dst->info.int_params[AC_ny],
        dst->info.int_params[AC_nz],
    };

    // Send to self
    if (pid == 0) {
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            // For pencils
            for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
                for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
                    const int i       = NGHOST;
                    const int count   = nn.x;
                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
                           &src.vertex_buffer[vtxbuf][src_idx],  //
                           count * sizeof(src.vertex_buffer[i][0]));
                }
            }
        }
    }

    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
        // For pencils
        for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
            for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
                const int i     = NGHOST;
                const int count = nn.x;

                if (pid != 0) {
                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
                    // Recv
                    MPI_Status status;
                    MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0,
                             MPI_COMM_WORLD, &status);
                }
                else {
                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
                        const int src_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
                                                              j + tgt_pid3d.y * nn.y, //
                                                              k + tgt_pid3d.z * nn.z, //
                                                              src.info);

                        // Send
                        MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0,
                                 MPI_COMM_WORLD);
                    }
                }
            }
        }
    }
    return AC_SUCCESS;
}

// TODO: do with packed data
static AcResult
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
{
    MPI_Barrier(MPI_COMM_WORLD);
    printf("Gathering mesh...\n");
    fflush(stdout);

    MPI_Datatype datatype = MPI_FLOAT;
    if (sizeof(AcReal) == 8)
        datatype = MPI_DOUBLE;

    int pid, nprocs;
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

    if (pid == 0)
        ERRCHK_ALWAYS(dst);

    // Submesh nn
    const int3 nn = (int3){
        src.info.int_params[AC_nx],
        src.info.int_params[AC_ny],
        src.info.int_params[AC_nz],
    };

    // Submesh mm
    const int3 mm = (int3){
        src.info.int_params[AC_mx],
        src.info.int_params[AC_my],
        src.info.int_params[AC_mz],
    };

    // Send to self
    if (pid == 0) {
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            // For pencils
            for (int k = 0; k < mm.z; ++k) {
                for (int j = 0; j < mm.y; ++j) {
                    const int i       = 0;
                    const int count   = mm.x;
                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
                    const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
                    memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
                           &src.vertex_buffer[vtxbuf][src_idx],  //
                           count * sizeof(src.vertex_buffer[i][0]));
                }
            }
        }
    }

    for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
        // For pencils
        for (int k = 0; k < mm.z; ++k) {
            for (int j = 0; j < mm.y; ++j) {
                const int i     = 0;
                const int count = mm.x;

                if (pid != 0) {
                    // Send
                    const int src_idx = acVertexBufferIdx(i, j, k, src.info);
                    MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0,
                             MPI_COMM_WORLD);
                }
                else {
                    for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
                        const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
                        const int dst_idx    = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
                                                              j + tgt_pid3d.y * nn.y, //
                                                              k + tgt_pid3d.z * nn.z, //
                                                              dst->info);

                        // Recv
                        MPI_Status status;
                        MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0,
                                 MPI_COMM_WORLD, &status);
                    }
                }
            }
        }
    }
    return AC_SUCCESS;
}

typedef struct {
    PackedData* srcs;
    PackedData* dsts;
#if MPI_GPUDIRECT_DISABLED
    PackedData* srcs_host;
    PackedData* dsts_host;
#endif
    int3 dims;
    size_t count;

    cudaStream_t* streams;
    MPI_Request* send_reqs;
    MPI_Request* recv_reqs;
} CommData;

static CommData
acCreateCommData(const Device device, const int3 dims, const size_t count)
{
    cudaSetDevice(device->id);

    CommData data = {};

    data.srcs  = (PackedData*)malloc(count * sizeof(PackedData));
    data.dsts  = (PackedData*)malloc(count * sizeof(PackedData));
    data.dims  = dims;
    data.count = count;

    data.streams   = (cudaStream_t*)malloc(count * sizeof(cudaStream_t));
    data.send_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request));
    data.recv_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request));

    ERRCHK_ALWAYS(data.srcs);
    ERRCHK_ALWAYS(data.dsts);
    ERRCHK_ALWAYS(data.send_reqs);
    ERRCHK_ALWAYS(data.recv_reqs);

#if MPI_GPUDIRECT_DISABLED
    data.srcs_host = (PackedData*)malloc(count * sizeof(PackedData));
    data.dsts_host = (PackedData*)malloc(count * sizeof(PackedData));
    ERRCHK_ALWAYS(data.srcs_host);
    ERRCHK_ALWAYS(data.dsts_host);
#endif

    for (size_t i = 0; i < count; ++i) {
        data.srcs[i] = acCreatePackedData(dims);
        data.dsts[i] = acCreatePackedData(dims);

#if MPI_GPUDIRECT_DISABLED
        data.srcs_host[i] = acCreatePackedDataHost(dims);
        data.dsts_host[i] = acCreatePackedDataHost(dims);
#endif

        cudaStreamCreate(&data.streams[i]);
    }

    return data;
}

static void
acDestroyCommData(const Device device, CommData* data)
{
    cudaSetDevice(device->id);

    for (size_t i = 0; i < data->count; ++i) {
        acDestroyPackedData(&data->srcs[i]);
        acDestroyPackedData(&data->dsts[i]);

#if MPI_GPUDIRECT_DISABLED
        acDestroyPackedDataHost(&data->srcs_host[i]);
        acDestroyPackedDataHost(&data->dsts_host[i]);
#endif

        cudaStreamDestroy(data->streams[i]);
    }

    free(data->srcs);
    free(data->dsts);

#if MPI_GPUDIRECT_DISABLED
    free(data->srcs_host);
    free(data->dsts_host);
#endif

    free(data->streams);
    free(data->send_reqs);
    free(data->recv_reqs);

    data->count = -1;
    data->dims  = (int3){-1, -1, -1};
}

static void
acPackCommData(const Device device, const int3* a0s, CommData* data)
{
    cudaSetDevice(device->id);
    for (size_t i = 0; i < data->count; ++i)
        acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
}

static void
acUnpackCommData(const Device device, const int3* b0s, CommData* data)
{
    cudaSetDevice(device->id);

    for (size_t i = 0; i < data->count; ++i)
        acKernelUnpackData(data->streams[i], data->dsts[i], b0s[i], device->vba);
}

#if MPI_GPUDIRECT_DISABLED
static void
acTransferCommDataToHost(const Device device, CommData* data)
{
    cudaSetDevice(device->id);
    for (size_t i = 0; i < data->count; ++i)
        acTransferPackedDataToHost(device, data->streams[i], data->srcs[i], &data->srcs_host[i]);
}

static void
acTransferCommDataToDevice(const Device device, CommData* data)
{
    cudaSetDevice(device->id);
    for (size_t i = 0; i < data->count; ++i)
        acTransferPackedDataToDevice(device, data->streams[i], data->dsts_host[i], &data->dsts[i]);
}
#endif

static AcResult
acTransferCommData(const Device device, //
                   const int3* a0s,     // Src idx inside comp. domain
                   const int3* b0s,     // Dst idx inside bound zone
                   CommData* data)
{
    cudaSetDevice(device->id);

    MPI_Datatype datatype = MPI_FLOAT;
    if (sizeof(AcReal) == 8)
        datatype = MPI_DOUBLE;

    int nprocs, pid;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    const int3 decomp = decompose(nprocs);

    const int3 nn = (int3){
        device->local_config.int_params[AC_nx],
        device->local_config.int_params[AC_ny],
        device->local_config.int_params[AC_nz],
    };

    const int3 dims         = data->dims;
    const size_t blockcount = data->count;

    for (int k = -1; k <= 1; ++k) {
        for (int j = -1; j <= 1; ++j) {
            for (int i = -1; i <= 1; ++i) {
                if (i == 0 && j == 0 && k == 0)
                    continue;

                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
                        const int3 neighbor = (int3){i, j, k};

                        const int3 a0 = a0s[a_idx];
                        // const int3 a1 = a0 + dims;

                        const int3 b0 = a0 - neighbor * nn;
                        // const int3 b1 = a1 - neighbor * nn;

                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {

                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;

#if MPI_GPUDIRECT_DISABLED
                            PackedData dst = data->dsts_host[b_idx];
#else
                            PackedData dst = data->dsts[b_idx];
#endif

                            const int3 pid3d = getPid3D(pid, decomp);
                            MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
                                      b_idx, MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
                        }
                    }
                }
            }
        }
    }

    for (int k = -1; k <= 1; ++k) {
        for (int j = -1; j <= 1; ++j) {
            for (int i = -1; i <= 1; ++i) {
                if (i == 0 && j == 0 && k == 0)
                    continue;

                for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
                    for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
                        const int3 neighbor = (int3){i, j, k};

                        const int3 a0 = a0s[a_idx];
                        // const int3 a1 = a0 + dims;

                        const int3 b0 = a0 - neighbor * nn;
                        // const int3 b1 = a1 - neighbor * nn;

                        if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {

                            const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;

#if MPI_GPUDIRECT_DISABLED
                            PackedData src = data->srcs_host[a_idx];
#else
                            PackedData src = data->srcs[a_idx];
#endif

                            const int3 pid3d = getPid3D(pid, decomp);

                            cudaStreamSynchronize(data->streams[a_idx]);
                            MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp),
                                      b_idx, MPI_COMM_WORLD, &data->send_reqs[b_idx]);
                        }
                    }
                }
            }
        }
    }

    return AC_SUCCESS;
}

static void
acTransferCommDataWait(const CommData data)
{
    for (size_t i = 0; i < data.count; ++i) {
        MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
        MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
    }
}

static AcResult
acDeviceIntegrateMPI(const Device device, const AcReal dt)
{
    // Configure
    const int3 nn = (int3){
        device->local_config.int_params[AC_nx],
        device->local_config.int_params[AC_ny],
        device->local_config.int_params[AC_nz],
    };

    // Corners
    const int3 corner_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //
        (int3){NGHOST, nn.y, NGHOST},   //
        (int3){nn.x, nn.y, NGHOST},     //

        (int3){NGHOST, NGHOST, nn.z}, //
        (int3){nn.x, NGHOST, nn.z},   //
        (int3){NGHOST, nn.y, nn.z},   //
        (int3){nn.x, nn.y, nn.z},
    };
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},

        (int3){0, 0, NGHOST + nn.z},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };
    const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};

    // Edges X
    const int3 edgex_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){NGHOST, nn.y, NGHOST},   //

        (int3){NGHOST, NGHOST, nn.z}, //
        (int3){NGHOST, nn.y, nn.z},   //
    };
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},

        (int3){NGHOST, 0, NGHOST + nn.z},
        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
    };
    const int3 edgex_dims = (int3){nn.x, NGHOST, NGHOST};

    // Edges Y
    const int3 edgey_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //

        (int3){NGHOST, NGHOST, nn.z}, //
        (int3){nn.x, NGHOST, nn.z},   //
    };
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},

        (int3){0, NGHOST, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
    };
    const int3 edgey_dims = (int3){NGHOST, nn.y, NGHOST};

    // Edges Z
    const int3 edgez_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //

        (int3){NGHOST, nn.y, NGHOST}, //
        (int3){nn.x, nn.y, NGHOST},   //
    };
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},

        (int3){0, NGHOST + nn.y, NGHOST},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
    };

    const int3 edgez_dims = (int3){NGHOST, NGHOST, nn.z};

    // Sides XY
    const int3 sidexy_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){NGHOST, NGHOST, nn.z},   //
    };
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };
    const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};

    // Sides XZ
    const int3 sidexz_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){NGHOST, nn.y, NGHOST},   //
    };
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };
    const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};

    // Sides YZ
    const int3 sideyz_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //
    };
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };
    const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};

    // Alloc
    CommData corner_data = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
    CommData edgex_data  = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
    CommData edgey_data  = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
    CommData edgez_data  = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
    CommData sidexy_data = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
    CommData sidexz_data = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
    CommData sideyz_data = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));

    // Warmup
    for (int i = 0; i < 10; ++i) {
        acPackCommData(device, corner_a0s, &corner_data);
        acPackCommData(device, edgex_a0s, &edgex_data);
        acPackCommData(device, edgey_a0s, &edgey_data);
        acPackCommData(device, edgez_a0s, &edgez_data);
        acPackCommData(device, sidexy_a0s, &sidexy_data);
        acPackCommData(device, sidexz_a0s, &sidexz_data);
        acPackCommData(device, sideyz_a0s, &sideyz_data);

#if MPI_GPUDIRECT_DISABLED
        acTransferCommDataToHost(device, &corner_data);
        acTransferCommDataToHost(device, &edgex_data);
        acTransferCommDataToHost(device, &edgey_data);
        acTransferCommDataToHost(device, &edgez_data);
        acTransferCommDataToHost(device, &sidexy_data);
        acTransferCommDataToHost(device, &sidexz_data);
        acTransferCommDataToHost(device, &sideyz_data);
#endif

        acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
        acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
        acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
        acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
        acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
        acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
        acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);

        acTransferCommDataWait(corner_data);
        acTransferCommDataWait(edgex_data);
        acTransferCommDataWait(edgey_data);
        acTransferCommDataWait(edgez_data);
        acTransferCommDataWait(sidexy_data);
        acTransferCommDataWait(sidexz_data);
        acTransferCommDataWait(sideyz_data);

#if MPI_GPUDIRECT_DISABLED
        acTransferCommDataToDevice(device, &corner_data);
        acTransferCommDataToDevice(device, &edgex_data);
        acTransferCommDataToDevice(device, &edgey_data);
        acTransferCommDataToDevice(device, &edgez_data);
        acTransferCommDataToDevice(device, &sidexy_data);
        acTransferCommDataToDevice(device, &sidexz_data);
        acTransferCommDataToDevice(device, &sideyz_data);
#endif

        acUnpackCommData(device, corner_b0s, &corner_data);
        acUnpackCommData(device, edgex_b0s, &edgex_data);
        acUnpackCommData(device, edgey_b0s, &edgey_data);
        acUnpackCommData(device, edgez_b0s, &edgez_data);
        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
    }

    // Communicate
    Timer ttot;
    cudaDeviceSynchronize();
    MPI_Barrier(MPI_COMM_WORLD);
    timer_reset(&ttot);
    MPI_Barrier(MPI_COMM_WORLD);

    const int num_iterations = 1;
    for (int i = 0; i < num_iterations; ++i) {
        for (int isubstep = 0; isubstep < 3; ++isubstep) {
            acPackCommData(device, corner_a0s, &corner_data);
            acPackCommData(device, edgex_a0s, &edgex_data);
            acPackCommData(device, edgey_a0s, &edgey_data);
            acPackCommData(device, edgez_a0s, &edgez_data);
            acPackCommData(device, sidexy_a0s, &sidexy_data);
            acPackCommData(device, sidexz_a0s, &sidexz_data);
            acPackCommData(device, sideyz_a0s, &sideyz_data);

#if MPI_GPUDIRECT_DISABLED
            acTransferCommDataToHost(device, &corner_data);
            acTransferCommDataToHost(device, &edgex_data);
            acTransferCommDataToHost(device, &edgey_data);
            acTransferCommDataToHost(device, &edgez_data);
            acTransferCommDataToHost(device, &sidexy_data);
            acTransferCommDataToHost(device, &sidexz_data);
            acTransferCommDataToHost(device, &sideyz_data);
#endif

            //////////// INNER INTEGRATION //////////////
            {
                const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
                const int3 m2 = nn;
                acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
            }
            ////////////////////////////////////////////

            acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
            acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
            acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
            acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
            acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
            acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
            acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);

            acTransferCommDataWait(corner_data);
            acTransferCommDataWait(edgex_data);
            acTransferCommDataWait(edgey_data);
            acTransferCommDataWait(edgez_data);
            acTransferCommDataWait(sidexy_data);
            acTransferCommDataWait(sidexz_data);
            acTransferCommDataWait(sideyz_data);

#if MPI_GPUDIRECT_DISABLED
            acTransferCommDataToDevice(device, &corner_data);
            acTransferCommDataToDevice(device, &edgex_data);
            acTransferCommDataToDevice(device, &edgey_data);
            acTransferCommDataToDevice(device, &edgez_data);
            acTransferCommDataToDevice(device, &sidexy_data);
            acTransferCommDataToDevice(device, &sidexz_data);
            acTransferCommDataToDevice(device, &sideyz_data);
#endif

            acUnpackCommData(device, corner_b0s, &corner_data);
            acUnpackCommData(device, edgex_b0s, &edgex_data);
            acUnpackCommData(device, edgey_b0s, &edgey_data);
            acUnpackCommData(device, edgez_b0s, &edgez_data);
            acUnpackCommData(device, sidexy_b0s, &sidexy_data);
            acUnpackCommData(device, sidexz_b0s, &sidexz_data);
            acUnpackCommData(device, sideyz_b0s, &sideyz_data);
            //////////// OUTER INTEGRATION //////////////
            acDeviceSynchronizeStream(device, STREAM_ALL); // Wait for unpacking
            {                                              // Front
                const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
                const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
                acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
            }
            { // Back
                const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
                const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
                acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
            }
            { // Bottom
                const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
                const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
                acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
            }
            { // Top
                const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
                const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
                acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
            }
            { // Left
                const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
                const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
                acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
            }
            { // Right
                const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
                const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
                acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
            }
            acDeviceSwapBuffers(device);
            acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
            ////////////////////////////////////////////
        }
    }

    cudaDeviceSynchronize();
    MPI_Barrier(MPI_COMM_WORLD);
    const double msec = timer_diff_nsec(ttot) / 1e6;
    MPI_Barrier(MPI_COMM_WORLD);

    int pid, nprocs;
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    if (!pid) {
        printf("--- Total communication time per step w/ integration: %f ms\n",
               msec / num_iterations);

        // Write out to file
        FILE* fp = fopen("benchmark.result", "a+");
        fprintf(fp, "%d, %f\n", nprocs, msec);
        fclose(fp);
    }

    // Dealloc
    acDestroyCommData(device, &corner_data);
    acDestroyCommData(device, &edgex_data);
    acDestroyCommData(device, &edgey_data);
    acDestroyCommData(device, &edgez_data);
    acDestroyCommData(device, &sidexy_data);
    acDestroyCommData(device, &sidexz_data);
    acDestroyCommData(device, &sideyz_data);

    return AC_SUCCESS;
}

static AcResult
acDeviceCommunicateHalosMPI(const Device device)
{
    // Configure
    const int3 nn = (int3){
        device->local_config.int_params[AC_nx],
        device->local_config.int_params[AC_ny],
        device->local_config.int_params[AC_nz],
    };
    const AcReal dt = FLT_EPSILON; // TODO replace with the real one

    // Corners
    const int3 corner_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //
        (int3){NGHOST, nn.y, NGHOST},   //
        (int3){nn.x, nn.y, NGHOST},     //

        (int3){NGHOST, NGHOST, nn.z}, //
        (int3){nn.x, NGHOST, nn.z},   //
        (int3){NGHOST, nn.y, nn.z},   //
        (int3){nn.x, nn.y, nn.z},
    };
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},

        (int3){0, 0, NGHOST + nn.z},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };
    const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};

    // Edges X
    const int3 edgex_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){NGHOST, nn.y, NGHOST},   //

        (int3){NGHOST, NGHOST, nn.z}, //
        (int3){NGHOST, nn.y, nn.z},   //
    };
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},

        (int3){NGHOST, 0, NGHOST + nn.z},
        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
    };
    const int3 edgex_dims = (int3){nn.x, NGHOST, NGHOST};

    // Edges Y
    const int3 edgey_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //

        (int3){NGHOST, NGHOST, nn.z}, //
        (int3){nn.x, NGHOST, nn.z},   //
    };
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},

        (int3){0, NGHOST, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
    };
    const int3 edgey_dims = (int3){NGHOST, nn.y, NGHOST};

    // Edges Z
    const int3 edgez_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //

        (int3){NGHOST, nn.y, NGHOST}, //
        (int3){nn.x, nn.y, NGHOST},   //
    };
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},

        (int3){0, NGHOST + nn.y, NGHOST},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
    };

    const int3 edgez_dims = (int3){NGHOST, NGHOST, nn.z};

    // Sides XY
    const int3 sidexy_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){NGHOST, NGHOST, nn.z},   //
    };
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };
    const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};

    // Sides XZ
    const int3 sidexz_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){NGHOST, nn.y, NGHOST},   //
    };
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };
    const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};

    // Sides YZ
    const int3 sideyz_a0s[] = {
        (int3){NGHOST, NGHOST, NGHOST}, //
        (int3){nn.x, NGHOST, NGHOST},   //
    };
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };
    const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};

    // Alloc
    CommData corner_data = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
    CommData edgex_data  = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
    CommData edgey_data  = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
    CommData edgez_data  = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
    CommData sidexy_data = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
    CommData sidexz_data = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
    CommData sideyz_data = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));

    // Communicate
    Timer ttot;
    cudaDeviceSynchronize();
    MPI_Barrier(MPI_COMM_WORLD);
    timer_reset(&ttot);
    MPI_Barrier(MPI_COMM_WORLD);

    acPackCommData(device, corner_a0s, &corner_data);
    acPackCommData(device, edgex_a0s, &edgex_data);
    acPackCommData(device, edgey_a0s, &edgey_data);
    acPackCommData(device, edgez_a0s, &edgez_data);
    acPackCommData(device, sidexy_a0s, &sidexy_data);
    acPackCommData(device, sidexz_a0s, &sidexz_data);
    acPackCommData(device, sideyz_a0s, &sideyz_data);

#if MPI_GPUDIRECT_DISABLED
    acTransferCommDataToHost(device, &corner_data);
    acTransferCommDataToHost(device, &edgex_data);
    acTransferCommDataToHost(device, &edgey_data);
    acTransferCommDataToHost(device, &edgez_data);
    acTransferCommDataToHost(device, &sidexy_data);
    acTransferCommDataToHost(device, &sidexz_data);
    acTransferCommDataToHost(device, &sideyz_data);
#endif

    acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
    acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
    acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
    acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
    acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
    acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
    acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);

    acTransferCommDataWait(corner_data);
    acTransferCommDataWait(edgex_data);
    acTransferCommDataWait(edgey_data);
    acTransferCommDataWait(edgez_data);
    acTransferCommDataWait(sidexy_data);
    acTransferCommDataWait(sidexz_data);
    acTransferCommDataWait(sideyz_data);

#if MPI_GPUDIRECT_DISABLED
    acTransferCommDataToDevice(device, &corner_data);
    acTransferCommDataToDevice(device, &edgex_data);
    acTransferCommDataToDevice(device, &edgey_data);
    acTransferCommDataToDevice(device, &edgez_data);
    acTransferCommDataToDevice(device, &sidexy_data);
    acTransferCommDataToDevice(device, &sidexz_data);
    acTransferCommDataToDevice(device, &sideyz_data);
#endif

    acUnpackCommData(device, corner_b0s, &corner_data);
    acUnpackCommData(device, edgex_b0s, &edgex_data);
    acUnpackCommData(device, edgey_b0s, &edgey_data);
    acUnpackCommData(device, edgez_b0s, &edgez_data);
    acUnpackCommData(device, sidexy_b0s, &sidexy_data);
    acUnpackCommData(device, sidexz_b0s, &sidexz_data);
    acUnpackCommData(device, sideyz_b0s, &sideyz_data);

    cudaDeviceSynchronize();
    MPI_Barrier(MPI_COMM_WORLD);
    const double msec = timer_diff_nsec(ttot) / 1e6;
    MPI_Barrier(MPI_COMM_WORLD);

    int pid, nprocs;
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    if (!pid) {
        printf("--- Total communication time per substep (comm): %f ms\n", msec);

        // Write out to file
        FILE* fp = fopen("benchmark.result", "a+");
        fprintf(fp, "%d, %f\n", nprocs, msec);
        fclose(fp);
    }

    // Dealloc
    acDestroyCommData(device, &corner_data);
    acDestroyCommData(device, &edgex_data);
    acDestroyCommData(device, &edgey_data);
    acDestroyCommData(device, &edgez_data);
    acDestroyCommData(device, &sidexy_data);
    acDestroyCommData(device, &sidexz_data);
    acDestroyCommData(device, &sideyz_data);

    return AC_SUCCESS;
}

/*
static int3
findOptimalDecomposition(const int3 nn)
{
  int3 decomposition = (int3){1, 1, 1};
}*/

AcResult
acDeviceRunMPITest(void)
{
    MPI_Init(NULL, NULL);

    int nprocs, pid;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);

    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);
    printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);

    // Create model and candidate meshes
    AcMeshInfo info;
    acLoadConfig(AC_DEFAULT_CONFIG, &info);
    info.real_params[AC_inv_dsx]   = AcReal(1.0) / info.real_params[AC_dsx];
    info.real_params[AC_inv_dsy]   = AcReal(1.0) / info.real_params[AC_dsy];
    info.real_params[AC_inv_dsz]   = AcReal(1.0) / info.real_params[AC_dsz];
    info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];

    AcMesh model, candidate;

    // Master CPU
    if (pid == 0) {
        acMeshCreate(info, &model);
        acMeshCreate(info, &candidate);

        acMeshRandomize(&model);
        acMeshRandomize(&candidate);
    }

    /// DECOMPOSITION & SUBMESH ///////////////////////////////////
    AcMeshInfo submesh_info  = info;
    const int3 decomposition = decompose(nprocs);
    const int3 pid3d         = getPid3D(pid, decomposition);

    printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
    printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
    ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
    ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
    ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0);

    const int submesh_nx                       = info.int_params[AC_nx] / decomposition.x;
    const int submesh_ny                       = info.int_params[AC_ny] / decomposition.y;
    const int submesh_nz                       = info.int_params[AC_nz] / decomposition.z;
    submesh_info.int_params[AC_nx]             = submesh_nx;
    submesh_info.int_params[AC_ny]             = submesh_ny;
    submesh_info.int_params[AC_nz]             = submesh_nz;
    submesh_info.int3_params[AC_global_grid_n] = (int3){
        info.int_params[AC_nx],
        info.int_params[AC_ny],
        info.int_params[AC_nz],
    };
    submesh_info.int3_params[AC_multigpu_offset] = pid3d *
                                                   (int3){submesh_nx, submesh_ny, submesh_nz};
    WARNING("AC_multigpu_offset not yet tested");
    acUpdateBuiltinParams(&submesh_info);

    AcMesh submesh;
    acMeshCreate(submesh_info, &submesh);
    acMeshRandomize(&submesh);
    ////////////////////////////////////////////////////////////////

    // GPU INIT ////////////////////////////////////////////////////
    int devices_per_node = -1;
    cudaGetDeviceCount(&devices_per_node);

    Device device;
    acDeviceCreate(pid % devices_per_node, submesh_info, &device);
    // TODO enable peer access
    ////////////////////////////////////////////////////////////////

    // DISTRIBUTE & LOAD //////////////////////////////////////////
    acDeviceDistributeMeshMPI(model, decomposition, &submesh);
    acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
    ///////////////////////////////////////////////////////////////

    // SYNC //////////////////////////////////////////////////////
    acDeviceSynchronizeStream(device, STREAM_ALL);
    MPI_Barrier(MPI_COMM_WORLD);
    //////////////////////////////////////////////////////////////

    // TIMING START //////////////////////////////////////////////
    acDeviceSynchronizeStream(device, STREAM_ALL);
    MPI_Barrier(MPI_COMM_WORLD);
    Timer t;
    timer_reset(&t);
    //////////////////////////////////////////////////////////////

    // INTEGRATION & BOUNDCONDS////////////////////////////////////
    // acDeviceCommunicateHalosMPI(device);
    acDeviceIntegrateMPI(device, FLT_EPSILON);
    acDeviceCommunicateHalosMPI(device);
    ///////////////////////////////////////////////////////////////

    // TIMING END //////////////////////////////////////////////
    acDeviceSynchronizeStream(device, STREAM_ALL);
    MPI_Barrier(MPI_COMM_WORLD);
    if (!pid) {
        timer_diff_print(t);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    //////////////////////////////////////////////////////////////

    // STORE & GATHER /////////////////////////////////////////////
    MPI_Barrier(MPI_COMM_WORLD);
    acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
    acDeviceSynchronizeStream(device, STREAM_DEFAULT);
    acDeviceGatherMeshMPI(submesh, decomposition, &candidate);
    //////////////////////////////////////////////////////////////

    // VERIFY ////////////////////////////////////////////////////
    if (pid == 0) {
        acModelIntegrateStep(model, FLT_EPSILON);
        acMeshApplyPeriodicBounds(&model);

        acVerifyMesh(model, candidate);
        acMeshDestroy(&model);
        acMeshDestroy(&candidate);
    }
    //////////////////////////////////////////////////////////////

    // DESTROY ///////////////////////////////////////////////////
    acDeviceDestroy(device);
    acMeshDestroy(&submesh);
    MPI_Finalize();
    //////////////////////////////////////////////////////////////
    return AC_SUCCESS;
}
#else
AcResult
acDeviceRunMPITest(void)
{
    WARNING("MPI was not enabled but acDeviceRunMPITest() was called");
    return AC_FAILURE;
}
#endif // AC_MPI_ENABLED

/*
struct grid_s {
    Device device;
};

typedef grid_s* Grid;

AcResult
acGridInit(void)
{
    MPI_Init(NULL, NULL);

    int nprocs, pid;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);

    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);
    printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
}

AcResult
acGridLoad(const AcMesh mesh, Grid* grid)
{
}

AcResult
acGridStore(const Grid grid, AcMesh* mesh)
{
}

AcResult
acGridQuit(AcGrid& grid)
{
}
*/