/*
    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.

    This file is part of Astaroth.

    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
*/
#include "astaroth_node.h"

#include "astaroth_device.h"
#include "errchk.h"
#include "math_utils.h" // sum for reductions

static const int MAX_NUM_DEVICES = 32;

typedef struct {
    int3 m;
    int3 n;
} Grid;

struct node_s {
    int id;

    int num_devices;
    Device devices[MAX_NUM_DEVICES];

    Grid grid;
    Grid subgrid;

    AcMeshInfo config;
};

static int
gridIdx(const Grid grid, const int3 idx)
{
    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
}

static int3
gridIdx3d(const Grid grid, const int idx)
{
    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                  idx / (grid.m.x * grid.m.y)};
}

static void
printInt3(const int3 vec)
{
    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
}

static inline void
print(const AcMeshInfo config)
{
    for (int i = 0; i < NUM_INT_PARAMS; ++i)
        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
}

static void
update_builtin_params(AcMeshInfo* config)
{
    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
    ///////////// PAD TEST
    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
    ///////////// PAD TEST
    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;

    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
    config->int_params[AC_nx_min] = NGHOST;
    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
    config->int_params[AC_ny_min] = NGHOST;
    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
    config->int_params[AC_nz_min] = NGHOST;
    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;

    /* Additional helper params */
    // Int helpers
    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
}

static Grid
createGrid(const AcMeshInfo config)
{
    Grid grid;

    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};

    return grid;
}

AcResult
acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
{
    struct node_s* node = (struct node_s*)malloc(sizeof(*node));
    node->id            = id;
    node->config        = node_config;

    // Get node->num_devices
    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices));
    if (node->num_devices < 1) {
        ERROR("No CUDA devices found!");
        return AC_FAILURE;
    }
    if (node->num_devices > MAX_NUM_DEVICES) {
        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
        node->num_devices = MAX_NUM_DEVICES;
    }
    if (!AC_MULTIGPU_ENABLED) {
        WARNING("MULTIGPU_ENABLED was false. Using only one device");
        node->num_devices = 1; // Use only one device if multi-GPU is not enabled
    }
    // Check that node->num_devices is divisible with AC_nz. This makes decomposing the
    // problem domain to multiple GPUs much easier since we do not have to worry
    // about remainders
    ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0);

    // Decompose the problem domain
    // The main grid
    node->grid = createGrid(node->config);

    // Subgrids
    AcMeshInfo subgrid_config = node->config;
    subgrid_config.int_params[AC_nz] /= node->num_devices;
    update_builtin_params(&subgrid_config);
#if VERBOSE_PRINTING // Defined in astaroth.h
    printf("###############################################################\n");
    printf("Config dimensions recalculated:\n");
    print(subgrid_config);
    printf("###############################################################\n");
#endif
    node->subgrid = createGrid(subgrid_config);

    // Periodic boundary conditions become weird if the system can "fold unto itself".
    ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER);
    ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
    ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);

#if VERBOSE_PRINTING
    // clang-format off
    printf("Grid m ");   printInt3(node->grid.m);    printf("\n");
    printf("Grid n ");   printInt3(node->grid.n);    printf("\n");
    printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
    printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
    // clang-format on
#endif

    // Initialize the devices
    for (int i = 0; i < node->num_devices; ++i) {
        const int3 multinode_offset                    = (int3){0, 0, 0}; // Placeholder
        const int3 multigpu_offset                     = (int3){0, 0, i * node->subgrid.n.z};
        subgrid_config.int3_params[AC_global_grid_n]   = node->grid.n;
        subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset;

        acDeviceCreate(i, subgrid_config, &node->devices[i]);
        acDevicePrintInfo(node->devices[i]);
    }

    // Enable peer access
    for (int i = 0; i < node->num_devices; ++i) {
        const int front = (i + 1) % node->num_devices;
        const int back  = (i - 1 + node->num_devices) % node->num_devices;

        int can_access_front, can_access_back;
        cudaDeviceCanAccessPeer(&can_access_front, i, front);
        cudaDeviceCanAccessPeer(&can_access_back, i, back);
#if VERBOSE_PRINTING
        printf(
            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
            i, front, can_access_front, back, can_access_back);
#endif

        cudaSetDevice(i);
        if (can_access_front) {
            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
        }
        if (can_access_back) {
            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
        }
    }
    acNodeSynchronizeStream(node, STREAM_ALL);

    *node_handle = node;
    return AC_SUCCESS;
}

AcResult
acNodeDestroy(Node node)
{
    acNodeSynchronizeStream(node, STREAM_ALL);

    for (int i = 0; i < node->num_devices; ++i) {
        acDeviceDestroy(node->devices[i]);
    }
    free(node);

    return AC_SUCCESS;
}

AcResult
acNodePrintInfo(const Node node)
{
    (void)node;
    WARNING("Not implemented");
    return AC_FAILURE;
}

AcResult
acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
{
    (void)node;
    (void)config;
    WARNING("Not implemented");
    return AC_FAILURE;
}

AcResult
acNodeAutoOptimize(const Node node)
{
    (void)node;
    WARNING("Not implemented");
    return AC_FAILURE;
}

AcResult
acNodeSynchronizeStream(const Node node, const Stream stream)
{
    for (int i = 0; i < node->num_devices; ++i) {
        acDeviceSynchronizeStream(node->devices[i], stream);
    }

    return AC_SUCCESS;
}

AcResult
acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
                              const VertexBufferHandle vtxbuf_handle)
{
    acNodeSynchronizeStream(node, stream);
    // Exchanges the halos of subgrids
    // After this step, the data within the main grid ranging from
    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
    // has been synchronized and transferred to appropriate subgrids

    // We loop only to node->num_devices - 1 since the front and back plate of the grid is not
    // transferred because their contents depend on the boundary conditions.

    // IMPORTANT NOTE: the boundary conditions must be applied before
    // callingacNodeSynchronizeStream(node,  this function! I.e. the halos of subgrids must contain
    // up-to-date data!

    const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;

    for (int i = 0; i < node->num_devices - 1; ++i) {
        // ...|ooooxxx|... -> xxx|ooooooo|...
        const int3 src = (int3){0, 0, node->subgrid.n.z};
        const int3 dst = (int3){0, 0, 0};

        const Device src_device = node->devices[i];
        Device dst_device       = node->devices[i + 1];

        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                               num_vertices, dst_device);
    }
    for (int i = 1; i < node->num_devices; ++i) {
        // ...|ooooooo|xxx <- ...|xxxoooo|...
        const int3 src = (int3){0, 0, NGHOST};
        const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};

        const Device src_device = node->devices[i];
        Device dst_device       = node->devices[i - 1];

        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                               num_vertices, dst_device);
    }
    return AC_SUCCESS;
}

AcResult
acNodeSynchronizeMesh(const Node node, const Stream stream)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
    }

    return AC_SUCCESS;
}

AcResult
acNodeSwapBuffers(const Node node)
{
    for (int i = 0; i < node->num_devices; ++i) {
        acDeviceSwapBuffers(node->devices[i]);
    }
    return AC_SUCCESS;
}

AcResult
acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
                   const AcReal value)
{
    acNodeSynchronizeStream(node, stream);
    for (int i = 0; i < node->num_devices; ++i) {
        acDeviceLoadConstant(node->devices[i], stream, param, value);
    }
    return AC_SUCCESS;
}

AcResult
acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
                                 const int3 dst, const int num_vertices)
{
    acNodeSynchronizeStream(node, stream);
    // See the beginning of the file for an explanation of the index mapping
    // #pragma omp parallel for
    for (int i = 0; i < node->num_devices; ++i) {
        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};

        const int3 s0 = src; // dst; // TODO fix
        (void)dst;           // TODO fix
        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);

        const int3 da = max(s0, d0);
        const int3 db = min(s1, d1);
        /*
        printf("Device %d\n", i);
        printf("\ts0: "); printInt3(s0); printf("\n");
        printf("\td0: "); printInt3(d0); printf("\n");
        printf("\tda: "); printInt3(da); printf("\n");
        printf("\tdb: "); printInt3(db); printf("\n");
        printf("\td1: "); printInt3(d1); printf("\n");
        printf("\ts1: "); printInt3(s1); printf("\n");
        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
            // DECOMPOSITION OFFSET HERE
            const int3 da_global = da; // src + da - dst; // TODO fix
            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
            // printf("\n");
            acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle,
                                               da_global, da_local, copy_cells);
        }
        // printf("\n");
    }
    return AC_SUCCESS;
}

AcResult
acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
                         const int3 src, const int3 dst, const int num_vertices)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
                                         num_vertices);
    }
    return AC_SUCCESS;
}

AcResult
acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
                       const VertexBufferHandle vtxbuf_handle)
{
    const int3 src            = (int3){0, 0, 0};
    const int3 dst            = src;
    const size_t num_vertices = acVertexBufferSize(host_mesh.info);

    acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst,
                                     num_vertices);
    return AC_SUCCESS;
}

AcResult
acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i);
    }
    return AC_SUCCESS;
}

AcResult
acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                  const VertexBufferHandle vtxbuf_handle, const int3 src,
                                  const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
    acNodeSynchronizeStream(node, stream);
    for (int i = 0; i < node->num_devices; ++i) {
        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};

        const int3 s0 = src; // TODO fix
        (void)dst;           // TODO fix
        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);

        const int3 da = max(s0, d0);
        const int3 db = min(s1, d1);
        if (db.z >= da.z) {
            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
            // DECOMPOSITION OFFSET HERE
            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
            const int3 da_global = da; // dst + da - src; // TODO fix
            acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
                                                da_global, copy_cells, host_mesh);
        }
    }
    return AC_SUCCESS;
}

AcResult
acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
                          const int num_vertices, AcMesh* host_mesh)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst,
                                          num_vertices, host_mesh);
    }
    return AC_SUCCESS;
}

AcResult
acNodeStoreVertexBuffer(const Node node, const Stream stream,
                        const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
{
    const int3 src            = (int3){0, 0, 0};
    const int3 dst            = src;
    const size_t num_vertices = acVertexBufferSize(host_mesh->info);

    acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
                                      host_mesh);

    return AC_SUCCESS;
}

AcResult
acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh);
    }
    return AC_SUCCESS;
}

AcResult
acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start,
                       const int3 end, const AcReal dt)
{
    acNodeSynchronizeStream(node, stream);

    for (int i = 0; i < node->num_devices; ++i) {
        // DECOMPOSITION OFFSET HERE
        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
        const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};

        const int3 da = max(start, d0);
        const int3 db = min(end, d1);

        if (db.z >= da.z) {
            const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};
            const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};
            acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt);
        }
    }
    return AC_SUCCESS;
}

static AcResult
local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
{
    acNodeSynchronizeStream(node, stream);

    if (node->num_devices > 1) {
        // Local boundary conditions
        for (int i = 0; i < node->num_devices; ++i) {
            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
            acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
        }
    }
    else {
        acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
                                      node->subgrid.m);
    }
    return AC_SUCCESS;
}

static AcResult
global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
{
    acNodeSynchronizeStream(node, stream);

    if (node->num_devices > 1) {
        const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
        {
            // ...|ooooxxx|... -> xxx|ooooooo|...
            const int3 src = (int3){0, 0, node->subgrid.n.z};
            const int3 dst = (int3){0, 0, 0};

            const Device src_device = node->devices[node->num_devices - 1];
            Device dst_device       = node->devices[0];

            acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                   num_vertices, dst_device);
        }
        {
            // ...|ooooooo|xxx <- ...|xxxoooo|...
            const int3 src = (int3){0, 0, NGHOST};
            const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};

            const Device src_device = node->devices[0];
            Device dst_device       = node->devices[node->num_devices - 1];

            acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                   num_vertices, dst_device);
        }
    }
    return AC_SUCCESS;
}

AcResult
acNodeIntegrate(const Node node, const AcReal dt)
{
    acNodeSynchronizeStream(node, STREAM_ALL);
    // xxx|OOO OOOOOOOOO OOO|xxx
    //    ^    ^         ^  ^
    //   n0   n1        n2  n3
    const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
    const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
    const int3 n2 = node->grid.n;
    const int3 n3 = n0 + node->grid.n;

    for (int isubstep = 0; isubstep < 3; ++isubstep) {
        acNodeSynchronizeStream(node, STREAM_ALL);
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
        }
        acNodeSynchronizeStream(node, STREAM_ALL);
        // Inner inner
        for (int i = 0; i < node->num_devices; ++i) {
            const int3 m1 = n1 + (int3){0, 0, i * node->subgrid.n.z};
            const int3 m2 = m1 + node->subgrid.n - (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
            acNodeIntegrateSubstep(node, STREAM_16, isubstep, m1, m2, dt);
        }
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            const int num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
            for (int device_id = 0; device_id < node->num_devices; ++device_id) {
                // ...|ooooxxx|... -> xxx|ooooooo|...
                {
                    const int3 src = (int3){0, 0, node->subgrid.n.z};
                    const int3 dst = (int3){0, 0, 0};
                    acDeviceTransferVertexBufferWithOffset(
                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
                        dst, num_vertices, node->devices[(device_id + 1) % node->num_devices]);
                }
                // ...|ooooooo|xxx <- ...|xxxoooo|...
                {
                    const int3 src = (int3){0, 0, NGHOST};
                    const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
                    acDeviceTransferVertexBufferWithOffset(
                        node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src,
                        dst, num_vertices,
                        node->devices[(device_id - 1 + node->num_devices) % node->num_devices]);
                }
            }
        }
        for (int vtxbuf = 0; vtxbuf < 2 * NUM_VTXBUF_HANDLES; ++vtxbuf) {
            acNodeSynchronizeStream(node, (Stream)vtxbuf);
        }
        // Inner outer
        for (int i = 0; i < node->num_devices - 1; ++i) {
            const int3 m1 = n1 + (int3){0, 0, (i + 1) * node->subgrid.n.z - 2 * NGHOST};
            const int3 m2 = m1 + (int3){node->subgrid.n.x - 2 * NGHOST,
                                        node->subgrid.n.y - 2 * NGHOST, 2 * NGHOST};
            acNodeIntegrateSubstep(node, STREAM_0, isubstep, m1, m2, dt);
        }
        // Outer
        // Front
        {
            const int3 m1 = (int3){n0.x, n0.y, n0.z};
            const int3 m2 = (int3){n3.x, n3.y, n1.z};
            acNodeIntegrateSubstep(node, STREAM_1, isubstep, m1, m2, dt);
        }

        // Back
        {
            const int3 m1 = (int3){n0.x, n0.y, n2.z};
            const int3 m2 = (int3){n3.x, n3.y, n3.z};
            acNodeIntegrateSubstep(node, STREAM_2, isubstep, m1, m2, dt);
        }

        // Top
        {
            const int3 m1 = (int3){n0.x, n0.y, n1.z};
            const int3 m2 = (int3){n3.x, n1.y, n2.z};
            acNodeIntegrateSubstep(node, STREAM_3, isubstep, m1, m2, dt);
        }

        // Bottom
        {
            const int3 m1 = (int3){n0.x, n2.y, n1.z};
            const int3 m2 = (int3){n3.x, n3.y, n2.z};
            acNodeIntegrateSubstep(node, STREAM_4, isubstep, m1, m2, dt);
        }

        // Left
        {
            const int3 m1 = (int3){n0.x, n1.y, n1.z};
            const int3 m2 = (int3){n1.x, n2.y, n2.z};
            acNodeIntegrateSubstep(node, STREAM_5, isubstep, m1, m2, dt);
        }

        // Right
        {
            const int3 m1 = (int3){n2.x, n1.y, n1.z};
            const int3 m2 = (int3){n3.x, n2.y, n2.z};
            acNodeIntegrateSubstep(node, STREAM_6, isubstep, m1, m2, dt);
        }

        acNodeSwapBuffers(node);
    }
    acNodeSynchronizeStream(node, STREAM_ALL);
    return AC_SUCCESS;
}

AcResult
acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                            const VertexBufferHandle vtxbuf_handle)
{
    local_boundcondstep(node, stream, vtxbuf_handle);
    acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);

    // TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE
    global_boundcondstep(node, stream, vtxbuf_handle);
    // WARNING("Global boundconds should not be done here with multinode");

    return AC_SUCCESS;
}

AcResult
acNodePeriodicBoundconds(const Node node, const Stream stream)
{
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
    }
    return AC_SUCCESS;
}

static AcReal
simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
                         const int& n)
{
    AcReal res = results[0];
    for (int i = 1; i < n; ++i) {
        if (rtype == RTYPE_MAX) {
            res = max(res, results[i]);
        }
        else if (rtype == RTYPE_MIN) {
            res = min(res, results[i]);
        }
        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
            res = sum(res, results[i]);
        }
        else {
            ERROR("Invalid rtype");
        }
    }

    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
        const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z);
        res                = sqrt(inv_n * res);
    }
    return res;
}

AcResult
acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
    acNodeSynchronizeStream(node, STREAM_ALL);

    AcReal results[node->num_devices];
    for (int i = 0; i < node->num_devices; ++i) {
        acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
    }

    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
    return AC_SUCCESS;
}

AcResult
acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
                const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c,
                AcReal* result)
{
    acNodeSynchronizeStream(node, STREAM_ALL);

    AcReal results[node->num_devices];
    for (int i = 0; i < node->num_devices; ++i) {
        acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
    }

    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
    return AC_SUCCESS;
}