From 5f2378e91b5595c858dcbb9cea7e7df89aa5b8c6 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Fri, 2 Aug 2019 15:15:18 +0300
Subject: [PATCH] Now compiles (does not work though)

---
 include/astaroth_device.h |   4 +-
 src/core/CMakeLists.txt   |   2 +-
 src/core/device.cu        |   4 +-
 src/core/node.cu          | 273 +++++++++++++++++++++-----------------
 4 files changed, 158 insertions(+), 125 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 75ec693..442b1b1 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -97,10 +97,10 @@ AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream st
 
 /** */
 AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
-                                      const VertexBufferHandle vtxbuf_handle, Device* dst_device);
+                                      const VertexBufferHandle vtxbuf_handle, Device dst_device);
 
 /** Deprecated */
-AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device);
+AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
 
 /** */
 AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 6054444..0aad51d 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -34,5 +34,5 @@ endif ()
 
 ## Create and link the library
 include_directories(.)
-cuda_add_library(astaroth_core STATIC astaroth.cu device.cu)
+cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
 target_link_libraries(astaroth_core m)
diff --git a/src/core/device.cu b/src/core/device.cu
index d643ad1..f5bd295 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -229,7 +229,7 @@ acDevicePrintInfo(const Device device)
 }
 
 AcResult
-autoOptimize(const Device device)
+acDeviceAutoOptimize(const Device device)
 {
     cudaSetDevice(device->id);
 
@@ -502,7 +502,7 @@ acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
 }
 
 AcResult
-acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device)
+acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
 {
     WARNING("This function is deprecated");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
diff --git a/src/core/node.cu b/src/core/node.cu
index 8d4b48a..23c79b3 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -23,7 +23,11 @@
 #include "math_utils.h" // sum for reductions
 
 static const int MAX_NUM_DEVICES = 32;
-static Node node                 = NULL;
+
+typedef struct {
+    int3 m;
+    int3 n;
+} Grid;
 
 struct node_s {
     int id;
@@ -33,6 +37,8 @@ struct node_s {
 
     Grid grid;
     Grid subgrid;
+
+    AcMeshInfo config;
 };
 
 static int
@@ -55,12 +61,12 @@ printInt3(const int3 vec)
 }
 
 static inline void
-print(const AcMeshInfo config)
+print(const Node node)
 {
     for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+        printf("[%s]: %d\n", intparam_names[i], node->config.int_params[i]);
     for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+        printf("[%s]: %g\n", realparam_names[i], double(node->config.real_params[i]));
 }
 
 static void
@@ -86,13 +92,6 @@ update_builtin_params(AcMeshInfo* config)
     config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
     config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
     config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
-
-#if VERBOSE_PRINTING // Defined in astaroth.h
-    printf("###############################################################\n");
-    printf("Config dimensions recalculated:\n");
-    print(*config);
-    printf("###############################################################\n");
-#endif
 }
 
 static Grid
@@ -109,61 +108,69 @@ createGrid(const AcMeshInfo config)
 AcResult
 acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
 {
-    struct node_s* node = (struct device_s*)malloc(sizeof(*node));
+    struct node_s* node = (struct node_s*)malloc(sizeof(*node));
+    node->id            = id;
+    node->config        = node_config;
 
-    // Get num_devices
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
-    if (num_devices < 1) {
+    // Get node->num_devices
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices));
+    if (node->num_devices < 1) {
         ERROR("No CUDA devices found!");
         return AC_FAILURE;
     }
-    if (num_devices > MAX_NUM_DEVICES) {
+    if (node->num_devices > MAX_NUM_DEVICES) {
         WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
-        num_devices = MAX_NUM_DEVICES;
+        node->num_devices = MAX_NUM_DEVICES;
     }
     if (!AC_MULTIGPU_ENABLED) {
         WARNING("MULTIGPU_ENABLED was false. Using only one device");
-        num_devices = 1; // Use only one device if multi-GPU is not enabled
+        node->num_devices = 1; // Use only one device if multi-GPU is not enabled
     }
-    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // Check that node->num_devices is divisible with AC_nz. This makes decomposing the
     // problem domain to multiple GPUs much easier since we do not have to worry
     // about remainders
-    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+    ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0);
 
     // Decompose the problem domain
     // The main grid
-    grid = createGrid(config);
+    node->grid = createGrid(node->config);
 
     // Subgrids
-    AcMeshInfo subgrid_config = config;
-    subgrid_config.int_params[AC_nz] /= num_devices;
+    AcMeshInfo subgrid_config = node->config;
+    subgrid_config.int_params[AC_nz] /= node->num_devices;
     update_builtin_params(&subgrid_config);
-    subgrid = createGrid(subgrid_config);
+#if VERBOSE_PRINTING // Defined in astaroth.h
+    printf("###############################################################\n");
+    printf("Config dimensions recalculated:\n");
+    print(node);
+    printf("###############################################################\n");
+#endif
+    node->subgrid = createGrid(subgrid_config);
 
     // Periodic boundary conditions become weird if the system can "fold unto itself".
-    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
 
 #if VERBOSE_PRINTING
     // clang-format off
-    printf("Grid m ");   printInt3(grid.m);    printf("\n");
-    printf("Grid n ");   printInt3(grid.n);    printf("\n");
-    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
-    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+    printf("Grid m ");   printInt3(node->grid.m);    printf("\n");
+    printf("Grid n ");   printInt3(node->grid.n);    printf("\n");
+    printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
     // clang-format on
 #endif
 
     // Initialize the devices
-    for (int i = 0; i < num_devices; ++i) {
-        createDevice(i, subgrid_config, &devices[i]);
-        printDeviceInfo(devices[i]);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceCreate(i, subgrid_config, &node->devices[i]);
+        acDevicePrintInfo(node->devices[i]);
     }
 
     // Enable peer access
-    for (int i = 0; i < num_devices; ++i) {
-        const int front = (i + 1) % num_devices;
-        const int back  = (i - 1 + num_devices) % num_devices;
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int front = (i + 1) % node->num_devices;
+        const int back  = (i - 1 + node->num_devices) % node->num_devices;
 
         int can_access_front, can_access_back;
         cudaDeviceCanAccessPeer(&can_access_front, i, front);
@@ -182,7 +189,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
             ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
         }
     }
-    acNodeSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
     *node_handle = node;
     return AC_SUCCESS;
@@ -191,10 +198,10 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
 AcResult
 acNodeDestroy(Node node)
 {
-    acSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    for (int i = 0; i < num_devices; ++i) {
-        destroyDevice(devices[i]);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceDestroy(node->devices[i]);
     }
     free(node);
 
@@ -204,6 +211,7 @@ acNodeDestroy(Node node)
 AcResult
 acNodePrintInfo(const Node node)
 {
+    (void)node;
     WARNING("Not implemented");
     return AC_FAILURE;
 }
@@ -211,6 +219,8 @@ acNodePrintInfo(const Node node)
 AcResult
 acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
 {
+    (void)node;
+    (void)config;
     WARNING("Not implemented");
     return AC_FAILURE;
 }
@@ -218,6 +228,7 @@ acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
 AcResult
 acNodeAutoOptimize(const Node node)
 {
+    (void)node;
     WARNING("Not implemented");
     return AC_FAILURE;
 }
@@ -225,8 +236,8 @@ acNodeAutoOptimize(const Node node)
 AcResult
 acNodeSynchronizeStream(const Node node, const Stream stream)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceSynchronizeStream(node->devices[i], stream);
     }
 
     return AC_SUCCESS;
@@ -236,37 +247,39 @@ AcResult
 acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
                               const VertexBufferHandle vtxbuf_handle)
 {
+    acNodeSynchronizeStream(node, stream);
     // Exchanges the halos of subgrids
     // After this step, the data within the main grid ranging from
     // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
     // has been synchronized and transferred to appropriate subgrids
 
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
+    // We loop only to node->num_devices - 1 since the front and back plate of the grid is not
     // transferred because their contents depend on the boundary conditions.
 
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
+    // IMPORTANT NOTE: the boundary conditions must be applied before
+    // callingacNodeSynchronizeStream(node,  this function! I.e. the halos of subgrids must contain
+    // up-to-date data!
 
-    const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+    const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
 
-    for (int i = 0; i < num_devices - 1; ++i) {
+    for (int i = 0; i < node->num_devices - 1; ++i) {
         // ...|ooooxxx|... -> xxx|ooooooo|...
-        const int3 src = (int3){0, 0, subgrid.n.z};
+        const int3 src = (int3){0, 0, node->subgrid.n.z};
         const int3 dst = (int3){0, 0, 0};
 
-        const Device src_device = devices[i];
-        Device dst_device       = devices[i + 1];
+        const Device src_device = node->devices[i];
+        Device dst_device       = node->devices[i + 1];
 
         acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                num_vertices, dst_device);
     }
-    for (int i = 1; i < num_devices; ++i) {
+    for (int i = 1; i < node->num_devices; ++i) {
         // ...|ooooooo|xxx <- ...|xxxoooo|...
         const int3 src = (int3){0, 0, NGHOST};
-        const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+        const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
 
-        const Device src_device = devices[i];
-        Device dst_device       = devices[i - 1];
+        const Device src_device = node->devices[i];
+        Device dst_device       = node->devices[i - 1];
 
         acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                num_vertices, dst_device);
@@ -287,8 +300,8 @@ acNodeSynchronizeMesh(const Node node, const Stream stream)
 AcResult
 acNodeSwapBuffers(const Node node)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceSwapBuffers(devices[i]);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceSwapBuffers(node->devices[i]);
     }
     return AC_SUCCESS;
 }
@@ -297,8 +310,9 @@ AcResult
 acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
                    const AcReal value)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceLoadConstant(devices[i], stream, param, value);
+    acNodeSynchronizeStream(node, stream);
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceLoadConstant(node->devices[i], stream, param, value);
     }
     return AC_SUCCESS;
 }
@@ -308,14 +322,15 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
                                  const VertexBufferHandle vtxbuf_handle, const int3 src,
                                  const int3 dst, const int num_vertices)
 {
+    acNodeSynchronizeStream(node, stream);
     // See the beginning of the file for an explanation of the index mapping
     // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
 
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
+        const int3 s0 = dst;
+        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
 
         const int3 da = max(s0, d0);
         const int3 db = min(s1, d1);
@@ -330,13 +345,14 @@ acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcM
         printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
         */
         if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
+            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
             // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
+            const int3 da_global = src + da - dst;
+            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
             // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
             // printf("\n");
-            acDeviceLoadVertexBufferWithOffset(devices[i], stream, host_mesh, vtxbuf_handle, da,
-                                               da_local, copy_cells);
+            acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle,
+                                               da_global, da_local, copy_cells);
         }
         // printf("\n");
     }
@@ -381,21 +397,23 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
                                   const int3 dst, const int num_vertices, AcMesh* host_mesh)
 {
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+    acNodeSynchronizeStream(node, stream);
+    for (int i = 0; i < node->num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
 
         const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
+        const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
 
         const int3 da = max(s0, d0);
         const int3 db = min(s1, d1);
         if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
+            const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
             // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            acDeviceStoreVertexBufferWithOffset(devices[i], stream, vtxbuf_handle, da_local, da,
-                                                copy_cells, host_mesh);
+            const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
+            const int3 da_global = dst + da - src;
+            acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
+                                                da_global, copy_cells, host_mesh);
         }
     }
     return AC_SUCCESS;
@@ -418,7 +436,7 @@ acNodeStoreVertexBuffer(const Node node, const Stream stream,
 {
     const int3 src            = (int3){0, 0, 0};
     const int3 dst            = src;
-    const size_t num_vertices = acVertexBufferSize(host_mesh.info);
+    const size_t num_vertices = acVertexBufferSize(host_mesh->info);
 
     acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
                                       host_mesh);
@@ -436,21 +454,23 @@ acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
 }
 
 AcResult
-acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
-                       const int3 start, const int3 end, const AcReal dt)
+acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start,
+                       const int3 end, const AcReal dt)
 {
-    for (int i = 0; i < num_devices; ++i) {
+    acNodeSynchronizeStream(node, stream);
+
+    for (int i = 0; i < node->num_devices; ++i) {
         // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
+        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
+        const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};
 
         const int3 da = max(start, d0);
         const int3 db = min(end, d1);
 
         if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            acDeviceIntegrateSubstep(devices[i], stream, isubstep, da_local, db_local, dt);
+            const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};
+            const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};
+            acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt);
         }
     }
     return AC_SUCCESS;
@@ -459,43 +479,54 @@ acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_numb
 AcResult
 acNodeIntegrate(const Node node, const AcReal dt)
 {
-    acNodeSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    WARNING("Not implementad\n");
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acNodePeriodicBoundconds(node, STREAM_DEFAULT);
+        const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+        const int3 end   = start + node->grid.n;
+        acNodeIntegrateSubstep(node, STREAM_DEFAULT, isubstep, start, end, dt);
+        acNodeSwapBuffers(node);
+    }
 
-    acNodeSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
     return AC_SUCCESS;
 }
 
 static AcResult
-local_boundcondstep(const Node node, const StreamType stream, const VertexBufferHandle vtxbuf)
+local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
 {
-    if (num_devices > 1) {
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
         // Local boundary conditions
-        for (int i = 0; i < num_devices; ++i) {
+        for (int i = 0; i < node->num_devices; ++i) {
             const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            acDeviceBoundcondStep(devices[i], stream, vtxbuf, d0, d1);
+            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
+            acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
         }
     }
     else {
-        acDeviceBoundcondStep(devices[0], stream, vtxbuf, (int3){0, 0, 0}, subgrid.m);
+        acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
+                                      node->subgrid.m);
     }
     return AC_SUCCESS;
 }
 
 static AcResult
-global_boundcondstep(const Node node, const StreamType stream, const VertexBufferHandle vtxbuf)
+global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
 {
-    if (num_devices > 1) {
-        const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
+        const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
         {
             // ...|ooooxxx|... -> xxx|ooooooo|...
-            const int3 src = (int3){0, 0, subgrid.n.z};
+            const int3 src = (int3){0, 0, node->subgrid.n.z};
             const int3 dst = (int3){0, 0, 0};
 
-            const Device src_device = devices[num_devices - 1];
-            Device dst_device       = devices[0];
+            const Device src_device = node->devices[node->num_devices - 1];
+            Device dst_device       = node->devices[0];
 
             acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                    num_vertices, dst_device);
@@ -503,10 +534,10 @@ global_boundcondstep(const Node node, const StreamType stream, const VertexBuffe
         {
             // ...|ooooooo|xxx <- ...|xxxoooo|...
             const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+            const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
 
-            const Device src_device = devices[0];
-            Device dst_device       = devices[num_devices - 1];
+            const Device src_device = node->devices[0];
+            Device dst_device       = node->devices[node->num_devices - 1];
 
             acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
                                                    num_vertices, dst_device);
@@ -539,7 +570,8 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
 }
 
 static AcReal
-simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
+                         const int& n)
 {
     AcReal res = results[0];
     for (int i = 1; i < n; ++i) {
@@ -549,7 +581,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
         else if (rtype == RTYPE_MIN) {
             res = min(res, results[i]);
         }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
             res = sum(res, results[i]);
         }
         else {
@@ -558,10 +590,9 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
     }
 
     if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
+        const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z);
         res                = sqrt(inv_n * res);
     }
-
     return res;
 }
 
@@ -569,27 +600,29 @@ AcResult
 acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                  const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
-    acSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    AcReal results[num_devices];
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceReduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
+    AcReal results[node->num_devices];
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
     }
 
-    return simple_final_reduce_scal(rtype, results, num_devices);
+    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
+    return AC_SUCCESS;
 }
 
 AcResult
-acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
-                const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-                const VertexBufferHandle vtxbuf2, AcReal* result)
+acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
+                const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c,
+                AcReal* result)
 {
-    acSynchronizeStream(STREAM_ALL);
+    acNodeSynchronizeStream(node, STREAM_ALL);
 
-    AcReal results[num_devices];
-    for (int i = 0; i < num_devices; ++i) {
-        acDeviceReduceScal(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
+    AcReal results[node->num_devices];
+    for (int i = 0; i < node->num_devices; ++i) {
+        acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
     }
 
-    return simple_final_reduce_scal(rtype, results, num_devices);
+    *result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
+    return AC_SUCCESS;
 }