diff --git a/include/astaroth.h b/include/astaroth.h
index 76f5174..ac20d34 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -232,6 +232,11 @@ AcResult acStore(AcMesh* host_mesh);
  * substep and the user is responsible for calling acBoundcondStep before reading the data. */
 AcResult acIntegrate(const AcReal dt);
 
+/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
+ * substep and the user is responsible for calling acBoundcondStep before reading the data.
+ * Has customizable boundary conditions. */
+AcResult acIntegrateGBC(const AcMeshInfo config, const AcReal dt);
+
 /** Applies periodic boundary conditions for the Mesh distributed among the devices visible to
  * the caller*/
 AcResult acBoundcondStep(void);
diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc
index 1c8d0f7..c565699 100644
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -86,6 +86,13 @@ acIntegrate(const AcReal dt)
     return acNodeIntegrate(nodes[0], dt);
 }
 
+AcResult
+acIntegrateGBC(const AcMeshInfo config, const AcReal dt) 
+{
+    return acNodeIntegrateGBC(nodes[0], config, dt);
+}
+
+
 AcResult
 acIntegrateStep(const int isubstep, const AcReal dt)
 {
@@ -109,6 +116,12 @@ acBoundcondStep(void)
     return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
 }
 
+AcResult
+acBoundcondStepGBC(void) //TODO ADAPT
+{
+    return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT);
+}
+
 AcReal
 acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
 {
diff --git a/src/core/node.cc b/src/core/node.cc
index 552cf3f..58acd25 100644
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -656,6 +656,28 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
     return AC_SUCCESS;
 }
 
+static AcResult
+local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf) //TODO ADAPT
+{
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
+            acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
+        }
+    }
+    else {
+        acDeviceGeneralBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
+                                     node->subgrid.m);
+    }
+    return AC_SUCCESS;
+}
+
+
 static AcResult
 global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
 {
@@ -768,6 +790,85 @@ acNodeIntegrate(const Node node, const AcReal dt)
     return AC_SUCCESS;
 }
 
+AcResult
+acNodeIntegrateGBG(const Node node, const AcReal dt) //TODO ADAPT
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    // xxx|OOO OOOOOOOOO OOO|xxx
+    //    ^    ^         ^  ^
+    //   n0   n1        n2  n3
+    // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
+    // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+    // const int3 n2 = node->grid.n;
+    // const int3 n3 = n0 + node->grid.n;
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acNodeSynchronizeStream(node, STREAM_ALL);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        acNodeSynchronizeStream(node, STREAM_ALL);
+
+        // Inner inner
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = node->subgrid.n;
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
+        }
+
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+            global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeStream(node, (Stream)vtxbuf);
+        }
+
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Top
+            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Right
+            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
+        }
+        acNodeSwapBuffers(node);
+    }
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    return AC_SUCCESS;
+}
+
 AcResult
 acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                             const VertexBufferHandle vtxbuf_handle)
@@ -783,21 +884,20 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 }
 
 AcResult
-acNodeGeneralBoundcondStep(const Node node, const Stream stream,
+acNodeGeneralBoundcondStep(const Node node, const Stream stream,   //TODO ADAPT
                            const VertexBufferHandle vtxbuf_handle)
 {
-    local_boundcondstep(node, stream, vtxbuf_handle);
+    local_boundcondstep_GBC(node, stream, vtxbuf_handle);
     acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
 
-    //MV: assume that global step communication is handles by acNodePeriodicBoundcondStep as above. 
-    //global_boundcondstep(node, stream, vtxbuf_handle);
+    global_boundcondstep(node, stream, vtxbuf_handle);
   
 
     return AC_SUCCESS;
 }
 
 AcResult
-acNodePeriodicBoundconds(const Node node, const Stream stream)
+acNodePeriodicBoundconds(const Node node, const Stream stream) //TODO ADAPT
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
@@ -806,7 +906,7 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
 }
 
 AcResult
-acNodeGeneralBoundconds(const Node node, const Stream stream)
+acNodeGeneralBoundconds(const Node node, const Stream stream) //TODO ADAPT
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i);