From b81180ff19cf320ca06c68459da896e9dcd44305 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Tue, 15 Sep 2020 19:08:32 +0800
Subject: [PATCH 01/21] Desperate but perhaps wrongminded attempts.

Periodic boundary conditions are etched too deep inside...
---
 src/core/node.cc | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/core/node.cc b/src/core/node.cc
index 70c3115..552cf3f 100644
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -782,6 +782,20 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
     return AC_SUCCESS;
 }
 
+AcResult
+acNodeGeneralBoundcondStep(const Node node, const Stream stream,
+                           const VertexBufferHandle vtxbuf_handle)
+{
+    local_boundcondstep(node, stream, vtxbuf_handle);
+    acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
+
+    //MV: assume that global step communication is handles by acNodePeriodicBoundcondStep as above. 
+    //global_boundcondstep(node, stream, vtxbuf_handle);
+  
+
+    return AC_SUCCESS;
+}
+
 AcResult
 acNodePeriodicBoundconds(const Node node, const Stream stream)
 {
@@ -791,6 +805,15 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
     return AC_SUCCESS;
 }
 
+AcResult
+acNodeGeneralBoundconds(const Node node, const Stream stream)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i);
+    }
+    return AC_SUCCESS;
+}
+
 static AcReal
 simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
                          const int& n)

From 67aa87731bd7bb12f59fd0aee437baf549b34e3d Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 18 Sep 2020 15:25:14 +0800
Subject: [PATCH 02/21] Crafting code. Attempting to figure out the MPI domain
 decomposition etc.

---
 src/core/device.cc | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index a6ec793..52f20ed 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -433,6 +433,27 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3
     return AC_SUCCESS;
 }
 
+AcResult
+acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
+                              const VertexBufferHandle vtxbuf_handle, const int3 start,
+                              const int3 end)
+{
+    cudaSetDevice(device->id);
+    return acKernelGeneralBoundconds(device->streams[stream], start, end,
+                                     device->vba.in[vtxbuf_handle]);
+}
+
+AcResult
+acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
+                           const int3 end)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
+    }
+    return AC_SUCCESS;
+}
+
+
 AcResult
 acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                    const VertexBufferHandle vtxbuf_handle, AcReal* result)
@@ -1652,6 +1673,13 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     acGridLoadScalarUniform(stream, AC_dt, dt);
     acDeviceSynchronizeStream(device, stream);
 
+    // Check the position in MPI frame 
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomposition = decompose(nprocs);
+    const int3 pid3d             = getPid3D(pid, decomposition);
+
 // Corners
 #if MPI_INCL_CORNERS
     // Do not rm: required for corners
@@ -1805,6 +1833,21 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acSyncCommData(sidexz_data);
         acSyncCommData(sideyz_data);
 #endif // MPI_COMM_ENABLED
+
+
+        // Set outer boudaries after substep computation. 
+        const int3 m1 = (int3){0, 0, 0};
+        const int3 m2 = nn;
+        const int3 pid3d = getPid3D(pid, decomposition);
+        // If we are are a boundary element
+        if ((pid3d.x == 0) || (pid3d.x == decomposition.x - 1) || 
+            (pid3d.y == 0) || (pid3d.y == decomposition.y - 1) ||
+            (pid3d.z == 0) || (pid3d.z == decomposition.z - 1) ||)
+        {
+            acDeviceGeneralBoundconds(device, stream, m1, m2);
+        }
+        acGridSynchronizeStream(stream);
+
 #if MPI_COMPUTE_ENABLED
         { // Front
             const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
@@ -1840,8 +1883,12 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acDeviceSwapBuffers(device);
         acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
         ////////////////////////////////////////////
+
     }
 
+    
+    
+
     return AC_SUCCESS;
 }
 

From f736aa1cd1655ad8adbea3aebed3a3eee84d2fa7 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 18 Sep 2020 16:55:36 +0800
Subject: [PATCH 03/21] Attemptiong to make kernels to go where they should.

---
 src/core/device.cc              | 11 ++++++-----
 src/core/kernels/boundconds.cuh | 17 +++++++++++++++++
 src/core/kernels/kernels.h      |  3 +++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 52f20ed..a68fbd9 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -436,19 +436,19 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3
 AcResult
 acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
                               const VertexBufferHandle vtxbuf_handle, const int3 start,
-                              const int3 end)
+                              const int3 end, const int bound_direction)
 {
     cudaSetDevice(device->id);
     return acKernelGeneralBoundconds(device->streams[stream], start, end,
-                                     device->vba.in[vtxbuf_handle]);
+                                     device->vba.in[vtxbuf_handle], bound_direction);
 }
 
 AcResult
 acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
-                           const int3 end)
+                           const int3 end, const int bound_direction)
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
+        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, bound_direction);
     }
     return AC_SUCCESS;
 }
@@ -1844,7 +1844,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
             (pid3d.y == 0) || (pid3d.y == decomposition.y - 1) ||
             (pid3d.z == 0) || (pid3d.z == decomposition.z - 1) ||)
         {
-            acDeviceGeneralBoundconds(device, stream, m1, m2);
+            //TODO get bound_direction
+            acDeviceGeneralBoundconds(device, stream, m1, m2, bound_direction);
         }
         acGridSynchronizeStream(stream);
 
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 6a31d58..1fa2c4c 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -60,3 +60,20 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
     ERRCHK_CUDA_KERNEL();
     return AC_SUCCESS;
 }
+
+AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                                    AcReal* vtxbuf, const int bound_direction);
+{
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    if (DCONST(AC_bype) == BOUNDCOND_SYM) 
+    {
+        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bound_direction);
+        ERRCHK_CUDA_KERNEL();
+    }
+
+    return AC_SUCCESS;
+}
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index fc9c745..980bafa 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -43,6 +43,9 @@ extern "C" {
 /** */
 AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                                     AcReal* vtxbuf);
+/** */
+AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                                    AcReal* vtxbuf, const int bound_direction);
 
 /** */
 AcResult acKernelDummy(void);

From 1868031f1e74c36f455e8b913726c8f61e995e36 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 18 Sep 2020 17:25:34 +0800
Subject: [PATCH 04/21] Working on marking the active edges.

---
 src/core/device.cc | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index a68fbd9..d1640c9 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1840,12 +1840,24 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         const int3 m2 = nn;
         const int3 pid3d = getPid3D(pid, decomposition);
         // If we are are a boundary element
-        if ((pid3d.x == 0) || (pid3d.x == decomposition.x - 1) || 
-            (pid3d.y == 0) || (pid3d.y == decomposition.y - 1) ||
-            (pid3d.z == 0) || (pid3d.z == decomposition.z - 1) ||)
+        int3 bindex = (int3){0, 0, 0};
+
+        // 1 is top edge, 2 bottom edge, 3 both edges, 0 is no boundary
+        if (pid3d.x == 0) { bindex.x = 1; }
+        else if (pid3d.x == decomposition.x - 1) { bindex.x = 2; }
+        else if ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 3; }
+
+        if (pid3d.y == 0) { bindex.y = 1; }
+        else if (pid3d.y == decomposition.y - 1) { bindex.y = 2; }
+        else if ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 3; }
+
+        if       (pid3d.z == 0)                                      { bindex.z = 1; }
+        else if                    (pid3d.z == decomposition.z - 1)  { bindex.z = 2; }
+        else if ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 3; }
+
         {
-            //TODO get bound_direction
-            acDeviceGeneralBoundconds(device, stream, m1, m2, bound_direction);
+            //TODO get bound_direction -> bindex
+            acDeviceGeneralBoundconds(device, stream, m1, m2, bindex);
         }
         acGridSynchronizeStream(stream);
 

From 2f85cbba1a0929d6cd638b65202238746290ad06 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Mon, 21 Sep 2020 15:54:33 +0800
Subject: [PATCH 05/21] Last mimnnut modification before the meeting.

---
 src/core/device.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index d1640c9..29ef9c8 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -436,19 +436,19 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3
 AcResult
 acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
                               const VertexBufferHandle vtxbuf_handle, const int3 start,
-                              const int3 end, const int bound_direction)
+                              const int3 end, const int3 bindex)
 {
     cudaSetDevice(device->id);
     return acKernelGeneralBoundconds(device->streams[stream], start, end,
-                                     device->vba.in[vtxbuf_handle], bound_direction);
+                                     device->vba.in[vtxbuf_handle], bindex);
 }
 
 AcResult
 acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
-                           const int3 end, const int bound_direction)
+                           const int3 end, const int3 bindex)
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, bound_direction);
+        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, bindex);
     }
     return AC_SUCCESS;
 }

From 9386129f1b98e33a698b6214e5f46e94e0c061d9 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Tue, 29 Sep 2020 16:31:16 +0800
Subject: [PATCH 06/21] Cleaning and improving the boundary condition draft.

---
 src/core/device.cc | 63 +++++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 29ef9c8..cf20e85 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1435,6 +1435,41 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
+AcResult
+acGridGeneralBoundconds(const Device device, const Stream stream)
+{
+    // Non-periodic Boundary conditions 
+
+    // Set outer boudaries after substep computation. 
+    const int3 m1 = (int3){0, 0, 0};
+    const int3 m2 = grid.nn;
+    const int3 pid3d = getPid3D(pid, grid.decomposition);
+    // If we are are a boundary element
+    int3 bindex = (int3){0, 0, 0};
+
+    // Check if there are active boundary condition edges.
+    // 0 is no boundary, 1 both edges, 2 is top edge, 3 bottom edge
+    if      ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 1; }
+    else if  (pid3d.x == 0)                                      { bindex.x = 2; }
+    else if                    (pid3d.x == decomposition.x - 1)  { bindex.x = 3; }
+
+    if      ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 1; }
+    else if  (pid3d.y == 0)                                      { bindex.y = 2; }
+    else if                    (pid3d.y == decomposition.y - 1)  { bindex.y = 3; }
+
+    if      ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 1; }
+    else if  (pid3d.z == 0)                                      { bindex.z = 2; }
+    else if                    (pid3d.z == decomposition.z - 1)  { bindex.z = 3; }
+
+    
+    if (bindex.x != 1) && (bindex.y != 1) && (bindex.z != 1) {
+        acDeviceGeneralBoundconds(device, stream, m1, m2, bindex);
+    }
+    acGridSynchronizeStream(stream);
+
+    return AC_SUCCESS;
+}
+
 /*
 // Unused
 AcResult
@@ -1834,32 +1869,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
         acSyncCommData(sideyz_data);
 #endif // MPI_COMM_ENABLED
 
-
-        // Set outer boudaries after substep computation. 
-        const int3 m1 = (int3){0, 0, 0};
-        const int3 m2 = nn;
-        const int3 pid3d = getPid3D(pid, decomposition);
-        // If we are are a boundary element
-        int3 bindex = (int3){0, 0, 0};
-
-        // 1 is top edge, 2 bottom edge, 3 both edges, 0 is no boundary
-        if (pid3d.x == 0) { bindex.x = 1; }
-        else if (pid3d.x == decomposition.x - 1) { bindex.x = 2; }
-        else if ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 3; }
-
-        if (pid3d.y == 0) { bindex.y = 1; }
-        else if (pid3d.y == decomposition.y - 1) { bindex.y = 2; }
-        else if ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 3; }
-
-        if       (pid3d.z == 0)                                      { bindex.z = 1; }
-        else if                    (pid3d.z == decomposition.z - 1)  { bindex.z = 2; }
-        else if ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 3; }
-
-        {
-            //TODO get bound_direction -> bindex
-            acDeviceGeneralBoundconds(device, stream, m1, m2, bindex);
-        }
-        acGridSynchronizeStream(stream);
+        // Invoke outer edge boundary conditions. 
+        acGridGeneralBoundconds(device, stream)
 
 #if MPI_COMPUTE_ENABLED
         { // Front

From 711cc4d350b1ed8c19286269a187e9fb9ead4073 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Tue, 29 Sep 2020 16:36:43 +0800
Subject: [PATCH 07/21] Moving code in wrong place.

---
 src/core/device.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index cf20e85..d931925 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1439,11 +1439,17 @@ AcResult
 acGridGeneralBoundconds(const Device device, const Stream stream)
 {
     // Non-periodic Boundary conditions 
+    // Check the position in MPI frame 
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomposition = decompose(nprocs);
+    const int3 pid3d             = getPid3D(pid, decomposition);
 
     // Set outer boudaries after substep computation. 
     const int3 m1 = (int3){0, 0, 0};
     const int3 m2 = grid.nn;
-    const int3 pid3d = getPid3D(pid, grid.decomposition);
+    const int3 pid3d = getPid3D(pid, decomposition);
     // If we are are a boundary element
     int3 bindex = (int3){0, 0, 0};
 
@@ -1708,12 +1714,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     acGridLoadScalarUniform(stream, AC_dt, dt);
     acDeviceSynchronizeStream(device, stream);
 
-    // Check the position in MPI frame 
-    int nprocs, pid;
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-    const uint3_64 decomposition = decompose(nprocs);
-    const int3 pid3d             = getPid3D(pid, decomposition);
 
 // Corners
 #if MPI_INCL_CORNERS

From 8667e2b2ec2719c2423f9003eac6dfeac038598d Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Tue, 29 Sep 2020 18:46:41 +0800
Subject: [PATCH 08/21] Entering kernel level

---
 src/core/kernels/boundconds.cuh | 47 ++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 1fa2c4c..22d353b 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,5 +1,45 @@
 #pragma once
 
+static __global__ void
+kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
+        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src, j_src, k_src;
+    int bsize = STENCIL_ORDER/(int) 2;
+
+    if (bindex.x == 1)
+    {
+        if (i_dst < bsize) 
+        {
+            i_src = 2*bsize - i_dst
+        } else if (i_dst >= DCONST(AC_nx_min) - bsize) 
+        {
+            i_src = i_dst - 2*bsize BRAIN NOT WORKING CONTINUE TOMORROW
+        }
+    } 
+
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vtxbuf[dst_idx]   = vtxbuf[src_idx];
+}
+
+
 static __global__ void
 kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
 {
@@ -61,8 +101,9 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
     return AC_SUCCESS;
 }
 
-AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                                    AcReal* vtxbuf, const int bound_direction);
+AcResult 
+acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                          AcReal* vtxbuf, const int bindex);
 {
     const dim3 tpb(8, 2, 8);
     const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
@@ -71,7 +112,7 @@ AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start,
 
     if (DCONST(AC_bype) == BOUNDCOND_SYM) 
     {
-        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bound_direction);
+        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex);
         ERRCHK_CUDA_KERNEL();
     }
 

From 681035275d70e1fd0dd1342afb6514b19322ecd0 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Wed, 30 Sep 2020 14:38:27 +0800
Subject: [PATCH 09/21] Draft of symmetric/antisymmetric boundary condition.

---
 src/core/kernels/boundconds.cuh | 54 +++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 22d353b..4b35065 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,7 +1,7 @@
 #pragma once
 
 static __global__ void
-kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex)
+kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex, const int sign)
 {
     const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
     const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
@@ -20,23 +20,54 @@ kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, co
 
     // Find the source index
     // Map to nx, ny, nz coordinates
-    int i_src, j_src, k_src;
+    int i_src, j_src, k_src, boundloc;
     int bsize = STENCIL_ORDER/(int) 2;
 
-    if (bindex.x == 1)
+    if (bindex.x != 0)
     {
-        if (i_dst < bsize) 
+        // Pick up the mirroring value.
+        if ((i_dst < bsize) && ((bindex.x == 3) || (bindex.x ==1)))
         {
-            i_src = 2*bsize - i_dst
-        } else if (i_dst >= DCONST(AC_nx_min) - bsize) 
+            boundloc = bsize;         //Location of central border point. 
+            i_src = 2*boundloc - i_dst;
+        } else if ((i_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.x == 2) || (bindex.x ==1)))
         {
-            i_src = i_dst - 2*bsize BRAIN NOT WORKING CONTINUE TOMORROW
+            boundloc = DCONST(AC_nx_min) - bsize - 1; //Location of central border point. 
+            i_src = 2*boundloc - i_dst;
+        }
+    } 
+    if (bindex.y != 0)
+    {
+        // Pick up the mirroring value.
+        if ((j_dst < bsize) && ((bindex.y == 3) || (bindex.y ==1)))
+        {
+            boundloc = bsize;         //Location of central border point. 
+            j_src = 2*boundloc - j_dst;
+        } else if ((j_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.y == 2) || (bindex.y ==1)))
+        {
+            boundloc = DCONST(AC_ny_min) - bsize - 1; //Location of central border point. 
+            i_src = 2*boundloc - j_dst;
+        }
+    } 
+    if (bindex.z != 0)
+    {
+        // Pick up the mirroring value.
+        if ((k_dst < bsize) && ((bindex.z == 3) || (bindex.z ==1)))
+        {
+            boundloc = bsize;         //Location of central border point. 
+            k_src = 2*boundloc - k_dst;
+        } else if ((i_dst >= DCONST(AC_nz_min) - bsize) && ((bindex.z == 2) || (bindex.z ==1)))
+        {
+            boundloc = DCONST(AC_nz_min) - bsize - 1; //Location of central border point. 
+            k_src = 2*boundloc - k_dst;
         }
     } 
 
+
+
     const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
     const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx]   = vtxbuf[src_idx];
+    vtxbuf[dst_idx]   = sign*vtxbuf[src_idx]; // sign = 1 symmetric, sign = -1 antisymmetric
 }
 
 
@@ -112,7 +143,12 @@ acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int
 
     if (DCONST(AC_bype) == BOUNDCOND_SYM) 
     {
-        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex);
+        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
+        ERRCHK_CUDA_KERNEL();
+    } 
+    else if (DCONST(AC_bype) == BOUNDCOND_ASYM) 
+    {
+        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
         ERRCHK_CUDA_KERNEL();
     }
 

From cf6b21e3ab5b4d3ee6cd4d511e9b4c47ed56f972 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Tue, 6 Oct 2020 17:49:36 +0800
Subject: [PATCH 10/21] Created separate acGridIntegrateNonperiodic()

from +acGridIntegrate() this is to avoid some potential issues with an upcoming merge by Oskar.
---
 src/core/device.cc | 215 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 215 insertions(+)

diff --git a/src/core/device.cc b/src/core/device.cc
index d931925..9f5a8e1 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1714,6 +1714,220 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     acGridLoadScalarUniform(stream, AC_dt, dt);
     acDeviceSynchronizeStream(device, stream);
 
+// Corners
+#if MPI_INCL_CORNERS
+    // Do not rm: required for corners
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
+#endif // MPI_INCL_CORNERS
+
+    // Edges X
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
+
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges Y
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
+
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
+
+    // Edges Z
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                    // MPI_INCL_CORNERS
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif
+
+#if MPI_COMM_ENABLED
+        MPI_Barrier(MPI_COMM_WORLD);
+
+#if MPI_GPUDIRECT_DISABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
+#endif                                                  // MPI_INCL_CORNERS
+        acTransferCommDataToHost(device, &edgex_data);
+        acTransferCommDataToHost(device, &edgey_data);
+        acTransferCommDataToHost(device, &edgez_data);
+        acTransferCommDataToHost(device, &sidexy_data);
+        acTransferCommDataToHost(device, &sidexz_data);
+        acTransferCommDataToHost(device, &sideyz_data);
+#endif
+#if MPI_INCL_CORNERS
+        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                        // MPI_INCL_CORNERS
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acKernelIntegrateSubstep(device->streams[STREAM_16], isubstep, m1, m2, device->vba);
+        }
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataWait(corner_data); // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+        acTransferCommDataWait(edgex_data);
+        acTransferCommDataWait(edgey_data);
+        acTransferCommDataWait(edgez_data);
+        acTransferCommDataWait(sidexy_data);
+        acTransferCommDataWait(sidexz_data);
+        acTransferCommDataWait(sideyz_data);
+
+#if MPI_INCL_CORNERS
+        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
+#endif                                         // MPI_INCL_CORNERS
+        acUnpinCommData(device, &edgex_data);
+        acUnpinCommData(device, &edgey_data);
+        acUnpinCommData(device, &edgez_data);
+        acUnpinCommData(device, &sidexy_data);
+        acUnpinCommData(device, &sidexz_data);
+        acUnpinCommData(device, &sideyz_data);
+
+#if MPI_INCL_CORNERS
+        acUnpackCommData(device, corner_b0s, &corner_data);
+#endif // MPI_INCL_CORNERS
+        acUnpackCommData(device, edgex_b0s, &edgex_data);
+        acUnpackCommData(device, edgey_b0s, &edgey_data);
+        acUnpackCommData(device, edgez_b0s, &edgez_data);
+        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+//////////// OUTER INTEGRATION //////////////
+
+// Wait for unpacking
+#if MPI_INCL_CORNERS
+        acSyncCommData(corner_data); // Do not rm: required for corners
+#endif                               // MPI_INCL_CORNERS
+        acSyncCommData(edgex_data);
+        acSyncCommData(edgey_data);
+        acSyncCommData(edgez_data);
+        acSyncCommData(sidexy_data);
+        acSyncCommData(sidexz_data);
+        acSyncCommData(sideyz_data);
+#endif // MPI_COMM_ENABLED
+#if MPI_COMPUTE_ENABLED
+        { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_0], isubstep, m1, m2, device->vba);
+        }
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_1], isubstep, m1, m2, device->vba);
+        }
+        { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_2], isubstep, m1, m2, device->vba);
+        }
+        { // Top
+            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_3], isubstep, m1, m2, device->vba);
+        }
+        { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_4], isubstep, m1, m2, device->vba);
+        }
+        { // Right
+            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_5], isubstep, m1, m2, device->vba);
+        }
+#endif // MPI_COMPUTE_ENABLED
+        acDeviceSwapBuffers(device);
+        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
+        ////////////////////////////////////////////
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+#if MPI_INCL_CORNERS
+    CommData corner_data = grid.corner_data; // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
+
+    acGridLoadScalarUniform(stream, AC_dt, dt);
+    acDeviceSynchronizeStream(device, stream);
+
 
 // Corners
 #if MPI_INCL_CORNERS
@@ -1916,6 +2130,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     return AC_SUCCESS;
 }
 
+
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
 {

From d060e67bec5e3912240439349e16565e8c207677 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Tue, 6 Oct 2020 18:16:24 +0800
Subject: [PATCH 11/21] Includes also

---
 include/astaroth.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/astaroth.h b/include/astaroth.h
index d32b367..8db7608 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -303,9 +303,15 @@ AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
 /** */
 AcResult acGridIntegrate(const Stream stream, const AcReal dt);
 
+/** */
+AcResult acGridIntegrateNonperiodic(const Stream stream, const AcReal dt);
+
 /** */
 AcResult acGridPeriodicBoundconds(const Stream stream);
 
+/** */
+AcResult acGridGeneralBoundconds(const Device device, const Stream stream);
+
 /** TODO */
 AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
                           const VertexBufferHandle vtxbuf_handle, AcReal* result);
@@ -562,6 +568,16 @@ AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
 AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
                                     const int3 end);
 
+/** */
+AcResult acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
+                                      const VertexBufferHandle vtxbuf_handle, const int3 start,
+                                      const int3 end, const int3 bindex);
+
+/** */
+AcResult acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
+                                   const int3 end, const int3 bindex);
+
+
 /** */
 AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                             const VertexBufferHandle vtxbuf_handle, AcReal* result);

From cb15668f2d364be854a0d5bd8a769b82e429f482 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 11:58:15 +0800
Subject: [PATCH 12/21] Figuring out compilations.

---
 acc/mhd_solver/stencil_kernel.ac |  5 ++++-
 include/astaroth.h               |  6 ++++++
 src/core/device.cc               |  4 ++++
 src/core/kernels/boundconds.cuh  | 10 +++++++---
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac
index e0efa94..57097f8 100644
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -20,9 +20,12 @@
 uniform int AC_max_steps;
 uniform int AC_save_steps;
 uniform int AC_bin_steps;
-uniform int AC_bc_type;
 uniform int AC_start_step;
 
+// In3 params 
+uniform int3 AC_bc_type_bot;
+uniform int3 AC_bc_type_top;
+
 // Real params
 uniform Scalar AC_dt;
 uniform Scalar AC_max_time;
diff --git a/include/astaroth.h b/include/astaroth.h
index 8db7608..ace32c0 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -304,7 +304,13 @@ AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
 AcResult acGridIntegrate(const Stream stream, const AcReal dt);
 
 /** */
+/*   MV: Commented out for a while, but save for the future when standalone_MPI
+         works with periodic boundary conditions. 
+AcResult
+acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
+
 AcResult acGridIntegrateNonperiodic(const Stream stream, const AcReal dt);
+*/
 
 /** */
 AcResult acGridPeriodicBoundconds(const Stream stream);
diff --git a/src/core/device.cc b/src/core/device.cc
index 9f5a8e1..ad46148 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1907,6 +1907,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
     return AC_SUCCESS;
 }
 
+/*   MV: Commented out for a while, but save for the future when standalone_MPI
+         works with periodic boundary conditions. 
 AcResult
 acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
 {
@@ -2130,6 +2132,8 @@ acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
     return AC_SUCCESS;
 }
 
+*/
+
 
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 4b35065..01e89fa 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,5 +1,9 @@
 #pragma once
 
+// Temporary defines untils we can figure out smarter swithches.
+#define BOUNDCOND_SYM  1
+#define BOUNDCOND_ASYM 2
+
 static __global__ void
 kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex, const int sign)
 {
@@ -134,19 +138,19 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
 
 AcResult 
 acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                          AcReal* vtxbuf, const int bindex);
+                          AcReal* vtxbuf, const int3 bindex)
 {
     const dim3 tpb(8, 2, 8);
     const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
                    (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                    (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
 
-    if (DCONST(AC_bype) == BOUNDCOND_SYM) 
+    if (DCONST(AC_bc_type) == BOUNDCOND_SYM) 
     {
         kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
         ERRCHK_CUDA_KERNEL();
     } 
-    else if (DCONST(AC_bype) == BOUNDCOND_ASYM) 
+    else if (DCONST(AC_bc_type) == BOUNDCOND_ASYM) 
     {
         kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
         ERRCHK_CUDA_KERNEL();

From efd3cc40cd98d05da6244b6ba5150b7d7646a43f Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 14:11:14 +0800
Subject: [PATCH 13/21] Compiles without the API funtion call.

---
 include/astaroth.h              |  9 +++++++--
 src/core/device.cc              | 11 +++++++----
 src/core/kernels/boundconds.cuh | 13 ++++++-------
 src/core/kernels/kernels.h      |  3 ++-
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index ace32c0..76f5174 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -49,6 +49,11 @@ typedef struct {
 
 typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
 
+// Neming the associated number of the boundary condition types
+typedef enum { AC_BOUNDCOND_PERIODIC = 0, 
+               AC_BOUNDCOND_SYMMETRIC = 1, 
+               AC_BOUNDCOND_ANTISYMMETRIC = 2 } AcBoundcond;
+
 #define AC_GEN_ID(X) X,
 typedef enum {
     AC_FOR_RTYPES(AC_GEN_ID) //
@@ -577,11 +582,11 @@ AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, co
 /** */
 AcResult acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
-                                      const int3 end, const int3 bindex);
+                                      const int3 end, const AcMeshInfo config, const int3 bindex);
 
 /** */
 AcResult acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
-                                   const int3 end, const int3 bindex);
+                                   const int3 end, const AcMeshInfo config, const int3 bindex);
 
 
 /** */
diff --git a/src/core/device.cc b/src/core/device.cc
index ad46148..9130f34 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -436,19 +436,19 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3
 AcResult
 acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
                               const VertexBufferHandle vtxbuf_handle, const int3 start,
-                              const int3 end, const int3 bindex)
+                              const int3 end, const AcMeshInfo config, const int3 bindex)
 {
     cudaSetDevice(device->id);
     return acKernelGeneralBoundconds(device->streams[stream], start, end,
-                                     device->vba.in[vtxbuf_handle], bindex);
+                                     device->vba.in[vtxbuf_handle], config, bindex);
 }
 
 AcResult
 acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
-                           const int3 end, const int3 bindex)
+                           const int3 end, const AcMeshInfo config, const int3 bindex)
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, bindex);
+        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, config, bindex);
     }
     return AC_SUCCESS;
 }
@@ -1435,6 +1435,8 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
     return AC_SUCCESS;
 }
 
+/*   MV: Commented out for a while, but save for the future when standalone_MPI
+         works with periodic boundary conditions. 
 AcResult
 acGridGeneralBoundconds(const Device device, const Stream stream)
 {
@@ -1475,6 +1477,7 @@ acGridGeneralBoundconds(const Device device, const Stream stream)
 
     return AC_SUCCESS;
 }
+*/
 
 /*
 // Unused
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 01e89fa..64b4a1e 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,9 +1,5 @@
 #pragma once
 
-// Temporary defines untils we can figure out smarter swithches.
-#define BOUNDCOND_SYM  1
-#define BOUNDCOND_ASYM 2
-
 static __global__ void
 kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex, const int sign)
 {
@@ -138,19 +134,22 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
 
 AcResult 
 acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                          AcReal* vtxbuf, const int3 bindex)
+                          AcReal* vtxbuf, const AcMeshInfo config, const int3 bindex)
 {
     const dim3 tpb(8, 2, 8);
     const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
                    (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                    (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
 
-    if (DCONST(AC_bc_type) == BOUNDCOND_SYM) 
+    int3 bc_top = config.int3_params[AC_bc_type_top];
+    int3 bc_bot = config.int3_params[AC_bc_type_bot];
+
+    if (bc_top.x == AC_BOUNDCOND_SYMMETRIC) 
     {
         kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
         ERRCHK_CUDA_KERNEL();
     } 
-    else if (DCONST(AC_bc_type) == BOUNDCOND_ASYM) 
+    else if (bc_bot.x == AC_BOUNDCOND_ANTISYMMETRIC) 
     {
         kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
         ERRCHK_CUDA_KERNEL();
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 980bafa..bc94185 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -45,7 +45,8 @@ AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start,
                                     AcReal* vtxbuf);
 /** */
 AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                                    AcReal* vtxbuf, const int bound_direction);
+                                    AcReal* vtxbuf, const AcMeshInfo config, const int3 bindex);
+
 
 /** */
 AcResult acKernelDummy(void);

From 8732480cd99fc6bc253fc3d06d40a91a839391e0 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 15:37:37 +0800
Subject: [PATCH 14/21] Added acIntegrateGBC()

---
 include/astaroth.h   |   5 ++
 src/core/astaroth.cc |  13 +++++
 src/core/node.cc     | 112 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index 76f5174..ac20d34 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -232,6 +232,11 @@ AcResult acStore(AcMesh* host_mesh);
  * substep and the user is responsible for calling acBoundcondStep before reading the data. */
 AcResult acIntegrate(const AcReal dt);
 
+/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
+ * substep and the user is responsible for calling acBoundcondStep before reading the data.
+ * Has customizable boundary conditions. */
+AcResult acIntegrateGBC(const AcMeshInfo config, const AcReal dt);
+
 /** Applies periodic boundary conditions for the Mesh distributed among the devices visible to
  * the caller*/
 AcResult acBoundcondStep(void);
diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc
index 1c8d0f7..c565699 100644
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -86,6 +86,13 @@ acIntegrate(const AcReal dt)
     return acNodeIntegrate(nodes[0], dt);
 }
 
+AcResult
+acIntegrateGBC(const AcMeshInfo config, const AcReal dt) 
+{
+    return acNodeIntegrateGBC(nodes[0], config, dt);
+}
+
+
 AcResult
 acIntegrateStep(const int isubstep, const AcReal dt)
 {
@@ -109,6 +116,12 @@ acBoundcondStep(void)
     return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
 }
 
+AcResult
+acBoundcondStepGBC(void) //TODO ADAPT
+{
+    return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT);
+}
+
 AcReal
 acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
 {
diff --git a/src/core/node.cc b/src/core/node.cc
index 552cf3f..58acd25 100644
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -656,6 +656,28 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
     return AC_SUCCESS;
 }
 
+static AcResult
+local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf) //TODO ADAPT
+{
+    acNodeSynchronizeStream(node, stream);
+
+    if (node->num_devices > 1) {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
+            acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
+        }
+    }
+    else {
+        acDeviceGeneralBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
+                                     node->subgrid.m);
+    }
+    return AC_SUCCESS;
+}
+
+
 static AcResult
 global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
 {
@@ -768,6 +790,85 @@ acNodeIntegrate(const Node node, const AcReal dt)
     return AC_SUCCESS;
 }
 
+AcResult
+acNodeIntegrateGBG(const Node node, const AcReal dt) //TODO ADAPT
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    // xxx|OOO OOOOOOOOO OOO|xxx
+    //    ^    ^         ^  ^
+    //   n0   n1        n2  n3
+    // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
+    // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+    // const int3 n2 = node->grid.n;
+    // const int3 n3 = n0 + node->grid.n;
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acNodeSynchronizeStream(node, STREAM_ALL);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        acNodeSynchronizeStream(node, STREAM_ALL);
+
+        // Inner inner
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = node->subgrid.n;
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
+        }
+
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+            global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeStream(node, (Stream)vtxbuf);
+        }
+
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Top
+            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Right
+            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
+        }
+        acNodeSwapBuffers(node);
+    }
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    return AC_SUCCESS;
+}
+
 AcResult
 acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                             const VertexBufferHandle vtxbuf_handle)
@@ -783,21 +884,20 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 }
 
 AcResult
-acNodeGeneralBoundcondStep(const Node node, const Stream stream,
+acNodeGeneralBoundcondStep(const Node node, const Stream stream,   //TODO ADAPT
                            const VertexBufferHandle vtxbuf_handle)
 {
-    local_boundcondstep(node, stream, vtxbuf_handle);
+    local_boundcondstep_GBC(node, stream, vtxbuf_handle);
     acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
 
-    //MV: assume that global step communication is handles by acNodePeriodicBoundcondStep as above. 
-    //global_boundcondstep(node, stream, vtxbuf_handle);
+    global_boundcondstep(node, stream, vtxbuf_handle);
   
 
     return AC_SUCCESS;
 }
 
 AcResult
-acNodePeriodicBoundconds(const Node node, const Stream stream)
+acNodePeriodicBoundconds(const Node node, const Stream stream) //TODO ADAPT
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
@@ -806,7 +906,7 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
 }
 
 AcResult
-acNodeGeneralBoundconds(const Node node, const Stream stream)
+acNodeGeneralBoundconds(const Node node, const Stream stream) //TODO ADAPT
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i);

From f87d65408cd91c69c9c49b538e8dd36c80a89580 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 15:42:54 +0800
Subject: [PATCH 15/21] Added acBoundcondStepGBC() for flexible boundary
 conditions.

---
 include/astaroth.h   | 4 ++++
 src/core/astaroth.cc | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index ac20d34..2f66354 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -241,6 +241,10 @@ AcResult acIntegrateGBC(const AcMeshInfo config, const AcReal dt);
  * the caller*/
 AcResult acBoundcondStep(void);
 
+/** Applies general outer boundary conditions for the Mesh distributed among the devices visible to
+ * the caller*/
+AcResult acBoundcondStepGBC(const AcMeshInfo config);
+
 /** Does a scalar reduction with the data stored in some vertex buffer */
 AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
 
diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc
index c565699..086e512 100644
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -117,9 +117,9 @@ acBoundcondStep(void)
 }
 
 AcResult
-acBoundcondStepGBC(void) //TODO ADAPT
+acBoundcondStepGBC(const AcMeshInfo config) 
 {
-    return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT);
+    return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT, config);
 }
 
 AcReal

From 06bacf298ae33dd3a0c034f85bc974a77f6783d8 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 16:07:33 +0800
Subject: [PATCH 16/21] Created local_boundcondstep_GBC()

---
 src/core/node.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/core/node.cc b/src/core/node.cc
index 58acd25..423bbd1 100644
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -657,22 +657,24 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
 }
 
 static AcResult
-local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf) //TODO ADAPT
+local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf, const MeshInfo config) 
 {
     acNodeSynchronizeStream(node, stream);
 
+    int3 bindex = {-1, -1, -1} //Dummy for node level. Relevant only for MPI. 
+
     if (node->num_devices > 1) {
         // Local boundary conditions
         // #pragma omp parallel for
         for (int i = 0; i < node->num_devices; ++i) {
             const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
             const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
-            acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
+            acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1, config, bindex);
         }
     }
     else {
         acDeviceGeneralBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
-                                     node->subgrid.m);
+                                     node->subgrid.m, config, bindex);
     }
     return AC_SUCCESS;
 }

From 4add619e2fcfaaf3e19798a24c4b9600f0e14ded Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 16:22:47 +0800
Subject: [PATCH 17/21] Calling boundconds compiles again.

---
 include/astaroth.h | 10 ++++++++++
 src/core/node.cc   | 20 ++++++++++----------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index 2f66354..512ecd1 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -453,6 +453,9 @@ AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int
 /** */
 AcResult acNodeIntegrate(const Node node, const AcReal dt);
 
+/** */
+AcResult acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt);
+
 /** */
 AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                                      const VertexBufferHandle vtxbuf_handle);
@@ -460,6 +463,13 @@ AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 /** */
 AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
 
+/** */
+AcResult acNodeGeneralBoundcondStep(const Node node, const Stream stream,   
+                                    const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config);
+
+/** */
+AcResult acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config);
+
 /** */
 AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                           const VertexBufferHandle vtxbuf_handle, AcReal* result);
diff --git a/src/core/node.cc b/src/core/node.cc
index 423bbd1..fdc4f49 100644
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -657,11 +657,11 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
 }
 
 static AcResult
-local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf, const MeshInfo config) 
+local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf, const AcMeshInfo config) 
 {
     acNodeSynchronizeStream(node, stream);
 
-    int3 bindex = {-1, -1, -1} //Dummy for node level. Relevant only for MPI. 
+    int3 bindex = {-1, -1, -1}; //Dummy for node level. Relevant only for MPI. 
 
     if (node->num_devices > 1) {
         // Local boundary conditions
@@ -793,7 +793,7 @@ acNodeIntegrate(const Node node, const AcReal dt)
 }
 
 AcResult
-acNodeIntegrateGBG(const Node node, const AcReal dt) //TODO ADAPT
+acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt) 
 {
     acNodeSynchronizeStream(node, STREAM_ALL);
     // xxx|OOO OOOOOOOOO OOO|xxx
@@ -807,7 +807,7 @@ acNodeIntegrateGBG(const Node node, const AcReal dt) //TODO ADAPT
     for (int isubstep = 0; isubstep < 3; ++isubstep) {
         acNodeSynchronizeStream(node, STREAM_ALL);
         for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
-            local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+            local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, config);
         }
         acNodeSynchronizeStream(node, STREAM_ALL);
 
@@ -886,10 +886,10 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 }
 
 AcResult
-acNodeGeneralBoundcondStep(const Node node, const Stream stream,   //TODO ADAPT
-                           const VertexBufferHandle vtxbuf_handle)
+acNodeGeneralBoundcondStep(const Node node, const Stream stream,   
+                           const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config)
 {
-    local_boundcondstep_GBC(node, stream, vtxbuf_handle);
+    local_boundcondstep_GBC(node, stream, vtxbuf_handle, config);
     acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
 
     global_boundcondstep(node, stream, vtxbuf_handle);
@@ -899,7 +899,7 @@ acNodeGeneralBoundcondStep(const Node node, const Stream stream,   //TODO ADAPT
 }
 
 AcResult
-acNodePeriodicBoundconds(const Node node, const Stream stream) //TODO ADAPT
+acNodePeriodicBoundconds(const Node node, const Stream stream) 
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
@@ -908,10 +908,10 @@ acNodePeriodicBoundconds(const Node node, const Stream stream) //TODO ADAPT
 }
 
 AcResult
-acNodeGeneralBoundconds(const Node node, const Stream stream) //TODO ADAPT
+acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config) 
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i);
+        acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i, config);
     }
     return AC_SUCCESS;
 }

From 288693fab59099e6d6539b52ec37e6331e1069d0 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 16:31:13 +0800
Subject: [PATCH 18/21] Flexible boundary conditions called from simulation.cc

---
 samples/standalone/simulation.cc | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/samples/standalone/simulation.cc b/samples/standalone/simulation.cc
index 0bf193e..1aa5bf2 100644
--- a/samples/standalone/simulation.cc
+++ b/samples/standalone/simulation.cc
@@ -367,7 +367,8 @@ run_simulation(const char* config_path)
 #endif
     }
 
-    acBoundcondStep();
+    //acBoundcondStep();
+    acBoundcondStepGBC(mesh_info);
     acStore(mesh);
     if (start_step == 0) {
         save_mesh(*mesh, 0, t_step);
@@ -426,7 +427,11 @@ run_simulation(const char* config_path)
         loadForcingParamsToDevice(forcing_params);
 #endif
 
-        acIntegrate(dt);
+        /* Uses now flexible bokundary conditions */
+        //acIntegrate(dt);
+        acIntegrateGBC(mesh_info, dt);
+
+
 
         t_step += dt;
 
@@ -472,7 +477,8 @@ run_simulation(const char* config_path)
                 acBoundcondStep();
                 acStore(mesh);
             */
-            acBoundcondStep();
+            //acBoundcondStep();
+            acBoundcondStepGBC(mesh_info);
             acStore(mesh);
 
             save_mesh(*mesh, i, t_step);
@@ -493,7 +499,8 @@ run_simulation(const char* config_path)
         if (dt < dt_typical/AcReal(1e5)) {
             if (dtcounter > 10) {
                 printf("dt = %e TOO LOW! Ending run at t = %#e \n", double(dt), double(t_step));
-                acBoundcondStep();
+                //acBoundcondStep();
+                acBoundcondStepGBC(mesh_info);
                 acStore(mesh);
                 save_mesh(*mesh, i, t_step);
                 break;
@@ -507,7 +514,8 @@ run_simulation(const char* config_path)
         // End loop if nan is found
         if (found_nan > 0) {
             printf("Found nan at t = %e \n", double(t_step));
-            acBoundcondStep();
+            //acBoundcondStep();
+            acBoundcondStepGBC(mesh_info);
             acStore(mesh);
             save_mesh(*mesh, i, t_step);
             break;
@@ -522,7 +530,8 @@ run_simulation(const char* config_path)
  
         if (found_stop == 1) {
             printf("Found STOP file at t = %e \n", double(t_step));
-            acBoundcondStep();
+            //acBoundcondStep();
+            acBoundcondStepGBC(mesh_info);
             acStore(mesh);
             save_mesh(*mesh, i, t_step);
             break;

From d4ee066b3c403d4a45a69450e1b22a011a52ee55 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 17:00:48 +0800
Subject: [PATCH 19/21] Periodic boundary conditions work with switchable
 system.

Still some issue with the custom alternatives. Need to look into kernels again.
---
 acc/mhd_solver/stencil_kernel.ac |  9 ++--
 config/astaroth.conf             |  9 ++++
 src/core/kernels/boundconds.cuh  | 87 +++++++++++++++++++++++++-------
 3 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac
index 57097f8..c26a5f7 100644
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -22,9 +22,12 @@ uniform int AC_save_steps;
 uniform int AC_bin_steps;
 uniform int AC_start_step;
 
-// In3 params 
-uniform int3 AC_bc_type_bot;
-uniform int3 AC_bc_type_top;
+uniform int AC_bc_type_top_x;
+uniform int AC_bc_type_bot_x;
+uniform int AC_bc_type_top_y;
+uniform int AC_bc_type_bot_y;
+uniform int AC_bc_type_top_z;
+uniform int AC_bc_type_bot_z;
 
 // Real params
 uniform Scalar AC_dt;
diff --git a/config/astaroth.conf b/config/astaroth.conf
index 190948b..480bcdc 100644
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -13,6 +13,15 @@ AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
 AC_dsz = 0.04908738521
 
+// 0 = periodic bc, 1 = symmetric bc, 2 = antisymmetric bc
+AC_bc_type_top_x = 0 
+AC_bc_type_top_y = 0   
+AC_bc_type_top_z = 0 
+AC_bc_type_bot_x = 0   
+AC_bc_type_bot_y = 0   
+AC_bc_type_bot_z = 0   
+
+
 /*
  * =============================================================================
  * Run-time params
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 64b4a1e..6a0c1ab 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -23,47 +23,83 @@ kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, co
     int i_src, j_src, k_src, boundloc;
     int bsize = STENCIL_ORDER/(int) 2;
 
-    if (bindex.x != 0)
-    {
+    //if (bindex.x != 0)
+    //{
+    //    // Pick up the mirroring value.
+    //    if ((i_dst < bsize) && ((bindex.x == 3) || (bindex.x ==1)))
+    //    {
+    //        boundloc = bsize;         //Location of central border point. 
+    //        i_src = 2*boundloc - i_dst;
+    //    } else if ((i_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.x == 2) || (bindex.x ==1)))
+    //    {
+    //        boundloc = DCONST(AC_nx_min) - bsize - 1; //Location of central border point. 
+    //        i_src = 2*boundloc - i_dst;
+    //    }
+    //} 
+    //if (bindex.y != 0)
+    //{
+    //    // Pick up the mirroring value.
+    //    if ((j_dst < bsize) && ((bindex.y == 3) || (bindex.y ==1)))
+    //    {
+    //        boundloc = bsize;         //Location of central border point. 
+    //        j_src = 2*boundloc - j_dst;
+    //    } else if ((j_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.y == 2) || (bindex.y ==1)))
+    //    {
+    //        boundloc = DCONST(AC_ny_min) - bsize - 1; //Location of central border point. 
+    //        i_src = 2*boundloc - j_dst;
+    //    }
+    //} 
+    //if (bindex.z != 0)
+    //{
+    //    // Pick up the mirroring value.
+    //    if ((k_dst < bsize) && ((bindex.z == 3) || (bindex.z ==1)))
+    //    {
+    //        boundloc = bsize;         //Location of central border point. 
+    //        k_src = 2*boundloc - k_dst;
+    //    } else if ((i_dst >= DCONST(AC_nz_min) - bsize) && ((bindex.z == 2) || (bindex.z ==1)))
+    //    {
+    //        boundloc = DCONST(AC_nz_min) - bsize - 1; //Location of central border point. 
+    //        k_src = 2*boundloc - k_dst;
+    //    }
+    //}
+
+    if (bindex.x < 0)
+    { 
+
         // Pick up the mirroring value.
-        if ((i_dst < bsize) && ((bindex.x == 3) || (bindex.x ==1)))
+        if ((i_dst < bsize))
         {
             boundloc = bsize;         //Location of central border point. 
             i_src = 2*boundloc - i_dst;
-        } else if ((i_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.x == 2) || (bindex.x ==1)))
+        } else if ((i_dst >= DCONST(AC_nx_min) - bsize))
         {
             boundloc = DCONST(AC_nx_min) - bsize - 1; //Location of central border point. 
             i_src = 2*boundloc - i_dst;
         }
-    } 
-    if (bindex.y != 0)
-    {
+        
         // Pick up the mirroring value.
-        if ((j_dst < bsize) && ((bindex.y == 3) || (bindex.y ==1)))
+        if ((j_dst < bsize))
         {
             boundloc = bsize;         //Location of central border point. 
             j_src = 2*boundloc - j_dst;
-        } else if ((j_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.y == 2) || (bindex.y ==1)))
+        } else if ((j_dst >= DCONST(AC_nx_min) - bsize))
         {
             boundloc = DCONST(AC_ny_min) - bsize - 1; //Location of central border point. 
             i_src = 2*boundloc - j_dst;
         }
-    } 
-    if (bindex.z != 0)
-    {
+        
         // Pick up the mirroring value.
-        if ((k_dst < bsize) && ((bindex.z == 3) || (bindex.z ==1)))
+        if ((k_dst < bsize))
         {
             boundloc = bsize;         //Location of central border point. 
             k_src = 2*boundloc - k_dst;
-        } else if ((i_dst >= DCONST(AC_nz_min) - bsize) && ((bindex.z == 2) || (bindex.z ==1)))
+        } else if ((i_dst >= DCONST(AC_nz_min) - bsize))
         {
             boundloc = DCONST(AC_nz_min) - bsize - 1; //Location of central border point. 
             k_src = 2*boundloc - k_dst;
         }
-    } 
-
 
+    }
 
     const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
     const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
@@ -141,8 +177,10 @@ acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int
                    (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                    (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
 
-    int3 bc_top = config.int3_params[AC_bc_type_top];
-    int3 bc_bot = config.int3_params[AC_bc_type_bot];
+    int3 bc_top = {config.int_params[AC_bc_type_top_x], config.int_params[AC_bc_type_top_y], 
+                   config.int_params[AC_bc_type_top_z]};
+    int3 bc_bot = {config.int_params[AC_bc_type_bot_x], config.int_params[AC_bc_type_bot_y], 
+                   config.int_params[AC_bc_type_bot_z]};
 
     if (bc_top.x == AC_BOUNDCOND_SYMMETRIC) 
     {
@@ -153,6 +191,19 @@ acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int
     {
         kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
         ERRCHK_CUDA_KERNEL();
+    } 
+    else if (bc_bot.x == AC_BOUNDCOND_PERIODIC) 
+    {
+        kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
+        ERRCHK_CUDA_KERNEL();
+    } 
+    else 
+    {
+        printf("ERROR: Boundary condition not recognized!\n");
+        printf("ERROR: bc_top = %i, %i, %i \n", bc_top.x, bc_top.y, bc_top.z);
+        printf("ERROR: bc_bot = %i, %i, %i \n", bc_bot.x, bc_bot.y, bc_bot.z);
+
+        return AC_FAILURE;
     }
 
     return AC_SUCCESS;

From 2f0f6ceac205789c57dda51e74925d7991f90b6a Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Mon, 23 Nov 2020 15:19:47 +0800
Subject: [PATCH 20/21] Functioning symmetric antisymmetric boundary condition.

---
 src/core/kernels/boundconds.cuh | 139 ++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 59 deletions(-)

diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 6a0c1ab..541b22b 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -20,90 +20,111 @@ kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, co
 
     // Find the source index
     // Map to nx, ny, nz coordinates
-    int i_src, j_src, k_src, boundloc;
+    int i_src, j_src, k_src, boundlocx0, boundlocx1, boundlocy0, boundlocy1, boundlocz0, boundlocz1;
     int bsize = STENCIL_ORDER/(int) 2;
 
     //if (bindex.x != 0)
-    //{
-    //    // Pick up the mirroring value.
-    //    if ((i_dst < bsize) && ((bindex.x == 3) || (bindex.x ==1)))
-    //    {
-    //        boundloc = bsize;         //Location of central border point. 
-    //        i_src = 2*boundloc - i_dst;
-    //    } else if ((i_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.x == 2) || (bindex.x ==1)))
-    //    {
-    //        boundloc = DCONST(AC_nx_min) - bsize - 1; //Location of central border point. 
-    //        i_src = 2*boundloc - i_dst;
-    //    }
-    //} 
     //if (bindex.y != 0)
-    //{
-    //    // Pick up the mirroring value.
-    //    if ((j_dst < bsize) && ((bindex.y == 3) || (bindex.y ==1)))
-    //    {
-    //        boundloc = bsize;         //Location of central border point. 
-    //        j_src = 2*boundloc - j_dst;
-    //    } else if ((j_dst >= DCONST(AC_nx_min) - bsize) && ((bindex.y == 2) || (bindex.y ==1)))
-    //    {
-    //        boundloc = DCONST(AC_ny_min) - bsize - 1; //Location of central border point. 
-    //        i_src = 2*boundloc - j_dst;
-    //    }
-    //} 
     //if (bindex.z != 0)
-    //{
-    //    // Pick up the mirroring value.
-    //    if ((k_dst < bsize) && ((bindex.z == 3) || (bindex.z ==1)))
-    //    {
-    //        boundloc = bsize;         //Location of central border point. 
-    //        k_src = 2*boundloc - k_dst;
-    //    } else if ((i_dst >= DCONST(AC_nz_min) - bsize) && ((bindex.z == 2) || (bindex.z ==1)))
-    //    {
-    //        boundloc = DCONST(AC_nz_min) - bsize - 1; //Location of central border point. 
-    //        k_src = 2*boundloc - k_dst;
-    //    }
-    //}
+
+    //Location of central border point.
+    boundlocx0 = bsize;
+    boundlocy0 = bsize; 
+    boundlocz0 = bsize;
+    boundlocx1 = DCONST(AC_nx_max) - 1; 
+    boundlocy1 = DCONST(AC_ny_max) - 1;
+    boundlocz1 = DCONST(AC_nz_max) - 1;
+    
+    //Defaults
+    i_src = i_dst;
+    j_src = j_dst;
+    k_src = k_dst;
 
     if (bindex.x < 0)
-    { 
+    {
 
         // Pick up the mirroring value.
-        if ((i_dst < bsize))
+        if ((i_dst < boundlocx0))
         {
-            boundloc = bsize;         //Location of central border point. 
-            i_src = 2*boundloc - i_dst;
-        } else if ((i_dst >= DCONST(AC_nx_min) - bsize))
+            i_src = 2.0f*boundlocx0 - i_dst;
+
+        } else if ((i_dst > boundlocx1))
         {
-            boundloc = DCONST(AC_nx_min) - bsize - 1; //Location of central border point. 
-            i_src = 2*boundloc - i_dst;
+            i_src = 2.0f*boundlocx1 - i_dst;
         }
         
         // Pick up the mirroring value.
-        if ((j_dst < bsize))
+        if ((j_dst < boundlocy0))
         {
-            boundloc = bsize;         //Location of central border point. 
-            j_src = 2*boundloc - j_dst;
-        } else if ((j_dst >= DCONST(AC_nx_min) - bsize))
+            j_src = 2.0f*boundlocy0 - j_dst;
+        } else if ((j_dst > boundlocx1))
         {
-            boundloc = DCONST(AC_ny_min) - bsize - 1; //Location of central border point. 
-            i_src = 2*boundloc - j_dst;
+            j_src = 2.0f*boundlocy1 - j_dst;
         }
         
         // Pick up the mirroring value.
-        if ((k_dst < bsize))
+        if ((k_dst < boundlocz0))
         {
-            boundloc = bsize;         //Location of central border point. 
-            k_src = 2*boundloc - k_dst;
-        } else if ((i_dst >= DCONST(AC_nz_min) - bsize))
+            k_src = 2.0f*boundlocz0 - k_dst;
+        } else if ((k_dst > boundlocz1))
         {
-            boundloc = DCONST(AC_nz_min) - bsize - 1; //Location of central border point. 
-            k_src = 2*boundloc - k_dst;
+            k_src = 2.0f*boundlocz1 - k_dst;
+        }
+
+        //Edges
+        if (       (i_dst < boundlocx0) && (j_dst < boundlocy0)                         )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            j_src = 2.0f*boundlocy0 - j_dst;
+            //if ((k_dst == 50)) printf("i_dst %i j_dst %i k_dst %i i_src %i j_src %i k_src %i bsize %i \n", i_dst, j_dst, k_dst, i_src, j_src, k_src, bsize);
+        } else if ((i_dst < boundlocx0)                         && (k_dst < boundlocz0) )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
+        } else if (                        (j_dst < boundlocy0) && (k_dst < boundlocz0) )
+        {
+            j_src = 2.0f*boundlocy0 - j_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
+
+        } else if ((i_dst > boundlocx1) && (j_dst > boundlocx1)                         )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            j_src = 2.0f*boundlocy1 - j_dst;
+        } else if ( (i_dst > boundlocx1)                        && (k_dst > boundlocz1) )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            k_src = 2.0f*boundlocz1 - k_dst;
+        } else if (                        (j_dst > boundlocy1) && (k_dst > boundlocz1) )
+        {
+            j_src = 2.0f*boundlocy1 - j_dst;
+            k_src = 2.0f*boundlocz1 - k_dst;
+        } else if ( (i_dst > boundlocx1)                        && (k_dst < boundlocz0) )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
+        } else if ( (i_dst > boundlocx1) && (j_dst < bsize)                             )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            j_src = 2.0f*boundlocy0 - j_dst;
+        } else if ( (i_dst < boundlocx0)                        && (k_dst > boundlocz1) )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            k_src = 2.0f*boundlocz1 - k_dst;
+        } else if ( (i_dst < boundlocx0) && (j_dst > boundlocy1)                        )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            j_src = 2.0f*boundlocy1 - j_dst;
+        } else if (                        (j_dst > boundlocy1) && (k_dst < boundlocz0) )
+        {
+            j_src = 2.0f*boundlocy1 - j_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
         }
 
     }
 
     const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
     const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx]   = sign*vtxbuf[src_idx]; // sign = 1 symmetric, sign = -1 antisymmetric
+    vtxbuf[dst_idx]   = sign*vtxbuf[src_idx] *0.0 + 1.0; // sign = 1 symmetric, sign = -1 antisymmetric
 }
 
 
@@ -187,12 +208,12 @@ acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int
         kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
         ERRCHK_CUDA_KERNEL();
     } 
-    else if (bc_bot.x == AC_BOUNDCOND_ANTISYMMETRIC) 
+    else if (bc_top.x == AC_BOUNDCOND_ANTISYMMETRIC) 
     {
         kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
         ERRCHK_CUDA_KERNEL();
     } 
-    else if (bc_bot.x == AC_BOUNDCOND_PERIODIC) 
+    else if (bc_top.x == AC_BOUNDCOND_PERIODIC) 
     {
         kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
         ERRCHK_CUDA_KERNEL();

From 543c565e5d676914ff93998d0dd878a75e6ff84e Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Mon, 23 Nov 2020 15:43:52 +0800
Subject: [PATCH 21/21] Dummy at the moment, but now the boundary condition
 kernel caller can see what vertex buffer name is in use.

---
 src/core/device.cc              |  2 +-
 src/core/kernels/boundconds.cuh | 55 ++++++++++++++++++++-------------
 src/core/kernels/kernels.h      |  3 +-
 3 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index 9130f34..0d2127e 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -440,7 +440,7 @@ acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
 {
     cudaSetDevice(device->id);
     return acKernelGeneralBoundconds(device->streams[stream], start, end,
-                                     device->vba.in[vtxbuf_handle], config, bindex);
+                                     device->vba.in[vtxbuf_handle], vtxbuf_handle, config, bindex);
 }
 
 AcResult
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 541b22b..f0518c0 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -191,7 +191,8 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
 
 AcResult 
 acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                          AcReal* vtxbuf, const AcMeshInfo config, const int3 bindex)
+                          AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle, 
+                          const AcMeshInfo config, const int3 bindex)   
 {
     const dim3 tpb(8, 2, 8);
     const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
@@ -203,28 +204,38 @@ acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int
     int3 bc_bot = {config.int_params[AC_bc_type_bot_x], config.int_params[AC_bc_type_bot_y], 
                    config.int_params[AC_bc_type_bot_z]};
 
-    if (bc_top.x == AC_BOUNDCOND_SYMMETRIC) 
-    {
-        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
-        ERRCHK_CUDA_KERNEL();
-    } 
-    else if (bc_top.x == AC_BOUNDCOND_ANTISYMMETRIC) 
-    {
-        kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
-        ERRCHK_CUDA_KERNEL();
-    } 
-    else if (bc_top.x == AC_BOUNDCOND_PERIODIC) 
-    {
-        kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
-        ERRCHK_CUDA_KERNEL();
-    } 
-    else 
-    {
-        printf("ERROR: Boundary condition not recognized!\n");
-        printf("ERROR: bc_top = %i, %i, %i \n", bc_top.x, bc_top.y, bc_top.z);
-        printf("ERROR: bc_bot = %i, %i, %i \n", bc_bot.x, bc_bot.y, bc_bot.z);
+//#if AC_MPI_ENABLED
+//    printf( "WARNING : NON-PERIODIC BOUNDARY CONDITIONS NOT SUPPORTER BY MPI! Only working at node level.\n");
+//    return AC_FAILURE;
+//#endif
+
+    if ( vtxbuf_handle != -1) // This is a dummy to make swithing boundary condition with respect to   more possible later  
+    {
+
+        if (bc_top.x == AC_BOUNDCOND_SYMMETRIC) 
+        {
+            kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
+            ERRCHK_CUDA_KERNEL();
+        } 
+        else if (bc_top.x == AC_BOUNDCOND_ANTISYMMETRIC) 
+        {
+            kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
+            ERRCHK_CUDA_KERNEL();
+        } 
+        else if (bc_top.x == AC_BOUNDCOND_PERIODIC) 
+        {
+            kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
+            ERRCHK_CUDA_KERNEL();
+        } 
+        else 
+        {
+            printf("ERROR: Boundary condition not recognized!\n");
+            printf("ERROR: bc_top = %i, %i, %i \n", bc_top.x, bc_top.y, bc_top.z);
+            printf("ERROR: bc_bot = %i, %i, %i \n", bc_bot.x, bc_bot.y, bc_bot.z);
+
+            return AC_FAILURE;
+        }
 
-        return AC_FAILURE;
     }
 
     return AC_SUCCESS;
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index bc94185..a9e05f9 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -45,7 +45,8 @@ AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start,
                                     AcReal* vtxbuf);
 /** */
 AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
-                                    AcReal* vtxbuf, const AcMeshInfo config, const int3 bindex);
+                                   AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle,
+                                   const AcMeshInfo config, const int3 bindex);
 
 
 /** */