Merged in alt_bcond_2020_09 (pull request #16)

Alt bcond 2020 09
2020-11-23 10:24:01 +00:00
parent e3eb782213 11eddabbd6
commit d83fd173fa
9 changed files with 694 additions and 8 deletions
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -20,9 +20,15 @@
 uniform int AC_max_steps;
 uniform int AC_save_steps;
 uniform int AC_bin_steps;
-uniform int AC_bc_type;
 uniform int AC_start_step;

+uniform int AC_bc_type_top_x;
+uniform int AC_bc_type_bot_x;
+uniform int AC_bc_type_top_y;
+uniform int AC_bc_type_bot_y;
+uniform int AC_bc_type_top_z;
+uniform int AC_bc_type_bot_z;
+
 // Real params
 uniform Scalar AC_dt;
 uniform Scalar AC_max_time;
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -13,6 +13,15 @@ AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
 AC_dsz = 0.04908738521

+// 0 = periodic bc, 1 = symmetric bc, 2 = antisymmetric bc
+AC_bc_type_top_x = 0 
+AC_bc_type_top_y = 0   
+AC_bc_type_top_z = 0 
+AC_bc_type_bot_x = 0   
+AC_bc_type_bot_y = 0   
+AC_bc_type_bot_z = 0   
+
+
 /*
 * =============================================================================
 * Run-time params
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -49,6 +49,11 @@ typedef struct {

 typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;

+// Neming the associated number of the boundary condition types
+typedef enum { AC_BOUNDCOND_PERIODIC = 0, 
+               AC_BOUNDCOND_SYMMETRIC = 1, 
+               AC_BOUNDCOND_ANTISYMMETRIC = 2 } AcBoundcond;
+
 #define AC_GEN_ID(X) X,
 typedef enum {
    AC_FOR_RTYPES(AC_GEN_ID) //
@@ -227,10 +232,19 @@ AcResult acStore(AcMesh* host_mesh);
 * substep and the user is responsible for calling acBoundcondStep before reading the data. */
 AcResult acIntegrate(const AcReal dt);

+/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
+ * substep and the user is responsible for calling acBoundcondStep before reading the data.
+ * Has customizable boundary conditions. */
+AcResult acIntegrateGBC(const AcMeshInfo config, const AcReal dt);
+
 /** Applies periodic boundary conditions for the Mesh distributed among the devices visible to
 * the caller*/
 AcResult acBoundcondStep(void);

+/** Applies general outer boundary conditions for the Mesh distributed among the devices visible to
+ * the caller*/
+AcResult acBoundcondStepGBC(const AcMeshInfo config);
+
 /** Does a scalar reduction with the data stored in some vertex buffer */
 AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);

@@ -306,9 +320,21 @@ AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
 /** */
 AcResult acGridIntegrate(const Stream stream, const AcReal dt);

+/** */
+/*   MV: Commented out for a while, but save for the future when standalone_MPI
+         works with periodic boundary conditions. 
+AcResult
+acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
+
+AcResult acGridIntegrateNonperiodic(const Stream stream, const AcReal dt);
+*/
+
 /** */
 AcResult acGridPeriodicBoundconds(const Stream stream);

+/** */
+AcResult acGridGeneralBoundconds(const Device device, const Stream stream);
+
 /** TODO */
 AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
@@ -430,6 +456,9 @@ AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int
 /** */
 AcResult acNodeIntegrate(const Node node, const AcReal dt);

+/** */
+AcResult acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt);
+
 /** */
 AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                                     const VertexBufferHandle vtxbuf_handle);
@@ -437,6 +466,13 @@ AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 /** */
 AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);

+/** */
+AcResult acNodeGeneralBoundcondStep(const Node node, const Stream stream,   
+                                    const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config);
+
+/** */
+AcResult acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config);
+
 /** */
 AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
@@ -565,6 +601,16 @@ AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
 AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
                                    const int3 end);

+/** */
+AcResult acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
+                                      const VertexBufferHandle vtxbuf_handle, const int3 start,
+                                      const int3 end, const AcMeshInfo config, const int3 bindex);
+
+/** */
+AcResult acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
+                                   const int3 end, const AcMeshInfo config, const int3 bindex);
+
+
 /** */
 AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
--- a/samples/standalone/simulation.cc
+++ b/samples/standalone/simulation.cc
@@ -374,7 +374,8 @@ run_simulation(const char* config_path)
 #endif
    }

-    acBoundcondStep();
+    //acBoundcondStep();
+    acBoundcondStepGBC(mesh_info);
    acStore(mesh);
    if (start_step == 0) {
        save_mesh(*mesh, 0, t_step);
@@ -440,7 +441,11 @@ run_simulation(const char* config_path)
        loadForcingParamsToDevice(forcing_params);
 #endif

-        acIntegrate(dt);
+        /* Uses now flexible bokundary conditions */
+        //acIntegrate(dt);
+        acIntegrateGBC(mesh_info, dt);
+
+

        t_step += dt;

@@ -486,7 +491,8 @@ run_simulation(const char* config_path)
                acBoundcondStep();
                acStore(mesh);
            */
-            acBoundcondStep();
+            //acBoundcondStep();
+            acBoundcondStepGBC(mesh_info);
            acStore(mesh);

            save_mesh(*mesh, i, t_step);
@@ -507,7 +513,8 @@ run_simulation(const char* config_path)
        if (dt < dt_typical/AcReal(1e5)) {
            if (dtcounter > 10) {
                printf("dt = %e TOO LOW! Ending run at t = %#e \n", double(dt), double(t_step));
-                acBoundcondStep();
+                //acBoundcondStep();
+                acBoundcondStepGBC(mesh_info);
                acStore(mesh);
                save_mesh(*mesh, i, t_step);
                break;
@@ -521,7 +528,8 @@ run_simulation(const char* config_path)
        // End loop if nan is found
        if (found_nan > 0) {
            printf("Found nan at t = %e \n", double(t_step));
-            acBoundcondStep();
+            //acBoundcondStep();
+            acBoundcondStepGBC(mesh_info);
            acStore(mesh);
            save_mesh(*mesh, i, t_step);
            break;
@@ -536,7 +544,8 @@ run_simulation(const char* config_path)
 
        if (found_stop == 1) {
            printf("Found STOP file at t = %e \n", double(t_step));
-            acBoundcondStep();
+            //acBoundcondStep();
+            acBoundcondStepGBC(mesh_info);
            acStore(mesh);
            save_mesh(*mesh, i, t_step);
            break;
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -86,6 +86,13 @@ acIntegrate(const AcReal dt)
    return acNodeIntegrate(nodes[0], dt);
 }

+AcResult
+acIntegrateGBC(const AcMeshInfo config, const AcReal dt) 
+{
+    return acNodeIntegrateGBC(nodes[0], config, dt);
+}
+
+
 AcResult
 acIntegrateStep(const int isubstep, const AcReal dt)
 {
@@ -109,6 +116,12 @@ acBoundcondStep(void)
    return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
 }

+AcResult
+acBoundcondStepGBC(const AcMeshInfo config) 
+{
+    return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT, config);
+}
+
 AcReal
 acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
 {
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -433,6 +433,27 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3
    return AC_SUCCESS;
 }

+AcResult
+acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
+                              const VertexBufferHandle vtxbuf_handle, const int3 start,
+                              const int3 end, const AcMeshInfo config, const int3 bindex)
+{
+    cudaSetDevice(device->id);
+    return acKernelGeneralBoundconds(device->streams[stream], start, end,
+                                     device->vba.in[vtxbuf_handle], vtxbuf_handle, config, bindex);
+}
+
+AcResult
+acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
+                           const int3 end, const AcMeshInfo config, const int3 bindex)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, config, bindex);
+    }
+    return AC_SUCCESS;
+}
+
+
 AcResult
 acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                   const VertexBufferHandle vtxbuf_handle, AcReal* result)
@@ -1433,6 +1454,50 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
    return AC_SUCCESS;
 }

+/*   MV: Commented out for a while, but save for the future when standalone_MPI
+         works with periodic boundary conditions. 
+AcResult
+acGridGeneralBoundconds(const Device device, const Stream stream)
+{
+    // Non-periodic Boundary conditions 
+    // Check the position in MPI frame 
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    const uint3_64 decomposition = decompose(nprocs);
+    const int3 pid3d             = getPid3D(pid, decomposition);
+
+    // Set outer boudaries after substep computation. 
+    const int3 m1 = (int3){0, 0, 0};
+    const int3 m2 = grid.nn;
+    const int3 pid3d = getPid3D(pid, decomposition);
+    // If we are are a boundary element
+    int3 bindex = (int3){0, 0, 0};
+
+    // Check if there are active boundary condition edges.
+    // 0 is no boundary, 1 both edges, 2 is top edge, 3 bottom edge
+    if      ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 1; }
+    else if  (pid3d.x == 0)                                      { bindex.x = 2; }
+    else if                    (pid3d.x == decomposition.x - 1)  { bindex.x = 3; }
+
+    if      ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 1; }
+    else if  (pid3d.y == 0)                                      { bindex.y = 2; }
+    else if                    (pid3d.y == decomposition.y - 1)  { bindex.y = 3; }
+
+    if      ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 1; }
+    else if  (pid3d.z == 0)                                      { bindex.z = 2; }
+    else if                    (pid3d.z == decomposition.z - 1)  { bindex.z = 3; }
+
+    
+    if (bindex.x != 1) && (bindex.y != 1) && (bindex.z != 1) {
+        acDeviceGeneralBoundconds(device, stream, m1, m2, bindex);
+    }
+    acGridSynchronizeStream(stream);
+
+    return AC_SUCCESS;
+}
+*/
+
 /*
 // Unused
 AcResult
@@ -1864,6 +1929,234 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    return AC_SUCCESS;
 }

+/*   MV: Commented out for a while, but save for the future when standalone_MPI
+         works with periodic boundary conditions. 
+AcResult
+acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
+{
+    ERRCHK(grid.initialized);
+    acGridSynchronizeStream(stream);
+
+    const Device device = grid.device;
+    const int3 nn       = grid.nn;
+#if MPI_INCL_CORNERS
+    CommData corner_data = grid.corner_data; // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+    CommData edgex_data  = grid.edgex_data;
+    CommData edgey_data  = grid.edgey_data;
+    CommData edgez_data  = grid.edgez_data;
+    CommData sidexy_data = grid.sidexy_data;
+    CommData sidexz_data = grid.sidexz_data;
+    CommData sideyz_data = grid.sideyz_data;
+
+    acGridLoadScalarUniform(stream, AC_dt, dt);
+    acDeviceSynchronizeStream(device, stream);
+
+
+// Corners
+#if MPI_INCL_CORNERS
+    // Do not rm: required for corners
+    const int3 corner_b0s[] = {
+        (int3){0, 0, 0},
+        (int3){NGHOST + nn.x, 0, 0},
+        (int3){0, NGHOST + nn.y, 0},
+        (int3){0, 0, NGHOST + nn.z},
+
+        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
+        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
+        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
+    };
+#endif // MPI_INCL_CORNERS
+
+    // Edges X
+    const int3 edgex_b0s[] = {
+        (int3){NGHOST, 0, 0},
+        (int3){NGHOST, NGHOST + nn.y, 0},
+
+        (int3){NGHOST, 0, NGHOST + nn.z},
+        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
+    };
+
+    // Edges Y
+    const int3 edgey_b0s[] = {
+        (int3){0, NGHOST, 0},
+        (int3){NGHOST + nn.x, NGHOST, 0},
+
+        (int3){0, NGHOST, NGHOST + nn.z},
+        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
+    };
+
+    // Edges Z
+    const int3 edgez_b0s[] = {
+        (int3){0, 0, NGHOST},
+        (int3){NGHOST + nn.x, 0, NGHOST},
+
+        (int3){0, NGHOST + nn.y, NGHOST},
+        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
+    };
+
+    // Sides XY
+    const int3 sidexy_b0s[] = {
+        (int3){NGHOST, NGHOST, 0},             //
+        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
+    };
+
+    // Sides XZ
+    const int3 sidexz_b0s[] = {
+        (int3){NGHOST, 0, NGHOST},             //
+        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
+    };
+
+    // Sides YZ
+    const int3 sideyz_b0s[] = {
+        (int3){0, NGHOST, NGHOST},             //
+        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
+    };
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                    // MPI_INCL_CORNERS
+        acPackCommData(device, edgex_b0s, &edgex_data);
+        acPackCommData(device, edgey_b0s, &edgey_data);
+        acPackCommData(device, edgez_b0s, &edgez_data);
+        acPackCommData(device, sidexy_b0s, &sidexy_data);
+        acPackCommData(device, sidexz_b0s, &sidexz_data);
+        acPackCommData(device, sideyz_b0s, &sideyz_data);
+#endif
+
+#if MPI_COMM_ENABLED
+        MPI_Barrier(MPI_COMM_WORLD);
+
+#if MPI_GPUDIRECT_DISABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
+#endif                                                  // MPI_INCL_CORNERS
+        acTransferCommDataToHost(device, &edgex_data);
+        acTransferCommDataToHost(device, &edgey_data);
+        acTransferCommDataToHost(device, &edgez_data);
+        acTransferCommDataToHost(device, &sidexy_data);
+        acTransferCommDataToHost(device, &sidexz_data);
+        acTransferCommDataToHost(device, &sideyz_data);
+#endif
+#if MPI_INCL_CORNERS
+        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
+#endif                                                        // MPI_INCL_CORNERS
+        acTransferCommData(device, edgex_b0s, &edgex_data);
+        acTransferCommData(device, edgey_b0s, &edgey_data);
+        acTransferCommData(device, edgez_b0s, &edgez_data);
+        acTransferCommData(device, sidexy_b0s, &sidexy_data);
+        acTransferCommData(device, sidexz_b0s, &sidexz_data);
+        acTransferCommData(device, sideyz_b0s, &sideyz_data);
+#endif // MPI_COMM_ENABLED
+
+#if MPI_COMPUTE_ENABLED
+        //////////// INNER INTEGRATION //////////////
+        {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = nn;
+            acKernelIntegrateSubstep(device->streams[STREAM_16], isubstep, m1, m2, device->vba);
+        }
+////////////////////////////////////////////
+#endif // MPI_COMPUTE_ENABLED
+
+#if MPI_COMM_ENABLED
+#if MPI_INCL_CORNERS
+        acTransferCommDataWait(corner_data); // Do not rm: required for corners
+#endif                                       // MPI_INCL_CORNERS
+        acTransferCommDataWait(edgex_data);
+        acTransferCommDataWait(edgey_data);
+        acTransferCommDataWait(edgez_data);
+        acTransferCommDataWait(sidexy_data);
+        acTransferCommDataWait(sidexz_data);
+        acTransferCommDataWait(sideyz_data);
+
+#if MPI_INCL_CORNERS
+        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
+#endif                                         // MPI_INCL_CORNERS
+        acUnpinCommData(device, &edgex_data);
+        acUnpinCommData(device, &edgey_data);
+        acUnpinCommData(device, &edgez_data);
+        acUnpinCommData(device, &sidexy_data);
+        acUnpinCommData(device, &sidexz_data);
+        acUnpinCommData(device, &sideyz_data);
+
+#if MPI_INCL_CORNERS
+        acUnpackCommData(device, corner_b0s, &corner_data);
+#endif // MPI_INCL_CORNERS
+        acUnpackCommData(device, edgex_b0s, &edgex_data);
+        acUnpackCommData(device, edgey_b0s, &edgey_data);
+        acUnpackCommData(device, edgez_b0s, &edgez_data);
+        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
+        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
+        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
+//////////// OUTER INTEGRATION //////////////
+
+// Wait for unpacking
+#if MPI_INCL_CORNERS
+        acSyncCommData(corner_data); // Do not rm: required for corners
+#endif                               // MPI_INCL_CORNERS
+        acSyncCommData(edgex_data);
+        acSyncCommData(edgey_data);
+        acSyncCommData(edgez_data);
+        acSyncCommData(sidexy_data);
+        acSyncCommData(sidexz_data);
+        acSyncCommData(sideyz_data);
+#endif // MPI_COMM_ENABLED
+
+        // Invoke outer edge boundary conditions. 
+        acGridGeneralBoundconds(device, stream)
+
+#if MPI_COMPUTE_ENABLED
+        { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_0], isubstep, m1, m2, device->vba);
+        }
+        { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
+            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_1], isubstep, m1, m2, device->vba);
+        }
+        { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_2], isubstep, m1, m2, device->vba);
+        }
+        { // Top
+            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_3], isubstep, m1, m2, device->vba);
+        }
+        { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_4], isubstep, m1, m2, device->vba);
+        }
+        { // Right
+            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
+            acKernelIntegrateSubstep(device->streams[STREAM_5], isubstep, m1, m2, device->vba);
+        }
+#endif // MPI_COMPUTE_ENABLED
+        acDeviceSwapBuffers(device);
+        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
+        ////////////////////////////////////////////
+
+    }
+
+    
+    
+
+    return AC_SUCCESS;
+}
+
+*/
+
+
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
 {
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,5 +1,133 @@
 #pragma once

+static __global__ void
+kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex, const int sign)
+{
+    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
+    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
+    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
+
+    // If within the start-end range (this allows threadblock dims that are not
+    // divisible by end - start)
+    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
+        return;
+
+    // If destination index is inside the computational domain, return since
+    // the boundary conditions are only applied to the ghost zones
+    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
+        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
+        return;
+
+    // Find the source index
+    // Map to nx, ny, nz coordinates
+    int i_src, j_src, k_src, boundlocx0, boundlocx1, boundlocy0, boundlocy1, boundlocz0, boundlocz1;
+    int bsize = STENCIL_ORDER/(int) 2;
+
+    //if (bindex.x != 0)
+    //if (bindex.y != 0)
+    //if (bindex.z != 0)
+
+    //Location of central border point.
+    boundlocx0 = bsize;
+    boundlocy0 = bsize; 
+    boundlocz0 = bsize;
+    boundlocx1 = DCONST(AC_nx_max) - 1; 
+    boundlocy1 = DCONST(AC_ny_max) - 1;
+    boundlocz1 = DCONST(AC_nz_max) - 1;
+    
+    //Defaults
+    i_src = i_dst;
+    j_src = j_dst;
+    k_src = k_dst;
+
+    if (bindex.x < 0)
+    {
+
+        // Pick up the mirroring value.
+        if ((i_dst < boundlocx0))
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+
+        } else if ((i_dst > boundlocx1))
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+        }
+        
+        // Pick up the mirroring value.
+        if ((j_dst < boundlocy0))
+        {
+            j_src = 2.0f*boundlocy0 - j_dst;
+        } else if ((j_dst > boundlocx1))
+        {
+            j_src = 2.0f*boundlocy1 - j_dst;
+        }
+        
+        // Pick up the mirroring value.
+        if ((k_dst < boundlocz0))
+        {
+            k_src = 2.0f*boundlocz0 - k_dst;
+        } else if ((k_dst > boundlocz1))
+        {
+            k_src = 2.0f*boundlocz1 - k_dst;
+        }
+
+        //Edges
+        if (       (i_dst < boundlocx0) && (j_dst < boundlocy0)                         )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            j_src = 2.0f*boundlocy0 - j_dst;
+            //if ((k_dst == 50)) printf("i_dst %i j_dst %i k_dst %i i_src %i j_src %i k_src %i bsize %i \n", i_dst, j_dst, k_dst, i_src, j_src, k_src, bsize);
+        } else if ((i_dst < boundlocx0)                         && (k_dst < boundlocz0) )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
+        } else if (                        (j_dst < boundlocy0) && (k_dst < boundlocz0) )
+        {
+            j_src = 2.0f*boundlocy0 - j_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
+
+        } else if ((i_dst > boundlocx1) && (j_dst > boundlocx1)                         )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            j_src = 2.0f*boundlocy1 - j_dst;
+        } else if ( (i_dst > boundlocx1)                        && (k_dst > boundlocz1) )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            k_src = 2.0f*boundlocz1 - k_dst;
+        } else if (                        (j_dst > boundlocy1) && (k_dst > boundlocz1) )
+        {
+            j_src = 2.0f*boundlocy1 - j_dst;
+            k_src = 2.0f*boundlocz1 - k_dst;
+        } else if ( (i_dst > boundlocx1)                        && (k_dst < boundlocz0) )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
+        } else if ( (i_dst > boundlocx1) && (j_dst < bsize)                             )
+        {
+            i_src = 2.0f*boundlocx1 - i_dst;
+            j_src = 2.0f*boundlocy0 - j_dst;
+        } else if ( (i_dst < boundlocx0)                        && (k_dst > boundlocz1) )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            k_src = 2.0f*boundlocz1 - k_dst;
+        } else if ( (i_dst < boundlocx0) && (j_dst > boundlocy1)                        )
+        {
+            i_src = 2.0f*boundlocx0 - i_dst;
+            j_src = 2.0f*boundlocy1 - j_dst;
+        } else if (                        (j_dst > boundlocy1) && (k_dst < boundlocz0) )
+        {
+            j_src = 2.0f*boundlocy1 - j_dst;
+            k_src = 2.0f*boundlocz0 - k_dst;
+        }
+
+    }
+
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vtxbuf[dst_idx]   = sign*vtxbuf[src_idx] *0.0 + 1.0; // sign = 1 symmetric, sign = -1 antisymmetric
+}
+
+
 static __global__ void
 kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
 {
@@ -60,3 +188,55 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
+
+AcResult 
+acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                          AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle, 
+                          const AcMeshInfo config, const int3 bindex)   
+{
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    int3 bc_top = {config.int_params[AC_bc_type_top_x], config.int_params[AC_bc_type_top_y], 
+                   config.int_params[AC_bc_type_top_z]};
+    int3 bc_bot = {config.int_params[AC_bc_type_bot_x], config.int_params[AC_bc_type_bot_y], 
+                   config.int_params[AC_bc_type_bot_z]};
+
+//#if AC_MPI_ENABLED
+//    printf( "WARNING : NON-PERIODIC BOUNDARY CONDITIONS NOT SUPPORTER BY MPI! Only working at node level.\n");
+//    return AC_FAILURE;
+//#endif
+
+    if ( vtxbuf_handle != -1) // This is a dummy to make swithing boundary condition with respect to   more possible later  
+    {
+
+        if (bc_top.x == AC_BOUNDCOND_SYMMETRIC) 
+        {
+            kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
+            ERRCHK_CUDA_KERNEL();
+        } 
+        else if (bc_top.x == AC_BOUNDCOND_ANTISYMMETRIC) 
+        {
+            kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
+            ERRCHK_CUDA_KERNEL();
+        } 
+        else if (bc_top.x == AC_BOUNDCOND_PERIODIC) 
+        {
+            kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
+            ERRCHK_CUDA_KERNEL();
+        } 
+        else 
+        {
+            printf("ERROR: Boundary condition not recognized!\n");
+            printf("ERROR: bc_top = %i, %i, %i \n", bc_top.x, bc_top.y, bc_top.z);
+            printf("ERROR: bc_bot = %i, %i, %i \n", bc_bot.x, bc_bot.y, bc_bot.z);
+
+            return AC_FAILURE;
+        }
+
+    }
+
+    return AC_SUCCESS;
+}
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -45,6 +45,11 @@ extern "C" {
 /** */
 AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                                    AcReal* vtxbuf);
+/** */
+AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
+                                   AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle,
+                                   const AcMeshInfo config, const int3 bindex);
+

 /** */
 AcResult acKernelDummy(void);
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -656,6 +656,30 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
    return AC_SUCCESS;
 }

+static AcResult
+local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf, const AcMeshInfo config) 
+{
+    acNodeSynchronizeStream(node, stream);
+
+    int3 bindex = {-1, -1, -1}; //Dummy for node level. Relevant only for MPI. 
+
+    if (node->num_devices > 1) {
+        // Local boundary conditions
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
+            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
+            acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1, config, bindex);
+        }
+    }
+    else {
+        acDeviceGeneralBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
+                                     node->subgrid.m, config, bindex);
+    }
+    return AC_SUCCESS;
+}
+
+
 static AcResult
 global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
 {
@@ -768,6 +792,85 @@ acNodeIntegrate(const Node node, const AcReal dt)
    return AC_SUCCESS;
 }

+AcResult
+acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt) 
+{
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    // xxx|OOO OOOOOOOOO OOO|xxx
+    //    ^    ^         ^  ^
+    //   n0   n1        n2  n3
+    // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
+    // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+    // const int3 n2 = node->grid.n;
+    // const int3 n3 = n0 + node->grid.n;
+
+    for (int isubstep = 0; isubstep < 3; ++isubstep) {
+        acNodeSynchronizeStream(node, STREAM_ALL);
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, config);
+        }
+        acNodeSynchronizeStream(node, STREAM_ALL);
+
+        // Inner inner
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) {
+            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = node->subgrid.n;
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
+        }
+
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+            global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
+        }
+        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
+            acNodeSynchronizeStream(node, (Stream)vtxbuf);
+        }
+
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Front
+            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Back
+            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Bottom
+            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Top
+            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Left
+            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
+        }
+        // #pragma omp parallel for
+        for (int i = 0; i < node->num_devices; ++i) { // Right
+            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
+            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
+                                        node->subgrid.n.z - 2 * NGHOST};
+            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
+        }
+        acNodeSwapBuffers(node);
+    }
+    acNodeSynchronizeStream(node, STREAM_ALL);
+    return AC_SUCCESS;
+}
+
 AcResult
 acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                            const VertexBufferHandle vtxbuf_handle)
@@ -783,7 +886,20 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 }

 AcResult
-acNodePeriodicBoundconds(const Node node, const Stream stream)
+acNodeGeneralBoundcondStep(const Node node, const Stream stream,   
+                           const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config)
+{
+    local_boundcondstep_GBC(node, stream, vtxbuf_handle, config);
+    acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
+
+    global_boundcondstep(node, stream, vtxbuf_handle);
+  
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodePeriodicBoundconds(const Node node, const Stream stream) 
 {
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
@@ -791,6 +907,15 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
    return AC_SUCCESS;
 }

+AcResult
+acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config) 
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i, config);
+    }
+    return AC_SUCCESS;
+}
+
 static AcReal
 simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
                         const int& n)