Merged in alt_bcond_2020_09 (pull request #16)

Alt bcond 2020 09
2020-11-23 10:24:01 +00:00
parent e3eb782213 11eddabbd6
commit d83fd173fa
9 changed files with 694 additions and 8 deletions
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -20,9 +20,15 @@
 uniform int AC_max_steps;
 uniform int AC_save_steps;
 uniform int AC_bin_steps;
 uniform int AC_bc_type;
 uniform int AC_start_step;
 uniform int AC_bc_type_top_x;
 uniform int AC_bc_type_bot_x;
 uniform int AC_bc_type_top_y;
 uniform int AC_bc_type_bot_y;
 uniform int AC_bc_type_top_z;
 uniform int AC_bc_type_bot_z;
 // Real params
 uniform Scalar AC_dt;
 uniform Scalar AC_max_time;
--- a/config/astaroth.conf
+++ b/config/astaroth.conf
@@ -13,6 +13,15 @@ AC_dsx = 0.04908738521
 AC_dsy = 0.04908738521
 AC_dsz = 0.04908738521
 // 0 = periodic bc, 1 = symmetric bc, 2 = antisymmetric bc
 AC_bc_type_top_x = 0 
 AC_bc_type_top_y = 0   
 AC_bc_type_top_z = 0 
 AC_bc_type_bot_x = 0   
 AC_bc_type_bot_y = 0   
 AC_bc_type_bot_z = 0   
 /*
 * =============================================================================
 * Run-time params
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -49,6 +49,11 @@ typedef struct {
 typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
 // Neming the associated number of the boundary condition types
 typedef enum { AC_BOUNDCOND_PERIODIC = 0, 
               AC_BOUNDCOND_SYMMETRIC = 1, 
               AC_BOUNDCOND_ANTISYMMETRIC = 2 } AcBoundcond;
 #define AC_GEN_ID(X) X,
 typedef enum {
    AC_FOR_RTYPES(AC_GEN_ID) //
@@ -227,10 +232,19 @@ AcResult acStore(AcMesh* host_mesh);
 * substep and the user is responsible for calling acBoundcondStep before reading the data. */
 AcResult acIntegrate(const AcReal dt);
 /** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
 * substep and the user is responsible for calling acBoundcondStep before reading the data.
 * Has customizable boundary conditions. */
 AcResult acIntegrateGBC(const AcMeshInfo config, const AcReal dt);
 /** Applies periodic boundary conditions for the Mesh distributed among the devices visible to
 * the caller*/
 AcResult acBoundcondStep(void);
 /** Applies general outer boundary conditions for the Mesh distributed among the devices visible to
 * the caller*/
 AcResult acBoundcondStepGBC(const AcMeshInfo config);
 /** Does a scalar reduction with the data stored in some vertex buffer */
 AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
@@ -306,9 +320,21 @@ AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
 /** */
 AcResult acGridIntegrate(const Stream stream, const AcReal dt);
 /** */
 /*   MV: Commented out for a while, but save for the future when standalone_MPI
         works with periodic boundary conditions. 
 AcResult
 acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
 AcResult acGridIntegrateNonperiodic(const Stream stream, const AcReal dt);
 */
 /** */
 AcResult acGridPeriodicBoundconds(const Stream stream);
 /** */
 AcResult acGridGeneralBoundconds(const Device device, const Stream stream);
 /** TODO */
 AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
@@ -430,6 +456,9 @@ AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int
 /** */
 AcResult acNodeIntegrate(const Node node, const AcReal dt);
 /** */
 AcResult acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt);
 /** */
 AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                                     const VertexBufferHandle vtxbuf_handle);
@@ -437,6 +466,13 @@ AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 /** */
 AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
 /** */
 AcResult acNodeGeneralBoundcondStep(const Node node, const Stream stream,   
                                    const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config);
 /** */
 AcResult acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config);
 /** */
 AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
@@ -565,6 +601,16 @@ AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
 AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
                                    const int3 end);
 /** */
 AcResult acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
                                      const VertexBufferHandle vtxbuf_handle, const int3 start,
                                      const int3 end, const AcMeshInfo config, const int3 bindex);
 /** */
 AcResult acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
                                   const int3 end, const AcMeshInfo config, const int3 bindex);
 /** */
 AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
--- a/samples/standalone/simulation.cc
+++ b/samples/standalone/simulation.cc
@@ -374,7 +374,8 @@ run_simulation(const char* config_path)
 #endif
    }
-    acBoundcondStep();
+    //acBoundcondStep();
    acBoundcondStepGBC(mesh_info);
    acStore(mesh);
    if (start_step == 0) {
        save_mesh(*mesh, 0, t_step);
@@ -440,7 +441,11 @@ run_simulation(const char* config_path)
        loadForcingParamsToDevice(forcing_params);
 #endif
-        acIntegrate(dt);
+        /* Uses now flexible bokundary conditions */
        //acIntegrate(dt);
        acIntegrateGBC(mesh_info, dt);
        t_step += dt;
@@ -486,7 +491,8 @@ run_simulation(const char* config_path)
                acBoundcondStep();
                acStore(mesh);
            */
-            acBoundcondStep();
+            //acBoundcondStep();
            acBoundcondStepGBC(mesh_info);
            acStore(mesh);
            save_mesh(*mesh, i, t_step);
@@ -507,7 +513,8 @@ run_simulation(const char* config_path)
        if (dt < dt_typical/AcReal(1e5)) {
            if (dtcounter > 10) {
                printf("dt = %e TOO LOW! Ending run at t = %#e \n", double(dt), double(t_step));
-                acBoundcondStep();
+                //acBoundcondStep();
                acBoundcondStepGBC(mesh_info);
                acStore(mesh);
                save_mesh(*mesh, i, t_step);
                break;
@@ -521,7 +528,8 @@ run_simulation(const char* config_path)
        // End loop if nan is found
        if (found_nan > 0) {
            printf("Found nan at t = %e \n", double(t_step));
-            acBoundcondStep();
+            //acBoundcondStep();
            acBoundcondStepGBC(mesh_info);
            acStore(mesh);
            save_mesh(*mesh, i, t_step);
            break;
@@ -536,7 +544,8 @@ run_simulation(const char* config_path)
        if (found_stop == 1) {
            printf("Found STOP file at t = %e \n", double(t_step));
-            acBoundcondStep();
+            //acBoundcondStep();
            acBoundcondStepGBC(mesh_info);
            acStore(mesh);
            save_mesh(*mesh, i, t_step);
            break;
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -86,6 +86,13 @@ acIntegrate(const AcReal dt)
    return acNodeIntegrate(nodes[0], dt);
 }
 AcResult
 acIntegrateGBC(const AcMeshInfo config, const AcReal dt) 
 {
    return acNodeIntegrateGBC(nodes[0], config, dt);
 }
 AcResult
 acIntegrateStep(const int isubstep, const AcReal dt)
 {
@@ -109,6 +116,12 @@ acBoundcondStep(void)
    return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
 }
 AcResult
 acBoundcondStepGBC(const AcMeshInfo config) 
 {
    return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT, config);
 }
 AcReal
 acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
 {
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -433,6 +433,27 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3
    return AC_SUCCESS;
 }
 AcResult
 acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
                              const VertexBufferHandle vtxbuf_handle, const int3 start,
                              const int3 end, const AcMeshInfo config, const int3 bindex)
 {
    cudaSetDevice(device->id);
    return acKernelGeneralBoundconds(device->streams[stream], start, end,
                                     device->vba.in[vtxbuf_handle], vtxbuf_handle, config, bindex);
 }
 AcResult
 acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
                           const int3 end, const AcMeshInfo config, const int3 bindex)
 {
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, config, bindex);
    }
    return AC_SUCCESS;
 }
 AcResult
 acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                   const VertexBufferHandle vtxbuf_handle, AcReal* result)
@@ -1433,6 +1454,50 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
    return AC_SUCCESS;
 }
 /*   MV: Commented out for a while, but save for the future when standalone_MPI
         works with periodic boundary conditions. 
 AcResult
 acGridGeneralBoundconds(const Device device, const Stream stream)
 {
    // Non-periodic Boundary conditions 
    // Check the position in MPI frame 
    int nprocs, pid;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    const uint3_64 decomposition = decompose(nprocs);
    const int3 pid3d             = getPid3D(pid, decomposition);
    // Set outer boudaries after substep computation. 
    const int3 m1 = (int3){0, 0, 0};
    const int3 m2 = grid.nn;
    const int3 pid3d = getPid3D(pid, decomposition);
    // If we are are a boundary element
    int3 bindex = (int3){0, 0, 0};
    // Check if there are active boundary condition edges.
    // 0 is no boundary, 1 both edges, 2 is top edge, 3 bottom edge
    if      ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 1; }
    else if  (pid3d.x == 0)                                      { bindex.x = 2; }
    else if                    (pid3d.x == decomposition.x - 1)  { bindex.x = 3; }
    if      ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 1; }
    else if  (pid3d.y == 0)                                      { bindex.y = 2; }
    else if                    (pid3d.y == decomposition.y - 1)  { bindex.y = 3; }
    if      ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 1; }
    else if  (pid3d.z == 0)                                      { bindex.z = 2; }
    else if                    (pid3d.z == decomposition.z - 1)  { bindex.z = 3; }
    if (bindex.x != 1) && (bindex.y != 1) && (bindex.z != 1) {
        acDeviceGeneralBoundconds(device, stream, m1, m2, bindex);
    }
    acGridSynchronizeStream(stream);
    return AC_SUCCESS;
 }
 */
 /*
 // Unused
 AcResult
@@ -1864,6 +1929,234 @@ acGridIntegrate(const Stream stream, const AcReal dt)
    return AC_SUCCESS;
 }
 /*   MV: Commented out for a while, but save for the future when standalone_MPI
         works with periodic boundary conditions. 
 AcResult
 acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
 {
    ERRCHK(grid.initialized);
    acGridSynchronizeStream(stream);
    const Device device = grid.device;
    const int3 nn       = grid.nn;
 #if MPI_INCL_CORNERS
    CommData corner_data = grid.corner_data; // Do not rm: required for corners
 #endif                                       // MPI_INCL_CORNERS
    CommData edgex_data  = grid.edgex_data;
    CommData edgey_data  = grid.edgey_data;
    CommData edgez_data  = grid.edgez_data;
    CommData sidexy_data = grid.sidexy_data;
    CommData sidexz_data = grid.sidexz_data;
    CommData sideyz_data = grid.sideyz_data;
    acGridLoadScalarUniform(stream, AC_dt, dt);
    acDeviceSynchronizeStream(device, stream);
 // Corners
 #if MPI_INCL_CORNERS
    // Do not rm: required for corners
    const int3 corner_b0s[] = {
        (int3){0, 0, 0},
        (int3){NGHOST + nn.x, 0, 0},
        (int3){0, NGHOST + nn.y, 0},
        (int3){0, 0, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, 0},
        (int3){NGHOST + nn.x, 0, NGHOST + nn.z},
        (int3){0, NGHOST + nn.y, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
    };
 #endif // MPI_INCL_CORNERS
    // Edges X
    const int3 edgex_b0s[] = {
        (int3){NGHOST, 0, 0},
        (int3){NGHOST, NGHOST + nn.y, 0},
        (int3){NGHOST, 0, NGHOST + nn.z},
        (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
    };
    // Edges Y
    const int3 edgey_b0s[] = {
        (int3){0, NGHOST, 0},
        (int3){NGHOST + nn.x, NGHOST, 0},
        (int3){0, NGHOST, NGHOST + nn.z},
        (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
    };
    // Edges Z
    const int3 edgez_b0s[] = {
        (int3){0, 0, NGHOST},
        (int3){NGHOST + nn.x, 0, NGHOST},
        (int3){0, NGHOST + nn.y, NGHOST},
        (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
    };
    // Sides XY
    const int3 sidexy_b0s[] = {
        (int3){NGHOST, NGHOST, 0},             //
        (int3){NGHOST, NGHOST, NGHOST + nn.z}, //
    };
    // Sides XZ
    const int3 sidexz_b0s[] = {
        (int3){NGHOST, 0, NGHOST},             //
        (int3){NGHOST, NGHOST + nn.y, NGHOST}, //
    };
    // Sides YZ
    const int3 sideyz_b0s[] = {
        (int3){0, NGHOST, NGHOST},             //
        (int3){NGHOST + nn.x, NGHOST, NGHOST}, //
    };
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
 #if MPI_COMM_ENABLED
 #if MPI_INCL_CORNERS
        acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
 #endif                                                    // MPI_INCL_CORNERS
        acPackCommData(device, edgex_b0s, &edgex_data);
        acPackCommData(device, edgey_b0s, &edgey_data);
        acPackCommData(device, edgez_b0s, &edgez_data);
        acPackCommData(device, sidexy_b0s, &sidexy_data);
        acPackCommData(device, sidexz_b0s, &sidexz_data);
        acPackCommData(device, sideyz_b0s, &sideyz_data);
 #endif
 #if MPI_COMM_ENABLED
        MPI_Barrier(MPI_COMM_WORLD);
 #if MPI_GPUDIRECT_DISABLED
 #if MPI_INCL_CORNERS
        acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
 #endif                                                  // MPI_INCL_CORNERS
        acTransferCommDataToHost(device, &edgex_data);
        acTransferCommDataToHost(device, &edgey_data);
        acTransferCommDataToHost(device, &edgez_data);
        acTransferCommDataToHost(device, &sidexy_data);
        acTransferCommDataToHost(device, &sidexz_data);
        acTransferCommDataToHost(device, &sideyz_data);
 #endif
 #if MPI_INCL_CORNERS
        acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
 #endif                                                        // MPI_INCL_CORNERS
        acTransferCommData(device, edgex_b0s, &edgex_data);
        acTransferCommData(device, edgey_b0s, &edgey_data);
        acTransferCommData(device, edgez_b0s, &edgez_data);
        acTransferCommData(device, sidexy_b0s, &sidexy_data);
        acTransferCommData(device, sidexz_b0s, &sidexz_data);
        acTransferCommData(device, sideyz_b0s, &sideyz_data);
 #endif // MPI_COMM_ENABLED
 #if MPI_COMPUTE_ENABLED
        //////////// INNER INTEGRATION //////////////
        {
            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = nn;
            acKernelIntegrateSubstep(device->streams[STREAM_16], isubstep, m1, m2, device->vba);
        }
 ////////////////////////////////////////////
 #endif // MPI_COMPUTE_ENABLED
 #if MPI_COMM_ENABLED
 #if MPI_INCL_CORNERS
        acTransferCommDataWait(corner_data); // Do not rm: required for corners
 #endif                                       // MPI_INCL_CORNERS
        acTransferCommDataWait(edgex_data);
        acTransferCommDataWait(edgey_data);
        acTransferCommDataWait(edgez_data);
        acTransferCommDataWait(sidexy_data);
        acTransferCommDataWait(sidexz_data);
        acTransferCommDataWait(sideyz_data);
 #if MPI_INCL_CORNERS
        acUnpinCommData(device, &corner_data); // Do not rm: required for corners
 #endif                                         // MPI_INCL_CORNERS
        acUnpinCommData(device, &edgex_data);
        acUnpinCommData(device, &edgey_data);
        acUnpinCommData(device, &edgez_data);
        acUnpinCommData(device, &sidexy_data);
        acUnpinCommData(device, &sidexz_data);
        acUnpinCommData(device, &sideyz_data);
 #if MPI_INCL_CORNERS
        acUnpackCommData(device, corner_b0s, &corner_data);
 #endif // MPI_INCL_CORNERS
        acUnpackCommData(device, edgex_b0s, &edgex_data);
        acUnpackCommData(device, edgey_b0s, &edgey_data);
        acUnpackCommData(device, edgez_b0s, &edgez_data);
        acUnpackCommData(device, sidexy_b0s, &sidexy_data);
        acUnpackCommData(device, sidexz_b0s, &sidexz_data);
        acUnpackCommData(device, sideyz_b0s, &sideyz_data);
 //////////// OUTER INTEGRATION //////////////
 // Wait for unpacking
 #if MPI_INCL_CORNERS
        acSyncCommData(corner_data); // Do not rm: required for corners
 #endif                               // MPI_INCL_CORNERS
        acSyncCommData(edgex_data);
        acSyncCommData(edgey_data);
        acSyncCommData(edgez_data);
        acSyncCommData(sidexy_data);
        acSyncCommData(sidexz_data);
        acSyncCommData(sideyz_data);
 #endif // MPI_COMM_ENABLED
        // Invoke outer edge boundary conditions. 
        acGridGeneralBoundconds(device, stream)
 #if MPI_COMPUTE_ENABLED
        { // Front
            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acKernelIntegrateSubstep(device->streams[STREAM_0], isubstep, m1, m2, device->vba);
        }
        { // Back
            const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
            const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
            acKernelIntegrateSubstep(device->streams[STREAM_1], isubstep, m1, m2, device->vba);
        }
        { // Bottom
            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acKernelIntegrateSubstep(device->streams[STREAM_2], isubstep, m1, m2, device->vba);
        }
        { // Top
            const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
            const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
            acKernelIntegrateSubstep(device->streams[STREAM_3], isubstep, m1, m2, device->vba);
        }
        { // Left
            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acKernelIntegrateSubstep(device->streams[STREAM_4], isubstep, m1, m2, device->vba);
        }
        { // Right
            const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
            acKernelIntegrateSubstep(device->streams[STREAM_5], isubstep, m1, m2, device->vba);
        }
 #endif // MPI_COMPUTE_ENABLED
        acDeviceSwapBuffers(device);
        acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
        ////////////////////////////////////////////
    }
    return AC_SUCCESS;
 }
 */
 AcResult
 acGridPeriodicBoundconds(const Stream stream)
 {
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -1,5 +1,133 @@
 #pragma once
 static __global__ void
 kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex, const int sign)
 {
    const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
    const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
    const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
    // If within the start-end range (this allows threadblock dims that are not
    // divisible by end - start)
    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
        return;
    // If destination index is inside the computational domain, return since
    // the boundary conditions are only applied to the ghost zones
    if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
        j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
        return;
    // Find the source index
    // Map to nx, ny, nz coordinates
    int i_src, j_src, k_src, boundlocx0, boundlocx1, boundlocy0, boundlocy1, boundlocz0, boundlocz1;
    int bsize = STENCIL_ORDER/(int) 2;
    //if (bindex.x != 0)
    //if (bindex.y != 0)
    //if (bindex.z != 0)
    //Location of central border point.
    boundlocx0 = bsize;
    boundlocy0 = bsize; 
    boundlocz0 = bsize;
    boundlocx1 = DCONST(AC_nx_max) - 1; 
    boundlocy1 = DCONST(AC_ny_max) - 1;
    boundlocz1 = DCONST(AC_nz_max) - 1;
    //Defaults
    i_src = i_dst;
    j_src = j_dst;
    k_src = k_dst;
    if (bindex.x < 0)
    {
        // Pick up the mirroring value.
        if ((i_dst < boundlocx0))
        {
            i_src = 2.0f*boundlocx0 - i_dst;
        } else if ((i_dst > boundlocx1))
        {
            i_src = 2.0f*boundlocx1 - i_dst;
        }
        // Pick up the mirroring value.
        if ((j_dst < boundlocy0))
        {
            j_src = 2.0f*boundlocy0 - j_dst;
        } else if ((j_dst > boundlocx1))
        {
            j_src = 2.0f*boundlocy1 - j_dst;
        }
        // Pick up the mirroring value.
        if ((k_dst < boundlocz0))
        {
            k_src = 2.0f*boundlocz0 - k_dst;
        } else if ((k_dst > boundlocz1))
        {
            k_src = 2.0f*boundlocz1 - k_dst;
        }
        //Edges
        if (       (i_dst < boundlocx0) && (j_dst < boundlocy0)                         )
        {
            i_src = 2.0f*boundlocx0 - i_dst;
            j_src = 2.0f*boundlocy0 - j_dst;
            //if ((k_dst == 50)) printf("i_dst %i j_dst %i k_dst %i i_src %i j_src %i k_src %i bsize %i \n", i_dst, j_dst, k_dst, i_src, j_src, k_src, bsize);
        } else if ((i_dst < boundlocx0)                         && (k_dst < boundlocz0) )
        {
            i_src = 2.0f*boundlocx0 - i_dst;
            k_src = 2.0f*boundlocz0 - k_dst;
        } else if (                        (j_dst < boundlocy0) && (k_dst < boundlocz0) )
        {
            j_src = 2.0f*boundlocy0 - j_dst;
            k_src = 2.0f*boundlocz0 - k_dst;
        } else if ((i_dst > boundlocx1) && (j_dst > boundlocx1)                         )
        {
            i_src = 2.0f*boundlocx1 - i_dst;
            j_src = 2.0f*boundlocy1 - j_dst;
        } else if ( (i_dst > boundlocx1)                        && (k_dst > boundlocz1) )
        {
            i_src = 2.0f*boundlocx1 - i_dst;
            k_src = 2.0f*boundlocz1 - k_dst;
        } else if (                        (j_dst > boundlocy1) && (k_dst > boundlocz1) )
        {
            j_src = 2.0f*boundlocy1 - j_dst;
            k_src = 2.0f*boundlocz1 - k_dst;
        } else if ( (i_dst > boundlocx1)                        && (k_dst < boundlocz0) )
        {
            i_src = 2.0f*boundlocx1 - i_dst;
            k_src = 2.0f*boundlocz0 - k_dst;
        } else if ( (i_dst > boundlocx1) && (j_dst < bsize)                             )
        {
            i_src = 2.0f*boundlocx1 - i_dst;
            j_src = 2.0f*boundlocy0 - j_dst;
        } else if ( (i_dst < boundlocx0)                        && (k_dst > boundlocz1) )
        {
            i_src = 2.0f*boundlocx0 - i_dst;
            k_src = 2.0f*boundlocz1 - k_dst;
        } else if ( (i_dst < boundlocx0) && (j_dst > boundlocy1)                        )
        {
            i_src = 2.0f*boundlocx0 - i_dst;
            j_src = 2.0f*boundlocy1 - j_dst;
        } else if (                        (j_dst > boundlocy1) && (k_dst < boundlocz0) )
        {
            j_src = 2.0f*boundlocy1 - j_dst;
            k_src = 2.0f*boundlocz0 - k_dst;
        }
    }
    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
    vtxbuf[dst_idx]   = sign*vtxbuf[src_idx] *0.0 + 1.0; // sign = 1 symmetric, sign = -1 antisymmetric
 }
 static __global__ void
 kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
 {
@@ -60,3 +188,55 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
 AcResult 
 acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                          AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle, 
                          const AcMeshInfo config, const int3 bindex)   
 {
    const dim3 tpb(8, 2, 8);
    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
    int3 bc_top = {config.int_params[AC_bc_type_top_x], config.int_params[AC_bc_type_top_y], 
                   config.int_params[AC_bc_type_top_z]};
    int3 bc_bot = {config.int_params[AC_bc_type_bot_x], config.int_params[AC_bc_type_bot_y], 
                   config.int_params[AC_bc_type_bot_z]};
 //#if AC_MPI_ENABLED
 //    printf( "WARNING : NON-PERIODIC BOUNDARY CONDITIONS NOT SUPPORTER BY MPI! Only working at node level.\n");
 //    return AC_FAILURE;
 //#endif
    if ( vtxbuf_handle != -1) // This is a dummy to make swithing boundary condition with respect to   more possible later  
    {
        if (bc_top.x == AC_BOUNDCOND_SYMMETRIC) 
        {
            kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex,  1);
            ERRCHK_CUDA_KERNEL();
        } 
        else if (bc_top.x == AC_BOUNDCOND_ANTISYMMETRIC) 
        {
            kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
            ERRCHK_CUDA_KERNEL();
        } 
        else if (bc_top.x == AC_BOUNDCOND_PERIODIC) 
        {
            kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
            ERRCHK_CUDA_KERNEL();
        } 
        else 
        {
            printf("ERROR: Boundary condition not recognized!\n");
            printf("ERROR: bc_top = %i, %i, %i \n", bc_top.x, bc_top.y, bc_top.z);
            printf("ERROR: bc_bot = %i, %i, %i \n", bc_bot.x, bc_bot.y, bc_bot.z);
            return AC_FAILURE;
        }
    }
    return AC_SUCCESS;
 }
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -45,6 +45,11 @@ extern "C" {
 /** */
 AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                                    AcReal* vtxbuf);
 /** */
 AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
                                   AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle,
                                   const AcMeshInfo config, const int3 bindex);
 /** */
 AcResult acKernelDummy(void);
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -656,6 +656,30 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
    return AC_SUCCESS;
 }
 static AcResult
 local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf, const AcMeshInfo config) 
 {
    acNodeSynchronizeStream(node, stream);
    int3 bindex = {-1, -1, -1}; //Dummy for node level. Relevant only for MPI. 
    if (node->num_devices > 1) {
        // Local boundary conditions
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) {
            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
            const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
            acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1, config, bindex);
        }
    }
    else {
        acDeviceGeneralBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
                                     node->subgrid.m, config, bindex);
    }
    return AC_SUCCESS;
 }
 static AcResult
 global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
 {
@@ -768,6 +792,85 @@ acNodeIntegrate(const Node node, const AcReal dt)
    return AC_SUCCESS;
 }
 AcResult
 acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt) 
 {
    acNodeSynchronizeStream(node, STREAM_ALL);
    // xxx|OOO OOOOOOOOO OOO|xxx
    //    ^    ^         ^  ^
    //   n0   n1        n2  n3
    // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
    // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
    // const int3 n2 = node->grid.n;
    // const int3 n3 = n0 + node->grid.n;
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
        acNodeSynchronizeStream(node, STREAM_ALL);
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, config);
        }
        acNodeSynchronizeStream(node, STREAM_ALL);
        // Inner inner
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) {
            const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = node->subgrid.n;
            acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
        }
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
            global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
        }
        for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
            acNodeSynchronizeStream(node, (Stream)vtxbuf);
        }
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) { // Front
            const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
            acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
        }
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) { // Back
            const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
            const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
            acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
        }
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) { // Bottom
            const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
        }
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) { // Top
            const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
            const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
        }
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) { // Left
            const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
                                        node->subgrid.n.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
        }
        // #pragma omp parallel for
        for (int i = 0; i < node->num_devices; ++i) { // Right
            const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
            const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
                                        node->subgrid.n.z - 2 * NGHOST};
            acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
        }
        acNodeSwapBuffers(node);
    }
    acNodeSynchronizeStream(node, STREAM_ALL);
    return AC_SUCCESS;
 }
 AcResult
 acNodePeriodicBoundcondStep(const Node node, const Stream stream,
                            const VertexBufferHandle vtxbuf_handle)
@@ -783,7 +886,20 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
 }
 AcResult
-acNodePeriodicBoundconds(const Node node, const Stream stream)
+acNodeGeneralBoundcondStep(const Node node, const Stream stream,   
                           const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config)
 {
    local_boundcondstep_GBC(node, stream, vtxbuf_handle, config);
    acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
    global_boundcondstep(node, stream, vtxbuf_handle);
    return AC_SUCCESS;
 }
 AcResult
 acNodePeriodicBoundconds(const Node node, const Stream stream) 
 {
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
@@ -791,6 +907,15 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
    return AC_SUCCESS;
 }
 AcResult
 acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config) 
 {
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i, config);
    }
    return AC_SUCCESS;
 }
 static AcReal
 simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
                         const int& n)