diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac index db83a3f..4b0f9bc 100644 --- a/acc/mhd_solver/stencil_kernel.ac +++ b/acc/mhd_solver/stencil_kernel.ac @@ -20,9 +20,15 @@ uniform int AC_max_steps; uniform int AC_save_steps; uniform int AC_bin_steps; -uniform int AC_bc_type; uniform int AC_start_step; +uniform int AC_bc_type_top_x; +uniform int AC_bc_type_bot_x; +uniform int AC_bc_type_top_y; +uniform int AC_bc_type_bot_y; +uniform int AC_bc_type_top_z; +uniform int AC_bc_type_bot_z; + // Real params uniform Scalar AC_dt; uniform Scalar AC_max_time; diff --git a/config/astaroth.conf b/config/astaroth.conf index 190948b..480bcdc 100644 --- a/config/astaroth.conf +++ b/config/astaroth.conf @@ -13,6 +13,15 @@ AC_dsx = 0.04908738521 AC_dsy = 0.04908738521 AC_dsz = 0.04908738521 +// 0 = periodic bc, 1 = symmetric bc, 2 = antisymmetric bc +AC_bc_type_top_x = 0 +AC_bc_type_top_y = 0 +AC_bc_type_top_z = 0 +AC_bc_type_bot_x = 0 +AC_bc_type_bot_y = 0 +AC_bc_type_bot_z = 0 + + /* * ============================================================================= * Run-time params diff --git a/include/astaroth.h b/include/astaroth.h index 21e45b9..07b4d61 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -49,6 +49,11 @@ typedef struct { typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult; +// Neming the associated number of the boundary condition types +typedef enum { AC_BOUNDCOND_PERIODIC = 0, + AC_BOUNDCOND_SYMMETRIC = 1, + AC_BOUNDCOND_ANTISYMMETRIC = 2 } AcBoundcond; + #define AC_GEN_ID(X) X, typedef enum { AC_FOR_RTYPES(AC_GEN_ID) // @@ -227,10 +232,19 @@ AcResult acStore(AcMesh* host_mesh); * substep and the user is responsible for calling acBoundcondStep before reading the data. */ AcResult acIntegrate(const AcReal dt); +/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final + * substep and the user is responsible for calling acBoundcondStep before reading the data. + * Has customizable boundary conditions. */ +AcResult acIntegrateGBC(const AcMeshInfo config, const AcReal dt); + /** Applies periodic boundary conditions for the Mesh distributed among the devices visible to * the caller*/ AcResult acBoundcondStep(void); +/** Applies general outer boundary conditions for the Mesh distributed among the devices visible to + * the caller*/ +AcResult acBoundcondStepGBC(const AcMeshInfo config); + /** Does a scalar reduction with the data stored in some vertex buffer */ AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle); @@ -306,9 +320,21 @@ AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh); /** */ AcResult acGridIntegrate(const Stream stream, const AcReal dt); +/** */ +/* MV: Commented out for a while, but save for the future when standalone_MPI + works with periodic boundary conditions. +AcResult +acGridIntegrateNonperiodic(const Stream stream, const AcReal dt) + +AcResult acGridIntegrateNonperiodic(const Stream stream, const AcReal dt); +*/ + /** */ AcResult acGridPeriodicBoundconds(const Stream stream); +/** */ +AcResult acGridGeneralBoundconds(const Device device, const Stream stream); + /** TODO */ AcResult acGridReduceScal(const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result); @@ -430,6 +456,9 @@ AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int /** */ AcResult acNodeIntegrate(const Node node, const AcReal dt); +/** */ +AcResult acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt); + /** */ AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle); @@ -437,6 +466,13 @@ AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, /** */ AcResult acNodePeriodicBoundconds(const Node node, const Stream stream); +/** */ +AcResult acNodeGeneralBoundcondStep(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config); + +/** */ +AcResult acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config); + /** */ AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result); @@ -565,6 +601,16 @@ AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start, const int3 end); +/** */ +AcResult acDeviceGeneralBoundcondStep(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 start, + const int3 end, const AcMeshInfo config, const int3 bindex); + +/** */ +AcResult acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start, + const int3 end, const AcMeshInfo config, const int3 bindex); + + /** */ AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result); diff --git a/samples/standalone/simulation.cc b/samples/standalone/simulation.cc index 4e0e176..8fd0d36 100644 --- a/samples/standalone/simulation.cc +++ b/samples/standalone/simulation.cc @@ -374,7 +374,8 @@ run_simulation(const char* config_path) #endif } - acBoundcondStep(); + //acBoundcondStep(); + acBoundcondStepGBC(mesh_info); acStore(mesh); if (start_step == 0) { save_mesh(*mesh, 0, t_step); @@ -440,7 +441,11 @@ run_simulation(const char* config_path) loadForcingParamsToDevice(forcing_params); #endif - acIntegrate(dt); + /* Uses now flexible bokundary conditions */ + //acIntegrate(dt); + acIntegrateGBC(mesh_info, dt); + + t_step += dt; @@ -486,7 +491,8 @@ run_simulation(const char* config_path) acBoundcondStep(); acStore(mesh); */ - acBoundcondStep(); + //acBoundcondStep(); + acBoundcondStepGBC(mesh_info); acStore(mesh); save_mesh(*mesh, i, t_step); @@ -507,7 +513,8 @@ run_simulation(const char* config_path) if (dt < dt_typical/AcReal(1e5)) { if (dtcounter > 10) { printf("dt = %e TOO LOW! Ending run at t = %#e \n", double(dt), double(t_step)); - acBoundcondStep(); + //acBoundcondStep(); + acBoundcondStepGBC(mesh_info); acStore(mesh); save_mesh(*mesh, i, t_step); break; @@ -521,7 +528,8 @@ run_simulation(const char* config_path) // End loop if nan is found if (found_nan > 0) { printf("Found nan at t = %e \n", double(t_step)); - acBoundcondStep(); + //acBoundcondStep(); + acBoundcondStepGBC(mesh_info); acStore(mesh); save_mesh(*mesh, i, t_step); break; @@ -536,7 +544,8 @@ run_simulation(const char* config_path) if (found_stop == 1) { printf("Found STOP file at t = %e \n", double(t_step)); - acBoundcondStep(); + //acBoundcondStep(); + acBoundcondStepGBC(mesh_info); acStore(mesh); save_mesh(*mesh, i, t_step); break; diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc index c6d9a19..2123224 100644 --- a/src/core/astaroth.cc +++ b/src/core/astaroth.cc @@ -86,6 +86,13 @@ acIntegrate(const AcReal dt) return acNodeIntegrate(nodes[0], dt); } +AcResult +acIntegrateGBC(const AcMeshInfo config, const AcReal dt) +{ + return acNodeIntegrateGBC(nodes[0], config, dt); +} + + AcResult acIntegrateStep(const int isubstep, const AcReal dt) { @@ -109,6 +116,12 @@ acBoundcondStep(void) return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT); } +AcResult +acBoundcondStepGBC(const AcMeshInfo config) +{ + return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT, config); +} + AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle) { diff --git a/src/core/device.cc b/src/core/device.cc index 24d6e38..c1bb43f 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -433,6 +433,27 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 return AC_SUCCESS; } +AcResult +acDeviceGeneralBoundcondStep(const Device device, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const int3 start, + const int3 end, const AcMeshInfo config, const int3 bindex) +{ + cudaSetDevice(device->id); + return acKernelGeneralBoundconds(device->streams[stream], start, end, + device->vba.in[vtxbuf_handle], vtxbuf_handle, config, bindex); +} + +AcResult +acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start, + const int3 end, const AcMeshInfo config, const int3 bindex) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, config, bindex); + } + return AC_SUCCESS; +} + + AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result) @@ -1433,6 +1454,50 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh) return AC_SUCCESS; } +/* MV: Commented out for a while, but save for the future when standalone_MPI + works with periodic boundary conditions. +AcResult +acGridGeneralBoundconds(const Device device, const Stream stream) +{ + // Non-periodic Boundary conditions + // Check the position in MPI frame + int nprocs, pid; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + const uint3_64 decomposition = decompose(nprocs); + const int3 pid3d = getPid3D(pid, decomposition); + + // Set outer boudaries after substep computation. + const int3 m1 = (int3){0, 0, 0}; + const int3 m2 = grid.nn; + const int3 pid3d = getPid3D(pid, decomposition); + // If we are are a boundary element + int3 bindex = (int3){0, 0, 0}; + + // Check if there are active boundary condition edges. + // 0 is no boundary, 1 both edges, 2 is top edge, 3 bottom edge + if ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 1; } + else if (pid3d.x == 0) { bindex.x = 2; } + else if (pid3d.x == decomposition.x - 1) { bindex.x = 3; } + + if ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 1; } + else if (pid3d.y == 0) { bindex.y = 2; } + else if (pid3d.y == decomposition.y - 1) { bindex.y = 3; } + + if ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 1; } + else if (pid3d.z == 0) { bindex.z = 2; } + else if (pid3d.z == decomposition.z - 1) { bindex.z = 3; } + + + if (bindex.x != 1) && (bindex.y != 1) && (bindex.z != 1) { + acDeviceGeneralBoundconds(device, stream, m1, m2, bindex); + } + acGridSynchronizeStream(stream); + + return AC_SUCCESS; +} +*/ + /* // Unused AcResult @@ -1864,6 +1929,234 @@ acGridIntegrate(const Stream stream, const AcReal dt) return AC_SUCCESS; } +/* MV: Commented out for a while, but save for the future when standalone_MPI + works with periodic boundary conditions. +AcResult +acGridIntegrateNonperiodic(const Stream stream, const AcReal dt) +{ + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); + + const Device device = grid.device; + const int3 nn = grid.nn; +#if MPI_INCL_CORNERS + CommData corner_data = grid.corner_data; // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + CommData edgex_data = grid.edgex_data; + CommData edgey_data = grid.edgey_data; + CommData edgez_data = grid.edgez_data; + CommData sidexy_data = grid.sidexy_data; + CommData sidexz_data = grid.sidexz_data; + CommData sideyz_data = grid.sideyz_data; + + acGridLoadScalarUniform(stream, AC_dt, dt); + acDeviceSynchronizeStream(device, stream); + + +// Corners +#if MPI_INCL_CORNERS + // Do not rm: required for corners + const int3 corner_b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){0, 0, NGHOST + nn.z}, + + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; +#endif // MPI_INCL_CORNERS + + // Edges X + const int3 edgex_b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, + + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; + + // Edges Y + const int3 edgey_b0s[] = { + (int3){0, NGHOST, 0}, + (int3){NGHOST + nn.x, NGHOST, 0}, + + (int3){0, NGHOST, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, + }; + + // Edges Z + const int3 edgez_b0s[] = { + (int3){0, 0, NGHOST}, + (int3){NGHOST + nn.x, 0, NGHOST}, + + (int3){0, NGHOST + nn.y, NGHOST}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, + }; + + // Sides XY + const int3 sidexy_b0s[] = { + (int3){NGHOST, NGHOST, 0}, // + (int3){NGHOST, NGHOST, NGHOST + nn.z}, // + }; + + // Sides XZ + const int3 sidexz_b0s[] = { + (int3){NGHOST, 0, NGHOST}, // + (int3){NGHOST, NGHOST + nn.y, NGHOST}, // + }; + + // Sides YZ + const int3 sideyz_b0s[] = { + (int3){0, NGHOST, NGHOST}, // + (int3){NGHOST + nn.x, NGHOST, NGHOST}, // + }; + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + +#if MPI_COMM_ENABLED +#if MPI_INCL_CORNERS + acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acPackCommData(device, edgex_b0s, &edgex_data); + acPackCommData(device, edgey_b0s, &edgey_data); + acPackCommData(device, edgez_b0s, &edgez_data); + acPackCommData(device, sidexy_b0s, &sidexy_data); + acPackCommData(device, sidexz_b0s, &sidexz_data); + acPackCommData(device, sideyz_b0s, &sideyz_data); +#endif + +#if MPI_COMM_ENABLED + MPI_Barrier(MPI_COMM_WORLD); + +#if MPI_GPUDIRECT_DISABLED +#if MPI_INCL_CORNERS + acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acTransferCommDataToHost(device, &edgex_data); + acTransferCommDataToHost(device, &edgey_data); + acTransferCommDataToHost(device, &edgez_data); + acTransferCommDataToHost(device, &sidexy_data); + acTransferCommDataToHost(device, &sidexz_data); + acTransferCommDataToHost(device, &sideyz_data); +#endif +#if MPI_INCL_CORNERS + acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acTransferCommData(device, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_b0s, &edgez_data); + acTransferCommData(device, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acKernelIntegrateSubstep(device->streams[STREAM_16], isubstep, m1, m2, device->vba); + } +//////////////////////////////////////////// +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED +#if MPI_INCL_CORNERS + acTransferCommDataWait(corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acTransferCommDataWait(edgex_data); + acTransferCommDataWait(edgey_data); + acTransferCommDataWait(edgez_data); + acTransferCommDataWait(sidexy_data); + acTransferCommDataWait(sidexz_data); + acTransferCommDataWait(sideyz_data); + +#if MPI_INCL_CORNERS + acUnpinCommData(device, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acUnpinCommData(device, &edgex_data); + acUnpinCommData(device, &edgey_data); + acUnpinCommData(device, &edgez_data); + acUnpinCommData(device, &sidexy_data); + acUnpinCommData(device, &sidexz_data); + acUnpinCommData(device, &sideyz_data); + +#if MPI_INCL_CORNERS + acUnpackCommData(device, corner_b0s, &corner_data); +#endif // MPI_INCL_CORNERS + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); +//////////// OUTER INTEGRATION ////////////// + +// Wait for unpacking +#if MPI_INCL_CORNERS + acSyncCommData(corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acSyncCommData(edgex_data); + acSyncCommData(edgey_data); + acSyncCommData(edgez_data); + acSyncCommData(sidexy_data); + acSyncCommData(sidexz_data); + acSyncCommData(sideyz_data); +#endif // MPI_COMM_ENABLED + + // Invoke outer edge boundary conditions. + acGridGeneralBoundconds(device, stream) + +#if MPI_COMPUTE_ENABLED + { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acKernelIntegrateSubstep(device->streams[STREAM_0], isubstep, m1, m2, device->vba); + } + { // Back + const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acKernelIntegrateSubstep(device->streams[STREAM_1], isubstep, m1, m2, device->vba); + } + { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acKernelIntegrateSubstep(device->streams[STREAM_2], isubstep, m1, m2, device->vba); + } + { // Top + const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acKernelIntegrateSubstep(device->streams[STREAM_3], isubstep, m1, m2, device->vba); + } + { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acKernelIntegrateSubstep(device->streams[STREAM_4], isubstep, m1, m2, device->vba); + } + { // Right + const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acKernelIntegrateSubstep(device->streams[STREAM_5], isubstep, m1, m2, device->vba); + } +#endif // MPI_COMPUTE_ENABLED + acDeviceSwapBuffers(device); + acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done + //////////////////////////////////////////// + + } + + + + + return AC_SUCCESS; +} + +*/ + + AcResult acGridPeriodicBoundconds(const Stream stream) { diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh index 6a31d58..f0518c0 100644 --- a/src/core/kernels/boundconds.cuh +++ b/src/core/kernels/boundconds.cuh @@ -1,5 +1,133 @@ #pragma once +static __global__ void +kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex, const int sign) +{ + const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x; + const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y; + const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z; + + // If within the start-end range (this allows threadblock dims that are not + // divisible by end - start) + if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z) + return; + + // If destination index is inside the computational domain, return since + // the boundary conditions are only applied to the ghost zones + if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) && + j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max)) + return; + + // Find the source index + // Map to nx, ny, nz coordinates + int i_src, j_src, k_src, boundlocx0, boundlocx1, boundlocy0, boundlocy1, boundlocz0, boundlocz1; + int bsize = STENCIL_ORDER/(int) 2; + + //if (bindex.x != 0) + //if (bindex.y != 0) + //if (bindex.z != 0) + + //Location of central border point. + boundlocx0 = bsize; + boundlocy0 = bsize; + boundlocz0 = bsize; + boundlocx1 = DCONST(AC_nx_max) - 1; + boundlocy1 = DCONST(AC_ny_max) - 1; + boundlocz1 = DCONST(AC_nz_max) - 1; + + //Defaults + i_src = i_dst; + j_src = j_dst; + k_src = k_dst; + + if (bindex.x < 0) + { + + // Pick up the mirroring value. + if ((i_dst < boundlocx0)) + { + i_src = 2.0f*boundlocx0 - i_dst; + + } else if ((i_dst > boundlocx1)) + { + i_src = 2.0f*boundlocx1 - i_dst; + } + + // Pick up the mirroring value. + if ((j_dst < boundlocy0)) + { + j_src = 2.0f*boundlocy0 - j_dst; + } else if ((j_dst > boundlocx1)) + { + j_src = 2.0f*boundlocy1 - j_dst; + } + + // Pick up the mirroring value. + if ((k_dst < boundlocz0)) + { + k_src = 2.0f*boundlocz0 - k_dst; + } else if ((k_dst > boundlocz1)) + { + k_src = 2.0f*boundlocz1 - k_dst; + } + + //Edges + if ( (i_dst < boundlocx0) && (j_dst < boundlocy0) ) + { + i_src = 2.0f*boundlocx0 - i_dst; + j_src = 2.0f*boundlocy0 - j_dst; + //if ((k_dst == 50)) printf("i_dst %i j_dst %i k_dst %i i_src %i j_src %i k_src %i bsize %i \n", i_dst, j_dst, k_dst, i_src, j_src, k_src, bsize); + } else if ((i_dst < boundlocx0) && (k_dst < boundlocz0) ) + { + i_src = 2.0f*boundlocx0 - i_dst; + k_src = 2.0f*boundlocz0 - k_dst; + } else if ( (j_dst < boundlocy0) && (k_dst < boundlocz0) ) + { + j_src = 2.0f*boundlocy0 - j_dst; + k_src = 2.0f*boundlocz0 - k_dst; + + } else if ((i_dst > boundlocx1) && (j_dst > boundlocx1) ) + { + i_src = 2.0f*boundlocx1 - i_dst; + j_src = 2.0f*boundlocy1 - j_dst; + } else if ( (i_dst > boundlocx1) && (k_dst > boundlocz1) ) + { + i_src = 2.0f*boundlocx1 - i_dst; + k_src = 2.0f*boundlocz1 - k_dst; + } else if ( (j_dst > boundlocy1) && (k_dst > boundlocz1) ) + { + j_src = 2.0f*boundlocy1 - j_dst; + k_src = 2.0f*boundlocz1 - k_dst; + } else if ( (i_dst > boundlocx1) && (k_dst < boundlocz0) ) + { + i_src = 2.0f*boundlocx1 - i_dst; + k_src = 2.0f*boundlocz0 - k_dst; + } else if ( (i_dst > boundlocx1) && (j_dst < bsize) ) + { + i_src = 2.0f*boundlocx1 - i_dst; + j_src = 2.0f*boundlocy0 - j_dst; + } else if ( (i_dst < boundlocx0) && (k_dst > boundlocz1) ) + { + i_src = 2.0f*boundlocx0 - i_dst; + k_src = 2.0f*boundlocz1 - k_dst; + } else if ( (i_dst < boundlocx0) && (j_dst > boundlocy1) ) + { + i_src = 2.0f*boundlocx0 - i_dst; + j_src = 2.0f*boundlocy1 - j_dst; + } else if ( (j_dst > boundlocy1) && (k_dst < boundlocz0) ) + { + j_src = 2.0f*boundlocy1 - j_dst; + k_src = 2.0f*boundlocz0 - k_dst; + } + + } + + const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src); + const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst); + vtxbuf[dst_idx] = sign*vtxbuf[src_idx] *0.0 + 1.0; // sign = 1 symmetric, sign = -1 antisymmetric +} + + static __global__ void kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) { @@ -60,3 +188,55 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in ERRCHK_CUDA_KERNEL(); return AC_SUCCESS; } + +AcResult +acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end, + AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle, + const AcMeshInfo config, const int3 bindex) +{ + const dim3 tpb(8, 2, 8); + const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), + (unsigned int)ceil((end.y - start.y) / (float)tpb.y), + (unsigned int)ceil((end.z - start.z) / (float)tpb.z)); + + int3 bc_top = {config.int_params[AC_bc_type_top_x], config.int_params[AC_bc_type_top_y], + config.int_params[AC_bc_type_top_z]}; + int3 bc_bot = {config.int_params[AC_bc_type_bot_x], config.int_params[AC_bc_type_bot_y], + config.int_params[AC_bc_type_bot_z]}; + +//#if AC_MPI_ENABLED +// printf( "WARNING : NON-PERIODIC BOUNDARY CONDITIONS NOT SUPPORTER BY MPI! Only working at node level.\n"); +// return AC_FAILURE; +//#endif + + if ( vtxbuf_handle != -1) // This is a dummy to make swithing boundary condition with respect to more possible later + { + + if (bc_top.x == AC_BOUNDCOND_SYMMETRIC) + { + kernel_symmetric_boundconds<<>>(start, end, vtxbuf, bindex, 1); + ERRCHK_CUDA_KERNEL(); + } + else if (bc_top.x == AC_BOUNDCOND_ANTISYMMETRIC) + { + kernel_symmetric_boundconds<<>>(start, end, vtxbuf, bindex, -1); + ERRCHK_CUDA_KERNEL(); + } + else if (bc_top.x == AC_BOUNDCOND_PERIODIC) + { + kernel_periodic_boundconds<<>>(start, end, vtxbuf); + ERRCHK_CUDA_KERNEL(); + } + else + { + printf("ERROR: Boundary condition not recognized!\n"); + printf("ERROR: bc_top = %i, %i, %i \n", bc_top.x, bc_top.y, bc_top.z); + printf("ERROR: bc_bot = %i, %i, %i \n", bc_bot.x, bc_bot.y, bc_bot.z); + + return AC_FAILURE; + } + + } + + return AC_SUCCESS; +} diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index 282f1c4..b04eaf5 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -45,6 +45,11 @@ extern "C" { /** */ AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end, AcReal* vtxbuf); +/** */ +AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end, + AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle, + const AcMeshInfo config, const int3 bindex); + /** */ AcResult acKernelDummy(void); diff --git a/src/core/node.cc b/src/core/node.cc index 70c3115..fdc4f49 100644 --- a/src/core/node.cc +++ b/src/core/node.cc @@ -656,6 +656,30 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand return AC_SUCCESS; } +static AcResult +local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf, const AcMeshInfo config) +{ + acNodeSynchronizeStream(node, stream); + + int3 bindex = {-1, -1, -1}; //Dummy for node level. Relevant only for MPI. + + if (node->num_devices > 1) { + // Local boundary conditions + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE + const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z}; + acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1, config, bindex); + } + } + else { + acDeviceGeneralBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0}, + node->subgrid.m, config, bindex); + } + return AC_SUCCESS; +} + + static AcResult global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle) { @@ -768,6 +792,85 @@ acNodeIntegrate(const Node node, const AcReal dt) return AC_SUCCESS; } +AcResult +acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt) +{ + acNodeSynchronizeStream(node, STREAM_ALL); + // xxx|OOO OOOOOOOOO OOO|xxx + // ^ ^ ^ ^ + // n0 n1 n2 n3 + // const int3 n0 = (int3){NGHOST, NGHOST, NGHOST}; + // const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + // const int3 n2 = node->grid.n; + // const int3 n3 = n0 + node->grid.n; + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acNodeSynchronizeStream(node, STREAM_ALL); + for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { + local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, config); + } + acNodeSynchronizeStream(node, STREAM_ALL); + + // Inner inner + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = node->subgrid.n; + acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt); + } + + for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { + acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf); + global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf); + } + for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { + acNodeSynchronizeStream(node, (Stream)vtxbuf); + } + + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Back + const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Top + const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST, + node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt); + } + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { // Right + const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST, + node->subgrid.n.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt); + } + acNodeSwapBuffers(node); + } + acNodeSynchronizeStream(node, STREAM_ALL); + return AC_SUCCESS; +} + AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle) @@ -783,7 +886,20 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream, } AcResult -acNodePeriodicBoundconds(const Node node, const Stream stream) +acNodeGeneralBoundcondStep(const Node node, const Stream stream, + const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config) +{ + local_boundcondstep_GBC(node, stream, vtxbuf_handle, config); + acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle); + + global_boundcondstep(node, stream, vtxbuf_handle); + + + return AC_SUCCESS; +} + +AcResult +acNodePeriodicBoundconds(const Node node, const Stream stream) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i); @@ -791,6 +907,15 @@ acNodePeriodicBoundconds(const Node node, const Stream stream) return AC_SUCCESS; } +AcResult +acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config) +{ + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i, config); + } + return AC_SUCCESS; +} + static AcReal simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results, const int& n)