Merged in alt_bcond_2020_09 (pull request #16)

Alt bcond 2020 09
This commit is contained in:
Miikka Väisälä
2020-11-23 10:24:01 +00:00
committed by jpekkila
9 changed files with 694 additions and 8 deletions

View File

@@ -20,9 +20,15 @@
uniform int AC_max_steps;
uniform int AC_save_steps;
uniform int AC_bin_steps;
uniform int AC_bc_type;
uniform int AC_start_step;
uniform int AC_bc_type_top_x;
uniform int AC_bc_type_bot_x;
uniform int AC_bc_type_top_y;
uniform int AC_bc_type_bot_y;
uniform int AC_bc_type_top_z;
uniform int AC_bc_type_bot_z;
// Real params
uniform Scalar AC_dt;
uniform Scalar AC_max_time;

View File

@@ -13,6 +13,15 @@ AC_dsx = 0.04908738521
AC_dsy = 0.04908738521
AC_dsz = 0.04908738521
// 0 = periodic bc, 1 = symmetric bc, 2 = antisymmetric bc
AC_bc_type_top_x = 0
AC_bc_type_top_y = 0
AC_bc_type_top_z = 0
AC_bc_type_bot_x = 0
AC_bc_type_bot_y = 0
AC_bc_type_bot_z = 0
/*
* =============================================================================
* Run-time params

View File

@@ -49,6 +49,11 @@ typedef struct {
typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
// Neming the associated number of the boundary condition types
typedef enum { AC_BOUNDCOND_PERIODIC = 0,
AC_BOUNDCOND_SYMMETRIC = 1,
AC_BOUNDCOND_ANTISYMMETRIC = 2 } AcBoundcond;
#define AC_GEN_ID(X) X,
typedef enum {
AC_FOR_RTYPES(AC_GEN_ID) //
@@ -227,10 +232,19 @@ AcResult acStore(AcMesh* host_mesh);
* substep and the user is responsible for calling acBoundcondStep before reading the data. */
AcResult acIntegrate(const AcReal dt);
/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
* substep and the user is responsible for calling acBoundcondStep before reading the data.
* Has customizable boundary conditions. */
AcResult acIntegrateGBC(const AcMeshInfo config, const AcReal dt);
/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to
* the caller*/
AcResult acBoundcondStep(void);
/** Applies general outer boundary conditions for the Mesh distributed among the devices visible to
* the caller*/
AcResult acBoundcondStepGBC(const AcMeshInfo config);
/** Does a scalar reduction with the data stored in some vertex buffer */
AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
@@ -306,9 +320,21 @@ AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
/** */
AcResult acGridIntegrate(const Stream stream, const AcReal dt);
/** */
/* MV: Commented out for a while, but save for the future when standalone_MPI
works with periodic boundary conditions.
AcResult
acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
AcResult acGridIntegrateNonperiodic(const Stream stream, const AcReal dt);
*/
/** */
AcResult acGridPeriodicBoundconds(const Stream stream);
/** */
AcResult acGridGeneralBoundconds(const Device device, const Stream stream);
/** TODO */
AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
@@ -430,6 +456,9 @@ AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int
/** */
AcResult acNodeIntegrate(const Node node, const AcReal dt);
/** */
AcResult acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt);
/** */
AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle);
@@ -437,6 +466,13 @@ AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
/** */
AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
/** */
AcResult acNodeGeneralBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config);
/** */
AcResult acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config);
/** */
AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
@@ -565,6 +601,16 @@ AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end);
/** */
AcResult acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end, const AcMeshInfo config, const int3 bindex);
/** */
AcResult acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end, const AcMeshInfo config, const int3 bindex);
/** */
AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);

View File

@@ -374,7 +374,8 @@ run_simulation(const char* config_path)
#endif
}
acBoundcondStep();
//acBoundcondStep();
acBoundcondStepGBC(mesh_info);
acStore(mesh);
if (start_step == 0) {
save_mesh(*mesh, 0, t_step);
@@ -440,7 +441,11 @@ run_simulation(const char* config_path)
loadForcingParamsToDevice(forcing_params);
#endif
acIntegrate(dt);
/* Uses now flexible bokundary conditions */
//acIntegrate(dt);
acIntegrateGBC(mesh_info, dt);
t_step += dt;
@@ -486,7 +491,8 @@ run_simulation(const char* config_path)
acBoundcondStep();
acStore(mesh);
*/
acBoundcondStep();
//acBoundcondStep();
acBoundcondStepGBC(mesh_info);
acStore(mesh);
save_mesh(*mesh, i, t_step);
@@ -507,7 +513,8 @@ run_simulation(const char* config_path)
if (dt < dt_typical/AcReal(1e5)) {
if (dtcounter > 10) {
printf("dt = %e TOO LOW! Ending run at t = %#e \n", double(dt), double(t_step));
acBoundcondStep();
//acBoundcondStep();
acBoundcondStepGBC(mesh_info);
acStore(mesh);
save_mesh(*mesh, i, t_step);
break;
@@ -521,7 +528,8 @@ run_simulation(const char* config_path)
// End loop if nan is found
if (found_nan > 0) {
printf("Found nan at t = %e \n", double(t_step));
acBoundcondStep();
//acBoundcondStep();
acBoundcondStepGBC(mesh_info);
acStore(mesh);
save_mesh(*mesh, i, t_step);
break;
@@ -536,7 +544,8 @@ run_simulation(const char* config_path)
if (found_stop == 1) {
printf("Found STOP file at t = %e \n", double(t_step));
acBoundcondStep();
//acBoundcondStep();
acBoundcondStepGBC(mesh_info);
acStore(mesh);
save_mesh(*mesh, i, t_step);
break;

View File

@@ -86,6 +86,13 @@ acIntegrate(const AcReal dt)
return acNodeIntegrate(nodes[0], dt);
}
AcResult
acIntegrateGBC(const AcMeshInfo config, const AcReal dt)
{
return acNodeIntegrateGBC(nodes[0], config, dt);
}
AcResult
acIntegrateStep(const int isubstep, const AcReal dt)
{
@@ -109,6 +116,12 @@ acBoundcondStep(void)
return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
}
AcResult
acBoundcondStepGBC(const AcMeshInfo config)
{
return acNodeGeneralBoundconds(nodes[0], STREAM_DEFAULT, config);
}
AcReal
acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
{

View File

@@ -433,6 +433,27 @@ acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3
return AC_SUCCESS;
}
AcResult
acDeviceGeneralBoundcondStep(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end, const AcMeshInfo config, const int3 bindex)
{
cudaSetDevice(device->id);
return acKernelGeneralBoundconds(device->streams[stream], start, end,
device->vba.in[vtxbuf_handle], vtxbuf_handle, config, bindex);
}
AcResult
acDeviceGeneralBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end, const AcMeshInfo config, const int3 bindex)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceGeneralBoundcondStep(device, stream, (VertexBufferHandle)i, start, end, config, bindex);
}
return AC_SUCCESS;
}
AcResult
acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
@@ -1433,6 +1454,50 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
return AC_SUCCESS;
}
/* MV: Commented out for a while, but save for the future when standalone_MPI
works with periodic boundary conditions.
AcResult
acGridGeneralBoundconds(const Device device, const Stream stream)
{
// Non-periodic Boundary conditions
// Check the position in MPI frame
int nprocs, pid;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
const uint3_64 decomposition = decompose(nprocs);
const int3 pid3d = getPid3D(pid, decomposition);
// Set outer boudaries after substep computation.
const int3 m1 = (int3){0, 0, 0};
const int3 m2 = grid.nn;
const int3 pid3d = getPid3D(pid, decomposition);
// If we are are a boundary element
int3 bindex = (int3){0, 0, 0};
// Check if there are active boundary condition edges.
// 0 is no boundary, 1 both edges, 2 is top edge, 3 bottom edge
if ((pid3d.x == 0) && (pid3d.x == decomposition.x - 1)) { bindex.x = 1; }
else if (pid3d.x == 0) { bindex.x = 2; }
else if (pid3d.x == decomposition.x - 1) { bindex.x = 3; }
if ((pid3d.y == 0) && (pid3d.y == decomposition.y - 1)) { bindex.y = 1; }
else if (pid3d.y == 0) { bindex.y = 2; }
else if (pid3d.y == decomposition.y - 1) { bindex.y = 3; }
if ((pid3d.z == 0) && (pid3d.z == decomposition.z - 1)) { bindex.z = 1; }
else if (pid3d.z == 0) { bindex.z = 2; }
else if (pid3d.z == decomposition.z - 1) { bindex.z = 3; }
if (bindex.x != 1) && (bindex.y != 1) && (bindex.z != 1) {
acDeviceGeneralBoundconds(device, stream, m1, m2, bindex);
}
acGridSynchronizeStream(stream);
return AC_SUCCESS;
}
*/
/*
// Unused
AcResult
@@ -1864,6 +1929,234 @@ acGridIntegrate(const Stream stream, const AcReal dt)
return AC_SUCCESS;
}
/* MV: Commented out for a while, but save for the future when standalone_MPI
works with periodic boundary conditions.
AcResult
acGridIntegrateNonperiodic(const Stream stream, const AcReal dt)
{
ERRCHK(grid.initialized);
acGridSynchronizeStream(stream);
const Device device = grid.device;
const int3 nn = grid.nn;
#if MPI_INCL_CORNERS
CommData corner_data = grid.corner_data; // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
CommData edgex_data = grid.edgex_data;
CommData edgey_data = grid.edgey_data;
CommData edgez_data = grid.edgez_data;
CommData sidexy_data = grid.sidexy_data;
CommData sidexz_data = grid.sidexz_data;
CommData sideyz_data = grid.sideyz_data;
acGridLoadScalarUniform(stream, AC_dt, dt);
acDeviceSynchronizeStream(device, stream);
// Corners
#if MPI_INCL_CORNERS
// Do not rm: required for corners
const int3 corner_b0s[] = {
(int3){0, 0, 0},
(int3){NGHOST + nn.x, 0, 0},
(int3){0, NGHOST + nn.y, 0},
(int3){0, 0, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
};
#endif // MPI_INCL_CORNERS
// Edges X
const int3 edgex_b0s[] = {
(int3){NGHOST, 0, 0},
(int3){NGHOST, NGHOST + nn.y, 0},
(int3){NGHOST, 0, NGHOST + nn.z},
(int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
};
// Edges Y
const int3 edgey_b0s[] = {
(int3){0, NGHOST, 0},
(int3){NGHOST + nn.x, NGHOST, 0},
(int3){0, NGHOST, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
};
// Edges Z
const int3 edgez_b0s[] = {
(int3){0, 0, NGHOST},
(int3){NGHOST + nn.x, 0, NGHOST},
(int3){0, NGHOST + nn.y, NGHOST},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
};
// Sides XY
const int3 sidexy_b0s[] = {
(int3){NGHOST, NGHOST, 0}, //
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
};
// Sides XZ
const int3 sidexz_b0s[] = {
(int3){NGHOST, 0, NGHOST}, //
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
};
// Sides YZ
const int3 sideyz_b0s[] = {
(int3){0, NGHOST, NGHOST}, //
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
};
for (int isubstep = 0; isubstep < 3; ++isubstep) {
#if MPI_COMM_ENABLED
#if MPI_INCL_CORNERS
acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
acPackCommData(device, edgex_b0s, &edgex_data);
acPackCommData(device, edgey_b0s, &edgey_data);
acPackCommData(device, edgez_b0s, &edgez_data);
acPackCommData(device, sidexy_b0s, &sidexy_data);
acPackCommData(device, sidexz_b0s, &sidexz_data);
acPackCommData(device, sideyz_b0s, &sideyz_data);
#endif
#if MPI_COMM_ENABLED
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED
#if MPI_INCL_CORNERS
acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
acTransferCommDataToHost(device, &edgex_data);
acTransferCommDataToHost(device, &edgey_data);
acTransferCommDataToHost(device, &edgez_data);
acTransferCommDataToHost(device, &sidexy_data);
acTransferCommDataToHost(device, &sidexz_data);
acTransferCommDataToHost(device, &sideyz_data);
#endif
#if MPI_INCL_CORNERS
acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
acTransferCommData(device, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_b0s, &sideyz_data);
#endif // MPI_COMM_ENABLED
#if MPI_COMPUTE_ENABLED
//////////// INNER INTEGRATION //////////////
{
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = nn;
acKernelIntegrateSubstep(device->streams[STREAM_16], isubstep, m1, m2, device->vba);
}
////////////////////////////////////////////
#endif // MPI_COMPUTE_ENABLED
#if MPI_COMM_ENABLED
#if MPI_INCL_CORNERS
acTransferCommDataWait(corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
acTransferCommDataWait(edgex_data);
acTransferCommDataWait(edgey_data);
acTransferCommDataWait(edgez_data);
acTransferCommDataWait(sidexy_data);
acTransferCommDataWait(sidexz_data);
acTransferCommDataWait(sideyz_data);
#if MPI_INCL_CORNERS
acUnpinCommData(device, &corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
acUnpinCommData(device, &edgex_data);
acUnpinCommData(device, &edgey_data);
acUnpinCommData(device, &edgez_data);
acUnpinCommData(device, &sidexy_data);
acUnpinCommData(device, &sidexz_data);
acUnpinCommData(device, &sideyz_data);
#if MPI_INCL_CORNERS
acUnpackCommData(device, corner_b0s, &corner_data);
#endif // MPI_INCL_CORNERS
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);
acUnpackCommData(device, edgez_b0s, &edgez_data);
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
//////////// OUTER INTEGRATION //////////////
// Wait for unpacking
#if MPI_INCL_CORNERS
acSyncCommData(corner_data); // Do not rm: required for corners
#endif // MPI_INCL_CORNERS
acSyncCommData(edgex_data);
acSyncCommData(edgey_data);
acSyncCommData(edgez_data);
acSyncCommData(sidexy_data);
acSyncCommData(sidexz_data);
acSyncCommData(sideyz_data);
#endif // MPI_COMM_ENABLED
// Invoke outer edge boundary conditions.
acGridGeneralBoundconds(device, stream)
#if MPI_COMPUTE_ENABLED
{ // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acKernelIntegrateSubstep(device->streams[STREAM_0], isubstep, m1, m2, device->vba);
}
{ // Back
const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acKernelIntegrateSubstep(device->streams[STREAM_1], isubstep, m1, m2, device->vba);
}
{ // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acKernelIntegrateSubstep(device->streams[STREAM_2], isubstep, m1, m2, device->vba);
}
{ // Top
const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acKernelIntegrateSubstep(device->streams[STREAM_3], isubstep, m1, m2, device->vba);
}
{ // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acKernelIntegrateSubstep(device->streams[STREAM_4], isubstep, m1, m2, device->vba);
}
{ // Right
const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acKernelIntegrateSubstep(device->streams[STREAM_5], isubstep, m1, m2, device->vba);
}
#endif // MPI_COMPUTE_ENABLED
acDeviceSwapBuffers(device);
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
////////////////////////////////////////////
}
return AC_SUCCESS;
}
*/
AcResult
acGridPeriodicBoundconds(const Stream stream)
{

View File

@@ -1,5 +1,133 @@
#pragma once
static __global__ void
kernel_symmetric_boundconds(const int3 start, const int3 end, AcReal* vtxbuf, const int3 bindex, const int sign)
{
const int i_dst = start.x + threadIdx.x + blockIdx.x * blockDim.x;
const int j_dst = start.y + threadIdx.y + blockIdx.y * blockDim.y;
const int k_dst = start.z + threadIdx.z + blockIdx.z * blockDim.z;
// If within the start-end range (this allows threadblock dims that are not
// divisible by end - start)
if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
return;
// If destination index is inside the computational domain, return since
// the boundary conditions are only applied to the ghost zones
if (i_dst >= DCONST(AC_nx_min) && i_dst < DCONST(AC_nx_max) && j_dst >= DCONST(AC_ny_min) &&
j_dst < DCONST(AC_ny_max) && k_dst >= DCONST(AC_nz_min) && k_dst < DCONST(AC_nz_max))
return;
// Find the source index
// Map to nx, ny, nz coordinates
int i_src, j_src, k_src, boundlocx0, boundlocx1, boundlocy0, boundlocy1, boundlocz0, boundlocz1;
int bsize = STENCIL_ORDER/(int) 2;
//if (bindex.x != 0)
//if (bindex.y != 0)
//if (bindex.z != 0)
//Location of central border point.
boundlocx0 = bsize;
boundlocy0 = bsize;
boundlocz0 = bsize;
boundlocx1 = DCONST(AC_nx_max) - 1;
boundlocy1 = DCONST(AC_ny_max) - 1;
boundlocz1 = DCONST(AC_nz_max) - 1;
//Defaults
i_src = i_dst;
j_src = j_dst;
k_src = k_dst;
if (bindex.x < 0)
{
// Pick up the mirroring value.
if ((i_dst < boundlocx0))
{
i_src = 2.0f*boundlocx0 - i_dst;
} else if ((i_dst > boundlocx1))
{
i_src = 2.0f*boundlocx1 - i_dst;
}
// Pick up the mirroring value.
if ((j_dst < boundlocy0))
{
j_src = 2.0f*boundlocy0 - j_dst;
} else if ((j_dst > boundlocx1))
{
j_src = 2.0f*boundlocy1 - j_dst;
}
// Pick up the mirroring value.
if ((k_dst < boundlocz0))
{
k_src = 2.0f*boundlocz0 - k_dst;
} else if ((k_dst > boundlocz1))
{
k_src = 2.0f*boundlocz1 - k_dst;
}
//Edges
if ( (i_dst < boundlocx0) && (j_dst < boundlocy0) )
{
i_src = 2.0f*boundlocx0 - i_dst;
j_src = 2.0f*boundlocy0 - j_dst;
//if ((k_dst == 50)) printf("i_dst %i j_dst %i k_dst %i i_src %i j_src %i k_src %i bsize %i \n", i_dst, j_dst, k_dst, i_src, j_src, k_src, bsize);
} else if ((i_dst < boundlocx0) && (k_dst < boundlocz0) )
{
i_src = 2.0f*boundlocx0 - i_dst;
k_src = 2.0f*boundlocz0 - k_dst;
} else if ( (j_dst < boundlocy0) && (k_dst < boundlocz0) )
{
j_src = 2.0f*boundlocy0 - j_dst;
k_src = 2.0f*boundlocz0 - k_dst;
} else if ((i_dst > boundlocx1) && (j_dst > boundlocx1) )
{
i_src = 2.0f*boundlocx1 - i_dst;
j_src = 2.0f*boundlocy1 - j_dst;
} else if ( (i_dst > boundlocx1) && (k_dst > boundlocz1) )
{
i_src = 2.0f*boundlocx1 - i_dst;
k_src = 2.0f*boundlocz1 - k_dst;
} else if ( (j_dst > boundlocy1) && (k_dst > boundlocz1) )
{
j_src = 2.0f*boundlocy1 - j_dst;
k_src = 2.0f*boundlocz1 - k_dst;
} else if ( (i_dst > boundlocx1) && (k_dst < boundlocz0) )
{
i_src = 2.0f*boundlocx1 - i_dst;
k_src = 2.0f*boundlocz0 - k_dst;
} else if ( (i_dst > boundlocx1) && (j_dst < bsize) )
{
i_src = 2.0f*boundlocx1 - i_dst;
j_src = 2.0f*boundlocy0 - j_dst;
} else if ( (i_dst < boundlocx0) && (k_dst > boundlocz1) )
{
i_src = 2.0f*boundlocx0 - i_dst;
k_src = 2.0f*boundlocz1 - k_dst;
} else if ( (i_dst < boundlocx0) && (j_dst > boundlocy1) )
{
i_src = 2.0f*boundlocx0 - i_dst;
j_src = 2.0f*boundlocy1 - j_dst;
} else if ( (j_dst > boundlocy1) && (k_dst < boundlocz0) )
{
j_src = 2.0f*boundlocy1 - j_dst;
k_src = 2.0f*boundlocz0 - k_dst;
}
}
const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
vtxbuf[dst_idx] = sign*vtxbuf[src_idx] *0.0 + 1.0; // sign = 1 symmetric, sign = -1 antisymmetric
}
static __global__ void
kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
{
@@ -60,3 +188,55 @@ acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const in
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle,
const AcMeshInfo config, const int3 bindex)
{
const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
int3 bc_top = {config.int_params[AC_bc_type_top_x], config.int_params[AC_bc_type_top_y],
config.int_params[AC_bc_type_top_z]};
int3 bc_bot = {config.int_params[AC_bc_type_bot_x], config.int_params[AC_bc_type_bot_y],
config.int_params[AC_bc_type_bot_z]};
//#if AC_MPI_ENABLED
// printf( "WARNING : NON-PERIODIC BOUNDARY CONDITIONS NOT SUPPORTER BY MPI! Only working at node level.\n");
// return AC_FAILURE;
//#endif
if ( vtxbuf_handle != -1) // This is a dummy to make swithing boundary condition with respect to more possible later
{
if (bc_top.x == AC_BOUNDCOND_SYMMETRIC)
{
kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, 1);
ERRCHK_CUDA_KERNEL();
}
else if (bc_top.x == AC_BOUNDCOND_ANTISYMMETRIC)
{
kernel_symmetric_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf, bindex, -1);
ERRCHK_CUDA_KERNEL();
}
else if (bc_top.x == AC_BOUNDCOND_PERIODIC)
{
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
ERRCHK_CUDA_KERNEL();
}
else
{
printf("ERROR: Boundary condition not recognized!\n");
printf("ERROR: bc_top = %i, %i, %i \n", bc_top.x, bc_top.y, bc_top.z);
printf("ERROR: bc_bot = %i, %i, %i \n", bc_bot.x, bc_bot.y, bc_bot.z);
return AC_FAILURE;
}
}
return AC_SUCCESS;
}

View File

@@ -45,6 +45,11 @@ extern "C" {
/** */
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf);
/** */
AcResult acKernelGeneralBoundconds(const cudaStream_t stream, const int3 start, const int3 end,
AcReal* vtxbuf, const VertexBufferHandle vtxbuf_handle,
const AcMeshInfo config, const int3 bindex);
/** */
AcResult acKernelDummy(void);

View File

@@ -656,6 +656,30 @@ local_boundcondstep(const Node node, const Stream stream, const VertexBufferHand
return AC_SUCCESS;
}
static AcResult
local_boundcondstep_GBC(const Node node, const Stream stream, const VertexBufferHandle vtxbuf, const AcMeshInfo config)
{
acNodeSynchronizeStream(node, stream);
int3 bindex = {-1, -1, -1}; //Dummy for node level. Relevant only for MPI.
if (node->num_devices > 1) {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
acDeviceGeneralBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1, config, bindex);
}
}
else {
acDeviceGeneralBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
node->subgrid.m, config, bindex);
}
return AC_SUCCESS;
}
static AcResult
global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
{
@@ -768,6 +792,85 @@ acNodeIntegrate(const Node node, const AcReal dt)
return AC_SUCCESS;
}
AcResult
acNodeIntegrateGBC(const Node node, const AcMeshInfo config, const AcReal dt)
{
acNodeSynchronizeStream(node, STREAM_ALL);
// xxx|OOO OOOOOOOOO OOO|xxx
// ^ ^ ^ ^
// n0 n1 n2 n3
// const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
// const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
// const int3 n2 = node->grid.n;
// const int3 n3 = n0 + node->grid.n;
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acNodeSynchronizeStream(node, STREAM_ALL);
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
local_boundcondstep_GBC(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, config);
}
acNodeSynchronizeStream(node, STREAM_ALL);
// Inner inner
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = node->subgrid.n;
acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
acNodeSynchronizeStream(node, (Stream)vtxbuf);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Back
const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Top
const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Right
const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
}
acNodeSwapBuffers(node);
}
acNodeSynchronizeStream(node, STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acNodePeriodicBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle)
@@ -783,7 +886,20 @@ acNodePeriodicBoundcondStep(const Node node, const Stream stream,
}
AcResult
acNodePeriodicBoundconds(const Node node, const Stream stream)
acNodeGeneralBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const AcMeshInfo config)
{
local_boundcondstep_GBC(node, stream, vtxbuf_handle, config);
acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
global_boundcondstep(node, stream, vtxbuf_handle);
return AC_SUCCESS;
}
AcResult
acNodePeriodicBoundconds(const Node node, const Stream stream)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
@@ -791,6 +907,15 @@ acNodePeriodicBoundconds(const Node node, const Stream stream)
return AC_SUCCESS;
}
AcResult
acNodeGeneralBoundconds(const Node node, const Stream stream, const AcMeshInfo config)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodeGeneralBoundcondStep(node, stream, (VertexBufferHandle)i, config);
}
return AC_SUCCESS;
}
static AcReal
simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
const int& n)