Intermediate changes to the revised node interface
This commit is contained in:
465
src/core/node.cu
465
src/core/node.cu
@@ -18,5 +18,470 @@
|
||||
*/
|
||||
#include "astaroth_node.h"
|
||||
|
||||
#include "astaroth_device.h"
|
||||
#include "errchk.h"
|
||||
#include "math_utils.h" // sum for reductions
|
||||
|
||||
#define AC_GEN_STR(X) #X
|
||||
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
|
||||
AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* int3param_names[] = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
|
||||
AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* realparam_names[] = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
|
||||
AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
|
||||
AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
|
||||
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
|
||||
#undef AC_GEN_STR
|
||||
|
||||
static const int MAX_NUM_DEVICES = 32;
|
||||
static Node node = NULL;
|
||||
|
||||
struct node_s {
|
||||
int id;
|
||||
|
||||
int num_devices;
|
||||
Device devices[MAX_NUM_DEVICES];
|
||||
|
||||
Grid grid;
|
||||
Grid subgrid;
|
||||
};
|
||||
|
||||
static int
|
||||
gridIdx(const Grid grid, const int3 idx)
|
||||
{
|
||||
return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
|
||||
}
|
||||
|
||||
static int3
|
||||
gridIdx3d(const Grid grid, const int idx)
|
||||
{
|
||||
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
||||
idx / (grid.m.x * grid.m.y)};
|
||||
}
|
||||
|
||||
static void
|
||||
printInt3(const int3 vec)
|
||||
{
|
||||
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
|
||||
}
|
||||
|
||||
static inline void
|
||||
print(const AcMeshInfo config)
|
||||
{
|
||||
for (int i = 0; i < NUM_INT_PARAMS; ++i)
|
||||
printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
|
||||
for (int i = 0; i < NUM_REAL_PARAMS; ++i)
|
||||
printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
|
||||
}
|
||||
|
||||
static void
|
||||
update_builtin_params(AcMeshInfo* config)
|
||||
{
|
||||
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
||||
///////////// PAD TEST
|
||||
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||
///////////// PAD TEST
|
||||
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
||||
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
||||
|
||||
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
||||
config->int_params[AC_nx_min] = NGHOST;
|
||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
||||
config->int_params[AC_ny_min] = NGHOST;
|
||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
|
||||
config->int_params[AC_nz_min] = NGHOST;
|
||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
|
||||
|
||||
/* Additional helper params */
|
||||
// Int helpers
|
||||
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
||||
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
||||
|
||||
#if VERBOSE_PRINTING // Defined in astaroth.h
|
||||
printf("###############################################################\n");
|
||||
printf("Config dimensions recalculated:\n");
|
||||
print(*config);
|
||||
printf("###############################################################\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
static Grid
|
||||
createGrid(const AcMeshInfo config)
|
||||
{
|
||||
Grid grid;
|
||||
|
||||
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
|
||||
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
|
||||
|
||||
return grid;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
|
||||
{
|
||||
struct node_s* node = (struct device_s*)malloc(sizeof(*node));
|
||||
|
||||
// Get num_devices
|
||||
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
|
||||
if (num_devices < 1) {
|
||||
ERROR("No CUDA devices found!");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
if (num_devices > MAX_NUM_DEVICES) {
|
||||
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
|
||||
num_devices = MAX_NUM_DEVICES;
|
||||
}
|
||||
if (!AC_MULTIGPU_ENABLED) {
|
||||
WARNING("MULTIGPU_ENABLED was false. Using only one device");
|
||||
num_devices = 1; // Use only one device if multi-GPU is not enabled
|
||||
}
|
||||
// Check that num_devices is divisible with AC_nz. This makes decomposing the
|
||||
// problem domain to multiple GPUs much easier since we do not have to worry
|
||||
// about remainders
|
||||
ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
|
||||
|
||||
// Decompose the problem domain
|
||||
// The main grid
|
||||
grid = createGrid(config);
|
||||
|
||||
// Subgrids
|
||||
AcMeshInfo subgrid_config = config;
|
||||
subgrid_config.int_params[AC_nz] /= num_devices;
|
||||
update_builtin_params(&subgrid_config);
|
||||
subgrid = createGrid(subgrid_config);
|
||||
|
||||
// Periodic boundary conditions become weird if the system can "fold unto itself".
|
||||
ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
|
||||
|
||||
#if VERBOSE_PRINTING
|
||||
// clang-format off
|
||||
printf("Grid m "); printInt3(grid.m); printf("\n");
|
||||
printf("Grid n "); printInt3(grid.n); printf("\n");
|
||||
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
|
||||
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
|
||||
// clang-format on
|
||||
#endif
|
||||
|
||||
// Initialize the devices
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
createDevice(i, subgrid_config, &devices[i]);
|
||||
printDeviceInfo(devices[i]);
|
||||
}
|
||||
|
||||
// Enable peer access
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int front = (i + 1) % num_devices;
|
||||
const int back = (i - 1 + num_devices) % num_devices;
|
||||
|
||||
int can_access_front, can_access_back;
|
||||
cudaDeviceCanAccessPeer(&can_access_front, i, front);
|
||||
cudaDeviceCanAccessPeer(&can_access_back, i, back);
|
||||
#if VERBOSE_PRINTING
|
||||
printf(
|
||||
"Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
|
||||
i, front, can_access_front, back, can_access_back);
|
||||
#endif
|
||||
|
||||
cudaSetDevice(i);
|
||||
if (can_access_front) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
|
||||
}
|
||||
if (can_access_back) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
|
||||
}
|
||||
}
|
||||
acNodeSynchronizeStream(STREAM_ALL);
|
||||
|
||||
*node_handle = node;
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeDestroy(Node node)
|
||||
{
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
destroyDevice(devices[i]);
|
||||
}
|
||||
free(node);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodePrintInfo(const Node node)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeAutoOptimize(const Node node)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSynchronizeStream(const Node node, const Stream stream)
|
||||
{
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
synchronize(devices[i], stream);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
// Exchanges the halos of subgrids
|
||||
// After this step, the data within the main grid ranging from
|
||||
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
|
||||
// has been synchronized and transferred to appropriate subgrids
|
||||
|
||||
// We loop only to num_devices - 1 since the front and back plate of the grid is not
|
||||
// transferred because their contents depend on the boundary conditions.
|
||||
|
||||
// IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
|
||||
// I.e. the halos of subgrids must contain up-to-date data!
|
||||
|
||||
const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
|
||||
|
||||
for (int i = 0; i < num_devices - 1; ++i) {
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
const int3 src = (int3){0, 0, subgrid.n.z};
|
||||
const int3 dst = (int3){0, 0, 0};
|
||||
|
||||
const Device src_device = devices[i];
|
||||
Device dst_device = devices[i + 1];
|
||||
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
||||
num_vertices, dst_device);
|
||||
}
|
||||
for (int i = 1; i < num_devices; ++i) {
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
const int3 src = (int3){0, 0, NGHOST};
|
||||
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
|
||||
|
||||
const Device src_device = devices[i];
|
||||
Device dst_device = devices[i - 1];
|
||||
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
||||
num_vertices, dst_device);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSynchronizeMesh(const Node node, const Stream stream)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSwapBuffers(const Node node)
|
||||
{
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
acDeviceSwapBuffers(devices[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
|
||||
const AcReal value)
|
||||
{
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
acDeviceLoadConstant(devices[i], stream, param, value);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices)
|
||||
{
|
||||
// See the beginning of the file for an explanation of the index mapping
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
|
||||
|
||||
const int3 s0 = src;
|
||||
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
|
||||
|
||||
const int3 da = max(s0, d0);
|
||||
const int3 db = min(s1, d1);
|
||||
/*
|
||||
printf("Device %d\n", i);
|
||||
printf("\ts0: "); printInt3(s0); printf("\n");
|
||||
printf("\td0: "); printInt3(d0); printf("\n");
|
||||
printf("\tda: "); printInt3(da); printf("\n");
|
||||
printf("\tdb: "); printInt3(db); printf("\n");
|
||||
printf("\td1: "); printInt3(d1); printf("\n");
|
||||
printf("\ts1: "); printInt3(s1); printf("\n");
|
||||
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
|
||||
*/
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
|
||||
// DECOMPOSITION OFFSET HERE
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
|
||||
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
|
||||
// printf("\n");
|
||||
acDeviceLoadVertexBufferWithOffset(devices[i], stream, host_mesh, vtxbuf_handle, da,
|
||||
da_local, copy_cells);
|
||||
}
|
||||
// printf("\n");
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const int3 src, const int3 dst, const int num_vertices)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
|
||||
num_vertices);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
|
||||
const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreVertexBuffer(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeTransferVertexBufferWithOffset(const Node src_node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices, Node dst_node)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeTransferMeshWithOffset(const Node src_node, const Stream stream, const int3 src,
|
||||
const int3 dst, const int num_vertices, Node* dst_node)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, Node* dst_node)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
|
||||
const int3 start, const int3 end, const AcReal dt)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodePeriodicBoundcondStep(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 start,
|
||||
const int3 end)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodePeriodicBoundconds(const Node node, const Stream stream, const int3 start, const int3 end)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
Reference in New Issue
Block a user