Intermediate changes to the revised node interface

2019-07-31 20:04:39 +03:00
parent 0a5d025172
commit fb0610c1ba
4 changed files with 502 additions and 15 deletions
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -18,5 +18,470 @@
 */
 #include "astaroth_node.h"

+#include "astaroth_device.h"
+#include "errchk.h"
+#include "math_utils.h" // sum for reductions
+
+#define AC_GEN_STR(X) #X
+const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
+                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
+                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
+                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
+                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+#undef AC_GEN_STR
+
+static const int MAX_NUM_DEVICES = 32;
+static Node node                 = NULL;
+
 struct node_s {
+    int id;
+
+    int num_devices;
+    Device devices[MAX_NUM_DEVICES];
+
+    Grid grid;
+    Grid subgrid;
 };
+
+static int
+gridIdx(const Grid grid, const int3 idx)
+{
+    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
+}
+
+static int3
+gridIdx3d(const Grid grid, const int idx)
+{
+    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+                  idx / (grid.m.x * grid.m.y)};
+}
+
+static void
+printInt3(const int3 vec)
+{
+    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
+}
+
+static inline void
+print(const AcMeshInfo config)
+{
+    for (int i = 0; i < NUM_INT_PARAMS; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+}
+
+static void
+update_builtin_params(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = NGHOST;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = NGHOST;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
+    config->int_params[AC_nz_min] = NGHOST;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
+
+#if VERBOSE_PRINTING // Defined in astaroth.h
+    printf("###############################################################\n");
+    printf("Config dimensions recalculated:\n");
+    print(*config);
+    printf("###############################################################\n");
+#endif
+}
+
+static Grid
+createGrid(const AcMeshInfo config)
+{
+    Grid grid;
+
+    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
+    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
+
+    return grid;
+}
+
+AcResult
+acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
+{
+    struct node_s* node = (struct device_s*)malloc(sizeof(*node));
+
+    // Get num_devices
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
+    if (num_devices < 1) {
+        ERROR("No CUDA devices found!");
+        return AC_FAILURE;
+    }
+    if (num_devices > MAX_NUM_DEVICES) {
+        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
+        num_devices = MAX_NUM_DEVICES;
+    }
+    if (!AC_MULTIGPU_ENABLED) {
+        WARNING("MULTIGPU_ENABLED was false. Using only one device");
+        num_devices = 1; // Use only one device if multi-GPU is not enabled
+    }
+    // Check that num_devices is divisible with AC_nz. This makes decomposing the
+    // problem domain to multiple GPUs much easier since we do not have to worry
+    // about remainders
+    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
+
+    // Decompose the problem domain
+    // The main grid
+    grid = createGrid(config);
+
+    // Subgrids
+    AcMeshInfo subgrid_config = config;
+    subgrid_config.int_params[AC_nz] /= num_devices;
+    update_builtin_params(&subgrid_config);
+    subgrid = createGrid(subgrid_config);
+
+    // Periodic boundary conditions become weird if the system can "fold unto itself".
+    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
+    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
+
+#if VERBOSE_PRINTING
+    // clang-format off
+    printf("Grid m ");   printInt3(grid.m);    printf("\n");
+    printf("Grid n ");   printInt3(grid.n);    printf("\n");
+    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
+    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+    // clang-format on
+#endif
+
+    // Initialize the devices
+    for (int i = 0; i < num_devices; ++i) {
+        createDevice(i, subgrid_config, &devices[i]);
+        printDeviceInfo(devices[i]);
+    }
+
+    // Enable peer access
+    for (int i = 0; i < num_devices; ++i) {
+        const int front = (i + 1) % num_devices;
+        const int back  = (i - 1 + num_devices) % num_devices;
+
+        int can_access_front, can_access_back;
+        cudaDeviceCanAccessPeer(&can_access_front, i, front);
+        cudaDeviceCanAccessPeer(&can_access_back, i, back);
+#if VERBOSE_PRINTING
+        printf(
+            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
+            i, front, can_access_front, back, can_access_back);
+#endif
+
+        cudaSetDevice(i);
+        if (can_access_front) {
+            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
+        }
+        if (can_access_back) {
+            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
+        }
+    }
+    acNodeSynchronizeStream(STREAM_ALL);
+
+    *node_handle = node;
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeDestroy(Node node)
+{
+    acSynchronizeStream(STREAM_ALL);
+
+    for (int i = 0; i < num_devices; ++i) {
+        destroyDevice(devices[i]);
+    }
+    free(node);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodePrintInfo(const Node node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeAutoOptimize(const Node node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeSynchronizeStream(const Node node, const Stream stream)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        synchronize(devices[i], stream);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
+                              const VertexBufferHandle vtxbuf_handle)
+{
+    // Exchanges the halos of subgrids
+    // After this step, the data within the main grid ranging from
+    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
+    // has been synchronized and transferred to appropriate subgrids
+
+    // We loop only to num_devices - 1 since the front and back plate of the grid is not
+    // transferred because their contents depend on the boundary conditions.
+
+    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
+    // I.e. the halos of subgrids must contain up-to-date data!
+
+    const size_t num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
+
+    for (int i = 0; i < num_devices - 1; ++i) {
+        // ...|ooooxxx|... -> xxx|ooooooo|...
+        const int3 src = (int3){0, 0, subgrid.n.z};
+        const int3 dst = (int3){0, 0, 0};
+
+        const Device src_device = devices[i];
+        Device dst_device       = devices[i + 1];
+
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    for (int i = 1; i < num_devices; ++i) {
+        // ...|ooooooo|xxx <- ...|xxxoooo|...
+        const int3 src = (int3){0, 0, NGHOST};
+        const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
+
+        const Device src_device = devices[i];
+        Device dst_device       = devices[i - 1];
+
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSynchronizeMesh(const Node node, const Stream stream)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeSwapBuffers(const Node node)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        acDeviceSwapBuffers(devices[i]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
+                   const AcReal value)
+{
+    for (int i = 0; i < num_devices; ++i) {
+        acDeviceLoadConstant(devices[i], stream, param, value);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                                 const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                 const int3 dst, const int num_vertices)
+{
+    // See the beginning of the file for an explanation of the index mapping
+    // #pragma omp parallel for
+    for (int i = 0; i < num_devices; ++i) {
+        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
+
+        const int3 s0 = src;
+        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
+
+        const int3 da = max(s0, d0);
+        const int3 db = min(s1, d1);
+        /*
+        printf("Device %d\n", i);
+        printf("\ts0: "); printInt3(s0); printf("\n");
+        printf("\td0: "); printInt3(d0); printf("\n");
+        printf("\tda: "); printInt3(da); printf("\n");
+        printf("\tdb: "); printInt3(db); printf("\n");
+        printf("\td1: "); printInt3(d1); printf("\n");
+        printf("\ts1: "); printInt3(s1); printf("\n");
+        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
+        */
+        if (db.z >= da.z) {
+            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
+            // DECOMPOSITION OFFSET HERE
+            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
+            acDeviceLoadVertexBufferWithOffset(devices[i], stream, host_mesh, vtxbuf_handle, da,
+                                               da_local, copy_cells);
+        }
+        // printf("\n");
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
+                         const int3 src, const int3 dst, const int num_vertices)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
+                                         num_vertices);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
+                       const VertexBufferHandle vtxbuf_handle)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
+                                  const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                  const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
+                          const int num_vertices, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreVertexBuffer(const Node node, const Stream stream,
+                        const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferVertexBufferWithOffset(const Node src_node, const Stream stream,
+                                     const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                     const int3 dst, const int num_vertices, Node dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferMeshWithOffset(const Node src_node, const Stream stream, const int3 src,
+                             const int3 dst, const int num_vertices, Node* dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
+                           const VertexBufferHandle vtxbuf_handle, Node* dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                       const int3 start, const int3 end, const AcReal dt)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodePeriodicBoundcondStep(const Node node, const Stream stream,
+                            const VertexBufferHandle vtxbuf_handle, const int3 start,
+                            const int3 end)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodePeriodicBoundconds(const Node node, const Stream stream, const int3 start, const int3 end)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
+                 const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}
+
+AcResult
+acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
+                const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                const VertexBufferHandle vtxbuf2, AcReal* result)
+{
+    WARNING("Not implemented");
+    return AC_FAILURE;
+}