From ed7cf3f5401d24f2b848792730c63a86da3883fd Mon Sep 17 00:00:00 2001 From: jpekkila Date: Thu, 26 Mar 2020 15:02:37 +0200 Subject: [PATCH] Added a production-ready interface for doing multi-node runs with Astaroth with MPI --- include/astaroth.h | 56 +++- src/core/device.cc | 773 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 661 insertions(+), 168 deletions(-) diff --git a/include/astaroth.h b/include/astaroth.h index af03634..b170796 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -116,10 +116,10 @@ typedef enum { #define _UNUSED __attribute__((unused)) // Does not give a warning if unused #define AC_GEN_STR(X) #X, -static const char* intparam_names[] _UNUSED = {AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR) "-end-"}; -static const char* int3param_names[] _UNUSED = {AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR) "-end-"}; -static const char* realparam_names[] _UNUSED = {AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR) "-end-"}; -static const char* real3param_names[] _UNUSED = {AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR) "-end-"}; +static const char* intparam_names[] _UNUSED = {AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR) "-end-"}; +static const char* int3param_names[] _UNUSED = {AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR) "-end-"}; +static const char* realparam_names[] _UNUSED = {AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR) "-end-"}; +static const char* real3param_names[] _UNUSED = {AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR) "-end-"}; static const char* scalararray_names[] _UNUSED = {AC_FOR_SCALARARRAY_HANDLES(AC_GEN_STR) "-end-"}; static const char* vtxbuf_names[] _UNUSED = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR) "-end-"}; #undef AC_GEN_STR @@ -144,17 +144,20 @@ typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dis // Node typedef struct node_s* Node; // Opaque pointer to node_s. +// Grid +// typedef struct grid_s* Grid; // Opaque pointer to grid_s + typedef struct { int3 m; int3 n; -} Grid; // WARNING: Grid structure may be deprecated in future versions (TODO) +} GridDims; typedef struct { int num_devices; Device* devices; - Grid grid; - Grid subgrid; + GridDims grid; + GridDims subgrid; } DeviceConfiguration; #ifdef __cplusplus @@ -280,6 +283,45 @@ int acGetNumDevicesPerNode(void); /** */ Node acGetNode(void); +/* + * ============================================================================= + * Grid interface + * ============================================================================= + */ +#if AC_MPI_ENABLED +/** +Initializes all available devices. + +Must compile and run the code with MPI. + +Must allocate exactly one process per GPU. And the same number of processes +per node as there are GPUs on that node. + +Devices in the grid are configured based on the contents of AcMesh. + */ +AcResult acGridInit(const AcMeshInfo info); + +/** +Resets all devices on the current grid. + */ +AcResult acGridQuit(void); + +/** */ +AcResult acGridSynchronizeStream(const Stream stream); + +/** */ +AcResult acGridLoadMesh(const AcMesh host_mesh, const Stream stream); + +/** */ +AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh); + +/** */ +AcResult acGridIntegrate(const Stream stream, const AcReal dt); + +/** */ +AcResult acGridPeriodicBoundconds(const Stream stream); +#endif // AC_MPI_ENABLED + /* * ============================================================================= * Node interface diff --git a/src/core/device.cc b/src/core/device.cc index 631e287..af27a6e 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -833,6 +833,13 @@ acDestroyCommData(const Device device, CommData* data) data->dims = (int3){-1, -1, -1}; } +static void +acSyncCommData(const CommData data) +{ + for (size_t i = 0; i < data.count; ++i) + cudaStreamSynchronize(data.streams[i]); +} + static void acPackCommData(const Device device, const int3* a0s, CommData* data) { @@ -980,6 +987,226 @@ acTransferCommDataWait(const CommData data) } } +typedef struct { + Device device; + AcMesh submesh; + int3 decomposition; + bool initialized; + + int3 nn; + CommData corner_data; + CommData edgex_data; + CommData edgey_data; + CommData edgez_data; + CommData sidexy_data; + CommData sidexz_data; + CommData sideyz_data; +} Grid; + +static Grid grid = {}; + +AcResult +acGridSynchronizeStream(const Stream stream) +{ + ERRCHK(grid.initialized); + + acDeviceSynchronizeStream(grid.device, stream); + MPI_Barrier(MPI_COMM_WORLD); + return AC_SUCCESS; +} + +AcResult +acGridInit(const AcMeshInfo info) +{ + ERRCHK(!grid.initialized); + + // Check that MPI is initialized + int nprocs, pid; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs); + + // Decompose + AcMeshInfo submesh_info = info; + const int3 decomposition = decompose(nprocs); + const int3 pid3d = getPid3D(pid, decomposition); + + printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z); + printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z); + ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0); + ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0); + ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0); + + const int submesh_nx = info.int_params[AC_nx] / decomposition.x; + const int submesh_ny = info.int_params[AC_ny] / decomposition.y; + const int submesh_nz = info.int_params[AC_nz] / decomposition.z; + submesh_info.int_params[AC_nx] = submesh_nx; + submesh_info.int_params[AC_ny] = submesh_ny; + submesh_info.int_params[AC_nz] = submesh_nz; + submesh_info.int3_params[AC_global_grid_n] = (int3){ + info.int_params[AC_nx], + info.int_params[AC_ny], + info.int_params[AC_nz], + }; + submesh_info.int3_params[AC_multigpu_offset] = pid3d * + (int3){submesh_nx, submesh_ny, submesh_nz}; + acUpdateBuiltinParams(&submesh_info); + + // GPU alloc + int devices_per_node = -1; + cudaGetDeviceCount(&devices_per_node); + + Device device; + acDeviceCreate(pid % devices_per_node, submesh_info, &device); + + // CPU alloc + AcMesh submesh; + acMeshCreate(submesh_info, &submesh); + + // Setup the global grid structure + grid.device = device; + grid.submesh = submesh; + grid.decomposition = decomposition; + grid.initialized = true; + + // Configure + const int3 nn = (int3){ + device->local_config.int_params[AC_nx], + device->local_config.int_params[AC_ny], + device->local_config.int_params[AC_nz], + }; + + // Corners + const int3 corner_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + (int3){nn.x, nn.y, nn.z}, + }; + + // Edges X + const int3 edgex_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + }; + + // Edges Y + const int3 edgey_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + }; + + // Edges Z + const int3 edgez_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + }; + + // Sides XY + const int3 sidexy_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, NGHOST, nn.z}, // + }; + + // Sides XZ + const int3 sidexz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + }; + + // Sides YZ + const int3 sideyz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + }; + + const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST}; + const int3 edgex_dims = (int3){nn.x, NGHOST, NGHOST}; + const int3 edgey_dims = (int3){NGHOST, nn.y, NGHOST}; + const int3 edgez_dims = (int3){NGHOST, NGHOST, nn.z}; + const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST}; + const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z}; + const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z}; + grid.nn = nn; + grid.corner_data = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s)); + grid.edgex_data = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s)); + grid.edgey_data = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s)); + grid.edgez_data = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s)); + grid.sidexy_data = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s)); + grid.sidexz_data = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s)); + grid.sideyz_data = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s)); + + acGridSynchronizeStream(STREAM_ALL); + return AC_SUCCESS; +} + +AcResult +acGridQuit(void) +{ + ERRCHK(grid.initialized); + acGridSynchronizeStream(STREAM_ALL); + + acDestroyCommData(grid.device, &grid.corner_data); + acDestroyCommData(grid.device, &grid.edgex_data); + acDestroyCommData(grid.device, &grid.edgey_data); + acDestroyCommData(grid.device, &grid.edgez_data); + acDestroyCommData(grid.device, &grid.sidexy_data); + acDestroyCommData(grid.device, &grid.sidexz_data); + acDestroyCommData(grid.device, &grid.sideyz_data); + + grid.initialized = false; + grid.decomposition = (int3){-1, -1, -1}; + acMeshDestroy(&grid.submesh); + acDeviceDestroy(grid.device); + + acGridSynchronizeStream(STREAM_ALL); + return AC_SUCCESS; +} + +AcResult +acGridLoadMesh(const AcMesh host_mesh, const Stream stream) +{ + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); + + acDeviceDistributeMeshMPI(host_mesh, grid.decomposition, &grid.submesh); + acDeviceLoadMesh(grid.device, stream, grid.submesh); + + return AC_SUCCESS; +} + +AcResult +acGridStoreMesh(const Stream stream, AcMesh* host_mesh) +{ + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); + + acDeviceStoreMesh(grid.device, stream, &grid.submesh); + acGridSynchronizeStream(stream); + + acDeviceGatherMeshMPI(grid.submesh, grid.decomposition, host_mesh); + + return AC_SUCCESS; +} + static AcResult acDeviceIntegrateMPI(const Device device, const AcReal dt) { @@ -1518,186 +1745,410 @@ acDeviceCommunicateHalosMPI(const Device device) return AC_SUCCESS; } -/* -static int3 -findOptimalDecomposition(const int3 nn) -{ - int3 decomposition = (int3){1, 1, 1}; -}*/ - AcResult -acDeviceRunMPITest(void) +acGridIntegrate(const Stream stream, const AcReal dt) { - MPI_Init(NULL, NULL); + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); + const Device device = grid.device; + const int3 nn = grid.nn; + CommData corner_data = grid.corner_data; + CommData edgex_data = grid.edgex_data; + CommData edgey_data = grid.edgey_data; + CommData edgez_data = grid.edgez_data; + CommData sidexy_data = grid.sidexy_data; + CommData sidexz_data = grid.sidexz_data; + CommData sideyz_data = grid.sideyz_data; - char processor_name[MPI_MAX_PROCESSOR_NAME]; - int name_len; - MPI_Get_processor_name(processor_name, &name_len); - printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs); + // Corners + const int3 corner_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // - // Create model and candidate meshes - AcMeshInfo info; - acLoadConfig(AC_DEFAULT_CONFIG, &info); - info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx]; - info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy]; - info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz]; - info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound]; - - AcMesh model, candidate; - - // Master CPU - if (pid == 0) { - acMeshCreate(info, &model); - acMeshCreate(info, &candidate); - - acMeshRandomize(&model); - acMeshRandomize(&candidate); - } - - /// DECOMPOSITION & SUBMESH /////////////////////////////////// - AcMeshInfo submesh_info = info; - const int3 decomposition = decompose(nprocs); - const int3 pid3d = getPid3D(pid, decomposition); - - printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z); - printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z); - ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0); - ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0); - ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0); - - const int submesh_nx = info.int_params[AC_nx] / decomposition.x; - const int submesh_ny = info.int_params[AC_ny] / decomposition.y; - const int submesh_nz = info.int_params[AC_nz] / decomposition.z; - submesh_info.int_params[AC_nx] = submesh_nx; - submesh_info.int_params[AC_ny] = submesh_ny; - submesh_info.int_params[AC_nz] = submesh_nz; - submesh_info.int3_params[AC_global_grid_n] = (int3){ - info.int_params[AC_nx], - info.int_params[AC_ny], - info.int_params[AC_nz], + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + (int3){nn.x, nn.y, nn.z}, }; - submesh_info.int3_params[AC_multigpu_offset] = pid3d * - (int3){submesh_nx, submesh_ny, submesh_nz}; - acUpdateBuiltinParams(&submesh_info); + const int3 corner_b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, - AcMesh submesh; - acMeshCreate(submesh_info, &submesh); - acMeshRandomize(&submesh); - //////////////////////////////////////////////////////////////// + (int3){0, 0, NGHOST + nn.z}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; - // GPU INIT //////////////////////////////////////////////////// - int devices_per_node = -1; - cudaGetDeviceCount(&devices_per_node); + // Edges X + const int3 edgex_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // - Device device; - acDeviceCreate(pid % devices_per_node, submesh_info, &device); - // TODO enable peer access - //////////////////////////////////////////////////////////////// + (int3){NGHOST, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + }; + const int3 edgex_b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, - // DISTRIBUTE & LOAD ////////////////////////////////////////// - acDeviceDistributeMeshMPI(model, decomposition, &submesh); - acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); - /////////////////////////////////////////////////////////////// + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; - // SYNC ////////////////////////////////////////////////////// - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - ////////////////////////////////////////////////////////////// + // Edges Y + const int3 edgey_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // - // TIMING START ////////////////////////////////////////////// - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - Timer t; - timer_reset(&t); - ////////////////////////////////////////////////////////////// + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + }; + const int3 edgey_b0s[] = { + (int3){0, NGHOST, 0}, + (int3){NGHOST + nn.x, NGHOST, 0}, - // INTEGRATION & BOUNDCONDS//////////////////////////////////// - // acDeviceCommunicateHalosMPI(device); - acDeviceIntegrateMPI(device, FLT_EPSILON); - acDeviceCommunicateHalosMPI(device); - /////////////////////////////////////////////////////////////// + (int3){0, NGHOST, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, + }; - // TIMING END ////////////////////////////////////////////// - acDeviceSynchronizeStream(device, STREAM_ALL); - MPI_Barrier(MPI_COMM_WORLD); - if (!pid) { - timer_diff_print(t); + // Edges Z + const int3 edgez_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + }; + const int3 edgez_b0s[] = { + (int3){0, 0, NGHOST}, + (int3){NGHOST + nn.x, 0, NGHOST}, + + (int3){0, NGHOST + nn.y, NGHOST}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, + }; + + // Sides XY + const int3 sidexy_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, NGHOST, nn.z}, // + }; + const int3 sidexy_b0s[] = { + (int3){NGHOST, NGHOST, 0}, // + (int3){NGHOST, NGHOST, NGHOST + nn.z}, // + }; + + // Sides XZ + const int3 sidexz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + }; + const int3 sidexz_b0s[] = { + (int3){NGHOST, 0, NGHOST}, // + (int3){NGHOST, NGHOST + nn.y, NGHOST}, // + }; + + // Sides YZ + const int3 sideyz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + }; + const int3 sideyz_b0s[] = { + (int3){0, NGHOST, NGHOST}, // + (int3){NGHOST + nn.x, NGHOST, NGHOST}, // + }; + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acPackCommData(device, corner_a0s, &corner_data); + acPackCommData(device, edgex_a0s, &edgex_data); + acPackCommData(device, edgey_a0s, &edgey_data); + acPackCommData(device, edgez_a0s, &edgez_data); + acPackCommData(device, sidexy_a0s, &sidexy_data); + acPackCommData(device, sidexz_a0s, &sidexz_data); + acPackCommData(device, sideyz_a0s, &sideyz_data); + +#if MPI_GPUDIRECT_DISABLED + acTransferCommDataToHost(device, &corner_data); + acTransferCommDataToHost(device, &edgex_data); + acTransferCommDataToHost(device, &edgey_data); + acTransferCommDataToHost(device, &edgez_data); + acTransferCommDataToHost(device, &sidexy_data); + acTransferCommDataToHost(device, &sidexz_data); + acTransferCommDataToHost(device, &sideyz_data); +#endif + + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); + } + //////////////////////////////////////////// + + acTransferCommData(device, corner_a0s, corner_b0s, &corner_data); + acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data); + acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data); + + acTransferCommDataWait(corner_data); + acTransferCommDataWait(edgex_data); + acTransferCommDataWait(edgey_data); + acTransferCommDataWait(edgez_data); + acTransferCommDataWait(sidexy_data); + acTransferCommDataWait(sidexz_data); + acTransferCommDataWait(sideyz_data); + +#if MPI_GPUDIRECT_DISABLED + acTransferCommDataToDevice(device, &corner_data); + acTransferCommDataToDevice(device, &edgex_data); + acTransferCommDataToDevice(device, &edgey_data); + acTransferCommDataToDevice(device, &edgez_data); + acTransferCommDataToDevice(device, &sidexy_data); + acTransferCommDataToDevice(device, &sidexz_data); + acTransferCommDataToDevice(device, &sideyz_data); +#endif + + acUnpackCommData(device, corner_b0s, &corner_data); + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); + //////////// OUTER INTEGRATION ////////////// + + // Wait for unpacking + acSyncCommData(corner_data); + acSyncCommData(edgex_data); + acSyncCommData(edgey_data); + acSyncCommData(edgez_data); + acSyncCommData(sidexy_data); + acSyncCommData(sidexz_data); + acSyncCommData(sideyz_data); + { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt); + } + { // Back + const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt); + } + { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt); + } + { // Top + const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt); + } + { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt); + } + { // Right + const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); + } + acDeviceSwapBuffers(device); + acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done + //////////////////////////////////////////// } - MPI_Barrier(MPI_COMM_WORLD); - ////////////////////////////////////////////////////////////// - // STORE & GATHER ///////////////////////////////////////////// - MPI_Barrier(MPI_COMM_WORLD); - acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); - acDeviceSynchronizeStream(device, STREAM_DEFAULT); - acDeviceGatherMeshMPI(submesh, decomposition, &candidate); - ////////////////////////////////////////////////////////////// - - // VERIFY //////////////////////////////////////////////////// - if (pid == 0) { - acModelIntegrateStep(model, FLT_EPSILON); - acMeshApplyPeriodicBounds(&model); - - acVerifyMesh(model, candidate); - acMeshDestroy(&model); - acMeshDestroy(&candidate); - } - ////////////////////////////////////////////////////////////// - - // DESTROY /////////////////////////////////////////////////// - acDeviceDestroy(device); - acMeshDestroy(&submesh); - MPI_Finalize(); - ////////////////////////////////////////////////////////////// return AC_SUCCESS; } -#else + AcResult -acDeviceRunMPITest(void) +acGridPeriodicBoundconds(const Stream stream) { - WARNING("MPI was not enabled but acDeviceRunMPITest() was called"); - return AC_FAILURE; + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); + + const Device device = grid.device; + const int3 nn = grid.nn; + CommData corner_data = grid.corner_data; + CommData edgex_data = grid.edgex_data; + CommData edgey_data = grid.edgey_data; + CommData edgez_data = grid.edgez_data; + CommData sidexy_data = grid.sidexy_data; + CommData sidexz_data = grid.sidexz_data; + CommData sideyz_data = grid.sideyz_data; + + // Corners + const int3 corner_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + (int3){nn.x, nn.y, nn.z}, + }; + const int3 corner_b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + + (int3){0, 0, NGHOST + nn.z}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; + + // Edges X + const int3 edgex_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){NGHOST, nn.y, nn.z}, // + }; + const int3 edgex_b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, + + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; + + // Edges Y + const int3 edgey_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + + (int3){NGHOST, NGHOST, nn.z}, // + (int3){nn.x, NGHOST, nn.z}, // + }; + const int3 edgey_b0s[] = { + (int3){0, NGHOST, 0}, + (int3){NGHOST + nn.x, NGHOST, 0}, + + (int3){0, NGHOST, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, + }; + + // Edges Z + const int3 edgez_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + + (int3){NGHOST, nn.y, NGHOST}, // + (int3){nn.x, nn.y, NGHOST}, // + }; + const int3 edgez_b0s[] = { + (int3){0, 0, NGHOST}, + (int3){NGHOST + nn.x, 0, NGHOST}, + + (int3){0, NGHOST + nn.y, NGHOST}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, + }; + + // Sides XY + const int3 sidexy_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, NGHOST, nn.z}, // + }; + const int3 sidexy_b0s[] = { + (int3){NGHOST, NGHOST, 0}, // + (int3){NGHOST, NGHOST, NGHOST + nn.z}, // + }; + + // Sides XZ + const int3 sidexz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){NGHOST, nn.y, NGHOST}, // + }; + const int3 sidexz_b0s[] = { + (int3){NGHOST, 0, NGHOST}, // + (int3){NGHOST, NGHOST + nn.y, NGHOST}, // + }; + + // Sides YZ + const int3 sideyz_a0s[] = { + (int3){NGHOST, NGHOST, NGHOST}, // + (int3){nn.x, NGHOST, NGHOST}, // + }; + const int3 sideyz_b0s[] = { + (int3){0, NGHOST, NGHOST}, // + (int3){NGHOST + nn.x, NGHOST, NGHOST}, // + }; + + acPackCommData(device, corner_a0s, &corner_data); + acPackCommData(device, edgex_a0s, &edgex_data); + acPackCommData(device, edgey_a0s, &edgey_data); + acPackCommData(device, edgez_a0s, &edgez_data); + acPackCommData(device, sidexy_a0s, &sidexy_data); + acPackCommData(device, sidexz_a0s, &sidexz_data); + acPackCommData(device, sideyz_a0s, &sideyz_data); + +#if MPI_GPUDIRECT_DISABLED + acTransferCommDataToHost(device, &corner_data); + acTransferCommDataToHost(device, &edgex_data); + acTransferCommDataToHost(device, &edgey_data); + acTransferCommDataToHost(device, &edgez_data); + acTransferCommDataToHost(device, &sidexy_data); + acTransferCommDataToHost(device, &sidexz_data); + acTransferCommDataToHost(device, &sideyz_data); +#endif + + acTransferCommData(device, corner_a0s, corner_b0s, &corner_data); + acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data); + acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data); + + acTransferCommDataWait(corner_data); + acTransferCommDataWait(edgex_data); + acTransferCommDataWait(edgey_data); + acTransferCommDataWait(edgez_data); + acTransferCommDataWait(sidexy_data); + acTransferCommDataWait(sidexz_data); + acTransferCommDataWait(sideyz_data); + +#if MPI_GPUDIRECT_DISABLED + acTransferCommDataToDevice(device, &corner_data); + acTransferCommDataToDevice(device, &edgex_data); + acTransferCommDataToDevice(device, &edgey_data); + acTransferCommDataToDevice(device, &edgez_data); + acTransferCommDataToDevice(device, &sidexy_data); + acTransferCommDataToDevice(device, &sidexz_data); + acTransferCommDataToDevice(device, &sideyz_data); +#endif + + acUnpackCommData(device, corner_b0s, &corner_data); + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); + + // Wait for unpacking + acSyncCommData(corner_data); + acSyncCommData(edgex_data); + acSyncCommData(edgey_data); + acSyncCommData(edgez_data); + acSyncCommData(sidexy_data); + acSyncCommData(sidexz_data); + acSyncCommData(sideyz_data); + return AC_SUCCESS; } #endif // AC_MPI_ENABLED - -/* -struct grid_s { - Device device; -}; - -typedef grid_s* Grid; - -AcResult -acGridInit(void) -{ - MPI_Init(NULL, NULL); - - int nprocs, pid; - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - - char processor_name[MPI_MAX_PROCESSOR_NAME]; - int name_len; - MPI_Get_processor_name(processor_name, &name_len); - printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs); -} - -AcResult -acGridLoad(const AcMesh mesh, Grid* grid) -{ -} - -AcResult -acGridStore(const Grid grid, AcMesh* mesh) -{ -} - -AcResult -acGridQuit(AcGrid& grid) -{ -} -*/