Various improvements to the MPI-GPU implementation, but linking MPI libraries with both the host C-project and the core library seems to be a major pain. Currently the communication is done via gpu->cpu->cpu->gpu.

2019-10-15 19:32:16 +03:00
parent 113be456d6
commit 0865f0499b
6 changed files with 71 additions and 14 deletions
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -96,6 +96,9 @@ AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const in
 AcResult acSynchronize(void);
 AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices);

+/** */
+int acGetNumDevicesPerNode(void);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -102,7 +102,7 @@ AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, c
 AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
                                   const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);

-/** Deprecated */
+/** */
 AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);

 /** */
@@ -120,7 +120,7 @@ AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream st
 AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
                                      const VertexBufferHandle vtxbuf_handle, Device dst_device);

-/** Deprecated */
+/** */
 AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);

 /** */
@@ -143,9 +143,8 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const
                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
                           const VertexBufferHandle vtxbuf2, AcReal* result);

-#if AC_MPI_ENABLED == 1
+/** */
 AcResult acDeviceCommunicateHalosMPI(const Device device);
-#endif

 #ifdef __cplusplus
 } // extern "C"
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -27,6 +27,13 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS})
 set(CUDA_NVCC_FLAGS_RELEASE)
 set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)

+if (MPI_ENABLED)
+    find_package(MPI REQUIRED)
+
+    add_definitions(-DAC_MPI_ENABLED=1)
+    cuda_include_directories(${MPI_C_INCLUDE_PATH})
+endif ()
+
 ## Create and link the library
 cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
 target_include_directories(astaroth_core PRIVATE .)
@@ -39,7 +46,5 @@ if (MULTIGPU_ENABLED)
 endif ()

 if (MPI_ENABLED)
-    add_definitions(-DAC_MPI_ENABLED=1)
-    find_package(MPI REQUIRED)
-    target_link_libraries(astaroth_core ${MPI_CXX_INCLUDE_PATH})
+    target_link_libraries(astaroth_core ${MPI_C_LIBRARIES})
 endif ()
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -168,3 +168,11 @@ acSynchronizeMesh(void)
 {
    return acNodeSynchronizeMesh(nodes[0], STREAM_DEFAULT);
 }
+
+int
+acGetNumDevicesPerNode(void)
+{
+    int num_devices;
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
+    return num_devices;
+}
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -762,7 +762,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType

 #if AC_MPI_ENABLED == 1
 #include <mpi.h>
-
 /** NOTE: Assumes 1 process per GPU */
 AcResult
 acDeviceCommunicateHalosMPI(const Device device)
@@ -820,6 +819,14 @@ acDeviceCommunicateHalosMPI(const Device device)
    }
    return AC_SUCCESS;
 }
+#else
+AcResult
+acDeviceCommunicateHalosMPI(const Device device)
+{
+    (void)device;
+    WARNING("MPI was not enabled but acDeviceCommunicateHalosMPI() was called");
+    return AC_FAILURE;
+}
 #endif

 #if PACKED_DATA_TRANSFERS
--- a/src/mpitest/main.c
+++ b/src/mpitest/main.c
@@ -181,15 +181,15 @@ communicate_halos(AcMesh* submesh)
 int
 main(void)
 {
-    int num_processes, process_id;
+    int num_processes, pid;
    MPI_Init(NULL, NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
-    MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);

    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);
-    printf("Processor %s. Process %d of %d.\n", processor_name, process_id, num_processes);
+    printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes);

    AcMeshInfo info;
    acLoadConfig(AC_DEFAULT_CONFIG, &info);
@@ -197,7 +197,7 @@ main(void)
    AcMesh model, candidate, submesh;

    // Master CPU
-    if (process_id == 0) {
+    if (pid == 0) {
        acMeshCreate(info, &model);
        acMeshCreate(info, &candidate);

@@ -213,12 +213,47 @@ main(void)
    acMeshCreate(submesh_info, &submesh);

    distribute_mesh(model, &submesh);
+
+    // GPU-GPU communication
+    /*
+    const int device_id = pid % acGetNumDevicesPerNode();
+
+    Device device;
+    acDeviceCreate(device_id, submesh_info, &device);
+
+    acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
+    acDeviceCommunicateHalosMPI(device);
+    acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
+
+    acDeviceDestroy(device);
+    */
+
+    // GPU-CPU-CPU-GPU communication
+    const int device_id = pid % acGetNumDevicesPerNode();
+
+    Device device;
+    acDeviceCreate(device_id, submesh_info, &device);
+
+    acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
+    acDevicePeriodicBoundconds(device, STREAM_DEFAULT, (int3){0, 0, 0},
+                               (int3){submesh_info.int_params[AC_mx],
+                                      submesh_info.int_params[AC_my],
+                                      submesh_info.int_params[AC_mz]});
+    acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
    communicate_halos(&submesh);
+
+    acDeviceDestroy(device);
+    //
+
+    //
+    // CPU-CPU communication
+    // communicate_halos(&submesh);
+    //
    gather_mesh(submesh, &candidate);

    acMeshDestroy(&submesh);
    // Master CPU
-    if (process_id == 0) {
+    if (pid == 0) {
        acVerifyMesh(model, candidate);
        acMeshDestroy(&model);
        acMeshDestroy(&candidate);
@@ -227,7 +262,7 @@ main(void)
    // GPU
    /*
    Device device;
-    acDeviceCreate(process_id, info, &device);
+    acDeviceCreate(pid, info, &device);

    acDeviceLoadMesh(device, STREAM_DEFAULT, model);