Various improvements to the MPI-GPU implementation, but linking MPI libraries with both the host C-project and the core library seems to be a major pain. Currently the communication is done via gpu->cpu->cpu->gpu.

2019-10-15 19:32:16 +03:00
parent 113be456d6
commit 0865f0499b
6 changed files with 71 additions and 14 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -27,6 +27,13 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS})
 set(CUDA_NVCC_FLAGS_RELEASE)
 set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)

+if (MPI_ENABLED)
+    find_package(MPI REQUIRED)
+
+    add_definitions(-DAC_MPI_ENABLED=1)
+    cuda_include_directories(${MPI_C_INCLUDE_PATH})
+endif ()
+
 ## Create and link the library
 cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
 target_include_directories(astaroth_core PRIVATE .)
@@ -39,7 +46,5 @@ if (MULTIGPU_ENABLED)
 endif ()

 if (MPI_ENABLED)
-    add_definitions(-DAC_MPI_ENABLED=1)
-    find_package(MPI REQUIRED)
-    target_link_libraries(astaroth_core ${MPI_CXX_INCLUDE_PATH})
+    target_link_libraries(astaroth_core ${MPI_C_LIBRARIES})
 endif ()
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -168,3 +168,11 @@ acSynchronizeMesh(void)
 {
    return acNodeSynchronizeMesh(nodes[0], STREAM_DEFAULT);
 }
+
+int
+acGetNumDevicesPerNode(void)
+{
+    int num_devices;
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
+    return num_devices;
+}
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -762,7 +762,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType

 #if AC_MPI_ENABLED == 1
 #include <mpi.h>
-
 /** NOTE: Assumes 1 process per GPU */
 AcResult
 acDeviceCommunicateHalosMPI(const Device device)
@@ -820,6 +819,14 @@ acDeviceCommunicateHalosMPI(const Device device)
    }
    return AC_SUCCESS;
 }
+#else
+AcResult
+acDeviceCommunicateHalosMPI(const Device device)
+{
+    (void)device;
+    WARNING("MPI was not enabled but acDeviceCommunicateHalosMPI() was called");
+    return AC_FAILURE;
+}
 #endif

 #if PACKED_DATA_TRANSFERS