diff --git a/CMakeLists.txt b/CMakeLists.txt index abd9b63..b452bb2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ option(BUILD_C_API_TEST "Builds a C program to test whether the API is c option(BUILD_MPI_TEST "Builds a C program to test whether MPI works" OFF) option(DOUBLE_PRECISION "Generates double precision code" OFF) option(MULTIGPU_ENABLED "If enabled, uses all the available GPUs" ON) +option(MPI_ENABLED "Enables additional functions for MPI communciation" OFF) ## Compile the Astaroth Code Compiler add_subdirectory(acc) diff --git a/include/astaroth_device.h b/include/astaroth_device.h index eebc1ca..fcbbddf 100644 --- a/include/astaroth_device.h +++ b/include/astaroth_device.h @@ -47,19 +47,19 @@ AcResult acDeviceSwapBuffers(const Device device); /** */ AcResult acDeviceLoadScalarUniform(const Device device, const Stream stream, - const AcRealParam param, const AcReal value); + const AcRealParam param, const AcReal value); /** */ AcResult acDeviceLoadVectorUniform(const Device device, const Stream stream, - const AcReal3Param param, const AcReal3 value); + const AcReal3Param param, const AcReal3 value); /** */ AcResult acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntParam param, - const int value); + const int value); /** */ AcResult acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Param param, - const int3 value); + const int3 value); /** */ AcResult acDeviceLoadScalarArray(const Device device, const Stream stream, @@ -143,6 +143,10 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result); +#if AC_MPI_ENABLED == 1 +AcResult acDeviceCommunicateHalosMPI(const Device device); +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index ccef597..fb8b23b 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -27,15 +27,19 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS}) set(CUDA_NVCC_FLAGS_RELEASE) set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v) -## Definitions -if (MULTIGPU_ENABLED) - add_definitions(-DAC_MULTIGPU_ENABLED=1) -else () - add_definitions(-DAC_MULTIGPU_ENABLED=0) -endif () - ## Create and link the library cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu) target_include_directories(astaroth_core PRIVATE .) target_link_libraries(astaroth_core m) add_dependencies(astaroth_core dsl_headers) + +## Definitions +if (MULTIGPU_ENABLED) + add_definitions(-DAC_MULTIGPU_ENABLED=1) +endif () + +if (MPI_ENABLED) + add_definitions(-DAC_MPI_ENABLED=1) + find_package(MPI REQUIRED) + target_link_libraries(astaroth_core ${MPI_CXX_INCLUDE_PATH}) +endif () diff --git a/src/core/device.cu b/src/core/device.cu index 5f26e5b..3979c5b 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -760,6 +760,68 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType return AC_SUCCESS; } +#if AC_MPI_ENABLED == 1 +#include + +/** NOTE: Assumes 1 process per GPU */ +AcResult +acDeviceCommunicateHalosMPI(const Device device) +{ + MPI_Barrier(MPI_COMM_WORLD); + MPI_Datatype datatype = MPI_FLOAT; + if (sizeof(AcReal) == 8) + datatype = MPI_DOUBLE; + + int pid, num_processes; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &num_processes); + + const size_t count = device->local_config.int_params[AC_mx] * + device->local_config.int_params[AC_my] * NGHOST; + + for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { + { // Front + // ...|ooooxxx|... -> xxx|ooooooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz], + device->local_config); + const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config); + const int send_pid = (pid + 1) % num_processes; + const int recv_pid = (pid + num_processes - 1) % num_processes; + + MPI_Request request; + MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD, + &request); + fflush(stdout); + + MPI_Status status; + MPI_Recv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD, + &status); + + MPI_Wait(&request, &status); + } + { // Back + // ...|ooooooo|xxx <- ...|xxxoooo|... + const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config); + const size_t dst_idx = acVertexBufferIdx( + 0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config); + const int send_pid = (pid + num_processes - 1) % num_processes; + const int recv_pid = (pid + 1) % num_processes; + + MPI_Request request; + MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, + NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request); + + MPI_Status status; + MPI_Recv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i, + MPI_COMM_WORLD, &status); + + MPI_Wait(&request, &status); + } + } + return AC_SUCCESS; +} +#endif + #if PACKED_DATA_TRANSFERS // Functions for calling packed data transfers #endif diff --git a/src/core/node.cu b/src/core/node.cu index 1f14893..05695ad 100644 --- a/src/core/node.cu +++ b/src/core/node.cu @@ -224,10 +224,10 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle) WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES"); node->num_devices = MAX_NUM_DEVICES; } - if (!AC_MULTIGPU_ENABLED) { - WARNING("MULTIGPU_ENABLED was false. Using only one device"); - node->num_devices = 1; // Use only one device if multi-GPU is not enabled - } +#if AC_MULTIGPU_ENABLED != 1 + WARNING("MULTIGPU_ENABLED was false. Using only one device"); + node->num_devices = 1; // Use only one device if multi-GPU is not enabled +#endif // Check that node->num_devices is divisible with AC_nz. This makes decomposing the // problem domain to multiple GPUs much easier since we do not have to worry // about remainders