New cmake option: MPI_ENABLED. Enables MPI functions on the device layer
This commit is contained in:
@@ -29,6 +29,7 @@ option(BUILD_C_API_TEST "Builds a C program to test whether the API is c
|
|||||||
option(BUILD_MPI_TEST "Builds a C program to test whether MPI works" OFF)
|
option(BUILD_MPI_TEST "Builds a C program to test whether MPI works" OFF)
|
||||||
option(DOUBLE_PRECISION "Generates double precision code" OFF)
|
option(DOUBLE_PRECISION "Generates double precision code" OFF)
|
||||||
option(MULTIGPU_ENABLED "If enabled, uses all the available GPUs" ON)
|
option(MULTIGPU_ENABLED "If enabled, uses all the available GPUs" ON)
|
||||||
|
option(MPI_ENABLED "Enables additional functions for MPI communciation" OFF)
|
||||||
|
|
||||||
## Compile the Astaroth Code Compiler
|
## Compile the Astaroth Code Compiler
|
||||||
add_subdirectory(acc)
|
add_subdirectory(acc)
|
||||||
|
@@ -143,6 +143,10 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const
|
|||||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||||
const VertexBufferHandle vtxbuf2, AcReal* result);
|
const VertexBufferHandle vtxbuf2, AcReal* result);
|
||||||
|
|
||||||
|
#if AC_MPI_ENABLED == 1
|
||||||
|
AcResult acDeviceCommunicateHalosMPI(const Device device);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
@@ -27,15 +27,19 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS})
|
|||||||
set(CUDA_NVCC_FLAGS_RELEASE)
|
set(CUDA_NVCC_FLAGS_RELEASE)
|
||||||
set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
|
set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
|
||||||
|
|
||||||
## Definitions
|
|
||||||
if (MULTIGPU_ENABLED)
|
|
||||||
add_definitions(-DAC_MULTIGPU_ENABLED=1)
|
|
||||||
else ()
|
|
||||||
add_definitions(-DAC_MULTIGPU_ENABLED=0)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
## Create and link the library
|
## Create and link the library
|
||||||
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
|
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
|
||||||
target_include_directories(astaroth_core PRIVATE .)
|
target_include_directories(astaroth_core PRIVATE .)
|
||||||
target_link_libraries(astaroth_core m)
|
target_link_libraries(astaroth_core m)
|
||||||
add_dependencies(astaroth_core dsl_headers)
|
add_dependencies(astaroth_core dsl_headers)
|
||||||
|
|
||||||
|
## Definitions
|
||||||
|
if (MULTIGPU_ENABLED)
|
||||||
|
add_definitions(-DAC_MULTIGPU_ENABLED=1)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (MPI_ENABLED)
|
||||||
|
add_definitions(-DAC_MPI_ENABLED=1)
|
||||||
|
find_package(MPI REQUIRED)
|
||||||
|
target_link_libraries(astaroth_core ${MPI_CXX_INCLUDE_PATH})
|
||||||
|
endif ()
|
||||||
|
@@ -760,6 +760,68 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if AC_MPI_ENABLED == 1
|
||||||
|
#include <mpi.h>
|
||||||
|
|
||||||
|
/** NOTE: Assumes 1 process per GPU */
|
||||||
|
AcResult
|
||||||
|
acDeviceCommunicateHalosMPI(const Device device)
|
||||||
|
{
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
|
if (sizeof(AcReal) == 8)
|
||||||
|
datatype = MPI_DOUBLE;
|
||||||
|
|
||||||
|
int pid, num_processes;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
||||||
|
|
||||||
|
const size_t count = device->local_config.int_params[AC_mx] *
|
||||||
|
device->local_config.int_params[AC_my] * NGHOST;
|
||||||
|
|
||||||
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
|
{ // Front
|
||||||
|
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||||
|
const size_t src_idx = acVertexBufferIdx(0, 0, device->local_config.int_params[AC_nz],
|
||||||
|
device->local_config);
|
||||||
|
const size_t dst_idx = acVertexBufferIdx(0, 0, 0, device->local_config);
|
||||||
|
const int send_pid = (pid + 1) % num_processes;
|
||||||
|
const int recv_pid = (pid + num_processes - 1) % num_processes;
|
||||||
|
|
||||||
|
MPI_Request request;
|
||||||
|
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid, i, MPI_COMM_WORLD,
|
||||||
|
&request);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
MPI_Status status;
|
||||||
|
MPI_Recv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, i, MPI_COMM_WORLD,
|
||||||
|
&status);
|
||||||
|
|
||||||
|
MPI_Wait(&request, &status);
|
||||||
|
}
|
||||||
|
{ // Back
|
||||||
|
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||||
|
const size_t src_idx = acVertexBufferIdx(0, 0, NGHOST, device->local_config);
|
||||||
|
const size_t dst_idx = acVertexBufferIdx(
|
||||||
|
0, 0, NGHOST + device->local_config.int_params[AC_nz], device->local_config);
|
||||||
|
const int send_pid = (pid + num_processes - 1) % num_processes;
|
||||||
|
const int recv_pid = (pid + 1) % num_processes;
|
||||||
|
|
||||||
|
MPI_Request request;
|
||||||
|
MPI_Isend(&device->vba.in[i][src_idx], count, datatype, send_pid,
|
||||||
|
NUM_VTXBUF_HANDLES + i, MPI_COMM_WORLD, &request);
|
||||||
|
|
||||||
|
MPI_Status status;
|
||||||
|
MPI_Recv(&device->vba.in[i][dst_idx], count, datatype, recv_pid, NUM_VTXBUF_HANDLES + i,
|
||||||
|
MPI_COMM_WORLD, &status);
|
||||||
|
|
||||||
|
MPI_Wait(&request, &status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if PACKED_DATA_TRANSFERS
|
#if PACKED_DATA_TRANSFERS
|
||||||
// Functions for calling packed data transfers
|
// Functions for calling packed data transfers
|
||||||
#endif
|
#endif
|
||||||
|
@@ -224,10 +224,10 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
|
|||||||
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
|
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
|
||||||
node->num_devices = MAX_NUM_DEVICES;
|
node->num_devices = MAX_NUM_DEVICES;
|
||||||
}
|
}
|
||||||
if (!AC_MULTIGPU_ENABLED) {
|
#if AC_MULTIGPU_ENABLED != 1
|
||||||
WARNING("MULTIGPU_ENABLED was false. Using only one device");
|
WARNING("MULTIGPU_ENABLED was false. Using only one device");
|
||||||
node->num_devices = 1; // Use only one device if multi-GPU is not enabled
|
node->num_devices = 1; // Use only one device if multi-GPU is not enabled
|
||||||
}
|
#endif
|
||||||
// Check that node->num_devices is divisible with AC_nz. This makes decomposing the
|
// Check that node->num_devices is divisible with AC_nz. This makes decomposing the
|
||||||
// problem domain to multiple GPUs much easier since we do not have to worry
|
// problem domain to multiple GPUs much easier since we do not have to worry
|
||||||
// about remainders
|
// about remainders
|
||||||
|
Reference in New Issue
Block a user