Various improvements to the MPI-GPU implementation, but linking MPI libraries with both the host C-project and the core library seems to be a major pain. Currently the communication is done via gpu->cpu->cpu->gpu.
This commit is contained in:
@@ -96,6 +96,9 @@ AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const in
|
|||||||
AcResult acSynchronize(void);
|
AcResult acSynchronize(void);
|
||||||
AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices);
|
AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices);
|
||||||
|
|
||||||
|
/** */
|
||||||
|
int acGetNumDevicesPerNode(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
@@ -102,7 +102,7 @@ AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, c
|
|||||||
AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
|
AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
|
||||||
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
|
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
|
||||||
|
|
||||||
/** Deprecated */
|
/** */
|
||||||
AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
|
AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
@@ -120,7 +120,7 @@ AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream st
|
|||||||
AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
|
AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
|
||||||
const VertexBufferHandle vtxbuf_handle, Device dst_device);
|
const VertexBufferHandle vtxbuf_handle, Device dst_device);
|
||||||
|
|
||||||
/** Deprecated */
|
/** */
|
||||||
AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
|
AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
@@ -143,9 +143,8 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const
|
|||||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||||
const VertexBufferHandle vtxbuf2, AcReal* result);
|
const VertexBufferHandle vtxbuf2, AcReal* result);
|
||||||
|
|
||||||
#if AC_MPI_ENABLED == 1
|
/** */
|
||||||
AcResult acDeviceCommunicateHalosMPI(const Device device);
|
AcResult acDeviceCommunicateHalosMPI(const Device device);
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
@@ -27,6 +27,13 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS})
|
|||||||
set(CUDA_NVCC_FLAGS_RELEASE)
|
set(CUDA_NVCC_FLAGS_RELEASE)
|
||||||
set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
|
set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v)
|
||||||
|
|
||||||
|
if (MPI_ENABLED)
|
||||||
|
find_package(MPI REQUIRED)
|
||||||
|
|
||||||
|
add_definitions(-DAC_MPI_ENABLED=1)
|
||||||
|
cuda_include_directories(${MPI_C_INCLUDE_PATH})
|
||||||
|
endif ()
|
||||||
|
|
||||||
## Create and link the library
|
## Create and link the library
|
||||||
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
|
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
|
||||||
target_include_directories(astaroth_core PRIVATE .)
|
target_include_directories(astaroth_core PRIVATE .)
|
||||||
@@ -39,7 +46,5 @@ if (MULTIGPU_ENABLED)
|
|||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (MPI_ENABLED)
|
if (MPI_ENABLED)
|
||||||
add_definitions(-DAC_MPI_ENABLED=1)
|
target_link_libraries(astaroth_core ${MPI_C_LIBRARIES})
|
||||||
find_package(MPI REQUIRED)
|
|
||||||
target_link_libraries(astaroth_core ${MPI_CXX_INCLUDE_PATH})
|
|
||||||
endif ()
|
endif ()
|
||||||
|
@@ -168,3 +168,11 @@ acSynchronizeMesh(void)
|
|||||||
{
|
{
|
||||||
return acNodeSynchronizeMesh(nodes[0], STREAM_DEFAULT);
|
return acNodeSynchronizeMesh(nodes[0], STREAM_DEFAULT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
acGetNumDevicesPerNode(void)
|
||||||
|
{
|
||||||
|
int num_devices;
|
||||||
|
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
|
||||||
|
return num_devices;
|
||||||
|
}
|
||||||
|
@@ -762,7 +762,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
|||||||
|
|
||||||
#if AC_MPI_ENABLED == 1
|
#if AC_MPI_ENABLED == 1
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
|
|
||||||
/** NOTE: Assumes 1 process per GPU */
|
/** NOTE: Assumes 1 process per GPU */
|
||||||
AcResult
|
AcResult
|
||||||
acDeviceCommunicateHalosMPI(const Device device)
|
acDeviceCommunicateHalosMPI(const Device device)
|
||||||
@@ -820,6 +819,14 @@ acDeviceCommunicateHalosMPI(const Device device)
|
|||||||
}
|
}
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
AcResult
|
||||||
|
acDeviceCommunicateHalosMPI(const Device device)
|
||||||
|
{
|
||||||
|
(void)device;
|
||||||
|
WARNING("MPI was not enabled but acDeviceCommunicateHalosMPI() was called");
|
||||||
|
return AC_FAILURE;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if PACKED_DATA_TRANSFERS
|
#if PACKED_DATA_TRANSFERS
|
||||||
|
@@ -181,15 +181,15 @@ communicate_halos(AcMesh* submesh)
|
|||||||
int
|
int
|
||||||
main(void)
|
main(void)
|
||||||
{
|
{
|
||||||
int num_processes, process_id;
|
int num_processes, pid;
|
||||||
MPI_Init(NULL, NULL);
|
MPI_Init(NULL, NULL);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
|
||||||
char processor_name[MPI_MAX_PROCESSOR_NAME];
|
char processor_name[MPI_MAX_PROCESSOR_NAME];
|
||||||
int name_len;
|
int name_len;
|
||||||
MPI_Get_processor_name(processor_name, &name_len);
|
MPI_Get_processor_name(processor_name, &name_len);
|
||||||
printf("Processor %s. Process %d of %d.\n", processor_name, process_id, num_processes);
|
printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes);
|
||||||
|
|
||||||
AcMeshInfo info;
|
AcMeshInfo info;
|
||||||
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
||||||
@@ -197,7 +197,7 @@ main(void)
|
|||||||
AcMesh model, candidate, submesh;
|
AcMesh model, candidate, submesh;
|
||||||
|
|
||||||
// Master CPU
|
// Master CPU
|
||||||
if (process_id == 0) {
|
if (pid == 0) {
|
||||||
acMeshCreate(info, &model);
|
acMeshCreate(info, &model);
|
||||||
acMeshCreate(info, &candidate);
|
acMeshCreate(info, &candidate);
|
||||||
|
|
||||||
@@ -213,12 +213,47 @@ main(void)
|
|||||||
acMeshCreate(submesh_info, &submesh);
|
acMeshCreate(submesh_info, &submesh);
|
||||||
|
|
||||||
distribute_mesh(model, &submesh);
|
distribute_mesh(model, &submesh);
|
||||||
|
|
||||||
|
// GPU-GPU communication
|
||||||
|
/*
|
||||||
|
const int device_id = pid % acGetNumDevicesPerNode();
|
||||||
|
|
||||||
|
Device device;
|
||||||
|
acDeviceCreate(device_id, submesh_info, &device);
|
||||||
|
|
||||||
|
acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
|
||||||
|
acDeviceCommunicateHalosMPI(device);
|
||||||
|
acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
|
||||||
|
|
||||||
|
acDeviceDestroy(device);
|
||||||
|
*/
|
||||||
|
|
||||||
|
// GPU-CPU-CPU-GPU communication
|
||||||
|
const int device_id = pid % acGetNumDevicesPerNode();
|
||||||
|
|
||||||
|
Device device;
|
||||||
|
acDeviceCreate(device_id, submesh_info, &device);
|
||||||
|
|
||||||
|
acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
|
||||||
|
acDevicePeriodicBoundconds(device, STREAM_DEFAULT, (int3){0, 0, 0},
|
||||||
|
(int3){submesh_info.int_params[AC_mx],
|
||||||
|
submesh_info.int_params[AC_my],
|
||||||
|
submesh_info.int_params[AC_mz]});
|
||||||
|
acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh);
|
||||||
communicate_halos(&submesh);
|
communicate_halos(&submesh);
|
||||||
|
|
||||||
|
acDeviceDestroy(device);
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// CPU-CPU communication
|
||||||
|
// communicate_halos(&submesh);
|
||||||
|
//
|
||||||
gather_mesh(submesh, &candidate);
|
gather_mesh(submesh, &candidate);
|
||||||
|
|
||||||
acMeshDestroy(&submesh);
|
acMeshDestroy(&submesh);
|
||||||
// Master CPU
|
// Master CPU
|
||||||
if (process_id == 0) {
|
if (pid == 0) {
|
||||||
acVerifyMesh(model, candidate);
|
acVerifyMesh(model, candidate);
|
||||||
acMeshDestroy(&model);
|
acMeshDestroy(&model);
|
||||||
acMeshDestroy(&candidate);
|
acMeshDestroy(&candidate);
|
||||||
@@ -227,7 +262,7 @@ main(void)
|
|||||||
// GPU
|
// GPU
|
||||||
/*
|
/*
|
||||||
Device device;
|
Device device;
|
||||||
acDeviceCreate(process_id, info, &device);
|
acDeviceCreate(pid, info, &device);
|
||||||
|
|
||||||
acDeviceLoadMesh(device, STREAM_DEFAULT, model);
|
acDeviceLoadMesh(device, STREAM_DEFAULT, model);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user