diff --git a/include/astaroth.h b/include/astaroth.h index 748c057..b2b3fa2 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -96,6 +96,9 @@ AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const in AcResult acSynchronize(void); AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices); +/** */ +int acGetNumDevicesPerNode(void); + #ifdef __cplusplus } // extern "C" #endif diff --git a/include/astaroth_device.h b/include/astaroth_device.h index fcbbddf..7d383f6 100644 --- a/include/astaroth_device.h +++ b/include/astaroth_device.h @@ -102,7 +102,7 @@ AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, c AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream, const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh); -/** Deprecated */ +/** */ AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh); /** */ @@ -120,7 +120,7 @@ AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream st AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream, const VertexBufferHandle vtxbuf_handle, Device dst_device); -/** Deprecated */ +/** */ AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device); /** */ @@ -143,9 +143,8 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result); -#if AC_MPI_ENABLED == 1 +/** */ AcResult acDeviceCommunicateHalosMPI(const Device device); -#endif #ifdef __cplusplus } // extern "C" diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index fb8b23b..9ffcc43 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -27,6 +27,13 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${CUDA_ARCH_FLAGS} ${CUDA_WARNING_FLAGS}) set(CUDA_NVCC_FLAGS_RELEASE) set(CUDA_NVCC_FLAGS_DEBUG --device-debug --generate-line-info --ptxas-options=-v) +if (MPI_ENABLED) + find_package(MPI REQUIRED) + + add_definitions(-DAC_MPI_ENABLED=1) + cuda_include_directories(${MPI_C_INCLUDE_PATH}) +endif () + ## Create and link the library cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu) target_include_directories(astaroth_core PRIVATE .) @@ -39,7 +46,5 @@ if (MULTIGPU_ENABLED) endif () if (MPI_ENABLED) - add_definitions(-DAC_MPI_ENABLED=1) - find_package(MPI REQUIRED) - target_link_libraries(astaroth_core ${MPI_CXX_INCLUDE_PATH}) + target_link_libraries(astaroth_core ${MPI_C_LIBRARIES}) endif () diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu index 4ab34a9..75cd57d 100644 --- a/src/core/astaroth.cu +++ b/src/core/astaroth.cu @@ -168,3 +168,11 @@ acSynchronizeMesh(void) { return acNodeSynchronizeMesh(nodes[0], STREAM_DEFAULT); } + +int +acGetNumDevicesPerNode(void) +{ + int num_devices; + ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices)); + return num_devices; +} diff --git a/src/core/device.cu b/src/core/device.cu index 2098cc7..6c7ed5e 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -762,7 +762,6 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType #if AC_MPI_ENABLED == 1 #include - /** NOTE: Assumes 1 process per GPU */ AcResult acDeviceCommunicateHalosMPI(const Device device) @@ -820,6 +819,14 @@ acDeviceCommunicateHalosMPI(const Device device) } return AC_SUCCESS; } +#else +AcResult +acDeviceCommunicateHalosMPI(const Device device) +{ + (void)device; + WARNING("MPI was not enabled but acDeviceCommunicateHalosMPI() was called"); + return AC_FAILURE; +} #endif #if PACKED_DATA_TRANSFERS diff --git a/src/mpitest/main.c b/src/mpitest/main.c index 9c83f02..1c82247 100644 --- a/src/mpitest/main.c +++ b/src/mpitest/main.c @@ -181,15 +181,15 @@ communicate_halos(AcMesh* submesh) int main(void) { - int num_processes, process_id; + int num_processes, pid; MPI_Init(NULL, NULL); MPI_Comm_size(MPI_COMM_WORLD, &num_processes); - MPI_Comm_rank(MPI_COMM_WORLD, &process_id); + MPI_Comm_rank(MPI_COMM_WORLD, &pid); char processor_name[MPI_MAX_PROCESSOR_NAME]; int name_len; MPI_Get_processor_name(processor_name, &name_len); - printf("Processor %s. Process %d of %d.\n", processor_name, process_id, num_processes); + printf("Processor %s. Process %d of %d.\n", processor_name, pid, num_processes); AcMeshInfo info; acLoadConfig(AC_DEFAULT_CONFIG, &info); @@ -197,7 +197,7 @@ main(void) AcMesh model, candidate, submesh; // Master CPU - if (process_id == 0) { + if (pid == 0) { acMeshCreate(info, &model); acMeshCreate(info, &candidate); @@ -213,12 +213,47 @@ main(void) acMeshCreate(submesh_info, &submesh); distribute_mesh(model, &submesh); + + // GPU-GPU communication + /* + const int device_id = pid % acGetNumDevicesPerNode(); + + Device device; + acDeviceCreate(device_id, submesh_info, &device); + + acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); + acDeviceCommunicateHalosMPI(device); + acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); + + acDeviceDestroy(device); + */ + + // GPU-CPU-CPU-GPU communication + const int device_id = pid % acGetNumDevicesPerNode(); + + Device device; + acDeviceCreate(device_id, submesh_info, &device); + + acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); + acDevicePeriodicBoundconds(device, STREAM_DEFAULT, (int3){0, 0, 0}, + (int3){submesh_info.int_params[AC_mx], + submesh_info.int_params[AC_my], + submesh_info.int_params[AC_mz]}); + acDeviceStoreMesh(device, STREAM_DEFAULT, &submesh); communicate_halos(&submesh); + + acDeviceDestroy(device); + // + + // + // CPU-CPU communication + // communicate_halos(&submesh); + // gather_mesh(submesh, &candidate); acMeshDestroy(&submesh); // Master CPU - if (process_id == 0) { + if (pid == 0) { acVerifyMesh(model, candidate); acMeshDestroy(&model); acMeshDestroy(&candidate); @@ -227,7 +262,7 @@ main(void) // GPU /* Device device; - acDeviceCreate(process_id, info, &device); + acDeviceCreate(pid, info, &device); acDeviceLoadMesh(device, STREAM_DEFAULT, model);