From 8f7745cd062f9a1062192baa9e81dfa06eeb65bf Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 4 May 2021 16:05:33 -0600 Subject: [PATCH] add device memory MPI_Put --- CMakeLists.txt | 115 ++++++++++++++++++++++++++++++++++++++++++++++ one_sided_gpu.cpp | 61 ++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 one_sided_gpu.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f5239c1 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,115 @@ +# 3.17+ for CMAKE_CUDA_KNOWN_FEATURES/cuda_std_11 +# 3.18+ for CUDA_ARCHITECTURES +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +project(tempi LANGUAGES CXX VERSION 0.1.0.0) + +include(CheckLanguage) + +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) +message(STATUS "CMAKE_CUDA_ARCHITECTURES not defined, setting to OFF") +set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "") +endif() + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +check_language(CUDA) +find_package(MPI REQUIRED) +find_package(CUDAToolkit REQUIRED) + +if(CMAKE_CUDA_COMPILER) + enable_language(Fortran) +else() + message(STATUS "No Fortran support") +endif() + +if (MPI_FOUND) +message(STATUS "MPI_CXX_COMPILER: ${MPI_CXX_COMPILER}") +message(STATUS "MPI_CXX_INCLUDE_DIRS: ${MPI_CXX_INCLUDE_DIRS}") +message(STATUS "MPIEXEC_EXECUTABLE: ${MPIEXEC_EXECUTABLE}") +message(STATUS "MPIEXEC_NUMPROC_FLAG: ${MPIEXEC_NUMPROC_FLAG}") +message(STATUS "MPIEXEC_MAX_NUMPROCS: ${MPIEXEC_MAX_NUMPROCS}") +message(STATUS "MPIEXEC_PREFLAGS: ${MPIEXEC_PREFLAGS}") +message(STATUS "MPIEXEC_POSTFLAGS: ${MPIEXEC_POSTFLAGS}") + +endif() + +function(set_cxx_options target) +target_compile_options(${target} PRIVATE +$<$: + -Wall + -Wextra + -Wcast-align; + -Wdisabled-optimization; + -Wformat=2; + -Winit-self; + -Wlogical-op; + -Wmissing-include-dirs; + -Woverloaded-virtual; + -Wpointer-arith; + -Wshadow; + -Wstrict-aliasing; + -Wswitch-enum; + -Wvla; + > +) +endfunction() + +function(set_cuda_options target) +target_compile_options(${target} PRIVATE +$<$: +--Wno-deprecated-gpu-targets; +--expt-extended-lambda; +-Xcompiler=-Wall; +-Xcompiler=-Wextra; +-Xcompiler=-Wcast-align; +-Xcompiler=-Wdisabled-optimization; +-Xcompiler=-Wformat=2; +-Xcompiler=-Winit-self; +-Xcompiler=-Wlogical-op; +-Xcompiler=-Wmissing-include-dirs; +-Xcompiler=-Woverloaded-virtual; +-Xcompiler=-Wpointer-arith; +-Xcompiler=-Wshadow; +-Xcompiler=-Wstrict-aliasing; +-Xcompiler=-Wswitch-enum; +-Xcompiler=-Wvla; +-Xptxas=-v; +> +) +endfunction() + +function(set_cxx_standard target) +set_property(TARGET ${target} PROPERTY CXX_STANDARD 11) +set_property(TARGET ${target} PROPERTY CXX_EXTENSIONS OFF) +set_property(TARGET ${target} PROPERTY CXX_STANDARD_REQUIRED ON) +set_property(TARGET ${target} PROPERTY CUDA_STANDARD 11) +set_property(TARGET ${target} PROPERTY CUDA_STANDARD_REQUIRED ON) +endfunction() + +# copy run-all.sh to build directory +configure_file(${CMAKE_CURRENT_LIST_DIR}/run-all.sh ${CMAKE_CURRENT_BINARY_DIR}/run-all.sh COPYONLY) + +if (MPI_FOUND) + add_executable(main main.cpp) + target_link_libraries(main MPI::MPI_CXX) + # target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH}) + # target_compile_options(main PRIVATE ${MPI_CXX_COMPILE_FLAGS}) + # target_link_libraries(main ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + set_cxx_options(main) + set_cxx_standard(main) +endif() + +if (MPI_FOUND) + add_executable(one-sided one_sided.cpp) + target_link_libraries(one-sided MPI::MPI_CXX) + set_cxx_options(one-sided) + set_cxx_standard(one-sided) +endif() + +if (MPI_FOUND AND CMAKE_CUDA_COMPILER) + add_executable(one-sided-gpu one_sided_gpu.cpp) + target_link_libraries(one-sided-gpu MPI::MPI_CXX) + target_link_libraries(one-sided-gpu CUDA::cudart) + set_cxx_options(one-sided-gpu) + set_cxx_standard(one-sided-gpu) +endif() \ No newline at end of file diff --git a/one_sided_gpu.cpp b/one_sided_gpu.cpp new file mode 100644 index 0000000..2c08dd5 --- /dev/null +++ b/one_sided_gpu.cpp @@ -0,0 +1,61 @@ +#include +#include +#include + +inline void checkCuda(cudaError_t result, const char *file, const int line) { + if (result != cudaSuccess) { + fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result)); + exit(-1); + } +} +#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__); + +int main(int argc, char **argv) { + + MPI_Init(&argc, &argv); + + int *a{}; + MPI_Win win; + { + MPI_Aint size = sizeof(int); + int disp_unit = sizeof(int); + CUDA_RUNTIME(cudaMalloc(&a, size)); + MPI_Win_create(a, size, disp_unit, MPI_INFO_NULL, MPI_COMM_WORLD, &win); + } + + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // expect our a to be set by the left + int source; + if (0 == rank) { + source = size - 1; + } else { + source = rank - 1; + } + + int target; // set the right + if (rank == size - 1) { + target = 0; + } else { + target = rank + 1; + } + + // start exposure of window + MPI_Win_fence(0, win); + + // send our rank to the target window + std::cout << "rank " << rank << " put to " << target << std::endl << std::flush; + MPI_Put(&rank, 1, MPI_INT, target, 0, 1, MPI_INT, win); + + // end exposure of window + MPI_Win_fence(0, win); + + int err = 0; + + MPI_Win_free(&win); + MPI_Finalize(); + std::cout << "rank " << rank << " completed" << std::endl << std::flush; + return err; +}