From a7c755d8995a9f06e7d98a0baddbe97ace15daf7 Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Fri, 11 Jun 2021 14:43:16 -0600 Subject: [PATCH] provisional distributed spmv --- CMakeLists.txt | 6 +-- README.md | 16 +++++++ at.hpp | 5 +++ csr_mat.hpp | 76 ++++++++++++++++++++++++++++++++ overlap.cu | 108 +++++----------------------------------------- row_part_spmv.cuh | 88 ++++++++++++++++++++++++++++++------- split_coo_mat.hpp | 1 + 7 files changed, 184 insertions(+), 116 deletions(-) create mode 100644 at.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f58d47..6a97724 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,7 +87,7 @@ endfunction() if (MPI_FOUND) add_executable(main main.cu) - target_include_directories(main PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS}) + target_include_directories(main SYSTEM PRIVATE ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(main ${MPI_CXX_LIBRARIES}) target_link_libraries(main CUDA::nvToolsExt) # target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH}) @@ -99,10 +99,10 @@ endif() if (MPI_FOUND) add_executable(overlap overlap.cu) - target_include_directories(overlap PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS}) + target_include_directories(overlap SYSTEM PRIVATE ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(overlap ${MPI_CXX_LIBRARIES}) target_link_libraries(overlap CUDA::nvToolsExt) - set_cxx_options(overlap) + set_cuda_options(overlap) set_cxx_standard(overlap) endif() diff --git a/README.md b/README.md index 4061b2d..838106c 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,22 @@ module load cde/v2/cmake/3.19.2 mpirun -n 2 ~/software/nsight-systems-cli/2021.2.1/bin/nsys profile -c cudaProfilerApi -t cuda,mpi,nvtx -o dist-spmv_%q{OMPI_COMM_WORLD_RANK} -f true ./main ``` +### To build with OpenMPI 4.1.1 + +``` +module purge +module load sems-env +module load sems-cmake/3.19.1 +module load sems-gcc/7.2.0 +module load sems-cuda/10.1 +cmake .. -DCMAKE_PREFIX_PATH=~cwpears/software/openmpi-4.1.1-cuda10.1-gcc7.2 +``` + +``` +~cwpears/software/openmpi-4.1.1-cuda10.1-gcc7.2/bin/mpirun -n 2 ./overlap + +``` + ## Design Considerations Minimize CUDA runtime calls \ No newline at end of file diff --git a/at.hpp b/at.hpp new file mode 100644 index 0000000..f8afa66 --- /dev/null +++ b/at.hpp @@ -0,0 +1,5 @@ +#pragma once + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define AT __FILE__ ":" TOSTRING(__LINE__) \ No newline at end of file diff --git a/csr_mat.hpp b/csr_mat.hpp index 1b2ddf1..c642abe 100644 --- a/csr_mat.hpp +++ b/csr_mat.hpp @@ -6,6 +6,8 @@ #include "coo_mat.hpp" #include "algorithm.hpp" +#include + template class CsrMat { public: @@ -161,6 +163,16 @@ public: CsrMat(CsrMat &&other) = delete; CsrMat(const CsrMat &other) = delete; + CsrMat &operator=(CsrMat &&rhs) { + if (this != &rhs) { + rowPtr_ = std::move(rhs.rowPtr_); + colInd_ = std::move(rhs.colInd_); + val_ = std::move(rhs.val_); + numCols_ = std::move(rhs.numCols_); + } + return *this; + } + // create device matrix from host CsrMat(const CsrMat &m) : rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) { @@ -193,4 +205,68 @@ public: return v; } +}; + + +// mxn random matrix with nnz +CsrMat random_matrix(const int64_t m, const int64_t n, const int64_t nnz) { + + if (m * n < nnz) { + throw std::logic_error(AT); + } + + CooMat coo(m,n); + while(coo.nnz() < nnz) { + + int64_t toPush = nnz - coo.nnz(); + std::cerr << "adding " << toPush << " non-zeros\n"; + for (int64_t _ = 0; _ < toPush; ++_) { + int r = rand() % m; + int c = rand() % n; + float e = 1.0; + coo.push_back(r, c, e); + } + std::cerr << "removing duplicate non-zeros\n"; + coo.remove_duplicates(); + } + coo.sort(); + std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; + CsrMat csr(coo); + std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n"; + return csr; +}; + +// nxn diagonal matrix with bandwidth b +CsrMat random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) { + + CooMat coo(n,n); + while(coo.nnz() < nnz) { + + int64_t toPush = nnz - coo.nnz(); + std::cerr << "adding " << toPush << " non-zeros\n"; + for (int64_t _ = 0; _ < toPush; ++_) { + int r = rand() % n; // random row + + // column in the band + int lb = r - bw; + int ub = r + bw + 1; + int64_t c = rand() % (ub - lb) + lb; + if (c < 0 || c >= n) { + // retry, don't over-weight first or last column + continue; + } + float e = 1.0; + + assert(c < n); + assert(r < n); + coo.push_back(r, c, e); + } + std::cerr << "removing duplicate non-zeros\n"; + coo.remove_duplicates(); + } + coo.sort(); + std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; + CsrMat csr(coo); + std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n"; + return csr; }; \ No newline at end of file diff --git a/overlap.cu b/overlap.cu index 154f5af..4baad28 100644 --- a/overlap.cu +++ b/overlap.cu @@ -10,91 +10,13 @@ #include #include +//#define VIEW_CHECK_BOUNDS + +#include "at.hpp" #include "cuda_runtime.hpp" #include "csr_mat.hpp" #include "row_part_spmv.cuh" -#define STRINGIFY(x) #x -#define TOSTRING(x) STRINGIFY(x) -#define AT __FILE__ ":" TOSTRING(__LINE__) - -//#define VIEW_CHECK_BOUNDS - - -// mxn random matrix with nnz -CsrMat random_matrix(const int64_t m, const int64_t n, const int64_t nnz) { - - if (m * n < nnz) { - throw std::logic_error(AT); - } - - CooMat coo(m,n); - while(coo.nnz() < nnz) { - - int64_t toPush = nnz - coo.nnz(); - std::cerr << "adding " << toPush << " non-zeros\n"; - for (int64_t _ = 0; _ < toPush; ++_) { - int r = rand() % m; - int c = rand() % n; - float e = 1.0; - coo.push_back(r, c, e); - } - std::cerr << "removing duplicate non-zeros\n"; - coo.remove_duplicates(); - } - coo.sort(); - std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; - CsrMat csr(coo); - std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n"; - return csr; -}; - -// nxn diagonal matrix with bandwidth b -CsrMat random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) { - - CooMat coo(n,n); - while(coo.nnz() < nnz) { - - int64_t toPush = nnz - coo.nnz(); - std::cerr << "adding " << toPush << " non-zeros\n"; - for (int64_t _ = 0; _ < toPush; ++_) { - int r = rand() % n; // random row - - // column in the band - int lb = r - bw; - int ub = r + bw + 1; - int64_t c = rand() % (ub - lb) + lb; - if (c < 0 || c > n) { - continue; // don't over-weight first or last column - } - - float e = 1.0; - coo.push_back(r, c, e); - } - std::cerr << "removing duplicate non-zeros\n"; - coo.remove_duplicates(); - } - coo.sort(); - std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; - CsrMat csr(coo); - std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n"; - return csr; -}; - -std::vector random_vector(const int64_t n) { - return std::vector(n, 1.0); -} - -Array random_array(const int64_t n) { - return Array(n, 1.0); -} - -#if 0 -int send_x(int dst, int src, std::vector &&v, MPI_Comm comm) { - MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm); - return 0; -} -#endif /* recv some amount of data, and put it in the right place in a full x @@ -152,10 +74,10 @@ int main (int argc, char **argv) { // int64_t n = 150000; // int64_t nnz = 11000000; // or - int64_t m = 150000; + int64_t m = 15000; int64_t n = m; int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks - int64_t nnz = 11000000; + int64_t nnz = 1100000; CsrMat A; // "local A" @@ -168,29 +90,21 @@ int main (int argc, char **argv) { RowPartSpmv spmv(A, 0, MPI_COMM_WORLD); - - std::cerr << "A: " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n"; + if (0 == rank) { + std::cerr << "A: " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n"; + } std::cerr << "local A: " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n"; std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n"; - - int loPrio, hiPrio; - CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio)); - - cudaStream_t loS, hiS; // "lo/hi prio" - CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio)); - CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio)); - - cudaEvent_t event; - CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - - const int nIters = 30; + const int nIters = 1; std::vector times(nIters); nvtxRangePush("overlap"); for (int i = 0; i < nIters; ++i) { MPI_Barrier(MPI_COMM_WORLD); double start = MPI_Wtime(); + spmv.pack_x_async(); + spmv.pack_x_wait(); spmv.send_x_async(); spmv.launch_local(); spmv.recv_x_async(); diff --git a/row_part_spmv.cuh b/row_part_spmv.cuh index 614a6fb..1875e8b 100644 --- a/row_part_spmv.cuh +++ b/row_part_spmv.cuh @@ -173,6 +173,7 @@ public: } void pack_x_async() { + assert(xSendBuf_.size() == xSendIdx_.size()); scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view()); } @@ -182,9 +183,12 @@ public: void send_x_async() { + std::cerr << "send_x_async(): send to " << sendParams_.size() << " ranks\n"; + // send to neighbors who want it for (auto &p : sendParams_) { int tag = 0; + assert(xSendBuf_.size() >= p.displ + p.count); MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req); } } @@ -209,8 +213,10 @@ public: CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_)); } - void launch_local_spmv() {} - void launch_remote_spmv() {} + ~RowPartSpmv() { + CUDA_RUNTIME(cudaStreamDestroy(kernelStream_)); kernelStream_ = 0; + CUDA_RUNTIME(cudaStreamDestroy(packStream_)); packStream_ = 0; + } /* create from a matrix at root */ @@ -218,11 +224,14 @@ public: const CsrMat &wholeA, const int root, MPI_Comm comm - ) { + ) : comm_(comm) { + + CUDA_RUNTIME(cudaStreamCreate(&kernelStream_)); + CUDA_RUNTIME(cudaStreamCreate(&packStream_)); int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + MPI_Comm_rank(comm_, &rank); + MPI_Comm_size(comm_, &size); CsrMat a; if (root == rank) { @@ -231,27 +240,34 @@ public: for (size_t dst = 0; dst < size; ++dst) { if (root != dst) { std::cerr << "send A to " << dst << "\n"; - send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD); + send_matrix(dst, 0, std::move(as[dst]), comm_); } } a = as[rank]; } else { std::cerr << "recv A at " << rank << "\n"; - a = receive_matrix(rank, 0, MPI_COMM_WORLD); + a = receive_matrix(rank, 0, comm_); } // split row part of a into local and global SplitCooMat scm = split_local_remote(a, comm); + la_ = std::move(scm.local); + ra_ = std::move(scm.remote); + assert(la_.nnz() + ra_.nnz() == a.nnz() && "lost a non-zero during split"); loff_ = scm.loff; // create local part of x array // undefined entries Range xrange = get_partition(a.num_cols(), rank, size); lx_ = Array(xrange.extent()); + ly_ = Array(la_.num_rows()); // create remote part of x array // one entry per remote column rx_ = Array(scm.globals.size()); + if (0 == rx_.size()) { + std::cerr << "WARN: not receiving anything\n"; + } // determine which columns needed from others std::map> recvCols; @@ -261,6 +277,24 @@ public: recvCols[src].push_back(c); } +#if 1 + for (int r = 0; r < size; ++r) { + MPI_Barrier(comm_); + if (r == rank) { + std::cerr << "rank " << rank << "recvCols:\n"; + for (auto it = recvCols.begin(); it != recvCols.end(); ++it) { + std::cerr << "from " << it->first << ": "; + for (auto &c : it->second) { + std::cerr << c << " "; + } + std::cerr << "\n"; + } + } + MPI_Barrier(comm_); + } +#endif + + // create receive parameters int offset = 0; for (auto it = recvCols.begin(); it != recvCols.end(); ++it) { @@ -272,15 +306,36 @@ public: recvParams_.push_back(param); } + +#if 1 + for (int r = 0; r < size; ++r) { + MPI_Barrier(comm_); + if (r == rank) { + std::cerr << "rank " << rank << " recvParams:\n"; + for (RecvParam &p : recvParams_) { + std::cerr + << "src=" << p.src + << " displ=" << p.displ + << " count=" << p.count + << "\n"; + } + } + MPI_Barrier(comm_); + } +#endif + + + // tell others which cols I need (send 0 if nothing) std::vector reqs(size); for (int dest = 0; dest < size; ++dest) { auto it = recvCols.find(dest); if (it != recvCols.end()) { - MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]); + assert(it->second.data()); + MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm_, &reqs[dest]); } else { int _; - MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]); + MPI_Isend(&_ /*junk*/, 0, MPI_INT, dest, 0, comm_, &reqs[dest]); } } @@ -293,10 +348,10 @@ public: MPI_Get_count(&status, MPI_INT, &count); if (count != 0) { sendCols[src].resize(count); - MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE); + MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm_, MPI_STATUS_IGNORE); } else { int _; - MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE); + MPI_Recv(&_, 0, MPI_INT, src, 0, comm_, MPI_STATUS_IGNORE); } } @@ -310,19 +365,20 @@ public: param.dst = it->first; for (int gc : it->second) { int lc = gc - scm.loff; + assert(lc >= 0); + assert(lc < lx_.size()); offsets.push_back(lc); } param.count = offsets.size() - param.displ; sendParams_.push_back(param); } + // device version of offsets for packing + xSendIdx_ = offsets; + // buffer that x values will be placed into for sending + xSendBuf_.resize(xSendIdx_.size()); - assert(la_.size() > 0); - assert(ra_.size() > 0); // remote A assert(lx_.size() > 0); - assert(rx_.size() > 0); assert(ly_.size() > 0); - - } }; \ No newline at end of file diff --git a/split_coo_mat.hpp b/split_coo_mat.hpp index b5d97b1..20275ea 100644 --- a/split_coo_mat.hpp +++ b/split_coo_mat.hpp @@ -45,6 +45,7 @@ SplitCooMat split_local_remote(const CsrMat &m, MPI_Comm comm) { // which rows of x are local Range localRange = get_partition(m.num_cols(), rank, size); + std::cerr << "[" << localRange.lb <<","<< localRange.ub << ")\n"; int loff = localRange.lb; // build two matrices, local gets local non-zeros, remote gets remote non-zeros