From fb88da915d72ca9e43e798d423f9aa342f9733e7 Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Fri, 11 Jun 2021 13:09:50 -0600 Subject: [PATCH] initial local+remote spmv --- CMakeLists.txt | 9 ++ algorithm.hpp | 9 ++ array.hpp | 178 +++++++++++++++++++++++++ coo_mat.hpp | 58 ++++++++ csr_mat.hpp | 196 +++++++++++++++++++++++++++ overlap.cu | 213 ++++++++++++++++++++++++++++++ partition.hpp | 59 +++++++++ row_part_spmv.cuh | 328 ++++++++++++++++++++++++++++++++++++++++++++++ split_coo_mat.hpp | 95 ++++++++++++++ where.hpp | 6 + 10 files changed, 1151 insertions(+) create mode 100644 algorithm.hpp create mode 100644 array.hpp create mode 100644 coo_mat.hpp create mode 100644 csr_mat.hpp create mode 100644 overlap.cu create mode 100644 partition.hpp create mode 100644 row_part_spmv.cuh create mode 100644 split_coo_mat.hpp create mode 100644 where.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eb928e..5f58d47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,3 +97,12 @@ if (MPI_FOUND) set_cxx_standard(main) endif() +if (MPI_FOUND) + add_executable(overlap overlap.cu) + target_include_directories(overlap PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS}) + target_link_libraries(overlap ${MPI_CXX_LIBRARIES}) + target_link_libraries(overlap CUDA::nvToolsExt) + set_cxx_options(overlap) + set_cxx_standard(overlap) +endif() + diff --git a/algorithm.hpp b/algorithm.hpp new file mode 100644 index 0000000..c8e6a0e --- /dev/null +++ b/algorithm.hpp @@ -0,0 +1,9 @@ +#pragma once + +template +void shift_left(ForwardIt first, ForwardIt last, size_t n) { + while(first != last) { + *(first-n) = *first; + ++first; + } +} \ No newline at end of file diff --git a/array.hpp b/array.hpp new file mode 100644 index 0000000..20e00df --- /dev/null +++ b/array.hpp @@ -0,0 +1,178 @@ +#pragma once + +#include + +#include "cuda_runtime.hpp" +#include "where.hpp" + +template +class Array; + + +// A non-owning view of data +template +struct ArrayView +{ + T *data_; + int64_t size_; + public: + ArrayView() : data_(nullptr), size_(0){} + ArrayView(const ArrayView &other) = default; + ArrayView(ArrayView &&other) = default; + ArrayView &operator=(const ArrayView &rhs) = default; + + __host__ __device__ int64_t size() const { return size_; } + + __host__ __device__ const T &operator()(int64_t i) const { +#ifdef VIEW_CHECK_BOUNDS + if (i < 0) { + printf("ERR: i < 0: %d\n", i); + } + if (i >= size_) { + printf("ERR: i > size_: %d > %ld\n", i, size_); + } +#endif + return data_[i]; + } + __host__ __device__ T &operator()(int64_t i) { + return data_[i]; + } + + const T* data() const { + return data_; + } + T* data() { + return data_; + } + +}; + +/* device array +*/ +template class Array +{ +public: + + // array owns the data in this view + ArrayView view_; +public: + Array() = default; + Array(const size_t n) { + resize(n); + } + Array(const Array &other) = delete; + Array(Array &&other) : view_(other.view_) { + // view is non-owning, so have to clear other + other.view_.data_ = nullptr; + other.view_.size_ = 0; + } + Array &operator=(Array &&other) { + view_ = std::move(other.view_); + // view is non-owning, so have to clear other + other.view_.data_ = nullptr; + other.view_.size_ = 0; + return *this; + } + + Array(const std::vector &v) { + set_from(v); + } + + ~Array() { + CUDA_RUNTIME(cudaFree(view_.data_)); + view_.data_ = nullptr; + view_.size_ = 0; + } + int64_t size() const { + return view_.size(); } + + ArrayView view() const { + return view_; // copy of internal view + } + + operator std::vector() const { + std::vector v(size()); + CUDA_RUNTIME(cudaMemcpy(v.data(), view_.data_, size() * sizeof(T), cudaMemcpyDeviceToHost)); + return v; + } + + void set_from(const std::vector &rhs, cudaStream_t stream = 0) { + resize(rhs.size()); + CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream)); + } + + void set_from(const Array &rhs, cudaStream_t stream = 0) { + resize(rhs.size()); + CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream)); + } + + // any change destroys all data + void resize(size_t n) { + if (size() != n) { + view_.size_ = n; + CUDA_RUNTIME(cudaFree(view_.data_)); + CUDA_RUNTIME(cudaMalloc(&view_.data_, view_.size_ * sizeof(T))); + } + } + + const T* data() const { + return view_.data(); + } + T* data() { + return view_.data(); + } + +}; + +/* host array +*/ +template class Array +{ +public: + + // array owns the data in this view + ArrayView view_; +public: + Array() = default; + Array(const size_t n, const T &val) { + resize(n); + for (size_t i = 0; i < n; ++i) { + view_(i) = val; + } + } + Array(const Array &other) = delete; + Array(Array &&other) : view_(other.view_) { + // view is non-owning, so have to clear other + other.view_.data_ = nullptr; + other.view_.size_ = 0; + } + + ~Array() { + CUDA_RUNTIME(cudaFreeHost(view_.data_)); + view_.data_ = nullptr; + view_.size_ = 0; + } + int64_t size() const { + return view_.size(); } + + ArrayView view() const { + return view_; // copy of internal view + } + + // any change destroys all data + void resize(size_t n) { + if (size() != n) { + view_.size_ = n; + CUDA_RUNTIME(cudaFreeHost(view_.data_)); + CUDA_RUNTIME(cudaHostAlloc(&view_.data_, view_.size_ * sizeof(T), cudaHostAllocDefault)); + } + } + + const T* data() const { + return view_.data_; + } + T* data() { + return view_.data_; + } + +}; \ No newline at end of file diff --git a/coo_mat.hpp b/coo_mat.hpp new file mode 100644 index 0000000..81c9780 --- /dev/null +++ b/coo_mat.hpp @@ -0,0 +1,58 @@ +#pragma once + +class CooMat { +public: + + + struct Entry { + int i; + int j; + float e; + + Entry(int _i, int _j, int _e) : i(_i), j(_j), e(_e) {} + + static bool by_ij(const Entry &a, const Entry &b) { + if (a.i < b.i) { + return true; + } else if (a.i > b.i) { + return false; + } else { + return a.j < b.j; + } + } + + static bool same_ij(const Entry &a, const Entry &b) { + return a.i == b.i && a.j == b.j; + } + }; + +private: + + // sorted during construction + std::vector data_; + int64_t numRows_; + int64_t numCols_; + +public: + CooMat(int m, int n) : numRows_(m), numCols_(n) {} + const std::vector &entries() const {return data_;} + void push_back(int i, int j, int e) { + data_.push_back(Entry(i,j,e)); + } + + void sort() { + std::sort(data_.begin(), data_.end(), Entry::by_ij); + } + + void remove_duplicates() { + std::sort(data_.begin(), data_.end(), Entry::by_ij); + std::unique(data_.begin(), data_.end(), Entry::same_ij); + } + + int64_t num_rows() const {return numRows_;} + int64_t num_cols() const {return numCols_;} + int64_t nnz() const {return data_.size();} + + std::vector::iterator begin() {return data_.begin();} + std::vector::iterator end() {return data_.end();} +}; \ No newline at end of file diff --git a/csr_mat.hpp b/csr_mat.hpp new file mode 100644 index 0000000..1b2ddf1 --- /dev/null +++ b/csr_mat.hpp @@ -0,0 +1,196 @@ +#pragma once + +#include + +#include "array.hpp" +#include "coo_mat.hpp" +#include "algorithm.hpp" + +template +class CsrMat { +public: + CsrMat(); + int64_t nnz() const; + int64_t num_rows() const; +}; +template<> class CsrMat; +template<> class CsrMat; + +/* host sparse matrix */ +template<> class CsrMat +{ + friend class CsrMat; // device can see inside + std::vector rowPtr_; + std::vector colInd_; + std::vector val_; + int64_t numCols_; + +public: + CsrMat() = default; + CsrMat(int numRows, int numCols, int nnz) : rowPtr_(numRows+1), colInd_(nnz), val_(nnz), numCols_(numCols) {} + + CsrMat(const CooMat &coo) : numCols_(coo.num_cols()) { + for (auto &e : coo.entries()) { + while (rowPtr_.size() <= e.i) { + rowPtr_.push_back(colInd_.size()); + } + colInd_.push_back(e.j); + val_.push_back(e.e); + } + while (rowPtr_.size() < coo.num_rows()+1){ + rowPtr_.push_back(colInd_.size()); + } + } + + int64_t num_rows() const { + if (rowPtr_.size() <= 1) { + return 0; + } else { + return rowPtr_.size() - 1; + } + } + + int64_t num_cols() const { + return numCols_; + } + + int64_t nnz() const { + if (colInd_.size() != val_.size()) { + throw std::logic_error("bad invariant"); + } + return colInd_.size(); + } + + const int &row_ptr(int64_t i) const { + return rowPtr_[i]; + } + const int &col_ind(int64_t i) const { + return colInd_[i]; + } + const float &val(int64_t i) const { + return val_[i]; + } + + const int *row_ptr() const {return rowPtr_.data(); } + int *row_ptr() {return rowPtr_.data(); } + const int *col_ind() const {return colInd_.data(); } + int *col_ind() {return colInd_.data(); } + const float *val() const {return val_.data(); } + float *val() {return val_.data(); } + + /* keep rows [rowStart, rowEnd) + */ + void retain_rows(int rowStart, int rowEnd) { + + if (0 == rowEnd) { + throw std::logic_error("unimplemented"); + } + // erase rows after + // dont want to keep rowEnd, so rowEnd points to end of rowEnd-1 + std::cerr << "rowPtr_ = rowPtr[:" << rowEnd+1 << "]\n"; + rowPtr_.resize(rowEnd+1); + std::cerr << "resize entries to " << rowPtr_.back() << "\n"; + colInd_.resize(rowPtr_.back()); + val_.resize(rowPtr_.back()); + + // erase early row pointers + std::cerr << "rowPtr <<= " << rowStart << "\n"; + shift_left(rowPtr_.begin()+rowStart, rowPtr_.end(), rowStart); + std::cerr << "resize rowPtr to " << rowEnd - rowStart+1 << "\n"; + rowPtr_.resize(rowEnd-rowStart+1); + + const int off = rowPtr_[0]; + // erase entries for first rows + std::cerr << "entries <<= " << off << "\n"; + shift_left(colInd_.begin()+off, colInd_.end(), off); + shift_left(val_.begin()+off, val_.end(), off); + + // adjust row pointer offset + std::cerr << "subtract rowPtrs by " << off << "\n"; + for (auto &e : rowPtr_) { + e -= off; + } + + // resize entries + std::cerr << "resize entries to " << rowPtr_.back() << "\n"; + colInd_.resize(rowPtr_.back()); + val_.resize(rowPtr_.back()); + } + +}; + +/* device sparse matrix +*/ +template<> class CsrMat +{ + Array rowPtr_; + Array colInd_; + Array val_; + int64_t numCols_; + +public: + + struct View { + ArrayView rowPtr_; + ArrayView colInd_; + ArrayView val_; + + __device__ int num_rows() const { + if (rowPtr_.size() > 0) { + return rowPtr_.size() - 1; + } else { + return 0; + } + } + + __device__ const int &row_ptr(int64_t i) const { + return rowPtr_(i); + } + + __device__ const int &col_ind(int64_t i) const { + return colInd_(i); + } + + __device__ const float &val(int64_t i) const { + return val_(i); + } + + }; + + CsrMat() = default; + CsrMat(CsrMat &&other) = delete; + CsrMat(const CsrMat &other) = delete; + + // create device matrix from host + CsrMat(const CsrMat &m) : + rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) { + if (colInd_.size() != val_.size()) { + throw std::logic_error("bad invariant"); + } + } + ~CsrMat() { + } + int64_t num_rows() const { + if (rowPtr_.size() <= 1) { + return 0; + } else { + return rowPtr_.size() - 1; + } + } + int64_t num_cols() const { + return numCols_; + } + + int64_t nnz() const { + return colInd_.size(); + } + + View view() const { + View v; + v.rowPtr_ = rowPtr_.view(); + v.colInd_ = colInd_.view(); + v.val_ = val_.view(); + return v; + } + +}; \ No newline at end of file diff --git a/overlap.cu b/overlap.cu new file mode 100644 index 0000000..154f5af --- /dev/null +++ b/overlap.cu @@ -0,0 +1,213 @@ +#include + +#include +// #include + +#include +#include +#include +#include +#include +#include + +#include "cuda_runtime.hpp" +#include "csr_mat.hpp" +#include "row_part_spmv.cuh" + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define AT __FILE__ ":" TOSTRING(__LINE__) + +//#define VIEW_CHECK_BOUNDS + + +// mxn random matrix with nnz +CsrMat random_matrix(const int64_t m, const int64_t n, const int64_t nnz) { + + if (m * n < nnz) { + throw std::logic_error(AT); + } + + CooMat coo(m,n); + while(coo.nnz() < nnz) { + + int64_t toPush = nnz - coo.nnz(); + std::cerr << "adding " << toPush << " non-zeros\n"; + for (int64_t _ = 0; _ < toPush; ++_) { + int r = rand() % m; + int c = rand() % n; + float e = 1.0; + coo.push_back(r, c, e); + } + std::cerr << "removing duplicate non-zeros\n"; + coo.remove_duplicates(); + } + coo.sort(); + std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; + CsrMat csr(coo); + std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n"; + return csr; +}; + +// nxn diagonal matrix with bandwidth b +CsrMat random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) { + + CooMat coo(n,n); + while(coo.nnz() < nnz) { + + int64_t toPush = nnz - coo.nnz(); + std::cerr << "adding " << toPush << " non-zeros\n"; + for (int64_t _ = 0; _ < toPush; ++_) { + int r = rand() % n; // random row + + // column in the band + int lb = r - bw; + int ub = r + bw + 1; + int64_t c = rand() % (ub - lb) + lb; + if (c < 0 || c > n) { + continue; // don't over-weight first or last column + } + + float e = 1.0; + coo.push_back(r, c, e); + } + std::cerr << "removing duplicate non-zeros\n"; + coo.remove_duplicates(); + } + coo.sort(); + std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; + CsrMat csr(coo); + std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n"; + return csr; +}; + +std::vector random_vector(const int64_t n) { + return std::vector(n, 1.0); +} + +Array random_array(const int64_t n) { + return Array(n, 1.0); +} + +#if 0 +int send_x(int dst, int src, std::vector &&v, MPI_Comm comm) { + MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm); + return 0; +} +#endif + +/* recv some amount of data, and put it in the right place + in a full x +*/ +std::vector receive_x(const int n, const int dst, int src, MPI_Comm comm) { + int rank = 0; + int size = 1; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + // which rows of x are local + Range local = get_partition(n, rank, size); + + // probe for size + MPI_Status stat; + MPI_Probe(0, Tag::x, comm, &stat); + int sz; + MPI_Get_count(&stat, MPI_INT, &sz); + if (sz != local.ub-local.lb) { + throw std::logic_error(AT); + } + + std::cerr << "recv " << sz << " x entries into offset " << local.lb << "\n"; + std::vector x(n); + MPI_Recv(x.data() + local.lb, sz, MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE); + + return x; +} + + + +// z += a +__global__ void vector_add(ArrayView z, const ArrayView a) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < z.size(); i += blockDim.x * gridDim.x) { + z(i) += a(i); + } +} + +int main (int argc, char **argv) { + + MPI_Init(&argc, &argv); + + int rank = 0; + int size = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + std::cerr << "get a gpu...\n"; + CUDA_RUNTIME(cudaSetDevice(rank % 4)); + CUDA_RUNTIME(cudaFree(0)); + std::cerr << "barrier...\n"; + MPI_Barrier(MPI_COMM_WORLD); + + // int64_t m = 150000; + // int64_t n = 150000; + // int64_t nnz = 11000000; + // or + int64_t m = 150000; + int64_t n = m; + int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks + int64_t nnz = 11000000; + + CsrMat A; // "local A" + + // generate and distribute A + if (0 == rank) { + std::cerr << "generate matrix\n"; + A = random_band_matrix(m, bw, nnz); + } + + + RowPartSpmv spmv(A, 0, MPI_COMM_WORLD); + + + std::cerr << "A: " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n"; + std::cerr << "local A: " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n"; + std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n"; + + + int loPrio, hiPrio; + CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio)); + + cudaStream_t loS, hiS; // "lo/hi prio" + CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio)); + CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio)); + + cudaEvent_t event; + CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + + const int nIters = 30; + std::vector times(nIters); + + nvtxRangePush("overlap"); + for (int i = 0; i < nIters; ++i) { + MPI_Barrier(MPI_COMM_WORLD); + double start = MPI_Wtime(); + spmv.send_x_async(); + spmv.launch_local(); + spmv.recv_x_async(); + spmv.send_x_wait(); + spmv.recv_x_wait(); + spmv.launch_remote(); + spmv.finish(); + times[i] = MPI_Wtime() - start; + } + nvtxRangePop(); // one-shot + MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + if (0 == rank) { + std::sort(times.begin(), times.end()); + std::cerr << times[times.size() / 2] << "\n"; + } + + MPI_Finalize(); + + return 0; +} \ No newline at end of file diff --git a/partition.hpp b/partition.hpp new file mode 100644 index 0000000..eba8f11 --- /dev/null +++ b/partition.hpp @@ -0,0 +1,59 @@ +#pragma once + +#include "csr_mat.hpp" + +struct Range { + int lb; + int ub; + + int extent() const { return ub-lb; } +}; + +/* get the ith part of splitting domain in to n pieces + if not divisible, remainder distributed to lower +*/ +Range get_partition(const int domain, const int i, const int n) { + int div = domain / n; + int rem = domain % n; + + int lb, ub; + + if (i < rem) { + lb = i * (div+1); + ub = lb + (div+1); + } else { + lb = rem * (div+1) + (i-rem) * div; + ub = lb + div; + } + return Range{.lb=lb, .ub=ub}; +} + +// who owns item `i` from `domain` split into `n` +int get_owner(int domain, int i, const int n) { + int div = domain / n; + int rem = domain % n; + + // i is in the first, pieces, which are div+1 + if (i < (div + 1) * rem) { + return i / (div + 1); + } else { + i -= (div+1) * rem; + domain -= (div+1) * rem; + return rem + i / div; + } +} + +std::vector> part_by_rows(const CsrMat &m, const int parts) { + + std::vector> mats; + + for (int p = 0; p < parts; ++p) { + Range range = get_partition(m.num_rows(), p, parts); + std::cerr << "matrix part " << p << " has " << range.ub-range.lb << " rows\n"; + CsrMat part(m); + part.retain_rows(range.lb, range.ub); + mats.push_back(part); + } + + return mats; +} \ No newline at end of file diff --git a/row_part_spmv.cuh b/row_part_spmv.cuh new file mode 100644 index 0000000..614a6fb --- /dev/null +++ b/row_part_spmv.cuh @@ -0,0 +1,328 @@ +#pragma once + +#include + +#include "csr_mat.hpp" +#include "partition.hpp" +#include "split_coo_mat.hpp" + +#include + + +enum class ProductConfig { + MODIFY, // b += + SET // b = +}; + +/* Ax=b +*/ +__global__ void spmv(ArrayView b, + const CsrMat::View A, + const ArrayView x, + const ProductConfig pc + ) { + // one thread per row + for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) { + float acc = 0; + for (int ci = A.row_ptr(r); ci < A.row_ptr(r+1); ++ci) { + int c = A.col_ind(ci); + acc += A.val(ci) * x(c); + } + if (ProductConfig::SET == pc) { + b(r) = acc; + } else { + b(r) += acc; + } + } +} + + + +enum Tag : int { + row_ptr, + col_ind, + val, + x, + num_cols +}; + + + + +int send_matrix(int dst, int src, CsrMat &&m, MPI_Comm comm) { + + MPI_Request reqs[4]; + + int numCols = m.num_cols(); + MPI_Isend(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm, &reqs[0]); + MPI_Isend(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm, &reqs[1]); + MPI_Isend(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm, &reqs[2]); + MPI_Isend(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm, &reqs[3]); + MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE); + + return 0; +} + +CsrMat receive_matrix(int dst, int src, MPI_Comm comm) { + + int numCols; + MPI_Recv(&numCols, 1, MPI_INT, 0, Tag::num_cols, comm, MPI_STATUS_IGNORE); + + // probe for number of rows + MPI_Status stat; + MPI_Probe(0, Tag::row_ptr, comm, &stat); + int numRows; + MPI_Get_count(&stat, MPI_INT, &numRows); + if (numRows > 0) { + --numRows; + } + + // probe for nnz + MPI_Probe(0, Tag::col_ind, comm, &stat); + int nnz; + MPI_Get_count(&stat, MPI_INT, &nnz); + + std::cerr << "recv " << numRows << "x" << numCols << " w/ " << nnz << "\n"; + CsrMat csr(numRows, numCols, nnz); + + // receive actual data into matrix + MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE); + MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE); + MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE); + + return csr; +} + + +// out[i] = in[idx[i]] +__global__ void scatter(ArrayView out, +ArrayView in, +ArrayView idx) {} + + +/* Ax=y , partitioned evenly by rows of A + + always have to pack on the send side, since not all local x values will be needed + so may as well pack in a way that recver doesn't have to unpack + + serializing local and remote means you don't have to worry about + concurrent adds to the product vector + if kernels are sufficiently large, no real opportunity for these to overlap anyway + if they're small, communication time will be longer anway + + +*/ +class RowPartSpmv { +private: + MPI_Comm comm_; + + int loff_; // first row in global index + CsrMat la_; // local A + CsrMat ra_; // remote A + Array lx_; // local x + Array rx_; // remote x + Array ly_; + + + // info for sending x + struct SendParam { + int dst; // destination rank + int displ; + int count; + MPI_Request req; + }; + std::vector sendParams_; // parameters for each rank + Array xSendIdx_; // which entry of lx_ will be in each xSendBuf_; + Array xSendBuf_; // send local x entries to other ranks + + std::vector gCols_; // global index from local + + + std::map> sendEntr; // which entries of x to send to each rank + std::map sendReq; + + struct RecvParam { + int src; // source rank + int displ; // displacement in + int count; // number of entries + MPI_Request req; + }; + std::vector recvParams_; + + + cudaStream_t kernelStream_; + cudaStream_t packStream_; + +public: + + const CsrMat &lA() const {return la_;} + const CsrMat &rA() const {return ra_;} + + void launch_local() { + dim3 dimGrid(100); + dim3 dimBlock(128); + spmv<<>>(ly_.view(), la_.view(), lx_.view(), ProductConfig::SET); + CUDA_RUNTIME(cudaGetLastError()); + } + + void launch_remote() { + dim3 dimGrid(100); + dim3 dimBlock(128); + spmv<<>>(ly_.view(), ra_.view(), rx_.view(), ProductConfig::MODIFY); + CUDA_RUNTIME(cudaGetLastError()); + } + + void pack_x_async() { + scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view()); + } + + void pack_x_wait() { + CUDA_RUNTIME(cudaStreamSynchronize(packStream_)); + } + + void send_x_async() { + + // send to neighbors who want it + for (auto &p : sendParams_) { + int tag = 0; + MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req); + } + } + void send_x_wait() { + for (auto &p : sendParams_) { + MPI_Wait(&p.req, MPI_STATUS_IGNORE); + } + } + void recv_x_async() { + for (auto &p : recvParams_) { + int tag = 0; + MPI_Irecv(rx_.data() + p.displ, p.count, MPI_FLOAT, p.src, tag, comm_, &p.req); + } + } + void recv_x_wait() { + for (auto &p : recvParams_) { + MPI_Wait(&p.req, MPI_STATUS_IGNORE); + } + } + + void finish() { + CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_)); + } + + void launch_local_spmv() {} + void launch_remote_spmv() {} + + /* create from a matrix at root + */ + RowPartSpmv( + const CsrMat &wholeA, + const int root, + MPI_Comm comm + ) { + + int rank, size; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + CsrMat a; + if (root == rank) { + std::cerr << "partition matrix\n"; + std::vector> as = part_by_rows(wholeA, size); + for (size_t dst = 0; dst < size; ++dst) { + if (root != dst) { + std::cerr << "send A to " << dst << "\n"; + send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD); + } + } + a = as[rank]; + } else { + std::cerr << "recv A at " << rank << "\n"; + a = receive_matrix(rank, 0, MPI_COMM_WORLD); + } + + // split row part of a into local and global + SplitCooMat scm = split_local_remote(a, comm); + loff_ = scm.loff; + + // create local part of x array + // undefined entries + Range xrange = get_partition(a.num_cols(), rank, size); + lx_ = Array(xrange.extent()); + + // create remote part of x array + // one entry per remote column + rx_ = Array(scm.globals.size()); + + // determine which columns needed from others + std::map> recvCols; + for (int c : scm.globals) { + auto src = get_owner(a.num_cols(), c, size); + assert(rank != src && "should not need my own columns in remote part"); + recvCols[src].push_back(c); + } + + // create receive parameters + int offset = 0; + for (auto it = recvCols.begin(); it != recvCols.end(); ++it) { + RecvParam param; + param.displ = offset; + param.src = it->first; + offset += it->second.size(); + param.count = offset - param.displ; + recvParams_.push_back(param); + } + + // tell others which cols I need (send 0 if nothing) + std::vector reqs(size); + for (int dest = 0; dest < size; ++dest) { + auto it = recvCols.find(dest); + if (it != recvCols.end()) { + MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]); + } else { + int _; + MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]); + } + } + + // which global x rows other ranks need from me + std::map> sendCols; + for (int src = 0; src < size; ++src) { + MPI_Status status; + MPI_Probe(src, 0, comm, &status); + int count; + MPI_Get_count(&status, MPI_INT, &count); + if (count != 0) { + sendCols[src].resize(count); + MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE); + } else { + int _; + MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE); + } + } + + // create the offsets from lx that we will send out + // TODO: should be device array + std::vector offsets; + for (auto it = sendCols.begin(); it != sendCols.end(); ++it) { + // TODO - adjust for changed local array columns + SendParam param; + param.displ = offsets.size(); + param.dst = it->first; + for (int gc : it->second) { + int lc = gc - scm.loff; + offsets.push_back(lc); + } + param.count = offsets.size() - param.displ; + sendParams_.push_back(param); + } + + assert(la_.size() > 0); + assert(ra_.size() > 0); // remote A + assert(lx_.size() > 0); + assert(rx_.size() > 0); + assert(ly_.size() > 0); + + + + } +}; \ No newline at end of file diff --git a/split_coo_mat.hpp b/split_coo_mat.hpp new file mode 100644 index 0000000..b5d97b1 --- /dev/null +++ b/split_coo_mat.hpp @@ -0,0 +1,95 @@ +#pragma once + +#include "coo_mat.hpp" +#include "partition.hpp" + + + + +/* local matrix has cols renumbered to be 0..N + into the local dense vector + + remote matrix has cols renumbered for the remote dense vector +*/ +struct SplitCooMat { + int loff; // global row for local matrix 0 + CsrMat local; // local matrix + CsrMat remote; // remote matrix (with local column indices) + std::map locals; // get local column from global column + std::vector globals; // get global column for local column +}; + +/* Row partition of a matrix into a "local" and "remote" part + If locally there are rows i...j, then the local part also has columns i...j + The remote part will have all other columns + + Each rank will also renumber the column indices in the remote part + This rank will recv the corresponding remote x vector entries, but + doesn't want to materialize the whole distributed x vector in memory + so the column indices must be packed into a contiguous range 0...N + + Furthermore, we want all entries from rank 0 to come first, then rank 1, etc. + This is so we can just get entries from rank 0 and recv them directly into the + remote x vector at the correct offset + + To do this, relabel the matrix in the following way: + Get a list of unique required global ids, and then sort them. + The first will be local 0, then local 1, etc + +*/ +SplitCooMat split_local_remote(const CsrMat &m, MPI_Comm comm) { + int rank = 0; + int size = 1; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + // which rows of x are local + Range localRange = get_partition(m.num_cols(), rank, size); + int loff = localRange.lb; + + // build two matrices, local gets local non-zeros, remote gets remote non-zeros + CooMat local(m.num_rows(), m.num_cols()); + CooMat remote(m.num_rows(), m.num_cols()); + + std::vector globals; // get global col for local col + for (int r = 0; r < m.num_rows(); ++r) { + for (int ci = m.row_ptr(r); ci < m.row_ptr(r+1); ++ci) { + int c = m.col_ind(ci); + float v = m.val(ci); + if (c >= localRange.lb && c < localRange.ub) { + int lc = c - loff; + local.push_back(r,lc,v); + } else { + // keep the global column for now, it will be renumbered later + globals.push_back(c); + remote.push_back(r, c, v); + } + } + } + + // sort required global columns. + // this will ensure the lowest owning rank comes first, and all are contiguous + std::sort(globals.begin(), globals.end()); + auto it = std::unique(globals.begin(), globals.end()); + globals.resize(it - globals.begin()); + + std::map locals; // get local col for global column + for (size_t lc = 0; lc < globals.size(); ++lc) { + int gc = globals[lc]; + locals[gc] = lc; + } + + // relabel remote columns + for (CooMat::Entry &e : remote) { + e.j = locals[e.j]; + } + + return SplitCooMat { + .loff=loff, + .local=local, + .remote=remote, + .locals=locals, + .globals=globals + }; + +} \ No newline at end of file diff --git a/where.hpp b/where.hpp new file mode 100644 index 0000000..7d56885 --- /dev/null +++ b/where.hpp @@ -0,0 +1,6 @@ +#pragma once + +enum class Where { + host, + device +}; \ No newline at end of file