initial local+remote spmv

2021-06-11 13:09:50 -06:00
parent aae7176823
commit fb88da915d
10 changed files with 1151 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,3 +97,12 @@ if (MPI_FOUND)
  set_cxx_standard(main)
 endif()

+if (MPI_FOUND)
+  add_executable(overlap overlap.cu)
+  target_include_directories(overlap PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
+  target_link_libraries(overlap ${MPI_CXX_LIBRARIES})
+  target_link_libraries(overlap CUDA::nvToolsExt)
+  set_cxx_options(overlap)
+  set_cxx_standard(overlap)
+endif()
+
--- a/algorithm.hpp
+++ b/algorithm.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+template<typename ForwardIt>
+void shift_left(ForwardIt first, ForwardIt last, size_t n) {
+    while(first != last) {
+        *(first-n) = *first;
+        ++first;
+    }
+}
--- a/array.hpp
+++ b/array.hpp
@@ -0,0 +1,178 @@
+#pragma once
+
+#include <vector>
+
+#include "cuda_runtime.hpp"
+#include "where.hpp"
+
+template <Where where, typename T>
+class Array;
+
+
+// A non-owning view of data
+template <typename T>
+struct ArrayView
+{
+    T *data_;
+    int64_t size_;
+    public:
+    ArrayView() : data_(nullptr), size_(0){}
+    ArrayView(const ArrayView &other) = default;
+    ArrayView(ArrayView &&other) = default;
+    ArrayView &operator=(const ArrayView &rhs) = default;
+
+    __host__ __device__ int64_t size() const { return size_; }
+
+    __host__ __device__ const T &operator()(int64_t i) const {
+#ifdef VIEW_CHECK_BOUNDS
+        if (i < 0) {
+            printf("ERR: i < 0: %d\n", i);
+        }
+        if (i >= size_) {
+            printf("ERR: i > size_: %d > %ld\n", i, size_);
+        }
+#endif
+        return data_[i];
+    }
+    __host__ __device__ T &operator()(int64_t i) {
+        return data_[i];
+    }
+
+    const T* data() const {
+        return data_;
+    }
+    T* data() {
+        return data_;
+    }
+
+};
+
+/* device array
+*/
+template<typename T> class Array<Where::device, T>
+{
+public:
+
+    // array owns the data in this view
+    ArrayView<T> view_;
+public:
+    Array() = default;
+    Array(const size_t n) {
+        resize(n);
+    }
+    Array(const Array &other) = delete;
+    Array(Array &&other) : view_(other.view_) {
+        // view is non-owning, so have to clear other
+        other.view_.data_ = nullptr;
+        other.view_.size_ = 0;
+    }
+    Array &operator=(Array &&other) {
+        view_ = std::move(other.view_);
+        // view is non-owning, so have to clear other
+        other.view_.data_ = nullptr;
+        other.view_.size_ = 0;
+        return *this;
+    }
+
+    Array(const std::vector<T> &v) {
+        set_from(v);
+    }
+
+    ~Array() {
+        CUDA_RUNTIME(cudaFree(view_.data_));
+        view_.data_ = nullptr;
+        view_.size_ = 0;
+    }
+    int64_t size() const { 
+        return view_.size(); }
+
+    ArrayView<T> view() const {
+        return view_; // copy of internal view
+    }
+
+    operator std::vector<T>() const {
+        std::vector<T> v(size());
+        CUDA_RUNTIME(cudaMemcpy(v.data(), view_.data_, size() * sizeof(T), cudaMemcpyDeviceToHost));
+        return v;
+    }
+
+    void set_from(const std::vector<T> &rhs, cudaStream_t stream = 0) {
+        resize(rhs.size());
+        CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    void set_from(const Array<Where::host, T> &rhs, cudaStream_t stream = 0) {
+        resize(rhs.size());
+        CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    // any change destroys all data
+    void resize(size_t n) {
+        if (size() != n) {
+            view_.size_ = n;
+            CUDA_RUNTIME(cudaFree(view_.data_));
+            CUDA_RUNTIME(cudaMalloc(&view_.data_, view_.size_ * sizeof(T)));
+        }
+    }
+
+    const T* data() const {
+        return view_.data();
+    }
+    T* data() {
+        return view_.data();
+    }
+
+};
+
+/* host array
+*/
+template<typename T> class Array<Where::host, T>
+{
+public:
+
+    // array owns the data in this view
+    ArrayView<T> view_;
+public:
+    Array() = default;
+    Array(const size_t n, const T &val) {
+        resize(n);
+        for (size_t i = 0; i < n; ++i) {
+            view_(i) = val;
+        }
+    }
+    Array(const Array &other) = delete;
+    Array(Array &&other) : view_(other.view_) {
+        // view is non-owning, so have to clear other
+        other.view_.data_ = nullptr;
+        other.view_.size_ = 0;
+    }
+
+    ~Array() {
+        CUDA_RUNTIME(cudaFreeHost(view_.data_));
+        view_.data_ = nullptr;
+        view_.size_ = 0;
+    }
+    int64_t size() const { 
+        return view_.size(); }
+
+    ArrayView<T> view() const {
+        return view_; // copy of internal view
+    }
+
+    // any change destroys all data
+    void resize(size_t n) {
+        if (size() != n) {
+            view_.size_ = n;
+            CUDA_RUNTIME(cudaFreeHost(view_.data_));
+            CUDA_RUNTIME(cudaHostAlloc(&view_.data_, view_.size_ * sizeof(T), cudaHostAllocDefault));
+        }
+    }
+
+    const T* data() const {
+        return view_.data_;
+    }
+    T* data() {
+        return view_.data_;
+    }
+
+};
--- a/coo_mat.hpp
+++ b/coo_mat.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+class CooMat {
+public:
+
+
+    struct Entry {
+        int i;
+        int j;
+        float e;
+
+        Entry(int _i, int _j, int _e) : i(_i), j(_j), e(_e) {}
+
+        static bool by_ij(const Entry &a, const Entry &b) {
+            if (a.i < b.i) {
+                return true;
+            } else if (a.i > b.i) {
+                return false;
+            } else {
+                return a.j < b.j;
+            }
+        }
+
+        static bool same_ij(const Entry &a, const Entry &b) {
+            return a.i == b.i && a.j == b.j;
+        }
+    };
+
+private:
+
+    // sorted during construction
+    std::vector<Entry> data_;
+    int64_t numRows_;
+    int64_t numCols_;
+
+public:
+    CooMat(int m, int n) : numRows_(m), numCols_(n) {}
+    const std::vector<Entry> &entries() const {return data_;}
+    void push_back(int i, int j, int e) {
+        data_.push_back(Entry(i,j,e));  
+    }
+
+    void sort() {
+        std::sort(data_.begin(), data_.end(), Entry::by_ij);
+    }
+
+    void remove_duplicates() {
+        std::sort(data_.begin(), data_.end(), Entry::by_ij);
+        std::unique(data_.begin(), data_.end(), Entry::same_ij);
+    }
+
+    int64_t num_rows() const {return numRows_;}
+    int64_t num_cols() const {return numCols_;}
+    int64_t nnz() const {return data_.size();}
+
+    std::vector<Entry>::iterator begin() {return data_.begin();}
+    std::vector<Entry>::iterator end() {return data_.end();}
+};
--- a/csr_mat.hpp
+++ b/csr_mat.hpp
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include "array.hpp"
+#include "coo_mat.hpp"
+#include "algorithm.hpp"
+
+template <Where where>
+class CsrMat {
+public:
+    CsrMat();
+    int64_t nnz() const;
+    int64_t num_rows() const;
+};
+template<> class CsrMat<Where::host>;
+template<> class CsrMat<Where::device>;
+
+/* host sparse matrix */
+template<> class CsrMat<Where::host>
+{
+    friend class CsrMat<Where::device>; // device can see inside
+    std::vector<int> rowPtr_;
+    std::vector<int> colInd_;
+    std::vector<float> val_;
+    int64_t numCols_;
+
+public:
+    CsrMat() = default;
+    CsrMat(int numRows, int numCols, int nnz) : rowPtr_(numRows+1), colInd_(nnz), val_(nnz), numCols_(numCols) {}
+
+    CsrMat(const CooMat &coo) : numCols_(coo.num_cols()) {
+        for (auto &e : coo.entries()) {
+            while (rowPtr_.size() <= e.i) {
+                rowPtr_.push_back(colInd_.size());
+            }
+            colInd_.push_back(e.j);
+            val_.push_back(e.e);
+        }
+        while (rowPtr_.size() < coo.num_rows()+1){
+            rowPtr_.push_back(colInd_.size());
+        }
+    }
+
+    int64_t num_rows() const {
+      if (rowPtr_.size() <= 1) {
+        return 0; 
+      } else { 
+        return rowPtr_.size() - 1;
+      }
+    }
+
+    int64_t num_cols() const {
+        return numCols_;
+      }
+
+    int64_t nnz() const {
+        if (colInd_.size() != val_.size()) {
+            throw std::logic_error("bad invariant");
+        }
+        return colInd_.size();
+    }
+
+    const int &row_ptr(int64_t i) const {
+        return rowPtr_[i];
+    }
+    const int &col_ind(int64_t i) const {
+        return colInd_[i];
+    }
+    const float &val(int64_t i) const {
+        return val_[i];
+    }
+
+    const int *row_ptr() const {return rowPtr_.data(); }
+    int *row_ptr() {return rowPtr_.data(); }
+    const int *col_ind() const {return colInd_.data(); }
+    int *col_ind() {return colInd_.data(); }
+    const float *val() const {return val_.data(); }
+    float *val() {return val_.data(); }
+
+    /* keep rows [rowStart, rowEnd)
+    */
+    void retain_rows(int rowStart, int rowEnd) {
+        
+        if (0 == rowEnd) {
+            throw std::logic_error("unimplemented");
+        }
+        // erase rows after
+        // dont want to keep rowEnd, so rowEnd points to end of rowEnd-1
+        std::cerr << "rowPtr_ = rowPtr[:" << rowEnd+1 << "]\n";
+        rowPtr_.resize(rowEnd+1);
+        std::cerr << "resize entries to " << rowPtr_.back() << "\n";
+        colInd_.resize(rowPtr_.back());
+        val_.resize(rowPtr_.back());
+
+        // erase early row pointers
+        std::cerr << "rowPtr <<= " << rowStart << "\n";
+        shift_left(rowPtr_.begin()+rowStart, rowPtr_.end(), rowStart);
+        std::cerr << "resize rowPtr to " << rowEnd - rowStart+1 << "\n";
+        rowPtr_.resize(rowEnd-rowStart+1);
+
+        const int off = rowPtr_[0];
+        // erase entries for first rows
+        std::cerr << "entries <<= " << off << "\n";
+        shift_left(colInd_.begin()+off, colInd_.end(), off);
+        shift_left(val_.begin()+off, val_.end(), off);
+
+        // adjust row pointer offset
+        std::cerr << "subtract rowPtrs by " << off << "\n";
+        for (auto &e : rowPtr_) {
+            e -= off;
+        }
+
+        // resize entries
+        std::cerr << "resize entries to " << rowPtr_.back() << "\n";
+        colInd_.resize(rowPtr_.back());
+        val_.resize(rowPtr_.back());
+    }
+
+};
+
+/* device sparse matrix
+*/
+template<> class CsrMat<Where::device>
+{
+    Array<Where::device, int> rowPtr_;
+    Array<Where::device, int> colInd_;
+    Array<Where::device, float> val_;
+    int64_t numCols_;
+
+public:
+
+    struct View {
+        ArrayView<int> rowPtr_;
+        ArrayView<int> colInd_;
+        ArrayView<float> val_;
+
+        __device__ int num_rows() const {
+            if (rowPtr_.size() > 0) {
+                return rowPtr_.size() - 1;
+            } else {
+                return 0;
+            }
+        }
+
+        __device__ const int &row_ptr(int64_t i) const {
+            return rowPtr_(i);
+        }
+
+        __device__ const int &col_ind(int64_t i) const {
+            return colInd_(i);
+        }
+
+        __device__ const float &val(int64_t i) const {
+            return val_(i);
+        }
+
+    };
+
+    CsrMat() = default;
+    CsrMat(CsrMat &&other) = delete;
+    CsrMat(const CsrMat &other) = delete;
+
+    // create device matrix from host
+    CsrMat(const CsrMat<Where::host> &m) : 
+        rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) {
+        if (colInd_.size() != val_.size()) {
+            throw std::logic_error("bad invariant");
+        }
+    }
+    ~CsrMat() {
+    }
+    int64_t num_rows() const {
+        if (rowPtr_.size() <= 1) {
+            return 0; 
+          } else { 
+            return rowPtr_.size() - 1;
+          }
+    }
+    int64_t num_cols() const {
+        return numCols_;
+    }
+  
+    int64_t nnz() const {
+        return colInd_.size();
+    }
+
+    View view() const {
+        View v;
+        v.rowPtr_ = rowPtr_.view();
+        v.colInd_ = colInd_.view();
+        v.val_ = val_.view();
+        return v;
+    }
+
+};
--- a/overlap.cu
+++ b/overlap.cu
@@ -0,0 +1,213 @@
+#include <mpi.h>
+
+#include <nvToolsExt.h>
+// #include <cuda_profiler_api.h>
+
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include <algorithm>
+#include <iostream>
+#include <map>
+
+#include "cuda_runtime.hpp"
+#include "csr_mat.hpp"
+#include "row_part_spmv.cuh"
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+#define AT __FILE__ ":" TOSTRING(__LINE__) 
+
+//#define VIEW_CHECK_BOUNDS
+
+
+// mxn random matrix with nnz
+CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
+
+    if (m * n < nnz) {
+        throw std::logic_error(AT);
+    }
+
+    CooMat coo(m,n);
+    while(coo.nnz() < nnz) {
+
+        int64_t toPush = nnz - coo.nnz();
+        std::cerr << "adding " << toPush << " non-zeros\n";
+        for (int64_t _ = 0; _ < toPush; ++_) {
+            int r = rand() % m;
+            int c = rand() % n;
+            float e = 1.0;
+            coo.push_back(r, c, e);
+        }
+        std::cerr << "removing duplicate non-zeros\n";
+        coo.remove_duplicates();
+    }
+    coo.sort();
+    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
+    CsrMat<Where::host> csr(coo);
+    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
+    return csr;
+};
+
+// nxn diagonal matrix with bandwidth b
+CsrMat<Where::host> random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) {
+
+    CooMat coo(n,n);
+    while(coo.nnz() < nnz) {
+
+        int64_t toPush = nnz - coo.nnz();
+        std::cerr << "adding " << toPush << " non-zeros\n";
+        for (int64_t _ = 0; _ < toPush; ++_) {
+            int r = rand() % n; // random row
+
+            // column in the band
+            int lb = r - bw;
+            int ub = r + bw + 1;
+            int64_t c = rand() % (ub - lb) + lb;
+            if (c < 0 || c > n) {
+                continue; // don't over-weight first or last column
+            }
+            
+            float e = 1.0;
+            coo.push_back(r, c, e);
+        }
+        std::cerr << "removing duplicate non-zeros\n";
+        coo.remove_duplicates();
+    }
+    coo.sort();
+    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
+    CsrMat<Where::host> csr(coo);
+    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
+    return csr;
+};
+
+std::vector<float> random_vector(const int64_t n) {
+    return std::vector<float>(n, 1.0);
+}
+
+Array<Where::host, float> random_array(const int64_t n) {
+    return Array<Where::host, float>(n, 1.0);
+}
+
+#if 0
+int send_x(int dst, int src, std::vector<float> &&v, MPI_Comm comm) {
+    MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm);
+    return 0;
+}
+#endif
+
+/* recv some amount of data, and put it in the right place
+   in a full x
+*/
+std::vector<float> receive_x(const int n, const int dst, int src, MPI_Comm comm) {
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+
+    // which rows of x are local
+    Range local = get_partition(n, rank, size);
+
+    // probe for size
+    MPI_Status stat;
+    MPI_Probe(0, Tag::x, comm, &stat);
+    int sz;
+    MPI_Get_count(&stat, MPI_INT, &sz);
+    if (sz != local.ub-local.lb) {
+        throw std::logic_error(AT);
+    }
+
+    std::cerr << "recv " << sz << " x entries into offset " << local.lb << "\n";
+    std::vector<float> x(n);
+    MPI_Recv(x.data() + local.lb, sz, MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE);
+
+    return x;
+}
+
+
+
+// z += a
+__global__ void vector_add(ArrayView<float> z, const ArrayView<float> a) {
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < z.size(); i += blockDim.x * gridDim.x) {
+        z(i) += a(i);
+    }
+}
+
+int main (int argc, char **argv) {
+
+    MPI_Init(&argc, &argv);
+
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+  
+    std::cerr << "get a gpu...\n";
+    CUDA_RUNTIME(cudaSetDevice(rank % 4));
+    CUDA_RUNTIME(cudaFree(0));
+    std::cerr << "barrier...\n";
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // int64_t m = 150000;
+    // int64_t n = 150000;
+    // int64_t nnz = 11000000;
+    // or
+    int64_t m = 150000;
+    int64_t n = m;
+    int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks
+    int64_t nnz = 11000000;
+
+    CsrMat<Where::host> A; // "local A"
+
+    // generate and distribute A
+    if (0 == rank) {
+        std::cerr << "generate matrix\n";
+        A = random_band_matrix(m, bw, nnz);
+    }
+
+    
+    RowPartSpmv spmv(A, 0, MPI_COMM_WORLD);
+
+
+    std::cerr << "A:        " << A.num_rows()         << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
+    std::cerr << "local A:  " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n";
+    std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n";
+
+
+    int loPrio, hiPrio;
+    CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio));
+
+    cudaStream_t loS, hiS; // "lo/hi prio"
+    CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio));
+    CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio));
+
+    cudaEvent_t event;
+    CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+    const int nIters = 30;
+    std::vector<double> times(nIters);
+
+    nvtxRangePush("overlap");
+    for (int i = 0; i < nIters; ++i) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        double start = MPI_Wtime();
+        spmv.send_x_async();
+        spmv.launch_local();
+        spmv.recv_x_async();
+        spmv.send_x_wait();
+        spmv.recv_x_wait();
+        spmv.launch_remote();
+        spmv.finish();
+        times[i] = MPI_Wtime() - start;
+    }
+    nvtxRangePop(); // one-shot
+    MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    if (0 == rank) {
+        std::sort(times.begin(), times.end());
+        std::cerr << times[times.size() / 2] << "\n";
+    }
+
+    MPI_Finalize();
+
+    return 0;
+}
--- a/partition.hpp
+++ b/partition.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "csr_mat.hpp"
+
+struct Range {
+    int lb;
+    int ub;
+
+    int extent() const { return ub-lb; }
+};
+
+/* get the ith part of splitting domain in to n pieces
+   if not divisible, remainder distributed to lower
+*/
+Range get_partition(const int domain, const int i, const int n) {
+    int div = domain / n;
+    int rem = domain % n;
+
+    int lb, ub;
+
+    if (i < rem) {
+        lb = i * (div+1);
+        ub = lb + (div+1);
+    } else {
+        lb = rem * (div+1) + (i-rem) * div;
+        ub = lb + div;
+    }
+    return Range{.lb=lb, .ub=ub};
+}
+
+// who owns item `i` from `domain` split into `n`
+int get_owner(int domain, int i, const int n) {
+    int div = domain / n;
+    int rem = domain % n;
+
+    // i is in the first, pieces, which are div+1
+    if (i < (div + 1) * rem) {
+        return i / (div + 1);
+    } else {
+        i -= (div+1) * rem;
+        domain -= (div+1) * rem;
+        return rem + i / div;
+    }
+}
+
+std::vector<CsrMat<Where::host>> part_by_rows(const CsrMat<Where::host> &m, const int parts) {
+
+    std::vector<CsrMat<Where::host>> mats;
+
+    for (int p = 0; p < parts; ++p) {
+        Range range = get_partition(m.num_rows(), p, parts);
+        std::cerr << "matrix part " << p << " has " << range.ub-range.lb << " rows\n";
+        CsrMat<Where::host> part(m);
+        part.retain_rows(range.lb, range.ub);
+        mats.push_back(part);
+    }
+
+    return mats;
+}
--- a/row_part_spmv.cuh
+++ b/row_part_spmv.cuh
@@ -0,0 +1,328 @@
+#pragma once
+
+#include <mpi.h>
+
+#include "csr_mat.hpp"
+#include "partition.hpp"
+#include "split_coo_mat.hpp"
+
+#include <cassert>
+
+
+enum class ProductConfig {
+    MODIFY, // b += 
+    SET     // b =
+};
+
+/* Ax=b
+*/
+__global__ void spmv(ArrayView<float> b,
+     const CsrMat<Where::device>::View A,
+      const ArrayView<float> x,
+      const ProductConfig pc
+    ) {
+    // one thread per row
+    for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) {
+        float acc = 0;
+        for (int ci = A.row_ptr(r); ci < A.row_ptr(r+1); ++ci) {
+            int c = A.col_ind(ci);
+            acc += A.val(ci) * x(c);
+        }
+        if (ProductConfig::SET == pc) {
+            b(r) = acc;
+        } else {
+            b(r) += acc;
+        }
+    }
+}
+
+
+
+enum Tag : int {
+    row_ptr,
+    col_ind,
+    val,
+    x,
+    num_cols
+};
+
+
+
+
+int send_matrix(int dst, int src, CsrMat<Where::host> &&m, MPI_Comm comm) {
+
+    MPI_Request reqs[4];
+
+    int numCols = m.num_cols();
+    MPI_Isend(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm, &reqs[0]);
+    MPI_Isend(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm, &reqs[1]);
+    MPI_Isend(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm, &reqs[2]);
+    MPI_Isend(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm, &reqs[3]);
+    MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE);
+
+    return 0;
+}
+
+CsrMat<Where::host> receive_matrix(int dst, int src, MPI_Comm comm) {
+
+    int numCols;
+    MPI_Recv(&numCols, 1, MPI_INT, 0, Tag::num_cols, comm, MPI_STATUS_IGNORE);
+
+    // probe for number of rows
+    MPI_Status stat;
+    MPI_Probe(0, Tag::row_ptr, comm, &stat);
+    int numRows;
+    MPI_Get_count(&stat, MPI_INT, &numRows);
+    if (numRows > 0) {
+        --numRows;
+    }
+
+    // probe for nnz
+    MPI_Probe(0, Tag::col_ind, comm, &stat);
+    int nnz;
+    MPI_Get_count(&stat, MPI_INT, &nnz);
+
+    std::cerr << "recv " << numRows << "x" << numCols << " w/ " << nnz << "\n";
+    CsrMat<Where::host> csr(numRows, numCols, nnz);
+
+    // receive actual data into matrix
+    MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE);
+    MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE);
+    MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE);
+
+    return csr;
+}
+
+
+// out[i] = in[idx[i]]
+__global__ void scatter(ArrayView<float> out, 
+ArrayView<float> in, 
+ArrayView<int> idx) {}
+
+
+/* Ax=y , partitioned evenly by rows of A
+
+   always have to pack on the send side, since not all local x values will be needed
+   so may as well pack in a way that recver doesn't have to unpack
+
+   serializing local and remote means you don't have to worry about
+   concurrent adds to the product vector
+   if kernels are sufficiently large, no real opportunity for these to overlap anyway
+   if they're small, communication time will be longer anway
+
+
+*/
+class RowPartSpmv {
+private:
+    MPI_Comm comm_;
+
+    int loff_; // first row in global index
+    CsrMat<Where::device> la_; // local A
+    CsrMat<Where::device> ra_; // remote A
+    Array<Where::device, float> lx_; // local x
+    Array<Where::device, float> rx_; // remote x
+    Array<Where::device, float> ly_;
+
+
+    // info for sending x
+    struct SendParam {
+        int dst; // destination rank
+        int displ;
+        int count;
+        MPI_Request req;
+    };
+    std::vector<SendParam> sendParams_; // parameters for each rank
+    Array<Where::device, int> xSendIdx_; // which entry of lx_ will be in each xSendBuf_;
+    Array<Where::device, float> xSendBuf_; // send local x entries to other ranks
+
+    std::vector<int> gCols_; // global index from local
+
+
+    std::map<int, std::vector<int>> sendEntr; // which entries of x to send to each rank
+    std::map<int, MPI_Request> sendReq;
+
+    struct RecvParam {
+        int src; // source rank
+        int displ; // displacement in 
+        int count; // number of entries
+        MPI_Request req;
+    };
+    std::vector<RecvParam> recvParams_;
+
+
+    cudaStream_t kernelStream_;
+    cudaStream_t packStream_;
+
+public:
+
+    const CsrMat<Where::device> &lA() const {return la_;}
+    const CsrMat<Where::device> &rA() const {return ra_;}
+
+    void launch_local() {
+        dim3 dimGrid(100);
+        dim3 dimBlock(128);
+        spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), la_.view(), lx_.view(), ProductConfig::SET);
+        CUDA_RUNTIME(cudaGetLastError());
+    }
+
+    void launch_remote() {
+        dim3 dimGrid(100);
+        dim3 dimBlock(128);
+        spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), ra_.view(), rx_.view(), ProductConfig::MODIFY);
+        CUDA_RUNTIME(cudaGetLastError());
+    }
+
+    void pack_x_async() {
+        scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view());
+    }
+
+    void pack_x_wait() {
+        CUDA_RUNTIME(cudaStreamSynchronize(packStream_));
+    }
+
+    void send_x_async() {
+
+        // send to neighbors who want it
+        for (auto &p : sendParams_) {
+            int tag = 0;
+            MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req);
+        }
+    }
+    void send_x_wait() {
+        for (auto &p : sendParams_) {
+            MPI_Wait(&p.req, MPI_STATUS_IGNORE);
+        }
+    }
+    void recv_x_async() {
+        for (auto &p : recvParams_) {
+            int tag = 0;
+            MPI_Irecv(rx_.data() + p.displ, p.count, MPI_FLOAT, p.src, tag, comm_, &p.req);
+        }
+    }
+    void recv_x_wait() {
+        for (auto &p : recvParams_) {
+            MPI_Wait(&p.req, MPI_STATUS_IGNORE);
+        }
+    }
+
+    void finish() {
+        CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_));
+    }
+
+    void launch_local_spmv() {}
+    void launch_remote_spmv() {}
+
+    /* create from a matrix at root
+    */
+    RowPartSpmv(
+        const CsrMat<Where::host> &wholeA,
+        const int root,
+        MPI_Comm comm
+    ) {
+
+    int rank, size;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+
+    CsrMat<Where::host> a;
+    if (root == rank) {
+        std::cerr << "partition matrix\n";
+        std::vector<CsrMat<Where::host>> as = part_by_rows(wholeA, size);
+        for (size_t dst = 0; dst < size; ++dst) {
+            if (root != dst) {
+                std::cerr << "send A to " << dst << "\n";
+                send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD);
+            }
+        }
+        a = as[rank];
+    } else {
+        std::cerr << "recv A at " << rank << "\n";
+        a = receive_matrix(rank, 0, MPI_COMM_WORLD);
+    }
+
+    // split row part of a into local and global
+    SplitCooMat scm = split_local_remote(a, comm);
+    loff_ = scm.loff;
+
+    // create local part of x array
+    // undefined entries
+    Range xrange = get_partition(a.num_cols(), rank, size);
+    lx_ = Array<Where::device, float>(xrange.extent());
+
+    // create remote part of x array
+    // one entry per remote column
+    rx_ = Array<Where::device,float>(scm.globals.size());
+
+    // determine which columns needed from others
+    std::map<int, std::vector<int>> recvCols;
+    for (int c : scm.globals) {
+        auto src = get_owner(a.num_cols(), c, size);
+        assert(rank != src && "should not need my own columns in remote part");
+        recvCols[src].push_back(c);
+    }
+
+    // create receive parameters
+    int offset = 0;
+    for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
+        RecvParam param;
+        param.displ = offset;
+        param.src = it->first;
+        offset += it->second.size();
+        param.count = offset - param.displ;
+        recvParams_.push_back(param);
+    }
+
+    // tell others which cols I need (send 0 if nothing)
+    std::vector<MPI_Request> reqs(size);
+    for (int dest = 0; dest < size; ++dest) {
+        auto it = recvCols.find(dest);
+        if (it != recvCols.end()) {
+            MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]);
+        } else {
+            int _;
+            MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]);
+        }
+    }
+
+    // which global x rows other ranks need from me
+    std::map<int, std::vector<int>> sendCols;
+    for (int src = 0; src < size; ++src) {
+        MPI_Status status;
+        MPI_Probe(src, 0, comm, &status);
+        int count;
+        MPI_Get_count(&status, MPI_INT, &count);
+        if (count != 0) {
+            sendCols[src].resize(count);
+            MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
+        } else {
+            int _;
+            MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
+        }
+    }
+
+    // create the offsets from lx that we will send out
+    // TODO: should be device array
+    std::vector<int> offsets;
+    for (auto it = sendCols.begin(); it != sendCols.end(); ++it) {
+        // TODO - adjust for changed local array columns
+        SendParam param;
+        param.displ = offsets.size();
+        param.dst = it->first;
+        for (int gc : it->second) {
+            int lc = gc - scm.loff;
+            offsets.push_back(lc);
+        }
+        param.count = offsets.size() - param.displ;
+        sendParams_.push_back(param);
+    }
+
+    assert(la_.size() > 0);
+    assert(ra_.size() > 0); // remote A
+    assert(lx_.size() > 0);
+    assert(rx_.size() > 0);
+    assert(ly_.size() > 0);
+
+
+
+    }
+};
--- a/split_coo_mat.hpp
+++ b/split_coo_mat.hpp
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "coo_mat.hpp"
+#include "partition.hpp"
+
+
+
+
+/* local matrix has cols renumbered to be 0..N
+   into the local dense vector
+
+   remote matrix has cols renumbered for the remote dense vector
+*/
+struct SplitCooMat {
+    int loff; // global row for local matrix 0
+    CsrMat<Where::host> local; // local matrix
+    CsrMat<Where::host> remote; // remote matrix (with local column indices)
+    std::map<int, int> locals;  // get local column from global column
+    std::vector<int> globals; // get global column for local column
+};
+
+/* Row partition of a matrix into a "local" and "remote" part
+   If locally there are rows i...j, then the local part also has columns i...j
+   The remote part will have all other columns
+
+   Each rank will also renumber the column indices in the remote part
+   This rank will recv the corresponding remote x vector entries, but
+   doesn't want to materialize the whole distributed x vector in memory
+   so the column indices must be packed into a contiguous range 0...N
+
+   Furthermore, we want all entries from rank 0 to come first, then rank 1, etc.
+   This is so we can just get entries from rank 0 and recv them directly into the
+   remote x vector at the correct offset
+
+   To do this, relabel the matrix in the following way:
+   Get a list of unique required global ids, and then sort them.
+   The first will be local 0, then local 1, etc
+
+*/
+SplitCooMat split_local_remote(const CsrMat<Where::host> &m, MPI_Comm comm) {
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+
+    // which rows of x are local
+    Range localRange = get_partition(m.num_cols(), rank, size);
+    int loff = localRange.lb;
+
+    // build two matrices, local gets local non-zeros, remote gets remote non-zeros
+    CooMat local(m.num_rows(), m.num_cols());
+    CooMat remote(m.num_rows(), m.num_cols());
+
+    std::vector<int> globals; // get global col for local col
+    for (int r = 0; r < m.num_rows(); ++r) {
+        for (int ci = m.row_ptr(r); ci < m.row_ptr(r+1); ++ci) {
+            int c = m.col_ind(ci);
+            float v = m.val(ci);
+            if (c >= localRange.lb && c < localRange.ub) {
+                int lc = c - loff;
+                local.push_back(r,lc,v);
+            } else {
+                // keep the global column for now, it will be renumbered later
+                globals.push_back(c);
+                remote.push_back(r, c, v);
+            }
+        }
+    }
+
+    // sort required global columns.
+    // this will ensure the lowest owning rank comes first, and all are contiguous
+    std::sort(globals.begin(), globals.end());
+    auto it = std::unique(globals.begin(), globals.end());
+    globals.resize(it - globals.begin());
+
+    std::map<int, int> locals; // get local col for global column
+    for (size_t lc = 0; lc < globals.size(); ++lc) {
+        int gc = globals[lc];
+        locals[gc] = lc;
+    }
+
+    // relabel remote columns
+    for (CooMat::Entry &e : remote) {
+        e.j = locals[e.j];
+    }
+
+    return SplitCooMat {
+        .loff=loff,
+        .local=local,
+        .remote=remote,
+        .locals=locals,
+        .globals=globals
+    };
+
+}
--- a/where.hpp
+++ b/where.hpp
@@ -0,0 +1,6 @@
+#pragma once
+
+enum class Where {
+    host,
+    device
+};