initial local+remote spmv

2021-06-11 13:09:50 -06:00
parent aae7176823
commit fb88da915d
10 changed files with 1151 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,3 +97,12 @@ if (MPI_FOUND)
  set_cxx_standard(main)
 endif()
 if (MPI_FOUND)
  add_executable(overlap overlap.cu)
  target_include_directories(overlap PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
  target_link_libraries(overlap ${MPI_CXX_LIBRARIES})
  target_link_libraries(overlap CUDA::nvToolsExt)
  set_cxx_options(overlap)
  set_cxx_standard(overlap)
 endif()
--- a/algorithm.hpp
+++ b/algorithm.hpp
@@ -0,0 +1,9 @@
 #pragma once
 template<typename ForwardIt>
 void shift_left(ForwardIt first, ForwardIt last, size_t n) {
    while(first != last) {
        *(first-n) = *first;
        ++first;
    }
 }
--- a/array.hpp
+++ b/array.hpp
@@ -0,0 +1,178 @@
 #pragma once
 #include <vector>
 #include "cuda_runtime.hpp"
 #include "where.hpp"
 template <Where where, typename T>
 class Array;
 // A non-owning view of data
 template <typename T>
 struct ArrayView
 {
    T *data_;
    int64_t size_;
    public:
    ArrayView() : data_(nullptr), size_(0){}
    ArrayView(const ArrayView &other) = default;
    ArrayView(ArrayView &&other) = default;
    ArrayView &operator=(const ArrayView &rhs) = default;
    __host__ __device__ int64_t size() const { return size_; }
    __host__ __device__ const T &operator()(int64_t i) const {
 #ifdef VIEW_CHECK_BOUNDS
        if (i < 0) {
            printf("ERR: i < 0: %d\n", i);
        }
        if (i >= size_) {
            printf("ERR: i > size_: %d > %ld\n", i, size_);
        }
 #endif
        return data_[i];
    }
    __host__ __device__ T &operator()(int64_t i) {
        return data_[i];
    }
    const T* data() const {
        return data_;
    }
    T* data() {
        return data_;
    }
 };
 /* device array
 */
 template<typename T> class Array<Where::device, T>
 {
 public:
    // array owns the data in this view
    ArrayView<T> view_;
 public:
    Array() = default;
    Array(const size_t n) {
        resize(n);
    }
    Array(const Array &other) = delete;
    Array(Array &&other) : view_(other.view_) {
        // view is non-owning, so have to clear other
        other.view_.data_ = nullptr;
        other.view_.size_ = 0;
    }
    Array &operator=(Array &&other) {
        view_ = std::move(other.view_);
        // view is non-owning, so have to clear other
        other.view_.data_ = nullptr;
        other.view_.size_ = 0;
        return *this;
    }
    Array(const std::vector<T> &v) {
        set_from(v);
    }
    ~Array() {
        CUDA_RUNTIME(cudaFree(view_.data_));
        view_.data_ = nullptr;
        view_.size_ = 0;
    }
    int64_t size() const { 
        return view_.size(); }
    ArrayView<T> view() const {
        return view_; // copy of internal view
    }
    operator std::vector<T>() const {
        std::vector<T> v(size());
        CUDA_RUNTIME(cudaMemcpy(v.data(), view_.data_, size() * sizeof(T), cudaMemcpyDeviceToHost));
        return v;
    }
    void set_from(const std::vector<T> &rhs, cudaStream_t stream = 0) {
        resize(rhs.size());
        CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
    }
    void set_from(const Array<Where::host, T> &rhs, cudaStream_t stream = 0) {
        resize(rhs.size());
        CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
    }
    // any change destroys all data
    void resize(size_t n) {
        if (size() != n) {
            view_.size_ = n;
            CUDA_RUNTIME(cudaFree(view_.data_));
            CUDA_RUNTIME(cudaMalloc(&view_.data_, view_.size_ * sizeof(T)));
        }
    }
    const T* data() const {
        return view_.data();
    }
    T* data() {
        return view_.data();
    }
 };
 /* host array
 */
 template<typename T> class Array<Where::host, T>
 {
 public:
    // array owns the data in this view
    ArrayView<T> view_;
 public:
    Array() = default;
    Array(const size_t n, const T &val) {
        resize(n);
        for (size_t i = 0; i < n; ++i) {
            view_(i) = val;
        }
    }
    Array(const Array &other) = delete;
    Array(Array &&other) : view_(other.view_) {
        // view is non-owning, so have to clear other
        other.view_.data_ = nullptr;
        other.view_.size_ = 0;
    }
    ~Array() {
        CUDA_RUNTIME(cudaFreeHost(view_.data_));
        view_.data_ = nullptr;
        view_.size_ = 0;
    }
    int64_t size() const { 
        return view_.size(); }
    ArrayView<T> view() const {
        return view_; // copy of internal view
    }
    // any change destroys all data
    void resize(size_t n) {
        if (size() != n) {
            view_.size_ = n;
            CUDA_RUNTIME(cudaFreeHost(view_.data_));
            CUDA_RUNTIME(cudaHostAlloc(&view_.data_, view_.size_ * sizeof(T), cudaHostAllocDefault));
        }
    }
    const T* data() const {
        return view_.data_;
    }
    T* data() {
        return view_.data_;
    }
 };
--- a/coo_mat.hpp
+++ b/coo_mat.hpp
@@ -0,0 +1,58 @@
 #pragma once
 class CooMat {
 public:
    struct Entry {
        int i;
        int j;
        float e;
        Entry(int _i, int _j, int _e) : i(_i), j(_j), e(_e) {}
        static bool by_ij(const Entry &a, const Entry &b) {
            if (a.i < b.i) {
                return true;
            } else if (a.i > b.i) {
                return false;
            } else {
                return a.j < b.j;
            }
        }
        static bool same_ij(const Entry &a, const Entry &b) {
            return a.i == b.i && a.j == b.j;
        }
    };
 private:
    // sorted during construction
    std::vector<Entry> data_;
    int64_t numRows_;
    int64_t numCols_;
 public:
    CooMat(int m, int n) : numRows_(m), numCols_(n) {}
    const std::vector<Entry> &entries() const {return data_;}
    void push_back(int i, int j, int e) {
        data_.push_back(Entry(i,j,e));  
    }
    void sort() {
        std::sort(data_.begin(), data_.end(), Entry::by_ij);
    }
    void remove_duplicates() {
        std::sort(data_.begin(), data_.end(), Entry::by_ij);
        std::unique(data_.begin(), data_.end(), Entry::same_ij);
    }
    int64_t num_rows() const {return numRows_;}
    int64_t num_cols() const {return numCols_;}
    int64_t nnz() const {return data_.size();}
    std::vector<Entry>::iterator begin() {return data_.begin();}
    std::vector<Entry>::iterator end() {return data_.end();}
 };
--- a/csr_mat.hpp
+++ b/csr_mat.hpp
@@ -0,0 +1,196 @@
 #pragma once
 #include <cuda_runtime.h>
 #include "array.hpp"
 #include "coo_mat.hpp"
 #include "algorithm.hpp"
 template <Where where>
 class CsrMat {
 public:
    CsrMat();
    int64_t nnz() const;
    int64_t num_rows() const;
 };
 template<> class CsrMat<Where::host>;
 template<> class CsrMat<Where::device>;
 /* host sparse matrix */
 template<> class CsrMat<Where::host>
 {
    friend class CsrMat<Where::device>; // device can see inside
    std::vector<int> rowPtr_;
    std::vector<int> colInd_;
    std::vector<float> val_;
    int64_t numCols_;
 public:
    CsrMat() = default;
    CsrMat(int numRows, int numCols, int nnz) : rowPtr_(numRows+1), colInd_(nnz), val_(nnz), numCols_(numCols) {}
    CsrMat(const CooMat &coo) : numCols_(coo.num_cols()) {
        for (auto &e : coo.entries()) {
            while (rowPtr_.size() <= e.i) {
                rowPtr_.push_back(colInd_.size());
            }
            colInd_.push_back(e.j);
            val_.push_back(e.e);
        }
        while (rowPtr_.size() < coo.num_rows()+1){
            rowPtr_.push_back(colInd_.size());
        }
    }
    int64_t num_rows() const {
      if (rowPtr_.size() <= 1) {
        return 0; 
      } else { 
        return rowPtr_.size() - 1;
      }
    }
    int64_t num_cols() const {
        return numCols_;
      }
    int64_t nnz() const {
        if (colInd_.size() != val_.size()) {
            throw std::logic_error("bad invariant");
        }
        return colInd_.size();
    }
    const int &row_ptr(int64_t i) const {
        return rowPtr_[i];
    }
    const int &col_ind(int64_t i) const {
        return colInd_[i];
    }
    const float &val(int64_t i) const {
        return val_[i];
    }
    const int *row_ptr() const {return rowPtr_.data(); }
    int *row_ptr() {return rowPtr_.data(); }
    const int *col_ind() const {return colInd_.data(); }
    int *col_ind() {return colInd_.data(); }
    const float *val() const {return val_.data(); }
    float *val() {return val_.data(); }
    /* keep rows [rowStart, rowEnd)
    */
    void retain_rows(int rowStart, int rowEnd) {
        if (0 == rowEnd) {
            throw std::logic_error("unimplemented");
        }
        // erase rows after
        // dont want to keep rowEnd, so rowEnd points to end of rowEnd-1
        std::cerr << "rowPtr_ = rowPtr[:" << rowEnd+1 << "]\n";
        rowPtr_.resize(rowEnd+1);
        std::cerr << "resize entries to " << rowPtr_.back() << "\n";
        colInd_.resize(rowPtr_.back());
        val_.resize(rowPtr_.back());
        // erase early row pointers
        std::cerr << "rowPtr <<= " << rowStart << "\n";
        shift_left(rowPtr_.begin()+rowStart, rowPtr_.end(), rowStart);
        std::cerr << "resize rowPtr to " << rowEnd - rowStart+1 << "\n";
        rowPtr_.resize(rowEnd-rowStart+1);
        const int off = rowPtr_[0];
        // erase entries for first rows
        std::cerr << "entries <<= " << off << "\n";
        shift_left(colInd_.begin()+off, colInd_.end(), off);
        shift_left(val_.begin()+off, val_.end(), off);
        // adjust row pointer offset
        std::cerr << "subtract rowPtrs by " << off << "\n";
        for (auto &e : rowPtr_) {
            e -= off;
        }
        // resize entries
        std::cerr << "resize entries to " << rowPtr_.back() << "\n";
        colInd_.resize(rowPtr_.back());
        val_.resize(rowPtr_.back());
    }
 };
 /* device sparse matrix
 */
 template<> class CsrMat<Where::device>
 {
    Array<Where::device, int> rowPtr_;
    Array<Where::device, int> colInd_;
    Array<Where::device, float> val_;
    int64_t numCols_;
 public:
    struct View {
        ArrayView<int> rowPtr_;
        ArrayView<int> colInd_;
        ArrayView<float> val_;
        __device__ int num_rows() const {
            if (rowPtr_.size() > 0) {
                return rowPtr_.size() - 1;
            } else {
                return 0;
            }
        }
        __device__ const int &row_ptr(int64_t i) const {
            return rowPtr_(i);
        }
        __device__ const int &col_ind(int64_t i) const {
            return colInd_(i);
        }
        __device__ const float &val(int64_t i) const {
            return val_(i);
        }
    };
    CsrMat() = default;
    CsrMat(CsrMat &&other) = delete;
    CsrMat(const CsrMat &other) = delete;
    // create device matrix from host
    CsrMat(const CsrMat<Where::host> &m) : 
        rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) {
        if (colInd_.size() != val_.size()) {
            throw std::logic_error("bad invariant");
        }
    }
    ~CsrMat() {
    }
    int64_t num_rows() const {
        if (rowPtr_.size() <= 1) {
            return 0; 
          } else { 
            return rowPtr_.size() - 1;
          }
    }
    int64_t num_cols() const {
        return numCols_;
    }
    int64_t nnz() const {
        return colInd_.size();
    }
    View view() const {
        View v;
        v.rowPtr_ = rowPtr_.view();
        v.colInd_ = colInd_.view();
        v.val_ = val_.view();
        return v;
    }
 };
--- a/overlap.cu
+++ b/overlap.cu
@@ -0,0 +1,213 @@
 #include <mpi.h>
 #include <nvToolsExt.h>
 // #include <cuda_profiler_api.h>
 #include <vector>
 #include <string>
 #include <stdexcept>
 #include <algorithm>
 #include <iostream>
 #include <map>
 #include "cuda_runtime.hpp"
 #include "csr_mat.hpp"
 #include "row_part_spmv.cuh"
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 #define AT __FILE__ ":" TOSTRING(__LINE__) 
 //#define VIEW_CHECK_BOUNDS
 // mxn random matrix with nnz
 CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
    if (m * n < nnz) {
        throw std::logic_error(AT);
    }
    CooMat coo(m,n);
    while(coo.nnz() < nnz) {
        int64_t toPush = nnz - coo.nnz();
        std::cerr << "adding " << toPush << " non-zeros\n";
        for (int64_t _ = 0; _ < toPush; ++_) {
            int r = rand() % m;
            int c = rand() % n;
            float e = 1.0;
            coo.push_back(r, c, e);
        }
        std::cerr << "removing duplicate non-zeros\n";
        coo.remove_duplicates();
    }
    coo.sort();
    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
    CsrMat<Where::host> csr(coo);
    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
    return csr;
 };
 // nxn diagonal matrix with bandwidth b
 CsrMat<Where::host> random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) {
    CooMat coo(n,n);
    while(coo.nnz() < nnz) {
        int64_t toPush = nnz - coo.nnz();
        std::cerr << "adding " << toPush << " non-zeros\n";
        for (int64_t _ = 0; _ < toPush; ++_) {
            int r = rand() % n; // random row
            // column in the band
            int lb = r - bw;
            int ub = r + bw + 1;
            int64_t c = rand() % (ub - lb) + lb;
            if (c < 0 || c > n) {
                continue; // don't over-weight first or last column
            }
            float e = 1.0;
            coo.push_back(r, c, e);
        }
        std::cerr << "removing duplicate non-zeros\n";
        coo.remove_duplicates();
    }
    coo.sort();
    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
    CsrMat<Where::host> csr(coo);
    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
    return csr;
 };
 std::vector<float> random_vector(const int64_t n) {
    return std::vector<float>(n, 1.0);
 }
 Array<Where::host, float> random_array(const int64_t n) {
    return Array<Where::host, float>(n, 1.0);
 }
 #if 0
 int send_x(int dst, int src, std::vector<float> &&v, MPI_Comm comm) {
    MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm);
    return 0;
 }
 #endif
 /* recv some amount of data, and put it in the right place
   in a full x
 */
 std::vector<float> receive_x(const int n, const int dst, int src, MPI_Comm comm) {
    int rank = 0;
    int size = 1;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);
    // which rows of x are local
    Range local = get_partition(n, rank, size);
    // probe for size
    MPI_Status stat;
    MPI_Probe(0, Tag::x, comm, &stat);
    int sz;
    MPI_Get_count(&stat, MPI_INT, &sz);
    if (sz != local.ub-local.lb) {
        throw std::logic_error(AT);
    }
    std::cerr << "recv " << sz << " x entries into offset " << local.lb << "\n";
    std::vector<float> x(n);
    MPI_Recv(x.data() + local.lb, sz, MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE);
    return x;
 }
 // z += a
 __global__ void vector_add(ArrayView<float> z, const ArrayView<float> a) {
    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < z.size(); i += blockDim.x * gridDim.x) {
        z(i) += a(i);
    }
 }
 int main (int argc, char **argv) {
    MPI_Init(&argc, &argv);
    int rank = 0;
    int size = 1;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    std::cerr << "get a gpu...\n";
    CUDA_RUNTIME(cudaSetDevice(rank % 4));
    CUDA_RUNTIME(cudaFree(0));
    std::cerr << "barrier...\n";
    MPI_Barrier(MPI_COMM_WORLD);
    // int64_t m = 150000;
    // int64_t n = 150000;
    // int64_t nnz = 11000000;
    // or
    int64_t m = 150000;
    int64_t n = m;
    int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks
    int64_t nnz = 11000000;
    CsrMat<Where::host> A; // "local A"
    // generate and distribute A
    if (0 == rank) {
        std::cerr << "generate matrix\n";
        A = random_band_matrix(m, bw, nnz);
    }
    RowPartSpmv spmv(A, 0, MPI_COMM_WORLD);
    std::cerr << "A:        " << A.num_rows()         << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
    std::cerr << "local A:  " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n";
    std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n";
    int loPrio, hiPrio;
    CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio));
    cudaStream_t loS, hiS; // "lo/hi prio"
    CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio));
    CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio));
    cudaEvent_t event;
    CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
    const int nIters = 30;
    std::vector<double> times(nIters);
    nvtxRangePush("overlap");
    for (int i = 0; i < nIters; ++i) {
        MPI_Barrier(MPI_COMM_WORLD);
        double start = MPI_Wtime();
        spmv.send_x_async();
        spmv.launch_local();
        spmv.recv_x_async();
        spmv.send_x_wait();
        spmv.recv_x_wait();
        spmv.launch_remote();
        spmv.finish();
        times[i] = MPI_Wtime() - start;
    }
    nvtxRangePop(); // one-shot
    MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
    if (0 == rank) {
        std::sort(times.begin(), times.end());
        std::cerr << times[times.size() / 2] << "\n";
    }
    MPI_Finalize();
    return 0;
 }
--- a/partition.hpp
+++ b/partition.hpp
@@ -0,0 +1,59 @@
 #pragma once
 #include "csr_mat.hpp"
 struct Range {
    int lb;
    int ub;
    int extent() const { return ub-lb; }
 };
 /* get the ith part of splitting domain in to n pieces
   if not divisible, remainder distributed to lower
 */
 Range get_partition(const int domain, const int i, const int n) {
    int div = domain / n;
    int rem = domain % n;
    int lb, ub;
    if (i < rem) {
        lb = i * (div+1);
        ub = lb + (div+1);
    } else {
        lb = rem * (div+1) + (i-rem) * div;
        ub = lb + div;
    }
    return Range{.lb=lb, .ub=ub};
 }
 // who owns item `i` from `domain` split into `n`
 int get_owner(int domain, int i, const int n) {
    int div = domain / n;
    int rem = domain % n;
    // i is in the first, pieces, which are div+1
    if (i < (div + 1) * rem) {
        return i / (div + 1);
    } else {
        i -= (div+1) * rem;
        domain -= (div+1) * rem;
        return rem + i / div;
    }
 }
 std::vector<CsrMat<Where::host>> part_by_rows(const CsrMat<Where::host> &m, const int parts) {
    std::vector<CsrMat<Where::host>> mats;
    for (int p = 0; p < parts; ++p) {
        Range range = get_partition(m.num_rows(), p, parts);
        std::cerr << "matrix part " << p << " has " << range.ub-range.lb << " rows\n";
        CsrMat<Where::host> part(m);
        part.retain_rows(range.lb, range.ub);
        mats.push_back(part);
    }
    return mats;
 }
--- a/row_part_spmv.cuh
+++ b/row_part_spmv.cuh
@@ -0,0 +1,328 @@
 #pragma once
 #include <mpi.h>
 #include "csr_mat.hpp"
 #include "partition.hpp"
 #include "split_coo_mat.hpp"
 #include <cassert>
 enum class ProductConfig {
    MODIFY, // b += 
    SET     // b =
 };
 /* Ax=b
 */
 __global__ void spmv(ArrayView<float> b,
     const CsrMat<Where::device>::View A,
      const ArrayView<float> x,
      const ProductConfig pc
    ) {
    // one thread per row
    for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) {
        float acc = 0;
        for (int ci = A.row_ptr(r); ci < A.row_ptr(r+1); ++ci) {
            int c = A.col_ind(ci);
            acc += A.val(ci) * x(c);
        }
        if (ProductConfig::SET == pc) {
            b(r) = acc;
        } else {
            b(r) += acc;
        }
    }
 }
 enum Tag : int {
    row_ptr,
    col_ind,
    val,
    x,
    num_cols
 };
 int send_matrix(int dst, int src, CsrMat<Where::host> &&m, MPI_Comm comm) {
    MPI_Request reqs[4];
    int numCols = m.num_cols();
    MPI_Isend(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm, &reqs[0]);
    MPI_Isend(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm, &reqs[1]);
    MPI_Isend(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm, &reqs[2]);
    MPI_Isend(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm, &reqs[3]);
    MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE);
    return 0;
 }
 CsrMat<Where::host> receive_matrix(int dst, int src, MPI_Comm comm) {
    int numCols;
    MPI_Recv(&numCols, 1, MPI_INT, 0, Tag::num_cols, comm, MPI_STATUS_IGNORE);
    // probe for number of rows
    MPI_Status stat;
    MPI_Probe(0, Tag::row_ptr, comm, &stat);
    int numRows;
    MPI_Get_count(&stat, MPI_INT, &numRows);
    if (numRows > 0) {
        --numRows;
    }
    // probe for nnz
    MPI_Probe(0, Tag::col_ind, comm, &stat);
    int nnz;
    MPI_Get_count(&stat, MPI_INT, &nnz);
    std::cerr << "recv " << numRows << "x" << numCols << " w/ " << nnz << "\n";
    CsrMat<Where::host> csr(numRows, numCols, nnz);
    // receive actual data into matrix
    MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE);
    MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE);
    MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE);
    return csr;
 }
 // out[i] = in[idx[i]]
 __global__ void scatter(ArrayView<float> out, 
 ArrayView<float> in, 
 ArrayView<int> idx) {}
 /* Ax=y , partitioned evenly by rows of A
   always have to pack on the send side, since not all local x values will be needed
   so may as well pack in a way that recver doesn't have to unpack
   serializing local and remote means you don't have to worry about
   concurrent adds to the product vector
   if kernels are sufficiently large, no real opportunity for these to overlap anyway
   if they're small, communication time will be longer anway
 */
 class RowPartSpmv {
 private:
    MPI_Comm comm_;
    int loff_; // first row in global index
    CsrMat<Where::device> la_; // local A
    CsrMat<Where::device> ra_; // remote A
    Array<Where::device, float> lx_; // local x
    Array<Where::device, float> rx_; // remote x
    Array<Where::device, float> ly_;
    // info for sending x
    struct SendParam {
        int dst; // destination rank
        int displ;
        int count;
        MPI_Request req;
    };
    std::vector<SendParam> sendParams_; // parameters for each rank
    Array<Where::device, int> xSendIdx_; // which entry of lx_ will be in each xSendBuf_;
    Array<Where::device, float> xSendBuf_; // send local x entries to other ranks
    std::vector<int> gCols_; // global index from local
    std::map<int, std::vector<int>> sendEntr; // which entries of x to send to each rank
    std::map<int, MPI_Request> sendReq;
    struct RecvParam {
        int src; // source rank
        int displ; // displacement in 
        int count; // number of entries
        MPI_Request req;
    };
    std::vector<RecvParam> recvParams_;
    cudaStream_t kernelStream_;
    cudaStream_t packStream_;
 public:
    const CsrMat<Where::device> &lA() const {return la_;}
    const CsrMat<Where::device> &rA() const {return ra_;}
    void launch_local() {
        dim3 dimGrid(100);
        dim3 dimBlock(128);
        spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), la_.view(), lx_.view(), ProductConfig::SET);
        CUDA_RUNTIME(cudaGetLastError());
    }
    void launch_remote() {
        dim3 dimGrid(100);
        dim3 dimBlock(128);
        spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), ra_.view(), rx_.view(), ProductConfig::MODIFY);
        CUDA_RUNTIME(cudaGetLastError());
    }
    void pack_x_async() {
        scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view());
    }
    void pack_x_wait() {
        CUDA_RUNTIME(cudaStreamSynchronize(packStream_));
    }
    void send_x_async() {
        // send to neighbors who want it
        for (auto &p : sendParams_) {
            int tag = 0;
            MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req);
        }
    }
    void send_x_wait() {
        for (auto &p : sendParams_) {
            MPI_Wait(&p.req, MPI_STATUS_IGNORE);
        }
    }
    void recv_x_async() {
        for (auto &p : recvParams_) {
            int tag = 0;
            MPI_Irecv(rx_.data() + p.displ, p.count, MPI_FLOAT, p.src, tag, comm_, &p.req);
        }
    }
    void recv_x_wait() {
        for (auto &p : recvParams_) {
            MPI_Wait(&p.req, MPI_STATUS_IGNORE);
        }
    }
    void finish() {
        CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_));
    }
    void launch_local_spmv() {}
    void launch_remote_spmv() {}
    /* create from a matrix at root
    */
    RowPartSpmv(
        const CsrMat<Where::host> &wholeA,
        const int root,
        MPI_Comm comm
    ) {
    int rank, size;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);
    CsrMat<Where::host> a;
    if (root == rank) {
        std::cerr << "partition matrix\n";
        std::vector<CsrMat<Where::host>> as = part_by_rows(wholeA, size);
        for (size_t dst = 0; dst < size; ++dst) {
            if (root != dst) {
                std::cerr << "send A to " << dst << "\n";
                send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD);
            }
        }
        a = as[rank];
    } else {
        std::cerr << "recv A at " << rank << "\n";
        a = receive_matrix(rank, 0, MPI_COMM_WORLD);
    }
    // split row part of a into local and global
    SplitCooMat scm = split_local_remote(a, comm);
    loff_ = scm.loff;
    // create local part of x array
    // undefined entries
    Range xrange = get_partition(a.num_cols(), rank, size);
    lx_ = Array<Where::device, float>(xrange.extent());
    // create remote part of x array
    // one entry per remote column
    rx_ = Array<Where::device,float>(scm.globals.size());
    // determine which columns needed from others
    std::map<int, std::vector<int>> recvCols;
    for (int c : scm.globals) {
        auto src = get_owner(a.num_cols(), c, size);
        assert(rank != src && "should not need my own columns in remote part");
        recvCols[src].push_back(c);
    }
    // create receive parameters
    int offset = 0;
    for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
        RecvParam param;
        param.displ = offset;
        param.src = it->first;
        offset += it->second.size();
        param.count = offset - param.displ;
        recvParams_.push_back(param);
    }
    // tell others which cols I need (send 0 if nothing)
    std::vector<MPI_Request> reqs(size);
    for (int dest = 0; dest < size; ++dest) {
        auto it = recvCols.find(dest);
        if (it != recvCols.end()) {
            MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]);
        } else {
            int _;
            MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]);
        }
    }
    // which global x rows other ranks need from me
    std::map<int, std::vector<int>> sendCols;
    for (int src = 0; src < size; ++src) {
        MPI_Status status;
        MPI_Probe(src, 0, comm, &status);
        int count;
        MPI_Get_count(&status, MPI_INT, &count);
        if (count != 0) {
            sendCols[src].resize(count);
            MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
        } else {
            int _;
            MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
        }
    }
    // create the offsets from lx that we will send out
    // TODO: should be device array
    std::vector<int> offsets;
    for (auto it = sendCols.begin(); it != sendCols.end(); ++it) {
        // TODO - adjust for changed local array columns
        SendParam param;
        param.displ = offsets.size();
        param.dst = it->first;
        for (int gc : it->second) {
            int lc = gc - scm.loff;
            offsets.push_back(lc);
        }
        param.count = offsets.size() - param.displ;
        sendParams_.push_back(param);
    }
    assert(la_.size() > 0);
    assert(ra_.size() > 0); // remote A
    assert(lx_.size() > 0);
    assert(rx_.size() > 0);
    assert(ly_.size() > 0);
    }
 };
--- a/split_coo_mat.hpp
+++ b/split_coo_mat.hpp
@@ -0,0 +1,95 @@
 #pragma once
 #include "coo_mat.hpp"
 #include "partition.hpp"
 /* local matrix has cols renumbered to be 0..N
   into the local dense vector
   remote matrix has cols renumbered for the remote dense vector
 */
 struct SplitCooMat {
    int loff; // global row for local matrix 0
    CsrMat<Where::host> local; // local matrix
    CsrMat<Where::host> remote; // remote matrix (with local column indices)
    std::map<int, int> locals;  // get local column from global column
    std::vector<int> globals; // get global column for local column
 };
 /* Row partition of a matrix into a "local" and "remote" part
   If locally there are rows i...j, then the local part also has columns i...j
   The remote part will have all other columns
   Each rank will also renumber the column indices in the remote part
   This rank will recv the corresponding remote x vector entries, but
   doesn't want to materialize the whole distributed x vector in memory
   so the column indices must be packed into a contiguous range 0...N
   Furthermore, we want all entries from rank 0 to come first, then rank 1, etc.
   This is so we can just get entries from rank 0 and recv them directly into the
   remote x vector at the correct offset
   To do this, relabel the matrix in the following way:
   Get a list of unique required global ids, and then sort them.
   The first will be local 0, then local 1, etc
 */
 SplitCooMat split_local_remote(const CsrMat<Where::host> &m, MPI_Comm comm) {
    int rank = 0;
    int size = 1;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);
    // which rows of x are local
    Range localRange = get_partition(m.num_cols(), rank, size);
    int loff = localRange.lb;
    // build two matrices, local gets local non-zeros, remote gets remote non-zeros
    CooMat local(m.num_rows(), m.num_cols());
    CooMat remote(m.num_rows(), m.num_cols());
    std::vector<int> globals; // get global col for local col
    for (int r = 0; r < m.num_rows(); ++r) {
        for (int ci = m.row_ptr(r); ci < m.row_ptr(r+1); ++ci) {
            int c = m.col_ind(ci);
            float v = m.val(ci);
            if (c >= localRange.lb && c < localRange.ub) {
                int lc = c - loff;
                local.push_back(r,lc,v);
            } else {
                // keep the global column for now, it will be renumbered later
                globals.push_back(c);
                remote.push_back(r, c, v);
            }
        }
    }
    // sort required global columns.
    // this will ensure the lowest owning rank comes first, and all are contiguous
    std::sort(globals.begin(), globals.end());
    auto it = std::unique(globals.begin(), globals.end());
    globals.resize(it - globals.begin());
    std::map<int, int> locals; // get local col for global column
    for (size_t lc = 0; lc < globals.size(); ++lc) {
        int gc = globals[lc];
        locals[gc] = lc;
    }
    // relabel remote columns
    for (CooMat::Entry &e : remote) {
        e.j = locals[e.j];
    }
    return SplitCooMat {
        .loff=loff,
        .local=local,
        .remote=remote,
        .locals=locals,
        .globals=globals
    };
 }
--- a/where.hpp
+++ b/where.hpp
@@ -0,0 +1,6 @@
 #pragma once
 enum class Where {
    host,
    device
 };