diff --git a/CMakeLists.txt b/CMakeLists.txt
index f136eef..4eb928e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ if (MPI_FOUND)
   add_executable(main main.cu)
   target_include_directories(main PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
   target_link_libraries(main ${MPI_CXX_LIBRARIES})
+  target_link_libraries(main CUDA::nvToolsExt)
   # target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH})
   # target_compile_options(main PRIVATE ${MPI_CXX_COMPILE_FLAGS})
   # target_link_libraries(main ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
diff --git a/README.md b/README.md
index 95363cc..4061b2d 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,18 @@ module load cmake/3.18.0
 
 **ascicgpu030**
 
+To get nsight, had to download the rpm and unpack into home directory with `cpio`.
+
 ```
 module purge
-sierra-devel/nvidia
+module load sierra-devel/nvidia
 module load cde/v2/cmake/3.19.2
-```
\ No newline at end of file
+```
+
+```
+mpirun -n 2 ~/software/nsight-systems-cli/2021.2.1/bin/nsys profile -c cudaProfilerApi -t cuda,mpi,nvtx -o dist-spmv_%q{OMPI_COMM_WORLD_RANK} -f true ./main
+```
+
+## Design Considerations
+
+Minimize CUDA runtime calls
\ No newline at end of file
diff --git a/main.cu b/main.cu
index 76557a8..64152f4 100644
--- a/main.cu
+++ b/main.cu
@@ -1,5 +1,8 @@
 #include <mpi.h>
 
+#include <nvToolsExt.h>
+// #include <cuda_profiler_api.h>
+
 #include <vector>
 #include <string>
 #include <stdexcept>
@@ -39,43 +42,44 @@ template <Where where, typename T>
 class Array;
 
 
+// A non-owning view of data
+template <typename T>
+struct ArrayView
+{
+    T *data_;
+    int64_t size_;
+    public:
+    ArrayView() : data_(nullptr), size_(0){}
+    ArrayView(const ArrayView &other) = default;
+    ArrayView(ArrayView &&other) = default;
+    ArrayView &operator=(const ArrayView &rhs) = default;
+
+    __host__ __device__ int64_t size() const { return size_; }
+
+    __host__ __device__ const T &operator()(int64_t i) const {
+#ifdef VIEW_CHECK_BOUNDS
+        if (i < 0) {
+            printf("ERR: i < 0: %d\n", i);
+        }
+        if (i >= size_) {
+            printf("ERR: i > size_: %d > %ld\n", i, size_);
+        }
+#endif
+        return data_[i];
+    }
+    __host__ __device__ T &operator()(int64_t i) {
+        return data_[i];
+    }
+};
+
 /* device array
 */
 template<typename T> class Array<Where::device, T>
 {
 public:
 
-    // A non-owning view of data
-    struct View
-    {
-        T *data_;
-        int64_t size_;
-        public:
-        View() : data_(nullptr), size_(0){}
-        View(const View &other) = default;
-        View(View &&other) = default;
-        View &operator=(const View &rhs) = default;
-
-        __host__ __device__ int64_t size() const { return size_; }
-
-        __device__ const T &operator()(int64_t i) const {
-#ifdef VIEW_CHECK_BOUNDS
-            if (i < 0) {
-                printf("ERR: i < 0: %d\n", i);
-            }
-            if (i >= size_) {
-                printf("ERR: i > size_: %d > %ld\n", i, size_);
-            }
-#endif
-            return data_[i];
-        }
-        __device__ T &operator()(int64_t i) {
-            return data_[i];
-        }
-    };
-
     // array owns the data in this view
-    View view_;
+    ArrayView<T> view_;
 public:
     Array() = default;
     Array(const size_t n) {
@@ -91,6 +95,7 @@ public:
     Array(const std::vector<T> &v) {
         set_from(v);
     }
+
     ~Array() {
         CUDA_RUNTIME(cudaFree(view_.data_));
         view_.data_ = nullptr;
@@ -99,7 +104,7 @@ public:
     int64_t size() const { 
         return view_.size(); }
 
-    View view() const {
+    ArrayView<T> view() const {
         return view_; // copy of internal view
     }
 
@@ -114,6 +119,11 @@ public:
         CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
     }
 
+    void set_from(const Array<Where::host, T> &rhs, cudaStream_t stream = 0) {
+        resize(rhs.size());
+        CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
     // any change destroys all data
     void resize(size_t n) {
         if (size() != n) {
@@ -125,6 +135,61 @@ public:
 
 };
 
+
+/* host array
+*/
+template<typename T> class Array<Where::host, T>
+{
+public:
+
+    // array owns the data in this view
+    ArrayView<T> view_;
+public:
+    Array() = default;
+    Array(const size_t n, const T &val) {
+        resize(n);
+        for (size_t i = 0; i < n; ++i) {
+            view_(i) = val;
+        }
+    }
+    Array(const Array &other) = delete;
+    Array(Array &&other) : view_(other.view_) {
+        // view is non-owning, so have to clear other
+        other.view_.data_ = nullptr;
+        other.view_.size_ = 0;
+    }
+
+    ~Array() {
+        CUDA_RUNTIME(cudaFreeHost(view_.data_));
+        view_.data_ = nullptr;
+        view_.size_ = 0;
+    }
+    int64_t size() const { 
+        return view_.size(); }
+
+    ArrayView<T> view() const {
+        return view_; // copy of internal view
+    }
+
+    // any change destroys all data
+    void resize(size_t n) {
+        if (size() != n) {
+            view_.size_ = n;
+            CUDA_RUNTIME(cudaFreeHost(view_.data_));
+            CUDA_RUNTIME(cudaHostAlloc(&view_.data_, view_.size_ * sizeof(T), cudaHostAllocDefault));
+        }
+    }
+
+    const T* data() const {
+        return view_.data_;
+    }
+    T* data() {
+        return view_.data_;
+    }
+
+};
+
+
 class CooMat {
 public:
 
@@ -145,6 +210,10 @@ public:
                 return a.j < b.j;
             }
         }
+
+        static bool same_ij(const Entry &a, const Entry &b) {
+            return a.i == b.i && a.j == b.j;
+        }
     };
 
 private:
@@ -158,24 +227,18 @@ public:
     CooMat(int m, int n) : numRows_(m), numCols_(n) {}
     const std::vector<Entry> &entries() const {return data_;}
     void push_back(int i, int j, int e) {
-
-        Entry entry(i,j,e);
-
-        // first position not less than entry
-        auto lb = std::lower_bound(data_.begin(), data_.end(), entry, Entry::by_ij);
-
-        // overwrite if already exists
-        if (lb != data_.end() && lb->i == entry.i && lb->j == entry.j) {
-            *lb = entry;
-        } else {
-            data_.insert(lb, entry);
-        }
+        data_.push_back(Entry(i,j,e));  
     }
 
     void sort() {
         std::sort(data_.begin(), data_.end(), Entry::by_ij);
     }
 
+    void remove_duplicates() {
+        std::sort(data_.begin(), data_.end(), Entry::by_ij);
+        std::unique(data_.begin(), data_.end(), Entry::same_ij);
+    }
+
     int64_t num_rows() const {return numRows_;}
     int64_t num_cols() const {return numCols_;}
     int64_t nnz() const {return data_.size();}
@@ -236,6 +299,16 @@ public:
         return colInd_.size();
     }
 
+    const int &row_ptr(int64_t i) const {
+        return rowPtr_[i];
+    }
+    const int &col_ind(int64_t i) const {
+        return colInd_[i];
+    }
+    const float &val(int64_t i) const {
+        return val_[i];
+    }
+
     const int *row_ptr() const {return rowPtr_.data(); }
     int *row_ptr() {return rowPtr_.data(); }
     const int *col_ind() const {return colInd_.data(); }
@@ -295,9 +368,9 @@ template<> class CsrMat<Where::device>
 public:
 
     struct View {
-        Array<Where::device, int>::View rowPtr_;
-        Array<Where::device, int>::View colInd_;
-        Array<Where::device, float>::View val_;
+        ArrayView<int> rowPtr_;
+        ArrayView<int> colInd_;
+        ArrayView<float> val_;
 
         __device__ int num_rows() const {
             if (rowPtr_.size() > 0) {
@@ -368,10 +441,17 @@ CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_
 
     CooMat coo(m,n);
     while(coo.nnz() < nnz) {
-        int r = rand() % m;
-        int c = rand() % n;
-        float e = 1.0;
-        coo.push_back(r, c, e);
+
+        int toPush = nnz - coo.nnz();
+        std::cerr << "adding " << toPush << " non-zeros\n";
+        for (int _ = 0; _ < toPush; ++_) {
+            int r = rand() % m;
+            int c = rand() % n;
+            float e = 1.0;
+            coo.push_back(r, c, e);
+        }
+        std::cerr << "removing duplicate non-zeros\n";
+        coo.remove_duplicates();
     }
     coo.sort();
     std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
@@ -384,6 +464,10 @@ std::vector<float> random_vector(const int64_t n) {
     return std::vector<float>(n, 1.0);
 }
 
+Array<Where::host, float> random_array(const int64_t n) {
+    return Array<Where::host, float>(n, 1.0);
+}
+
 struct Range {
     int lb;
     int ub;
@@ -422,6 +506,44 @@ std::vector<CsrMat<Where::host>> part_by_rows(const CsrMat<Where::host> &m, cons
     return mats;
 }
 
+struct DistMat {
+    CsrMat<Where::host> local;
+    CsrMat<Where::host> remote;
+};
+
+DistMat split_local_remote(const CsrMat<Where::host> &m, MPI_Comm comm) {
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+
+    // which rows of x are local
+    Range localRange = get_partition(m.num_cols(), rank, size);
+
+    // build two matrices, local gets local non-zeros, remote gets remote non-zeros
+    CooMat local(m.num_rows(), m.num_cols()), remote(m.num_rows(), m.num_cols());
+
+    for (int r = 0; r < m.num_rows(); ++r) {
+        for (int ci = m.row_ptr(r); ci < m.row_ptr(r+1); ++ci) {
+            int c = m.col_ind(ci);
+            float v = m.val(ci);
+
+            if (c >= localRange.lb && c < localRange.ub) {
+                local.push_back(r,c,v);
+            } else {
+                remote.push_back(r,c,v);
+            }
+
+        }
+    }
+
+    return DistMat {
+        .local=local,
+        .remote=remote
+    };
+
+}
+
 
 std::vector<std::vector<float>> part_by_rows(const std::vector<float> &x, const int parts) {
     std::vector<std::vector<float>> xs;
@@ -441,11 +563,14 @@ std::vector<std::vector<float>> part_by_rows(const std::vector<float> &x, const
 
 int send_matrix(int dst, int src, CsrMat<Where::host> &&m, MPI_Comm comm) {
 
+    MPI_Request reqs[4];
+
     int numCols = m.num_cols();
-    MPI_Send(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm);
-    MPI_Send(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm);
-    MPI_Send(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm);
-    MPI_Send(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm);
+    MPI_Isend(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm, &reqs[0]);
+    MPI_Isend(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm, &reqs[1]);
+    MPI_Isend(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm, &reqs[2]);
+    MPI_Isend(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm, &reqs[3]);
+    MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE);
 
     return 0;
 }
@@ -474,40 +599,57 @@ CsrMat<Where::host> receive_matrix(int dst, int src, MPI_Comm comm) {
 
     // receive actual data into matrix
     MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE);
-    // receive actual data into matrix
     MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE);
-    // receive actual data into matrix
     MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE);
 
     return csr;
 }
 
-int send_vector(int dst, int src, std::vector<float> &&v, MPI_Comm comm) {
+int send_x(int dst, int src, std::vector<float> &&v, MPI_Comm comm) {
     MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm);
     return 0;
 }
 
-std::vector<float> receive_vector(int dst, int src, MPI_Comm comm) {
+/* recv some amount of data, and put it in the right place
+   in a full x
+*/
+std::vector<float> receive_x(const int n, const int dst, int src, MPI_Comm comm) {
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+
+    // which rows of x are local
+    Range local = get_partition(n, rank, size);
 
     // probe for size
     MPI_Status stat;
     MPI_Probe(0, Tag::x, comm, &stat);
     int sz;
     MPI_Get_count(&stat, MPI_INT, &sz);
-    std::vector<float> x(sz);
+    if (sz != local.ub-local.lb) {
+        throw std::logic_error(AT);
+    }
 
-    std::cerr << "recv " << sz << " x entries\n";
-
-    // receive actual data into matrix
-    MPI_Recv(x.data(), x.size(), MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE);
+    std::cerr << "recv " << sz << " x entries into offset " << local.lb << "\n";
+    std::vector<float> x(n);
+    MPI_Recv(x.data() + local.lb, sz, MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE);
 
     return x;
 }
 
+enum class ProductConfig {
+    MODIFY, // b += 
+    SET     // b =
+};
+
 /* Ax=b
 */
-__global__ void spmv(Array<Where::device, float>::View b, const CsrMat<Where::device>::View A, const Array<Where::device, float>::View x) {
-
+__global__ void spmv(ArrayView<float> b,
+     const CsrMat<Where::device>::View A,
+      const ArrayView<float> x,
+      const ProductConfig pc
+    ) {
     // one thread per row
     for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) {
         float acc = 0;
@@ -515,68 +657,89 @@ __global__ void spmv(Array<Where::device, float>::View b, const CsrMat<Where::de
             int c = A.col_ind(ci);
             acc += A.val(ci) * x(c);
         }
-        b(r) = acc;
+        if (ProductConfig::SET == pc) {
+            b(r) = acc;
+        } else {
+            b(r) += acc;
+        }
     }
+}
 
+// z += a
+__global__ void vector_add(ArrayView<float> z, const ArrayView<float> a) {
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < z.size(); i += blockDim.x * gridDim.x) {
+        z(i) += a(i);
+    }
 }
 
 int main (int argc, char **argv) {
 
     MPI_Init(&argc, &argv);
 
-    std::cerr << "get a gpu...\n";
-    CUDA_RUNTIME(cudaFree(0));
-    std::cerr << "barrier...\n";
-    MPI_Barrier(MPI_COMM_WORLD);
-
     int rank = 0;
     int size = 1;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &size);
   
-    int64_t m = 100;
-    int64_t n = 50;
-    int64_t nnz = 100;
+    std::cerr << "get a gpu...\n";
+    CUDA_RUNTIME(cudaSetDevice(rank));
+    CUDA_RUNTIME(cudaFree(0));
+    std::cerr << "barrier...\n";
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    CsrMat<Where::host> A;
-    std::vector<float> x;
+    int64_t m = 150000;
+    int64_t n = 150000;
+    int64_t nnz = 11000000;
 
-    // generate and distribute A and x
+    CsrMat<Where::host> lA; // "local A"
+
+    // generate and distribute A
     if (0 == rank) {
-        A = random_matrix(m, n, nnz);
-        x = random_vector(n);
-        std::vector<CsrMat<Where::host>> As = part_by_rows(A, size);
-        std::vector<std::vector<float>> xs = part_by_rows(x, size);
+        std::cerr << "generate matrix\n";
+        lA = random_matrix(m, n, nnz);
+        std::cerr << "partition matrix\n";
+        std::vector<CsrMat<Where::host>> As = part_by_rows(lA, size);
         for (size_t dst = 1; dst < size; ++dst) {
             std::cerr << "send A to " << dst << "\n";
             send_matrix(dst, 0, std::move(As[dst]), MPI_COMM_WORLD);
-            std::cerr << "send x to " << dst << "\n";
-            send_vector(dst, 0, std::move(xs[dst]), MPI_COMM_WORLD);
         }
-        A = As[rank];
-        x = xs[rank];
+        lA = As[rank];
     } else {
         std::cerr << "recv A at " << rank << "\n";
-        A = receive_matrix(rank, 0, MPI_COMM_WORLD);
-        std::cerr << "recv x at " << rank << "\n";
-        x = receive_vector(rank, 0, MPI_COMM_WORLD);
+        lA = receive_matrix(rank, 0, MPI_COMM_WORLD);
     }
 
-    std::cerr << "A[" << A.num_rows() << "," << A.num_cols() << "] w/ " << A.nnz() << "\n";
+    // each rank has a dense x. each rank owns part of it,
+    // but it doesn't matter what the entries are
+    Array<Where::host, float> lx = random_array(n); // "local x"
+    std::cerr << "local X: " << lx.size() << "\n";
+    std::cerr << "copy x to device\n";
+    Array<Where::device, float> lxd(lx.size()), rxd(lx.size()); // "local/remote x device"
+
+    // get a local and remote split of A
+    std::cerr << "split local/remote A\n";
+    CsrMat<Where::host> rA, A(lA);
+    {
+        DistMat d = split_local_remote(lA, MPI_COMM_WORLD);
+        lA = d.local;
+        rA = d.remote;
+    }
+    std::cerr << "A:        " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
+    std::cerr << "local A:  " << lA.num_rows() << "x" << lA.num_cols() << " w/ " << lA.nnz() << "\n";
+    std::cerr << "remote A: " << rA.num_rows() << "x" << rA.num_cols() << " w/ " << rA.nnz() << "\n";
+
     std::cerr << "Copy A to GPU\n";
-    CsrMat<Where::device> Ad(A);
+    CsrMat<Where::device> Ad(A), lAd(lA), rAd(rA);
 
 
     // Product vector size is same as local rows of A
-    std::vector<float> b(A.num_rows(), 0);
+    std::vector<float> b(lA.num_rows(), 0);
     std::cerr << "Copy b to GPU\n";
-    Array<Where::device, float> bd(b);
+    Array<Where::device, float> lbd(b), rbd(b); // "local b device, remote b device"
 
 
-    // plan allgather of complete x
+    // plan allgather of remote x data
     std::cerr << "plan allgather xs\n";
-    std::vector<float> lx(A.num_cols());
-    Array<Where::device, float> lxd(A.num_cols());
     std::vector<int> recvcounts;
     std::vector<int> displs;
     for (int i = 0; i < size; ++i) {
@@ -589,42 +752,96 @@ int main (int argc, char **argv) {
         }
     }
 
-    
+    int loPrio, hiPrio;
+    CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio));
+
+    cudaStream_t loS, hiS; // "lo/hi prio"
+    CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio));
+    CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio));
+
+    cudaEvent_t event;
+    CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+    const int nIters = 30;
+    std::vector<double> times(nIters);
+
+    /* ===== multiply in one shot
+    */
 
-    cudaStream_t stream;
-    CUDA_RUNTIME(cudaStreamCreate(&stream));
     // do spmv
     dim3 dimBlock(256);
     dim3 dimGrid(100);
 
-    for (int i = 0; i < 10; ++i) {
+    nvtxRangePush("one-shot");
+    for (int i = 0; i < nIters; ++i) {
+        MPI_Barrier(MPI_COMM_WORLD);
+        double start = MPI_Wtime();
 
         // distribute x to ranks
-        std::cerr << "Allgather xs\n";
-        MPI_Allgatherv(x.data(), x.size(), MPI_FLOAT, lx.data(), recvcounts.data(), displs.data(), MPI_FLOAT, MPI_COMM_WORLD);
+        MPI_Allgatherv(lx.data() + displs[rank], recvcounts[rank], MPI_FLOAT, lx.data(), recvcounts.data(), displs.data(), MPI_FLOAT, MPI_COMM_WORLD);
 
-        std::cerr << "Copy x to GPU\n";
-        lxd.set_from(lx, stream);
+        // copy x to GPU
+        lxd.set_from(lx, hiS);
 
-        std::cerr << "launch spmv\n";
-        spmv<<<dimGrid, dimBlock, 0, stream>>>(bd.view(), Ad.view(), lxd.view());
+        spmv<<<dimGrid, dimBlock, 0, hiS>>>(lbd.view(), Ad.view(), lxd.view(), ProductConfig::SET);
         CUDA_RUNTIME(cudaGetLastError());
-        CUDA_RUNTIME(cudaStreamSynchronize(stream));
-        std::cerr << "done spmv\n";
-
-        // copy product back to host
-        // b = bd;
-
-        // for (int r = 0; r < size; ++r) {
-        //     MPI_Barrier(MPI_COMM_WORLD);
-        //     if (rank == r) {
-        //         for (auto &e : b) {
-        //             std::cout << e << "\n";
-        //         }
-        //     }
-        // }
-
+        CUDA_RUNTIME(cudaStreamSynchronize(hiS));
+        times[i] = MPI_Wtime() - start;
     }
+    nvtxRangePop(); // one-shot
+    MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    if (0 == rank) {
+        std::sort(times.begin(), times.end());
+        std::cerr << times[times.size() / 2] << "\n";
+    }
+
+
+    /* ===== split local and remote
+       multiply local, gather & multiply remote
+       TODO: the separate add launch can be removed if it is ensured
+       that The remote happens strictly after the local.
+       It's a small false serialization, but if we're in the case
+       where that matters, the launch overhead dominates anyway.
+    */
+    nvtxRangePush("local/remote");
+    for (int i = 0; i < nIters; ++i) {
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        double start = MPI_Wtime();
+
+        // overlap MPI with CUDA kernel launch
+        MPI_Request req;
+        MPI_Iallgatherv(lx.data() + displs[rank], recvcounts[rank], MPI_FLOAT, lx.data(), recvcounts.data(), displs.data(), MPI_FLOAT, MPI_COMM_WORLD, &req);
+
+        spmv<<<dimGrid, dimBlock, 0, hiS>>>(lbd.view(), lAd.view(), lxd.view(), ProductConfig::SET);
+        CUDA_RUNTIME(cudaGetLastError());
+
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+        rxd.set_from(lx, loS);
+        
+        // hiS blocks until transfer is done
+        CUDA_RUNTIME(cudaEventRecord(event, loS));
+        CUDA_RUNTIME(cudaStreamWaitEvent(hiS, event, 0));
+
+        spmv<<<dimGrid, dimBlock, 0, hiS>>>(rbd.view(), rAd.view(), rxd.view(), ProductConfig::MODIFY);
+        CUDA_RUNTIME(cudaGetLastError());
+
+        // all is done when hiS is done
+        CUDA_RUNTIME(cudaStreamSynchronize(hiS));
+        times[i] = MPI_Wtime() - start;
+    }
+    nvtxRangePop(); // local/remote
+    MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    if (0 == rank) {
+        std::sort(times.begin(), times.end());
+        std::cerr << times[times.size() / 2] << "\n";
+    }
+
+    // maybe better to atomic add into result than doing separate kernel launch?
+
+    CUDA_RUNTIME(cudaStreamDestroy(loS));
+    CUDA_RUNTIME(cudaStreamDestroy(hiS));
 
     MPI_Finalize();