From a7c755d8995a9f06e7d98a0baddbe97ace15daf7 Mon Sep 17 00:00:00 2001
From: Carl William Pearson <cwpears@sandia.gov>
Date: Fri, 11 Jun 2021 14:43:16 -0600
Subject: [PATCH] provisional distributed spmv

---
 CMakeLists.txt    |   6 +--
 README.md         |  16 +++++++
 at.hpp            |   5 +++
 csr_mat.hpp       |  76 ++++++++++++++++++++++++++++++++
 overlap.cu        | 108 +++++-----------------------------------------
 row_part_spmv.cuh |  88 ++++++++++++++++++++++++++++++-------
 split_coo_mat.hpp |   1 +
 7 files changed, 184 insertions(+), 116 deletions(-)
 create mode 100644 at.hpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f58d47..6a97724 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,7 +87,7 @@ endfunction()
 
 if (MPI_FOUND)
   add_executable(main main.cu)
-  target_include_directories(main PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
+  target_include_directories(main SYSTEM PRIVATE ${MPI_CXX_INCLUDE_DIRS})
   target_link_libraries(main ${MPI_CXX_LIBRARIES})
   target_link_libraries(main CUDA::nvToolsExt)
   # target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH})
@@ -99,10 +99,10 @@ endif()
 
 if (MPI_FOUND)
   add_executable(overlap overlap.cu)
-  target_include_directories(overlap PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
+  target_include_directories(overlap SYSTEM PRIVATE ${MPI_CXX_INCLUDE_DIRS})
   target_link_libraries(overlap ${MPI_CXX_LIBRARIES})
   target_link_libraries(overlap CUDA::nvToolsExt)
-  set_cxx_options(overlap)
+  set_cuda_options(overlap)
   set_cxx_standard(overlap)
 endif()
 
diff --git a/README.md b/README.md
index 4061b2d..838106c 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,22 @@ module load cde/v2/cmake/3.19.2
 mpirun -n 2 ~/software/nsight-systems-cli/2021.2.1/bin/nsys profile -c cudaProfilerApi -t cuda,mpi,nvtx -o dist-spmv_%q{OMPI_COMM_WORLD_RANK} -f true ./main
 ```
 
+### To build with OpenMPI 4.1.1
+
+```
+module purge
+module load sems-env
+module load sems-cmake/3.19.1
+module load sems-gcc/7.2.0
+module load sems-cuda/10.1
+cmake .. -DCMAKE_PREFIX_PATH=~cwpears/software/openmpi-4.1.1-cuda10.1-gcc7.2
+```
+
+```
+~cwpears/software/openmpi-4.1.1-cuda10.1-gcc7.2/bin/mpirun -n 2 ./overlap
+
+```
+
 ## Design Considerations
 
 Minimize CUDA runtime calls
\ No newline at end of file
diff --git a/at.hpp b/at.hpp
new file mode 100644
index 0000000..f8afa66
--- /dev/null
+++ b/at.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+#define AT __FILE__ ":" TOSTRING(__LINE__) 
\ No newline at end of file
diff --git a/csr_mat.hpp b/csr_mat.hpp
index 1b2ddf1..c642abe 100644
--- a/csr_mat.hpp
+++ b/csr_mat.hpp
@@ -6,6 +6,8 @@
 #include "coo_mat.hpp"
 #include "algorithm.hpp"
 
+#include <cassert>
+
 template <Where where>
 class CsrMat {
 public:
@@ -161,6 +163,16 @@ public:
     CsrMat(CsrMat &&other) = delete;
     CsrMat(const CsrMat &other) = delete;
 
+    CsrMat &operator=(CsrMat &&rhs) {
+        if (this != &rhs) {
+            rowPtr_ = std::move(rhs.rowPtr_);
+            colInd_ = std::move(rhs.colInd_);
+            val_ = std::move(rhs.val_);
+            numCols_ = std::move(rhs.numCols_);
+        }
+        return *this;
+    }
+
     // create device matrix from host
     CsrMat(const CsrMat<Where::host> &m) : 
         rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) {
@@ -193,4 +205,68 @@ public:
         return v;
     }
 
+};
+
+
+// mxn random matrix with nnz
+CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
+
+    if (m * n < nnz) {
+        throw std::logic_error(AT);
+    }
+
+    CooMat coo(m,n);
+    while(coo.nnz() < nnz) {
+
+        int64_t toPush = nnz - coo.nnz();
+        std::cerr << "adding " << toPush << " non-zeros\n";
+        for (int64_t _ = 0; _ < toPush; ++_) {
+            int r = rand() % m;
+            int c = rand() % n;
+            float e = 1.0;
+            coo.push_back(r, c, e);
+        }
+        std::cerr << "removing duplicate non-zeros\n";
+        coo.remove_duplicates();
+    }
+    coo.sort();
+    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
+    CsrMat<Where::host> csr(coo);
+    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
+    return csr;
+};
+
+// nxn diagonal matrix with bandwidth b
+CsrMat<Where::host> random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) {
+
+    CooMat coo(n,n);
+    while(coo.nnz() < nnz) {
+
+        int64_t toPush = nnz - coo.nnz();
+        std::cerr << "adding " << toPush << " non-zeros\n";
+        for (int64_t _ = 0; _ < toPush; ++_) {
+            int r = rand() % n; // random row
+
+            // column in the band
+            int lb = r - bw;
+            int ub = r + bw + 1;
+            int64_t c = rand() % (ub - lb) + lb;
+            if (c < 0 || c >= n) {
+                // retry, don't over-weight first or last column
+                continue;
+            }
+            float e = 1.0;
+            
+            assert(c < n);
+            assert(r < n);
+            coo.push_back(r, c, e);
+        }
+        std::cerr << "removing duplicate non-zeros\n";
+        coo.remove_duplicates();
+    }
+    coo.sort();
+    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
+    CsrMat<Where::host> csr(coo);
+    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
+    return csr;
 };
\ No newline at end of file
diff --git a/overlap.cu b/overlap.cu
index 154f5af..4baad28 100644
--- a/overlap.cu
+++ b/overlap.cu
@@ -10,91 +10,13 @@
 #include <iostream>
 #include <map>
 
+//#define VIEW_CHECK_BOUNDS
+
+#include "at.hpp"
 #include "cuda_runtime.hpp"
 #include "csr_mat.hpp"
 #include "row_part_spmv.cuh"
 
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-#define AT __FILE__ ":" TOSTRING(__LINE__) 
-
-//#define VIEW_CHECK_BOUNDS
-
-
-// mxn random matrix with nnz
-CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
-
-    if (m * n < nnz) {
-        throw std::logic_error(AT);
-    }
-
-    CooMat coo(m,n);
-    while(coo.nnz() < nnz) {
-
-        int64_t toPush = nnz - coo.nnz();
-        std::cerr << "adding " << toPush << " non-zeros\n";
-        for (int64_t _ = 0; _ < toPush; ++_) {
-            int r = rand() % m;
-            int c = rand() % n;
-            float e = 1.0;
-            coo.push_back(r, c, e);
-        }
-        std::cerr << "removing duplicate non-zeros\n";
-        coo.remove_duplicates();
-    }
-    coo.sort();
-    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
-    CsrMat<Where::host> csr(coo);
-    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
-    return csr;
-};
-
-// nxn diagonal matrix with bandwidth b
-CsrMat<Where::host> random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) {
-
-    CooMat coo(n,n);
-    while(coo.nnz() < nnz) {
-
-        int64_t toPush = nnz - coo.nnz();
-        std::cerr << "adding " << toPush << " non-zeros\n";
-        for (int64_t _ = 0; _ < toPush; ++_) {
-            int r = rand() % n; // random row
-
-            // column in the band
-            int lb = r - bw;
-            int ub = r + bw + 1;
-            int64_t c = rand() % (ub - lb) + lb;
-            if (c < 0 || c > n) {
-                continue; // don't over-weight first or last column
-            }
-            
-            float e = 1.0;
-            coo.push_back(r, c, e);
-        }
-        std::cerr << "removing duplicate non-zeros\n";
-        coo.remove_duplicates();
-    }
-    coo.sort();
-    std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
-    CsrMat<Where::host> csr(coo);
-    std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
-    return csr;
-};
-
-std::vector<float> random_vector(const int64_t n) {
-    return std::vector<float>(n, 1.0);
-}
-
-Array<Where::host, float> random_array(const int64_t n) {
-    return Array<Where::host, float>(n, 1.0);
-}
-
-#if 0
-int send_x(int dst, int src, std::vector<float> &&v, MPI_Comm comm) {
-    MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm);
-    return 0;
-}
-#endif
 
 /* recv some amount of data, and put it in the right place
    in a full x
@@ -152,10 +74,10 @@ int main (int argc, char **argv) {
     // int64_t n = 150000;
     // int64_t nnz = 11000000;
     // or
-    int64_t m = 150000;
+    int64_t m = 15000;
     int64_t n = m;
     int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks
-    int64_t nnz = 11000000;
+    int64_t nnz = 1100000;
 
     CsrMat<Where::host> A; // "local A"
 
@@ -168,29 +90,21 @@ int main (int argc, char **argv) {
     
     RowPartSpmv spmv(A, 0, MPI_COMM_WORLD);
 
-
-    std::cerr << "A:        " << A.num_rows()         << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
+    if (0 == rank) {
+        std::cerr << "A:        " << A.num_rows()         << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
+    }
     std::cerr << "local A:  " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n";
     std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n";
 
-
-    int loPrio, hiPrio;
-    CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio));
-
-    cudaStream_t loS, hiS; // "lo/hi prio"
-    CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio));
-    CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio));
-
-    cudaEvent_t event;
-    CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-
-    const int nIters = 30;
+    const int nIters = 1;
     std::vector<double> times(nIters);
 
     nvtxRangePush("overlap");
     for (int i = 0; i < nIters; ++i) {
         MPI_Barrier(MPI_COMM_WORLD);
         double start = MPI_Wtime();
+        spmv.pack_x_async();
+        spmv.pack_x_wait();
         spmv.send_x_async();
         spmv.launch_local();
         spmv.recv_x_async();
diff --git a/row_part_spmv.cuh b/row_part_spmv.cuh
index 614a6fb..1875e8b 100644
--- a/row_part_spmv.cuh
+++ b/row_part_spmv.cuh
@@ -173,6 +173,7 @@ public:
     }
 
     void pack_x_async() {
+        assert(xSendBuf_.size() == xSendIdx_.size());
         scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view());
     }
 
@@ -182,9 +183,12 @@ public:
 
     void send_x_async() {
 
+        std::cerr << "send_x_async(): send to " << sendParams_.size() << " ranks\n";
+
         // send to neighbors who want it
         for (auto &p : sendParams_) {
             int tag = 0;
+            assert(xSendBuf_.size() >= p.displ + p.count);
             MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req);
         }
     }
@@ -209,8 +213,10 @@ public:
         CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_));
     }
 
-    void launch_local_spmv() {}
-    void launch_remote_spmv() {}
+    ~RowPartSpmv() {
+        CUDA_RUNTIME(cudaStreamDestroy(kernelStream_)); kernelStream_ = 0;
+        CUDA_RUNTIME(cudaStreamDestroy(packStream_)); packStream_ = 0;
+    }
 
     /* create from a matrix at root
     */
@@ -218,11 +224,14 @@ public:
         const CsrMat<Where::host> &wholeA,
         const int root,
         MPI_Comm comm
-    ) {
+    ) : comm_(comm) {
+
+    CUDA_RUNTIME(cudaStreamCreate(&kernelStream_));
+    CUDA_RUNTIME(cudaStreamCreate(&packStream_));
 
     int rank, size;
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &size);
+    MPI_Comm_rank(comm_, &rank);
+    MPI_Comm_size(comm_, &size);
 
     CsrMat<Where::host> a;
     if (root == rank) {
@@ -231,27 +240,34 @@ public:
         for (size_t dst = 0; dst < size; ++dst) {
             if (root != dst) {
                 std::cerr << "send A to " << dst << "\n";
-                send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD);
+                send_matrix(dst, 0, std::move(as[dst]), comm_);
             }
         }
         a = as[rank];
     } else {
         std::cerr << "recv A at " << rank << "\n";
-        a = receive_matrix(rank, 0, MPI_COMM_WORLD);
+        a = receive_matrix(rank, 0, comm_);
     }
 
     // split row part of a into local and global
     SplitCooMat scm = split_local_remote(a, comm);
+    la_ = std::move(scm.local);
+    ra_ = std::move(scm.remote);
+    assert(la_.nnz() + ra_.nnz() == a.nnz() && "lost a non-zero during split");
     loff_ = scm.loff;
 
     // create local part of x array
     // undefined entries
     Range xrange = get_partition(a.num_cols(), rank, size);
     lx_ = Array<Where::device, float>(xrange.extent());
+    ly_ = Array<Where::device, float>(la_.num_rows());
 
     // create remote part of x array
     // one entry per remote column
     rx_ = Array<Where::device,float>(scm.globals.size());
+    if (0 == rx_.size()) {
+        std::cerr << "WARN: not receiving anything\n";
+    }
 
     // determine which columns needed from others
     std::map<int, std::vector<int>> recvCols;
@@ -261,6 +277,24 @@ public:
         recvCols[src].push_back(c);
     }
 
+#if 1
+    for (int r = 0; r < size; ++r) {
+        MPI_Barrier(comm_);
+        if (r == rank) {
+            std::cerr << "rank " << rank << "recvCols:\n";
+            for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
+                std::cerr << "from " << it->first << ": ";
+                for (auto &c : it->second) {
+                    std::cerr << c << " ";
+                }
+                std::cerr << "\n";
+            }
+        }
+        MPI_Barrier(comm_);
+    }
+#endif
+
+
     // create receive parameters
     int offset = 0;
     for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
@@ -272,15 +306,36 @@ public:
         recvParams_.push_back(param);
     }
 
+
+#if 1
+    for (int r = 0; r < size; ++r) {
+        MPI_Barrier(comm_);
+        if (r == rank) {
+            std::cerr << "rank " << rank << " recvParams:\n";
+            for (RecvParam &p : recvParams_) {
+                std::cerr 
+                << "src=" << p.src
+                << " displ=" << p.displ 
+                << " count=" << p.count 
+                << "\n";
+            }
+        }
+        MPI_Barrier(comm_);
+    }
+#endif
+
+
+
     // tell others which cols I need (send 0 if nothing)
     std::vector<MPI_Request> reqs(size);
     for (int dest = 0; dest < size; ++dest) {
         auto it = recvCols.find(dest);
         if (it != recvCols.end()) {
-            MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]);
+            assert(it->second.data());
+            MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm_, &reqs[dest]);
         } else {
             int _;
-            MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]);
+            MPI_Isend(&_ /*junk*/, 0, MPI_INT, dest, 0, comm_, &reqs[dest]);
         }
     }
 
@@ -293,10 +348,10 @@ public:
         MPI_Get_count(&status, MPI_INT, &count);
         if (count != 0) {
             sendCols[src].resize(count);
-            MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
+            MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm_, MPI_STATUS_IGNORE);
         } else {
             int _;
-            MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
+            MPI_Recv(&_, 0, MPI_INT, src, 0, comm_, MPI_STATUS_IGNORE);
         }
     }
 
@@ -310,19 +365,20 @@ public:
         param.dst = it->first;
         for (int gc : it->second) {
             int lc = gc - scm.loff;
+            assert(lc >= 0);
+            assert(lc < lx_.size());
             offsets.push_back(lc);
         }
         param.count = offsets.size() - param.displ;
         sendParams_.push_back(param);
     }
+    // device version of offsets for packing
+    xSendIdx_ = offsets;
+    // buffer that x values will be placed into for sending
+    xSendBuf_.resize(xSendIdx_.size());
 
-    assert(la_.size() > 0);
-    assert(ra_.size() > 0); // remote A
     assert(lx_.size() > 0);
-    assert(rx_.size() > 0);
     assert(ly_.size() > 0);
 
-
-
     }
 };
\ No newline at end of file
diff --git a/split_coo_mat.hpp b/split_coo_mat.hpp
index b5d97b1..20275ea 100644
--- a/split_coo_mat.hpp
+++ b/split_coo_mat.hpp
@@ -45,6 +45,7 @@ SplitCooMat split_local_remote(const CsrMat<Where::host> &m, MPI_Comm comm) {
 
     // which rows of x are local
     Range localRange = get_partition(m.num_cols(), rank, size);
+    std::cerr << "[" << localRange.lb <<","<< localRange.ub << ")\n";
     int loff = localRange.lb;
 
     // build two matrices, local gets local non-zeros, remote gets remote non-zeros