dist-spmv/row_part_spmv.cuh

#pragma once

#include <mpi.h>

#include "csr_mat.hpp"
#include "partition.hpp"
#include "split_coo_mat.hpp"

#include <cassert>


enum class ProductConfig {
    MODIFY, // b +=
    SET     // b =
};

/* Ax=b
*/
__global__ void spmv(ArrayView<float> b,
     const CsrMat<Where::device>::View A,
      const ArrayView<float> x,
      const ProductConfig pc
    ) {
    // one thread per row
    for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) {
        float acc = 0;
        for (int ci = A.row_ptr(r); ci < A.row_ptr(r+1); ++ci) {
            int c = A.col_ind(ci);
            acc += A.val(ci) * x(c);
        }
        if (ProductConfig::SET == pc) {
            b(r) = acc;
        } else {
            b(r) += acc;
        }
    }
}


enum Tag : int {
    row_ptr,
    col_ind,
    val,
    x,
    num_cols
};


int send_matrix(int dst, int src, CsrMat<Where::host> &&m, MPI_Comm comm) {

    MPI_Request reqs[4];

    int numCols = m.num_cols();
    MPI_Isend(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm, &reqs[0]);
    MPI_Isend(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm, &reqs[1]);
    MPI_Isend(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm, &reqs[2]);
    MPI_Isend(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm, &reqs[3]);
    MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE);

    return 0;
}

CsrMat<Where::host> receive_matrix(int dst, int src, MPI_Comm comm) {

    int numCols;
    MPI_Recv(&numCols, 1, MPI_INT, 0, Tag::num_cols, comm, MPI_STATUS_IGNORE);

    // probe for number of rows
    MPI_Status stat;
    MPI_Probe(0, Tag::row_ptr, comm, &stat);
    int numRows;
    MPI_Get_count(&stat, MPI_INT, &numRows);
    if (numRows > 0) {
        --numRows;
    }

    // probe for nnz
    MPI_Probe(0, Tag::col_ind, comm, &stat);
    int nnz;
    MPI_Get_count(&stat, MPI_INT, &nnz);

    std::cerr << "recv " << numRows << "x" << numCols << " w/ " << nnz << "\n";
    CsrMat<Where::host> csr(numRows, numCols, nnz);

    // receive actual data into matrix
    MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE);
    MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE);
    MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE);

    return csr;
}


// out[i] = in[idx[i]]
__global__ void scatter(ArrayView<float> out,
ArrayView<float> in,
ArrayView<int> idx) {}


/* Ax=y , partitioned evenly by rows of A

   always have to pack on the send side, since not all local x values will be needed
   so may as well pack in a way that recver doesn't have to unpack

   serializing local and remote means you don't have to worry about
   concurrent adds to the product vector
   if kernels are sufficiently large, no real opportunity for these to overlap anyway
   if they're small, communication time will be longer anway


*/
class RowPartSpmv {
private:
    MPI_Comm comm_;

    int loff_; // first row in global index
    CsrMat<Where::device> la_; // local A
    CsrMat<Where::device> ra_; // remote A
    Array<Where::device, float> lx_; // local x
    Array<Where::device, float> rx_; // remote x
    Array<Where::device, float> ly_;


    // info for sending x
    struct SendParam {
        int dst; // destination rank
        int displ;
        int count;
        MPI_Request req;
    };
    std::vector<SendParam> sendParams_; // parameters for each rank
    Array<Where::device, int> xSendIdx_; // which entry of lx_ will be in each xSendBuf_;
    Array<Where::device, float> xSendBuf_; // send local x entries to other ranks

    std::vector<int> gCols_; // global index from local


    std::map<int, std::vector<int>> sendEntr; // which entries of x to send to each rank
    std::map<int, MPI_Request> sendReq;

    struct RecvParam {
        int src; // source rank
        int displ; // displacement in
        int count; // number of entries
        MPI_Request req;
    };
    std::vector<RecvParam> recvParams_;


    cudaStream_t kernelStream_;
    cudaStream_t packStream_;

public:

    const CsrMat<Where::device> &lA() const {return la_;}
    const CsrMat<Where::device> &rA() const {return ra_;}

    void launch_local() {
        dim3 dimGrid(100);
        dim3 dimBlock(128);
        spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), la_.view(), lx_.view(), ProductConfig::SET);
        CUDA_RUNTIME(cudaGetLastError());
    }

    void launch_remote() {
        dim3 dimGrid(100);
        dim3 dimBlock(128);
        spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), ra_.view(), rx_.view(), ProductConfig::MODIFY);
        CUDA_RUNTIME(cudaGetLastError());
    }

    void pack_x_async() {
        scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view());
    }

    void pack_x_wait() {
        CUDA_RUNTIME(cudaStreamSynchronize(packStream_));
    }

    void send_x_async() {

        // send to neighbors who want it
        for (auto &p : sendParams_) {
            int tag = 0;
            MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req);
        }
    }
    void send_x_wait() {
        for (auto &p : sendParams_) {
            MPI_Wait(&p.req, MPI_STATUS_IGNORE);
        }
    }
    void recv_x_async() {
        for (auto &p : recvParams_) {
            int tag = 0;
            MPI_Irecv(rx_.data() + p.displ, p.count, MPI_FLOAT, p.src, tag, comm_, &p.req);
        }
    }
    void recv_x_wait() {
        for (auto &p : recvParams_) {
            MPI_Wait(&p.req, MPI_STATUS_IGNORE);
        }
    }

    void finish() {
        CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_));
    }

    void launch_local_spmv() {}
    void launch_remote_spmv() {}

    /* create from a matrix at root
    */
    RowPartSpmv(
        const CsrMat<Where::host> &wholeA,
        const int root,
        MPI_Comm comm
    ) {

    int rank, size;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);

    CsrMat<Where::host> a;
    if (root == rank) {
        std::cerr << "partition matrix\n";
        std::vector<CsrMat<Where::host>> as = part_by_rows(wholeA, size);
        for (size_t dst = 0; dst < size; ++dst) {
            if (root != dst) {
                std::cerr << "send A to " << dst << "\n";
                send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD);
            }
        }
        a = as[rank];
    } else {
        std::cerr << "recv A at " << rank << "\n";
        a = receive_matrix(rank, 0, MPI_COMM_WORLD);
    }

    // split row part of a into local and global
    SplitCooMat scm = split_local_remote(a, comm);
    loff_ = scm.loff;

    // create local part of x array
    // undefined entries
    Range xrange = get_partition(a.num_cols(), rank, size);
    lx_ = Array<Where::device, float>(xrange.extent());

    // create remote part of x array
    // one entry per remote column
    rx_ = Array<Where::device,float>(scm.globals.size());

    // determine which columns needed from others
    std::map<int, std::vector<int>> recvCols;
    for (int c : scm.globals) {
        auto src = get_owner(a.num_cols(), c, size);
        assert(rank != src && "should not need my own columns in remote part");
        recvCols[src].push_back(c);
    }

    // create receive parameters
    int offset = 0;
    for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
        RecvParam param;
        param.displ = offset;
        param.src = it->first;
        offset += it->second.size();
        param.count = offset - param.displ;
        recvParams_.push_back(param);
    }

    // tell others which cols I need (send 0 if nothing)
    std::vector<MPI_Request> reqs(size);
    for (int dest = 0; dest < size; ++dest) {
        auto it = recvCols.find(dest);
        if (it != recvCols.end()) {
            MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]);
        } else {
            int _;
            MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]);
        }
    }

    // which global x rows other ranks need from me
    std::map<int, std::vector<int>> sendCols;
    for (int src = 0; src < size; ++src) {
        MPI_Status status;
        MPI_Probe(src, 0, comm, &status);
        int count;
        MPI_Get_count(&status, MPI_INT, &count);
        if (count != 0) {
            sendCols[src].resize(count);
            MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
        } else {
            int _;
            MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
        }
    }

    // create the offsets from lx that we will send out
    // TODO: should be device array
    std::vector<int> offsets;
    for (auto it = sendCols.begin(); it != sendCols.end(); ++it) {
        // TODO - adjust for changed local array columns
        SendParam param;
        param.displ = offsets.size();
        param.dst = it->first;
        for (int gc : it->second) {
            int lc = gc - scm.loff;
            offsets.push_back(lc);
        }
        param.count = offsets.size() - param.displ;
        sendParams_.push_back(param);
    }

    assert(la_.size() > 0);
    assert(ra_.size() > 0); // remote A
    assert(lx_.size() > 0);
    assert(rx_.size() > 0);
    assert(ly_.size() > 0);


    }
};