initial local+remote spmv

This commit is contained in:
Carl William Pearson
2021-06-11 13:09:50 -06:00
parent aae7176823
commit fb88da915d
10 changed files with 1151 additions and 0 deletions

View File

@@ -97,3 +97,12 @@ if (MPI_FOUND)
set_cxx_standard(main)
endif()
if (MPI_FOUND)
add_executable(overlap overlap.cu)
target_include_directories(overlap PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
target_link_libraries(overlap ${MPI_CXX_LIBRARIES})
target_link_libraries(overlap CUDA::nvToolsExt)
set_cxx_options(overlap)
set_cxx_standard(overlap)
endif()

9
algorithm.hpp Normal file
View File

@@ -0,0 +1,9 @@
#pragma once
template<typename ForwardIt>
void shift_left(ForwardIt first, ForwardIt last, size_t n) {
while(first != last) {
*(first-n) = *first;
++first;
}
}

178
array.hpp Normal file
View File

@@ -0,0 +1,178 @@
#pragma once
#include <vector>
#include "cuda_runtime.hpp"
#include "where.hpp"
template <Where where, typename T>
class Array;
// A non-owning view of data
template <typename T>
struct ArrayView
{
T *data_;
int64_t size_;
public:
ArrayView() : data_(nullptr), size_(0){}
ArrayView(const ArrayView &other) = default;
ArrayView(ArrayView &&other) = default;
ArrayView &operator=(const ArrayView &rhs) = default;
__host__ __device__ int64_t size() const { return size_; }
__host__ __device__ const T &operator()(int64_t i) const {
#ifdef VIEW_CHECK_BOUNDS
if (i < 0) {
printf("ERR: i < 0: %d\n", i);
}
if (i >= size_) {
printf("ERR: i > size_: %d > %ld\n", i, size_);
}
#endif
return data_[i];
}
__host__ __device__ T &operator()(int64_t i) {
return data_[i];
}
const T* data() const {
return data_;
}
T* data() {
return data_;
}
};
/* device array
*/
template<typename T> class Array<Where::device, T>
{
public:
// array owns the data in this view
ArrayView<T> view_;
public:
Array() = default;
Array(const size_t n) {
resize(n);
}
Array(const Array &other) = delete;
Array(Array &&other) : view_(other.view_) {
// view is non-owning, so have to clear other
other.view_.data_ = nullptr;
other.view_.size_ = 0;
}
Array &operator=(Array &&other) {
view_ = std::move(other.view_);
// view is non-owning, so have to clear other
other.view_.data_ = nullptr;
other.view_.size_ = 0;
return *this;
}
Array(const std::vector<T> &v) {
set_from(v);
}
~Array() {
CUDA_RUNTIME(cudaFree(view_.data_));
view_.data_ = nullptr;
view_.size_ = 0;
}
int64_t size() const {
return view_.size(); }
ArrayView<T> view() const {
return view_; // copy of internal view
}
operator std::vector<T>() const {
std::vector<T> v(size());
CUDA_RUNTIME(cudaMemcpy(v.data(), view_.data_, size() * sizeof(T), cudaMemcpyDeviceToHost));
return v;
}
void set_from(const std::vector<T> &rhs, cudaStream_t stream = 0) {
resize(rhs.size());
CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
}
void set_from(const Array<Where::host, T> &rhs, cudaStream_t stream = 0) {
resize(rhs.size());
CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
}
// any change destroys all data
void resize(size_t n) {
if (size() != n) {
view_.size_ = n;
CUDA_RUNTIME(cudaFree(view_.data_));
CUDA_RUNTIME(cudaMalloc(&view_.data_, view_.size_ * sizeof(T)));
}
}
const T* data() const {
return view_.data();
}
T* data() {
return view_.data();
}
};
/* host array
*/
template<typename T> class Array<Where::host, T>
{
public:
// array owns the data in this view
ArrayView<T> view_;
public:
Array() = default;
Array(const size_t n, const T &val) {
resize(n);
for (size_t i = 0; i < n; ++i) {
view_(i) = val;
}
}
Array(const Array &other) = delete;
Array(Array &&other) : view_(other.view_) {
// view is non-owning, so have to clear other
other.view_.data_ = nullptr;
other.view_.size_ = 0;
}
~Array() {
CUDA_RUNTIME(cudaFreeHost(view_.data_));
view_.data_ = nullptr;
view_.size_ = 0;
}
int64_t size() const {
return view_.size(); }
ArrayView<T> view() const {
return view_; // copy of internal view
}
// any change destroys all data
void resize(size_t n) {
if (size() != n) {
view_.size_ = n;
CUDA_RUNTIME(cudaFreeHost(view_.data_));
CUDA_RUNTIME(cudaHostAlloc(&view_.data_, view_.size_ * sizeof(T), cudaHostAllocDefault));
}
}
const T* data() const {
return view_.data_;
}
T* data() {
return view_.data_;
}
};

58
coo_mat.hpp Normal file
View File

@@ -0,0 +1,58 @@
#pragma once
class CooMat {
public:
struct Entry {
int i;
int j;
float e;
Entry(int _i, int _j, int _e) : i(_i), j(_j), e(_e) {}
static bool by_ij(const Entry &a, const Entry &b) {
if (a.i < b.i) {
return true;
} else if (a.i > b.i) {
return false;
} else {
return a.j < b.j;
}
}
static bool same_ij(const Entry &a, const Entry &b) {
return a.i == b.i && a.j == b.j;
}
};
private:
// sorted during construction
std::vector<Entry> data_;
int64_t numRows_;
int64_t numCols_;
public:
CooMat(int m, int n) : numRows_(m), numCols_(n) {}
const std::vector<Entry> &entries() const {return data_;}
void push_back(int i, int j, int e) {
data_.push_back(Entry(i,j,e));
}
void sort() {
std::sort(data_.begin(), data_.end(), Entry::by_ij);
}
void remove_duplicates() {
std::sort(data_.begin(), data_.end(), Entry::by_ij);
std::unique(data_.begin(), data_.end(), Entry::same_ij);
}
int64_t num_rows() const {return numRows_;}
int64_t num_cols() const {return numCols_;}
int64_t nnz() const {return data_.size();}
std::vector<Entry>::iterator begin() {return data_.begin();}
std::vector<Entry>::iterator end() {return data_.end();}
};

196
csr_mat.hpp Normal file
View File

@@ -0,0 +1,196 @@
#pragma once
#include <cuda_runtime.h>
#include "array.hpp"
#include "coo_mat.hpp"
#include "algorithm.hpp"
template <Where where>
class CsrMat {
public:
CsrMat();
int64_t nnz() const;
int64_t num_rows() const;
};
template<> class CsrMat<Where::host>;
template<> class CsrMat<Where::device>;
/* host sparse matrix */
template<> class CsrMat<Where::host>
{
friend class CsrMat<Where::device>; // device can see inside
std::vector<int> rowPtr_;
std::vector<int> colInd_;
std::vector<float> val_;
int64_t numCols_;
public:
CsrMat() = default;
CsrMat(int numRows, int numCols, int nnz) : rowPtr_(numRows+1), colInd_(nnz), val_(nnz), numCols_(numCols) {}
CsrMat(const CooMat &coo) : numCols_(coo.num_cols()) {
for (auto &e : coo.entries()) {
while (rowPtr_.size() <= e.i) {
rowPtr_.push_back(colInd_.size());
}
colInd_.push_back(e.j);
val_.push_back(e.e);
}
while (rowPtr_.size() < coo.num_rows()+1){
rowPtr_.push_back(colInd_.size());
}
}
int64_t num_rows() const {
if (rowPtr_.size() <= 1) {
return 0;
} else {
return rowPtr_.size() - 1;
}
}
int64_t num_cols() const {
return numCols_;
}
int64_t nnz() const {
if (colInd_.size() != val_.size()) {
throw std::logic_error("bad invariant");
}
return colInd_.size();
}
const int &row_ptr(int64_t i) const {
return rowPtr_[i];
}
const int &col_ind(int64_t i) const {
return colInd_[i];
}
const float &val(int64_t i) const {
return val_[i];
}
const int *row_ptr() const {return rowPtr_.data(); }
int *row_ptr() {return rowPtr_.data(); }
const int *col_ind() const {return colInd_.data(); }
int *col_ind() {return colInd_.data(); }
const float *val() const {return val_.data(); }
float *val() {return val_.data(); }
/* keep rows [rowStart, rowEnd)
*/
void retain_rows(int rowStart, int rowEnd) {
if (0 == rowEnd) {
throw std::logic_error("unimplemented");
}
// erase rows after
// dont want to keep rowEnd, so rowEnd points to end of rowEnd-1
std::cerr << "rowPtr_ = rowPtr[:" << rowEnd+1 << "]\n";
rowPtr_.resize(rowEnd+1);
std::cerr << "resize entries to " << rowPtr_.back() << "\n";
colInd_.resize(rowPtr_.back());
val_.resize(rowPtr_.back());
// erase early row pointers
std::cerr << "rowPtr <<= " << rowStart << "\n";
shift_left(rowPtr_.begin()+rowStart, rowPtr_.end(), rowStart);
std::cerr << "resize rowPtr to " << rowEnd - rowStart+1 << "\n";
rowPtr_.resize(rowEnd-rowStart+1);
const int off = rowPtr_[0];
// erase entries for first rows
std::cerr << "entries <<= " << off << "\n";
shift_left(colInd_.begin()+off, colInd_.end(), off);
shift_left(val_.begin()+off, val_.end(), off);
// adjust row pointer offset
std::cerr << "subtract rowPtrs by " << off << "\n";
for (auto &e : rowPtr_) {
e -= off;
}
// resize entries
std::cerr << "resize entries to " << rowPtr_.back() << "\n";
colInd_.resize(rowPtr_.back());
val_.resize(rowPtr_.back());
}
};
/* device sparse matrix
*/
template<> class CsrMat<Where::device>
{
Array<Where::device, int> rowPtr_;
Array<Where::device, int> colInd_;
Array<Where::device, float> val_;
int64_t numCols_;
public:
struct View {
ArrayView<int> rowPtr_;
ArrayView<int> colInd_;
ArrayView<float> val_;
__device__ int num_rows() const {
if (rowPtr_.size() > 0) {
return rowPtr_.size() - 1;
} else {
return 0;
}
}
__device__ const int &row_ptr(int64_t i) const {
return rowPtr_(i);
}
__device__ const int &col_ind(int64_t i) const {
return colInd_(i);
}
__device__ const float &val(int64_t i) const {
return val_(i);
}
};
CsrMat() = default;
CsrMat(CsrMat &&other) = delete;
CsrMat(const CsrMat &other) = delete;
// create device matrix from host
CsrMat(const CsrMat<Where::host> &m) :
rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) {
if (colInd_.size() != val_.size()) {
throw std::logic_error("bad invariant");
}
}
~CsrMat() {
}
int64_t num_rows() const {
if (rowPtr_.size() <= 1) {
return 0;
} else {
return rowPtr_.size() - 1;
}
}
int64_t num_cols() const {
return numCols_;
}
int64_t nnz() const {
return colInd_.size();
}
View view() const {
View v;
v.rowPtr_ = rowPtr_.view();
v.colInd_ = colInd_.view();
v.val_ = val_.view();
return v;
}
};

213
overlap.cu Normal file
View File

@@ -0,0 +1,213 @@
#include <mpi.h>
#include <nvToolsExt.h>
// #include <cuda_profiler_api.h>
#include <vector>
#include <string>
#include <stdexcept>
#include <algorithm>
#include <iostream>
#include <map>
#include "cuda_runtime.hpp"
#include "csr_mat.hpp"
#include "row_part_spmv.cuh"
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define AT __FILE__ ":" TOSTRING(__LINE__)
//#define VIEW_CHECK_BOUNDS
// mxn random matrix with nnz
CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
if (m * n < nnz) {
throw std::logic_error(AT);
}
CooMat coo(m,n);
while(coo.nnz() < nnz) {
int64_t toPush = nnz - coo.nnz();
std::cerr << "adding " << toPush << " non-zeros\n";
for (int64_t _ = 0; _ < toPush; ++_) {
int r = rand() % m;
int c = rand() % n;
float e = 1.0;
coo.push_back(r, c, e);
}
std::cerr << "removing duplicate non-zeros\n";
coo.remove_duplicates();
}
coo.sort();
std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
CsrMat<Where::host> csr(coo);
std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
return csr;
};
// nxn diagonal matrix with bandwidth b
CsrMat<Where::host> random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) {
CooMat coo(n,n);
while(coo.nnz() < nnz) {
int64_t toPush = nnz - coo.nnz();
std::cerr << "adding " << toPush << " non-zeros\n";
for (int64_t _ = 0; _ < toPush; ++_) {
int r = rand() % n; // random row
// column in the band
int lb = r - bw;
int ub = r + bw + 1;
int64_t c = rand() % (ub - lb) + lb;
if (c < 0 || c > n) {
continue; // don't over-weight first or last column
}
float e = 1.0;
coo.push_back(r, c, e);
}
std::cerr << "removing duplicate non-zeros\n";
coo.remove_duplicates();
}
coo.sort();
std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
CsrMat<Where::host> csr(coo);
std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
return csr;
};
std::vector<float> random_vector(const int64_t n) {
return std::vector<float>(n, 1.0);
}
Array<Where::host, float> random_array(const int64_t n) {
return Array<Where::host, float>(n, 1.0);
}
#if 0
int send_x(int dst, int src, std::vector<float> &&v, MPI_Comm comm) {
MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm);
return 0;
}
#endif
/* recv some amount of data, and put it in the right place
in a full x
*/
std::vector<float> receive_x(const int n, const int dst, int src, MPI_Comm comm) {
int rank = 0;
int size = 1;
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &size);
// which rows of x are local
Range local = get_partition(n, rank, size);
// probe for size
MPI_Status stat;
MPI_Probe(0, Tag::x, comm, &stat);
int sz;
MPI_Get_count(&stat, MPI_INT, &sz);
if (sz != local.ub-local.lb) {
throw std::logic_error(AT);
}
std::cerr << "recv " << sz << " x entries into offset " << local.lb << "\n";
std::vector<float> x(n);
MPI_Recv(x.data() + local.lb, sz, MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE);
return x;
}
// z += a
__global__ void vector_add(ArrayView<float> z, const ArrayView<float> a) {
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < z.size(); i += blockDim.x * gridDim.x) {
z(i) += a(i);
}
}
int main (int argc, char **argv) {
MPI_Init(&argc, &argv);
int rank = 0;
int size = 1;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
std::cerr << "get a gpu...\n";
CUDA_RUNTIME(cudaSetDevice(rank % 4));
CUDA_RUNTIME(cudaFree(0));
std::cerr << "barrier...\n";
MPI_Barrier(MPI_COMM_WORLD);
// int64_t m = 150000;
// int64_t n = 150000;
// int64_t nnz = 11000000;
// or
int64_t m = 150000;
int64_t n = m;
int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks
int64_t nnz = 11000000;
CsrMat<Where::host> A; // "local A"
// generate and distribute A
if (0 == rank) {
std::cerr << "generate matrix\n";
A = random_band_matrix(m, bw, nnz);
}
RowPartSpmv spmv(A, 0, MPI_COMM_WORLD);
std::cerr << "A: " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
std::cerr << "local A: " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n";
std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n";
int loPrio, hiPrio;
CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio));
cudaStream_t loS, hiS; // "lo/hi prio"
CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio));
CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio));
cudaEvent_t event;
CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
const int nIters = 30;
std::vector<double> times(nIters);
nvtxRangePush("overlap");
for (int i = 0; i < nIters; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
double start = MPI_Wtime();
spmv.send_x_async();
spmv.launch_local();
spmv.recv_x_async();
spmv.send_x_wait();
spmv.recv_x_wait();
spmv.launch_remote();
spmv.finish();
times[i] = MPI_Wtime() - start;
}
nvtxRangePop(); // one-shot
MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
if (0 == rank) {
std::sort(times.begin(), times.end());
std::cerr << times[times.size() / 2] << "\n";
}
MPI_Finalize();
return 0;
}

59
partition.hpp Normal file
View File

@@ -0,0 +1,59 @@
#pragma once
#include "csr_mat.hpp"
struct Range {
int lb;
int ub;
int extent() const { return ub-lb; }
};
/* get the ith part of splitting domain in to n pieces
if not divisible, remainder distributed to lower
*/
Range get_partition(const int domain, const int i, const int n) {
int div = domain / n;
int rem = domain % n;
int lb, ub;
if (i < rem) {
lb = i * (div+1);
ub = lb + (div+1);
} else {
lb = rem * (div+1) + (i-rem) * div;
ub = lb + div;
}
return Range{.lb=lb, .ub=ub};
}
// who owns item `i` from `domain` split into `n`
int get_owner(int domain, int i, const int n) {
int div = domain / n;
int rem = domain % n;
// i is in the first, pieces, which are div+1
if (i < (div + 1) * rem) {
return i / (div + 1);
} else {
i -= (div+1) * rem;
domain -= (div+1) * rem;
return rem + i / div;
}
}
std::vector<CsrMat<Where::host>> part_by_rows(const CsrMat<Where::host> &m, const int parts) {
std::vector<CsrMat<Where::host>> mats;
for (int p = 0; p < parts; ++p) {
Range range = get_partition(m.num_rows(), p, parts);
std::cerr << "matrix part " << p << " has " << range.ub-range.lb << " rows\n";
CsrMat<Where::host> part(m);
part.retain_rows(range.lb, range.ub);
mats.push_back(part);
}
return mats;
}

328
row_part_spmv.cuh Normal file
View File

@@ -0,0 +1,328 @@
#pragma once
#include <mpi.h>
#include "csr_mat.hpp"
#include "partition.hpp"
#include "split_coo_mat.hpp"
#include <cassert>
enum class ProductConfig {
MODIFY, // b +=
SET // b =
};
/* Ax=b
*/
__global__ void spmv(ArrayView<float> b,
const CsrMat<Where::device>::View A,
const ArrayView<float> x,
const ProductConfig pc
) {
// one thread per row
for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) {
float acc = 0;
for (int ci = A.row_ptr(r); ci < A.row_ptr(r+1); ++ci) {
int c = A.col_ind(ci);
acc += A.val(ci) * x(c);
}
if (ProductConfig::SET == pc) {
b(r) = acc;
} else {
b(r) += acc;
}
}
}
enum Tag : int {
row_ptr,
col_ind,
val,
x,
num_cols
};
int send_matrix(int dst, int src, CsrMat<Where::host> &&m, MPI_Comm comm) {
MPI_Request reqs[4];
int numCols = m.num_cols();
MPI_Isend(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm, &reqs[0]);
MPI_Isend(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm, &reqs[1]);
MPI_Isend(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm, &reqs[2]);
MPI_Isend(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm, &reqs[3]);
MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE);
return 0;
}
CsrMat<Where::host> receive_matrix(int dst, int src, MPI_Comm comm) {
int numCols;
MPI_Recv(&numCols, 1, MPI_INT, 0, Tag::num_cols, comm, MPI_STATUS_IGNORE);
// probe for number of rows
MPI_Status stat;
MPI_Probe(0, Tag::row_ptr, comm, &stat);
int numRows;
MPI_Get_count(&stat, MPI_INT, &numRows);
if (numRows > 0) {
--numRows;
}
// probe for nnz
MPI_Probe(0, Tag::col_ind, comm, &stat);
int nnz;
MPI_Get_count(&stat, MPI_INT, &nnz);
std::cerr << "recv " << numRows << "x" << numCols << " w/ " << nnz << "\n";
CsrMat<Where::host> csr(numRows, numCols, nnz);
// receive actual data into matrix
MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE);
MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE);
MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE);
return csr;
}
// out[i] = in[idx[i]]
__global__ void scatter(ArrayView<float> out,
ArrayView<float> in,
ArrayView<int> idx) {}
/* Ax=y , partitioned evenly by rows of A
always have to pack on the send side, since not all local x values will be needed
so may as well pack in a way that recver doesn't have to unpack
serializing local and remote means you don't have to worry about
concurrent adds to the product vector
if kernels are sufficiently large, no real opportunity for these to overlap anyway
if they're small, communication time will be longer anway
*/
class RowPartSpmv {
private:
MPI_Comm comm_;
int loff_; // first row in global index
CsrMat<Where::device> la_; // local A
CsrMat<Where::device> ra_; // remote A
Array<Where::device, float> lx_; // local x
Array<Where::device, float> rx_; // remote x
Array<Where::device, float> ly_;
// info for sending x
struct SendParam {
int dst; // destination rank
int displ;
int count;
MPI_Request req;
};
std::vector<SendParam> sendParams_; // parameters for each rank
Array<Where::device, int> xSendIdx_; // which entry of lx_ will be in each xSendBuf_;
Array<Where::device, float> xSendBuf_; // send local x entries to other ranks
std::vector<int> gCols_; // global index from local
std::map<int, std::vector<int>> sendEntr; // which entries of x to send to each rank
std::map<int, MPI_Request> sendReq;
struct RecvParam {
int src; // source rank
int displ; // displacement in
int count; // number of entries
MPI_Request req;
};
std::vector<RecvParam> recvParams_;
cudaStream_t kernelStream_;
cudaStream_t packStream_;
public:
const CsrMat<Where::device> &lA() const {return la_;}
const CsrMat<Where::device> &rA() const {return ra_;}
void launch_local() {
dim3 dimGrid(100);
dim3 dimBlock(128);
spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), la_.view(), lx_.view(), ProductConfig::SET);
CUDA_RUNTIME(cudaGetLastError());
}
void launch_remote() {
dim3 dimGrid(100);
dim3 dimBlock(128);
spmv<<<dimGrid, dimBlock, 0, kernelStream_>>>(ly_.view(), ra_.view(), rx_.view(), ProductConfig::MODIFY);
CUDA_RUNTIME(cudaGetLastError());
}
void pack_x_async() {
scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view());
}
void pack_x_wait() {
CUDA_RUNTIME(cudaStreamSynchronize(packStream_));
}
void send_x_async() {
// send to neighbors who want it
for (auto &p : sendParams_) {
int tag = 0;
MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req);
}
}
void send_x_wait() {
for (auto &p : sendParams_) {
MPI_Wait(&p.req, MPI_STATUS_IGNORE);
}
}
void recv_x_async() {
for (auto &p : recvParams_) {
int tag = 0;
MPI_Irecv(rx_.data() + p.displ, p.count, MPI_FLOAT, p.src, tag, comm_, &p.req);
}
}
void recv_x_wait() {
for (auto &p : recvParams_) {
MPI_Wait(&p.req, MPI_STATUS_IGNORE);
}
}
void finish() {
CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_));
}
void launch_local_spmv() {}
void launch_remote_spmv() {}
/* create from a matrix at root
*/
RowPartSpmv(
const CsrMat<Where::host> &wholeA,
const int root,
MPI_Comm comm
) {
int rank, size;
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &size);
CsrMat<Where::host> a;
if (root == rank) {
std::cerr << "partition matrix\n";
std::vector<CsrMat<Where::host>> as = part_by_rows(wholeA, size);
for (size_t dst = 0; dst < size; ++dst) {
if (root != dst) {
std::cerr << "send A to " << dst << "\n";
send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD);
}
}
a = as[rank];
} else {
std::cerr << "recv A at " << rank << "\n";
a = receive_matrix(rank, 0, MPI_COMM_WORLD);
}
// split row part of a into local and global
SplitCooMat scm = split_local_remote(a, comm);
loff_ = scm.loff;
// create local part of x array
// undefined entries
Range xrange = get_partition(a.num_cols(), rank, size);
lx_ = Array<Where::device, float>(xrange.extent());
// create remote part of x array
// one entry per remote column
rx_ = Array<Where::device,float>(scm.globals.size());
// determine which columns needed from others
std::map<int, std::vector<int>> recvCols;
for (int c : scm.globals) {
auto src = get_owner(a.num_cols(), c, size);
assert(rank != src && "should not need my own columns in remote part");
recvCols[src].push_back(c);
}
// create receive parameters
int offset = 0;
for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
RecvParam param;
param.displ = offset;
param.src = it->first;
offset += it->second.size();
param.count = offset - param.displ;
recvParams_.push_back(param);
}
// tell others which cols I need (send 0 if nothing)
std::vector<MPI_Request> reqs(size);
for (int dest = 0; dest < size; ++dest) {
auto it = recvCols.find(dest);
if (it != recvCols.end()) {
MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]);
} else {
int _;
MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]);
}
}
// which global x rows other ranks need from me
std::map<int, std::vector<int>> sendCols;
for (int src = 0; src < size; ++src) {
MPI_Status status;
MPI_Probe(src, 0, comm, &status);
int count;
MPI_Get_count(&status, MPI_INT, &count);
if (count != 0) {
sendCols[src].resize(count);
MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
} else {
int _;
MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
}
}
// create the offsets from lx that we will send out
// TODO: should be device array
std::vector<int> offsets;
for (auto it = sendCols.begin(); it != sendCols.end(); ++it) {
// TODO - adjust for changed local array columns
SendParam param;
param.displ = offsets.size();
param.dst = it->first;
for (int gc : it->second) {
int lc = gc - scm.loff;
offsets.push_back(lc);
}
param.count = offsets.size() - param.displ;
sendParams_.push_back(param);
}
assert(la_.size() > 0);
assert(ra_.size() > 0); // remote A
assert(lx_.size() > 0);
assert(rx_.size() > 0);
assert(ly_.size() > 0);
}
};

95
split_coo_mat.hpp Normal file
View File

@@ -0,0 +1,95 @@
#pragma once
#include "coo_mat.hpp"
#include "partition.hpp"
/* local matrix has cols renumbered to be 0..N
into the local dense vector
remote matrix has cols renumbered for the remote dense vector
*/
struct SplitCooMat {
int loff; // global row for local matrix 0
CsrMat<Where::host> local; // local matrix
CsrMat<Where::host> remote; // remote matrix (with local column indices)
std::map<int, int> locals; // get local column from global column
std::vector<int> globals; // get global column for local column
};
/* Row partition of a matrix into a "local" and "remote" part
If locally there are rows i...j, then the local part also has columns i...j
The remote part will have all other columns
Each rank will also renumber the column indices in the remote part
This rank will recv the corresponding remote x vector entries, but
doesn't want to materialize the whole distributed x vector in memory
so the column indices must be packed into a contiguous range 0...N
Furthermore, we want all entries from rank 0 to come first, then rank 1, etc.
This is so we can just get entries from rank 0 and recv them directly into the
remote x vector at the correct offset
To do this, relabel the matrix in the following way:
Get a list of unique required global ids, and then sort them.
The first will be local 0, then local 1, etc
*/
SplitCooMat split_local_remote(const CsrMat<Where::host> &m, MPI_Comm comm) {
int rank = 0;
int size = 1;
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &size);
// which rows of x are local
Range localRange = get_partition(m.num_cols(), rank, size);
int loff = localRange.lb;
// build two matrices, local gets local non-zeros, remote gets remote non-zeros
CooMat local(m.num_rows(), m.num_cols());
CooMat remote(m.num_rows(), m.num_cols());
std::vector<int> globals; // get global col for local col
for (int r = 0; r < m.num_rows(); ++r) {
for (int ci = m.row_ptr(r); ci < m.row_ptr(r+1); ++ci) {
int c = m.col_ind(ci);
float v = m.val(ci);
if (c >= localRange.lb && c < localRange.ub) {
int lc = c - loff;
local.push_back(r,lc,v);
} else {
// keep the global column for now, it will be renumbered later
globals.push_back(c);
remote.push_back(r, c, v);
}
}
}
// sort required global columns.
// this will ensure the lowest owning rank comes first, and all are contiguous
std::sort(globals.begin(), globals.end());
auto it = std::unique(globals.begin(), globals.end());
globals.resize(it - globals.begin());
std::map<int, int> locals; // get local col for global column
for (size_t lc = 0; lc < globals.size(); ++lc) {
int gc = globals[lc];
locals[gc] = lc;
}
// relabel remote columns
for (CooMat::Entry &e : remote) {
e.j = locals[e.j];
}
return SplitCooMat {
.loff=loff,
.local=local,
.remote=remote,
.locals=locals,
.globals=globals
};
}

6
where.hpp Normal file
View File

@@ -0,0 +1,6 @@
#pragma once
enum class Where {
host,
device
};