first spmv

This commit is contained in:
Carl William Pearson
2021-05-17 15:01:30 -06:00
parent a4b08da21d
commit 5033fc1dec
2 changed files with 226 additions and 62 deletions

View File

@@ -8,4 +8,12 @@ module load xl/2021.03.11
module load cuda/10.1.243
module load spectrum-mpi/rolling-release
module load cmake/3.18.0
```
**ascicgpu030**
```
module purge
sierra-devel/nvidia
module load cde/v2/cmake/3.19.2
```

280
main.cu
View File

@@ -8,10 +8,17 @@
#include "cuda_runtime.hpp"
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define AT __FILE__ ":" TOSTRING(__LINE__)
//#define VIEW_CHECK_BOUNDS
template<typename ForwardIt>
void shift_left(ForwardIt first, ForwardIt last, size_t n) {
for (size_t i = 0; i < last-first; ++i) {
*(first-n+i) = *(first+i);
while(first != last) {
*(first-n) = *first;
++first;
}
}
@@ -29,11 +36,7 @@ enum class Where {
};
template <Where where, typename T>
class Array {
public:
Array();
int64_t size() const;
};
class Array;
/* device array
@@ -50,36 +53,76 @@ public:
public:
View() : data_(nullptr), size_(0){}
View(const View &other) = default;
View(View &&other) = default;
View &operator=(const View &rhs) = default;
// create view from array
View(const Array &a) {
size_ = a.size();
data_ = a.data_;
__host__ __device__ int64_t size() const { return size_; }
__device__ const T &operator()(int64_t i) const {
#ifdef VIEW_CHECK_BOUNDS
if (i < 0) {
printf("ERR: i < 0: %d\n", i);
}
if (i >= size_) {
printf("ERR: i > size_: %d > %ld\n", i, size_);
}
#endif
return data_[i];
}
__device__ T &operator()(int64_t i) {
return data_[i];
}
__device__ int64_t size() const { return size_; }
};
// array owns the data in this view
View view_;
public:
Array() = default;
Array(const size_t n) {
resize(n);
}
Array(const Array &other) = delete;
Array(Array &&other) : view_(other.view_) {
// view is non-owning, so have to clear other
other.view_.data_ = nullptr;
other.view_.size_ = 0;
}
Array(const std::vector<T> &v) {
view_.size_ = v.size();
CUDA_RUNTIME(cudaMalloc(&view_.data_, view_.size_ * sizeof(T)));
CUDA_RUNTIME(cudaMemcpy(view_.data_, v.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice));
set_from(v);
}
~Array() {
CUDA_RUNTIME(cudaFree(view_.data_));
view_.data_ = nullptr;
view_.size_ = 0;
}
int64_t size() const { return view_.size(); }
int64_t size() const {
return view_.size(); }
View view() const {
return view_; // copy of internal view
}
operator std::vector<T>() const {
std::vector<T> v(size());
CUDA_RUNTIME(cudaMemcpy(v.data(), view_.data_, size() * sizeof(T), cudaMemcpyDeviceToHost));
return v;
}
void set_from(const std::vector<T> &rhs, cudaStream_t stream = 0) {
resize(rhs.size());
CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream));
}
// any change destroys all data
void resize(size_t n) {
if (size() != n) {
view_.size_ = n;
CUDA_RUNTIME(cudaFree(view_.data_));
CUDA_RUNTIME(cudaMalloc(&view_.data_, view_.size_ * sizeof(T)));
}
}
};
class CooMat {
@@ -105,6 +148,8 @@ public:
};
private:
// sorted during construction
std::vector<Entry> data_;
int64_t numRows_;
int64_t numCols_;
@@ -113,7 +158,18 @@ public:
CooMat(int m, int n) : numRows_(m), numCols_(n) {}
const std::vector<Entry> &entries() const {return data_;}
void push_back(int i, int j, int e) {
data_.push_back(Entry(i, j, e));
Entry entry(i,j,e);
// first position not less than entry
auto lb = std::lower_bound(data_.begin(), data_.end(), entry, Entry::by_ij);
// overwrite if already exists
if (lb != data_.end() && lb->i == entry.i && lb->j == entry.j) {
*lb = entry;
} else {
data_.insert(lb, entry);
}
}
void sort() {
@@ -121,7 +177,7 @@ public:
}
int64_t num_rows() const {return numRows_;}
int64_t num_cols() const {return numRows_;}
int64_t num_cols() const {return numCols_;}
int64_t nnz() const {return data_.size();}
};
@@ -146,8 +202,8 @@ template<> class CsrMat<Where::host>
public:
CsrMat() = default;
CsrMat(int numRows, int numCols, int nnz) : rowPtr_(numRows+1), colInd_(nnz), val_(nnz) {}
CsrMat(int numRows, int numCols, int nnz) : rowPtr_(numRows+1), colInd_(nnz), val_(nnz), numCols_(numCols) {}
CsrMat(const CooMat &coo) : numCols_(coo.num_cols()) {
for (auto &e : coo.entries()) {
while (rowPtr_.size() <= e.i) {
@@ -196,21 +252,21 @@ public:
}
// erase rows after
// dont want to keep rowEnd, so rowEnd points to end of rowEnd-1
std::cerr << "resize rowPtr_ to " << rowEnd+1 << "\n";
std::cerr << "rowPtr_ = rowPtr[:" << rowEnd+1 << "]\n";
rowPtr_.resize(rowEnd+1);
std::cerr << "resize entries to " << rowPtr_.back() << "\n";
colInd_.resize(rowPtr_.back());
val_.resize(rowPtr_.back());
// erase early row pointers
std::cerr << "shl rowPtr by " << rowStart << "\n";
std::cerr << "rowPtr <<= " << rowStart << "\n";
shift_left(rowPtr_.begin()+rowStart, rowPtr_.end(), rowStart);
std::cerr << "resize rowPtr to " << rowEnd - rowStart+1 << "\n";
rowPtr_.resize(rowEnd-rowStart+1);
const int off = rowPtr_[0];
// erase entries for first rows
std::cerr << "shl entries by " << off << "\n";
std::cerr << "entries <<= " << off << "\n";
shift_left(colInd_.begin()+off, colInd_.end(), off);
shift_left(val_.begin()+off, val_.end(), off);
@@ -250,11 +306,28 @@ public:
return 0;
}
}
__device__ const int &row_ptr(int64_t i) const {
return rowPtr_(i);
}
__device__ const int &col_ind(int64_t i) const {
return colInd_(i);
}
__device__ const float &val(int64_t i) const {
return val_(i);
}
};
CsrMat() = delete;
CsrMat(CsrMat &&other) = delete;
CsrMat(const CsrMat &other) = delete;
// create device matrix from host
CsrMat(const CsrMat<Where::host> &m) :
rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_) {
CsrMat(const CsrMat<Where::host> &m) :
rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_) {
if (colInd_.size() != val_.size()) {
throw std::logic_error("bad invariant");
}
@@ -279,8 +352,6 @@ public:
v.colInd_ = colInd_.view();
v.val_ = val_.view();
return v;
}
};
@@ -290,8 +361,13 @@ public:
// mxn random matrix with nnz
CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
if (m * n < nnz) {
throw std::logic_error(AT);
}
CooMat coo(m,n);
for (int i = 0; i < nnz; ++i) {
while(coo.nnz() < nnz) {
int r = rand() % m;
int c = rand() % n;
float e = 1.0;
@@ -308,42 +384,58 @@ std::vector<float> random_vector(const int64_t n) {
return std::vector<float>(n, 1.0);
}
struct Range {
int lb;
int ub;
};
/* get the ith part of splitting domain in to n pieces
*/
Range get_partition(const int domain, const int i, const int n) {
int div = domain / n;
int rem = domain % n;
int lb, ub;
if (i < rem) {
lb = i * (div+1);
ub = lb + (div+1);
} else {
lb = rem * (div+1) + (i-rem) * div;
ub = lb + div;
}
return Range{.lb=lb, .ub=ub};
}
std::vector<CsrMat<Where::host>> part_by_rows(const CsrMat<Where::host> &m, const int parts) {
std::vector<CsrMat<Where::host>> mats;
int rowStart = 0;
for (int p = 0; p < parts; ++p) {
int partSize = m.num_rows() / parts;
if (p < m.num_rows() % parts) {
++partSize;
}
std::cerr << "matrix part " << p << " has " << partSize << " rows\n";
const int rowEnd = rowStart + partSize;
Range range = get_partition(m.num_rows(), p, parts);
std::cerr << "matrix part " << p << " has " << range.ub-range.lb << " rows\n";
CsrMat<Where::host> part(m);
part.retain_rows(rowStart, rowEnd);
rowStart = rowEnd;
part.retain_rows(range.lb, range.ub);
mats.push_back(part);
}
return mats;
}
std::vector<std::vector<float>> part_by_rows(const std::vector<float> &x, const int parts) {
std::vector<std::vector<float>> xs;
int rowStart = 0;
for (int p = 0; p < parts; ++p) {
int partSize = x.size() / parts;
if (p < x.size() % parts) {
++partSize;
}
std::cerr << "vector part " << p << " has " << partSize << " rows\n";
const int rowEnd = rowStart + partSize;
std::vector<float> part(x.begin()+rowStart, x.begin()+rowEnd);
Range range = get_partition(x.size(), p, parts);
std::cerr << "vector part " << p << " will have " << range.ub-range.lb << " rows\n";
std::vector<float> part(x.begin()+range.lb, x.begin()+range.ub);
xs.push_back(part);
}
if (xs.size() != parts) {
throw std::logic_error("line " STRINGIFY(__LINE__));
}
return xs;
}
@@ -412,34 +504,47 @@ std::vector<float> receive_vector(int dst, int src, MPI_Comm comm) {
return x;
}
/* Ax=b
*/
__global__ void spmv(Array<Where::device, float>::View b, const CsrMat<Where::device>::View A, const Array<Where::device, float>::View x) {
// one block per row
for (int r = blockIdx.x; r < A.num_rows(); r += gridDim.x) {
// one thread per row
for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) {
float acc = 0;
for (int ci = A.row_ptr(r); ci < A.row_ptr(r+1); ++ci) {
int c = A.col_ind(ci);
acc += A.val(ci) * x(c);
}
b(r) = acc;
}
}
int main (int argc, char **argv) {
MPI_Init(&argc, &argv);
MPI_Init(&argc, &argv);
int rank, size;
std::cerr << "get a gpu...\n";
CUDA_RUNTIME(cudaFree(0));
std::cerr << "barrier...\n";
MPI_Barrier(MPI_COMM_WORLD);
int rank = 0;
int size = 1;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int64_t m = 100;
int64_t n = 50;
int64_t nnz = 5000;
int64_t nnz = 100;
CsrMat<Where::host> A;
std::vector<float> x;
// generate and send or recv A
// generate and distribute A and x
if (0 == rank) {
A = random_matrix(m, n, nnz);
std::vector<float> x = random_vector(n);
x = random_vector(n);
std::vector<CsrMat<Where::host>> As = part_by_rows(A, size);
std::vector<std::vector<float>> xs = part_by_rows(x, size);
for (size_t dst = 1; dst < size; ++dst) {
@@ -457,20 +562,71 @@ MPI_Init(&argc, &argv);
x = receive_vector(rank, 0, MPI_COMM_WORLD);
}
// Product vector size is same as local rows of A
std::vector<float> b(A.num_rows());
// get GPU versions
std::cerr << "A[" << A.num_rows() << "," << A.num_cols() << "] w/ " << A.nnz() << "\n";
std::cerr << "Copy A to GPU\n";
CsrMat<Where::device> Ad(A);
Array<Where::device, float> xd(x);
// Product vector size is same as local rows of A
std::vector<float> b(A.num_rows(), 0);
std::cerr << "Copy b to GPU\n";
Array<Where::device, float> bd(b);
// plan allgather of complete x
std::cerr << "plan allgather xs\n";
std::vector<float> lx(A.num_cols());
Array<Where::device, float> lxd(A.num_cols());
std::vector<int> recvcounts;
std::vector<int> displs;
for (int i = 0; i < size; ++i) {
Range r = get_partition(lx.size(), i, size);
recvcounts.push_back(r.ub-r.lb);
if (displs.empty()) {
displs.push_back(0);
} else {
displs.push_back(displs.back() + recvcounts.back());
}
}
cudaStream_t stream;
CUDA_RUNTIME(cudaStreamCreate(&stream));
// do spmv
dim3 dimBlock(32,8,1);
dim3 dimBlock(256);
dim3 dimGrid(100);
spmv<<<dimGrid, dimBlock>>>(bd.view(), Ad.view(), xd.view());
CUDA_RUNTIME(cudaDeviceSynchronize());
MPI_Finalize();
for (int i = 0; i < 10; ++i) {
// distribute x to ranks
std::cerr << "Allgather xs\n";
MPI_Allgatherv(x.data(), x.size(), MPI_FLOAT, lx.data(), recvcounts.data(), displs.data(), MPI_FLOAT, MPI_COMM_WORLD);
std::cerr << "Copy x to GPU\n";
lxd.set_from(lx, stream);
std::cerr << "launch spmv\n";
spmv<<<dimGrid, dimBlock, 0, stream>>>(bd.view(), Ad.view(), lxd.view());
CUDA_RUNTIME(cudaGetLastError());
CUDA_RUNTIME(cudaStreamSynchronize(stream));
std::cerr << "done spmv\n";
// copy product back to host
// b = bd;
// for (int r = 0; r < size; ++r) {
// MPI_Barrier(MPI_COMM_WORLD);
// if (rank == r) {
// for (auto &e : b) {
// std::cout << e << "\n";
// }
// }
// }
}
MPI_Finalize();
return 0;
}