From a4b08da21dc7f03afd6addd6095b235845e7937b Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 14 May 2021 17:18:25 -0600 Subject: [PATCH] initial commit --- CMakeLists.txt | 98 ++++++++++ README.md | 11 ++ cuda_runtime.hpp | 15 ++ main.cu | 476 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 600 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 README.md create mode 100644 cuda_runtime.hpp create mode 100644 main.cu diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f136eef --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,98 @@ +# 3.17+ for CMAKE_CUDA_KNOWN_FEATURES/cuda_std_11 +# 3.18+ for CUDA_ARCHITECTURES +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +project(spmv LANGUAGES CXX CUDA VERSION 0.1.0.0) + +include(CheckLanguage) + +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) +message(STATUS "CMAKE_CUDA_ARCHITECTURES not defined, setting to OFF") +set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "") +endif() + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +find_package(MPI REQUIRED) +find_package(CUDAToolkit REQUIRED) + +if (MPI_FOUND) +message(STATUS "MPI_CXX_COMPILER: ${MPI_CXX_COMPILER}") +message(STATUS "MPI_CXX_INCLUDE_DIRS: ${MPI_CXX_INCLUDE_DIRS}") +message(STATUS "MPI_CXX_LIBRARIES: ${MPI_CXX_LIBRARIES}") +message(STATUS "MPI_CUDA_INCLUDE_DIRS: ${MPI_CUDA_INCLUDE_DIRS}") +message(STATUS "MPI_CUDA_LIBRARIES: ${MPI_CUDA_LIBRARIES}") +message(STATUS "MPIEXEC_EXECUTABLE: ${MPIEXEC_EXECUTABLE}") +message(STATUS "MPIEXEC_NUMPROC_FLAG: ${MPIEXEC_NUMPROC_FLAG}") +message(STATUS "MPIEXEC_MAX_NUMPROCS: ${MPIEXEC_MAX_NUMPROCS}") +message(STATUS "MPIEXEC_PREFLAGS: ${MPIEXEC_PREFLAGS}") +message(STATUS "MPIEXEC_POSTFLAGS: ${MPIEXEC_POSTFLAGS}") + +endif() + +function(set_cxx_options target) +target_compile_options(${target} PRIVATE +$<$: + -Wall + -Wextra + -Wcast-align; + -Wdisabled-optimization; + -Wformat=2; + -Winit-self; + -Wlogical-op; + -Wmissing-include-dirs; + -Woverloaded-virtual; + -Wpointer-arith; + -Wshadow; + -Wstrict-aliasing; + -Wswitch-enum; + -Wvla; + > +) +endfunction() + +function(set_cuda_options target) +target_compile_options(${target} PRIVATE +$<$: +--Wno-deprecated-gpu-targets; +--expt-extended-lambda; +-Xcompiler=-Wall; +-Xcompiler=-Wextra; +-Xcompiler=-Wcast-align; +-Xcompiler=-Wdisabled-optimization; +-Xcompiler=-Wformat=2; +-Xcompiler=-Winit-self; +-Xcompiler=-Wlogical-op; +-Xcompiler=-Wmissing-include-dirs; +-Xcompiler=-Woverloaded-virtual; +-Xcompiler=-Wpointer-arith; +-Xcompiler=-Wshadow; +-Xcompiler=-Wstrict-aliasing; +-Xcompiler=-Wswitch-enum; +-Xcompiler=-Wvla; +-Xptxas=-v; +> +) +endfunction() + +function(set_cxx_standard target) +set_property(TARGET ${target} PROPERTY CXX_STANDARD 11) +set_property(TARGET ${target} PROPERTY CXX_EXTENSIONS OFF) +set_property(TARGET ${target} PROPERTY CXX_STANDARD_REQUIRED ON) +set_property(TARGET ${target} PROPERTY CUDA_STANDARD 11) +set_property(TARGET ${target} PROPERTY CUDA_STANDARD_REQUIRED ON) +endfunction() + +# copy run-all.sh to build directory +#configure_file(${CMAKE_CURRENT_LIST_DIR}/run-all.sh ${CMAKE_CURRENT_BINARY_DIR}/run-all.sh COPYONLY) + +if (MPI_FOUND) + add_executable(main main.cu) + target_include_directories(main PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS}) + target_link_libraries(main ${MPI_CXX_LIBRARIES}) + # target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH}) + # target_compile_options(main PRIVATE ${MPI_CXX_COMPILE_FLAGS}) + # target_link_libraries(main ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + set_cxx_options(main) + set_cxx_standard(main) +endif() + diff --git a/README.md b/README.md new file mode 100644 index 0000000..10419a9 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# dist-spmv + +**vortex** +``` +module --force purge +module load StdEnv +module load xl/2021.03.11 +module load cuda/10.1.243 +module load spectrum-mpi/rolling-release +module load cmake/3.18.0 +``` \ No newline at end of file diff --git a/cuda_runtime.hpp b/cuda_runtime.hpp new file mode 100644 index 0000000..91514ed --- /dev/null +++ b/cuda_runtime.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include + +#include + +inline void checkCuda(cudaError_t result, const char *file, const int line) +{ + if (result != cudaSuccess) + { + fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result)); + exit(-1); + } +} +#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__); diff --git a/main.cu b/main.cu new file mode 100644 index 0000000..2e02886 --- /dev/null +++ b/main.cu @@ -0,0 +1,476 @@ +#include + +#include +#include +#include +#include +#include + +#include "cuda_runtime.hpp" + +template +void shift_left(ForwardIt first, ForwardIt last, size_t n) { + for (size_t i = 0; i < last-first; ++i) { + *(first-n+i) = *(first+i); + } +} + +enum Tag : int { + row_ptr, + col_ind, + val, + x, + num_cols +}; + +enum class Where { + host, + device +}; + +template +class Array { +public: + Array(); + int64_t size() const; +}; + + +/* device array +*/ +template class Array +{ +public: + + // A non-owning view of data + struct View + { + T *data_; + int64_t size_; + public: + View() : data_(nullptr), size_(0){} + View(const View &other) = default; + + // create view from array + View(const Array &a) { + size_ = a.size(); + data_ = a.data_; + } + __device__ int64_t size() const { return size_; } + }; + + // array owns the data in this view + View view_; +public: + Array() = default; + Array(const Array &other) = delete; + + Array(const std::vector &v) { + view_.size_ = v.size(); + CUDA_RUNTIME(cudaMalloc(&view_.data_, view_.size_ * sizeof(T))); + CUDA_RUNTIME(cudaMemcpy(view_.data_, v.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice)); + } + ~Array() { + CUDA_RUNTIME(cudaFree(view_.data_)); + view_.data_ = nullptr; + view_.size_ = 0; + } + int64_t size() const { return view_.size(); } + + View view() const { + return view_; // copy of internal view + } +}; + +class CooMat { +public: + + + struct Entry { + int i; + int j; + float e; + + Entry(int _i, int _j, int _e) : i(_i), j(_j), e(_e) {} + + static bool by_ij(const Entry &a, const Entry &b) { + if (a.i < b.i) { + return true; + } else if (a.i > b.i) { + return false; + } else { + return a.j < b.j; + } + } + }; + +private: + std::vector data_; + int64_t numRows_; + int64_t numCols_; + +public: + CooMat(int m, int n) : numRows_(m), numCols_(n) {} + const std::vector &entries() const {return data_;} + void push_back(int i, int j, int e) { + data_.push_back(Entry(i, j, e)); + } + + void sort() { + std::sort(data_.begin(), data_.end(), Entry::by_ij); + } + + int64_t num_rows() const {return numRows_;} + int64_t num_cols() const {return numRows_;} + int64_t nnz() const {return data_.size();} +}; + +template +class CsrMat { +public: + CsrMat(); + int64_t nnz() const; + int64_t num_rows() const; +}; +template<> class CsrMat; +template<> class CsrMat; + +/* host sparse matrix */ +template<> class CsrMat +{ + friend class CsrMat; // device can see inside + std::vector rowPtr_; + std::vector colInd_; + std::vector val_; + int64_t numCols_; + +public: + CsrMat() = default; + CsrMat(int numRows, int numCols, int nnz) : rowPtr_(numRows+1), colInd_(nnz), val_(nnz) {} + + CsrMat(const CooMat &coo) : numCols_(coo.num_cols()) { + for (auto &e : coo.entries()) { + while (rowPtr_.size() <= e.i) { + rowPtr_.push_back(colInd_.size()); + } + colInd_.push_back(e.j); + val_.push_back(e.e); + } + while (rowPtr_.size() < coo.num_rows()+1){ + rowPtr_.push_back(colInd_.size()); + } + } + + int64_t num_rows() const { + if (rowPtr_.size() <= 1) { + return 0; + } else { + return rowPtr_.size() - 1; + } + } + + int64_t num_cols() const { + return numCols_; + } + + int64_t nnz() const { + if (colInd_.size() != val_.size()) { + throw std::logic_error("bad invariant"); + } + return colInd_.size(); + } + + const int *row_ptr() const {return rowPtr_.data(); } + int *row_ptr() {return rowPtr_.data(); } + const int *col_ind() const {return colInd_.data(); } + int *col_ind() {return colInd_.data(); } + const float *val() const {return val_.data(); } + float *val() {return val_.data(); } + + /* keep rows [rowStart, rowEnd) + */ + void retain_rows(int rowStart, int rowEnd) { + + if (0 == rowEnd) { + throw std::logic_error("unimplemented"); + } + // erase rows after + // dont want to keep rowEnd, so rowEnd points to end of rowEnd-1 + std::cerr << "resize rowPtr_ to " << rowEnd+1 << "\n"; + rowPtr_.resize(rowEnd+1); + std::cerr << "resize entries to " << rowPtr_.back() << "\n"; + colInd_.resize(rowPtr_.back()); + val_.resize(rowPtr_.back()); + + // erase early row pointers + std::cerr << "shl rowPtr by " << rowStart << "\n"; + shift_left(rowPtr_.begin()+rowStart, rowPtr_.end(), rowStart); + std::cerr << "resize rowPtr to " << rowEnd - rowStart+1 << "\n"; + rowPtr_.resize(rowEnd-rowStart+1); + + const int off = rowPtr_[0]; + // erase entries for first rows + std::cerr << "shl entries by " << off << "\n"; + shift_left(colInd_.begin()+off, colInd_.end(), off); + shift_left(val_.begin()+off, val_.end(), off); + + // adjust row pointer offset + std::cerr << "subtract rowPtrs by " << off << "\n"; + for (auto &e : rowPtr_) { + e -= off; + } + + // resize entries + std::cerr << "resize entries to " << rowPtr_.back() << "\n"; + colInd_.resize(rowPtr_.back()); + val_.resize(rowPtr_.back()); + } + +}; + +/* device sparse matrix +*/ +template<> class CsrMat +{ + Array rowPtr_; + Array colInd_; + Array val_; + +public: + + struct View { + Array::View rowPtr_; + Array::View colInd_; + Array::View val_; + + __device__ int num_rows() const { + if (rowPtr_.size() > 0) { + return rowPtr_.size() - 1; + } else { + return 0; + } + } + }; + + // create device matrix from host + CsrMat(const CsrMat &m) : + rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_) { + if (colInd_.size() != val_.size()) { + throw std::logic_error("bad invariant"); + } + } + ~CsrMat() { + } + int64_t num_rows() const { + if (rowPtr_.size() <= 1) { + return 0; + } else { + return rowPtr_.size() - 1; + } + } + + int64_t nnz() const { + return colInd_.size(); + } + + View view() const { + View v; + v.rowPtr_ = rowPtr_.view(); + v.colInd_ = colInd_.view(); + v.val_ = val_.view(); + return v; + + + } + +}; + + + + +// mxn random matrix with nnz +CsrMat random_matrix(const int64_t m, const int64_t n, const int64_t nnz) { + CooMat coo(m,n); + for (int i = 0; i < nnz; ++i) { + int r = rand() % m; + int c = rand() % n; + float e = 1.0; + coo.push_back(r, c, e); + } + coo.sort(); + std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; + CsrMat csr(coo); + std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n"; + return csr; +}; + +std::vector random_vector(const int64_t n) { + return std::vector(n, 1.0); +} + +std::vector> part_by_rows(const CsrMat &m, const int parts) { + + std::vector> mats; + + int rowStart = 0; + for (int p = 0; p < parts; ++p) { + int partSize = m.num_rows() / parts; + if (p < m.num_rows() % parts) { + ++partSize; + } + std::cerr << "matrix part " << p << " has " << partSize << " rows\n"; + const int rowEnd = rowStart + partSize; + CsrMat part(m); + part.retain_rows(rowStart, rowEnd); + rowStart = rowEnd; + mats.push_back(part); + } + + return mats; +} + +std::vector> part_by_rows(const std::vector &x, const int parts) { + std::vector> xs; + + int rowStart = 0; + for (int p = 0; p < parts; ++p) { + int partSize = x.size() / parts; + if (p < x.size() % parts) { + ++partSize; + } + std::cerr << "vector part " << p << " has " << partSize << " rows\n"; + const int rowEnd = rowStart + partSize; + std::vector part(x.begin()+rowStart, x.begin()+rowEnd); + xs.push_back(part); + } + + return xs; +} + +int send_matrix(int dst, int src, CsrMat &&m, MPI_Comm comm) { + + int numCols = m.num_cols(); + MPI_Send(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm); + MPI_Send(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm); + MPI_Send(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm); + MPI_Send(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm); + + return 0; +} + +CsrMat receive_matrix(int dst, int src, MPI_Comm comm) { + + int numCols; + MPI_Recv(&numCols, 1, MPI_INT, 0, Tag::num_cols, comm, MPI_STATUS_IGNORE); + + // probe for number of rows + MPI_Status stat; + MPI_Probe(0, Tag::row_ptr, comm, &stat); + int numRows; + MPI_Get_count(&stat, MPI_INT, &numRows); + if (numRows > 0) { + --numRows; + } + + // probe for nnz + MPI_Probe(0, Tag::col_ind, comm, &stat); + int nnz; + MPI_Get_count(&stat, MPI_INT, &nnz); + + std::cerr << "recv " << numRows << "x" << numCols << " w/ " << nnz << "\n"; + CsrMat csr(numRows, numCols, nnz); + + // receive actual data into matrix + MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE); + // receive actual data into matrix + MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE); + // receive actual data into matrix + MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE); + + return csr; +} + +int send_vector(int dst, int src, std::vector &&v, MPI_Comm comm) { + MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm); + return 0; +} + +std::vector receive_vector(int dst, int src, MPI_Comm comm) { + + // probe for size + MPI_Status stat; + MPI_Probe(0, Tag::x, comm, &stat); + int sz; + MPI_Get_count(&stat, MPI_INT, &sz); + std::vector x(sz); + + std::cerr << "recv " << sz << " x entries\n"; + + // receive actual data into matrix + MPI_Recv(x.data(), x.size(), MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE); + + return x; +} + +__global__ void spmv(Array::View b, const CsrMat::View A, const Array::View x) { + + // one block per row + for (int r = blockIdx.x; r < A.num_rows(); r += gridDim.x) { + + } + +} + +int main (int argc, char **argv) { + +MPI_Init(&argc, &argv); + + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int64_t m = 100; + int64_t n = 50; + int64_t nnz = 5000; + + CsrMat A; + std::vector x; + + // generate and send or recv A + if (0 == rank) { + A = random_matrix(m, n, nnz); + std::vector x = random_vector(n); + std::vector> As = part_by_rows(A, size); + std::vector> xs = part_by_rows(x, size); + for (size_t dst = 1; dst < size; ++dst) { + std::cerr << "send A to " << dst << "\n"; + send_matrix(dst, 0, std::move(As[dst]), MPI_COMM_WORLD); + std::cerr << "send x to " << dst << "\n"; + send_vector(dst, 0, std::move(xs[dst]), MPI_COMM_WORLD); + } + A = As[rank]; + x = xs[rank]; + } else { + std::cerr << "recv A at " << rank << "\n"; + A = receive_matrix(rank, 0, MPI_COMM_WORLD); + std::cerr << "recv x at " << rank << "\n"; + x = receive_vector(rank, 0, MPI_COMM_WORLD); + } + + // Product vector size is same as local rows of A + std::vector b(A.num_rows()); + + // get GPU versions + CsrMat Ad(A); + Array xd(x); + Array bd(b); + + // do spmv + dim3 dimBlock(32,8,1); + dim3 dimGrid(100); + spmv<<>>(bd.view(), Ad.view(), xd.view()); + CUDA_RUNTIME(cudaDeviceSynchronize()); + +MPI_Finalize(); + +} \ No newline at end of file