diff --git a/CMakeLists.txt b/CMakeLists.txt index f136eef..4eb928e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ if (MPI_FOUND) add_executable(main main.cu) target_include_directories(main PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(main ${MPI_CXX_LIBRARIES}) + target_link_libraries(main CUDA::nvToolsExt) # target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH}) # target_compile_options(main PRIVATE ${MPI_CXX_COMPILE_FLAGS}) # target_link_libraries(main ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) diff --git a/README.md b/README.md index 95363cc..4061b2d 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,18 @@ module load cmake/3.18.0 **ascicgpu030** +To get nsight, had to download the rpm and unpack into home directory with `cpio`. + ``` module purge -sierra-devel/nvidia +module load sierra-devel/nvidia module load cde/v2/cmake/3.19.2 -``` \ No newline at end of file +``` + +``` +mpirun -n 2 ~/software/nsight-systems-cli/2021.2.1/bin/nsys profile -c cudaProfilerApi -t cuda,mpi,nvtx -o dist-spmv_%q{OMPI_COMM_WORLD_RANK} -f true ./main +``` + +## Design Considerations + +Minimize CUDA runtime calls \ No newline at end of file diff --git a/main.cu b/main.cu index 76557a8..64152f4 100644 --- a/main.cu +++ b/main.cu @@ -1,5 +1,8 @@ #include +#include +// #include + #include #include #include @@ -39,43 +42,44 @@ template class Array; +// A non-owning view of data +template +struct ArrayView +{ + T *data_; + int64_t size_; + public: + ArrayView() : data_(nullptr), size_(0){} + ArrayView(const ArrayView &other) = default; + ArrayView(ArrayView &&other) = default; + ArrayView &operator=(const ArrayView &rhs) = default; + + __host__ __device__ int64_t size() const { return size_; } + + __host__ __device__ const T &operator()(int64_t i) const { +#ifdef VIEW_CHECK_BOUNDS + if (i < 0) { + printf("ERR: i < 0: %d\n", i); + } + if (i >= size_) { + printf("ERR: i > size_: %d > %ld\n", i, size_); + } +#endif + return data_[i]; + } + __host__ __device__ T &operator()(int64_t i) { + return data_[i]; + } +}; + /* device array */ template class Array { public: - // A non-owning view of data - struct View - { - T *data_; - int64_t size_; - public: - View() : data_(nullptr), size_(0){} - View(const View &other) = default; - View(View &&other) = default; - View &operator=(const View &rhs) = default; - - __host__ __device__ int64_t size() const { return size_; } - - __device__ const T &operator()(int64_t i) const { -#ifdef VIEW_CHECK_BOUNDS - if (i < 0) { - printf("ERR: i < 0: %d\n", i); - } - if (i >= size_) { - printf("ERR: i > size_: %d > %ld\n", i, size_); - } -#endif - return data_[i]; - } - __device__ T &operator()(int64_t i) { - return data_[i]; - } - }; - // array owns the data in this view - View view_; + ArrayView view_; public: Array() = default; Array(const size_t n) { @@ -91,6 +95,7 @@ public: Array(const std::vector &v) { set_from(v); } + ~Array() { CUDA_RUNTIME(cudaFree(view_.data_)); view_.data_ = nullptr; @@ -99,7 +104,7 @@ public: int64_t size() const { return view_.size(); } - View view() const { + ArrayView view() const { return view_; // copy of internal view } @@ -114,6 +119,11 @@ public: CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream)); } + void set_from(const Array &rhs, cudaStream_t stream = 0) { + resize(rhs.size()); + CUDA_RUNTIME(cudaMemcpyAsync(view_.data_, rhs.data(), view_.size_ * sizeof(T), cudaMemcpyHostToDevice, stream)); + } + // any change destroys all data void resize(size_t n) { if (size() != n) { @@ -125,6 +135,61 @@ public: }; + +/* host array +*/ +template class Array +{ +public: + + // array owns the data in this view + ArrayView view_; +public: + Array() = default; + Array(const size_t n, const T &val) { + resize(n); + for (size_t i = 0; i < n; ++i) { + view_(i) = val; + } + } + Array(const Array &other) = delete; + Array(Array &&other) : view_(other.view_) { + // view is non-owning, so have to clear other + other.view_.data_ = nullptr; + other.view_.size_ = 0; + } + + ~Array() { + CUDA_RUNTIME(cudaFreeHost(view_.data_)); + view_.data_ = nullptr; + view_.size_ = 0; + } + int64_t size() const { + return view_.size(); } + + ArrayView view() const { + return view_; // copy of internal view + } + + // any change destroys all data + void resize(size_t n) { + if (size() != n) { + view_.size_ = n; + CUDA_RUNTIME(cudaFreeHost(view_.data_)); + CUDA_RUNTIME(cudaHostAlloc(&view_.data_, view_.size_ * sizeof(T), cudaHostAllocDefault)); + } + } + + const T* data() const { + return view_.data_; + } + T* data() { + return view_.data_; + } + +}; + + class CooMat { public: @@ -145,6 +210,10 @@ public: return a.j < b.j; } } + + static bool same_ij(const Entry &a, const Entry &b) { + return a.i == b.i && a.j == b.j; + } }; private: @@ -158,24 +227,18 @@ public: CooMat(int m, int n) : numRows_(m), numCols_(n) {} const std::vector &entries() const {return data_;} void push_back(int i, int j, int e) { - - Entry entry(i,j,e); - - // first position not less than entry - auto lb = std::lower_bound(data_.begin(), data_.end(), entry, Entry::by_ij); - - // overwrite if already exists - if (lb != data_.end() && lb->i == entry.i && lb->j == entry.j) { - *lb = entry; - } else { - data_.insert(lb, entry); - } + data_.push_back(Entry(i,j,e)); } void sort() { std::sort(data_.begin(), data_.end(), Entry::by_ij); } + void remove_duplicates() { + std::sort(data_.begin(), data_.end(), Entry::by_ij); + std::unique(data_.begin(), data_.end(), Entry::same_ij); + } + int64_t num_rows() const {return numRows_;} int64_t num_cols() const {return numCols_;} int64_t nnz() const {return data_.size();} @@ -236,6 +299,16 @@ public: return colInd_.size(); } + const int &row_ptr(int64_t i) const { + return rowPtr_[i]; + } + const int &col_ind(int64_t i) const { + return colInd_[i]; + } + const float &val(int64_t i) const { + return val_[i]; + } + const int *row_ptr() const {return rowPtr_.data(); } int *row_ptr() {return rowPtr_.data(); } const int *col_ind() const {return colInd_.data(); } @@ -295,9 +368,9 @@ template<> class CsrMat public: struct View { - Array::View rowPtr_; - Array::View colInd_; - Array::View val_; + ArrayView rowPtr_; + ArrayView colInd_; + ArrayView val_; __device__ int num_rows() const { if (rowPtr_.size() > 0) { @@ -368,10 +441,17 @@ CsrMat random_matrix(const int64_t m, const int64_t n, const int64_ CooMat coo(m,n); while(coo.nnz() < nnz) { - int r = rand() % m; - int c = rand() % n; - float e = 1.0; - coo.push_back(r, c, e); + + int toPush = nnz - coo.nnz(); + std::cerr << "adding " << toPush << " non-zeros\n"; + for (int _ = 0; _ < toPush; ++_) { + int r = rand() % m; + int c = rand() % n; + float e = 1.0; + coo.push_back(r, c, e); + } + std::cerr << "removing duplicate non-zeros\n"; + coo.remove_duplicates(); } coo.sort(); std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n"; @@ -384,6 +464,10 @@ std::vector random_vector(const int64_t n) { return std::vector(n, 1.0); } +Array random_array(const int64_t n) { + return Array(n, 1.0); +} + struct Range { int lb; int ub; @@ -422,6 +506,44 @@ std::vector> part_by_rows(const CsrMat &m, cons return mats; } +struct DistMat { + CsrMat local; + CsrMat remote; +}; + +DistMat split_local_remote(const CsrMat &m, MPI_Comm comm) { + int rank = 0; + int size = 1; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + // which rows of x are local + Range localRange = get_partition(m.num_cols(), rank, size); + + // build two matrices, local gets local non-zeros, remote gets remote non-zeros + CooMat local(m.num_rows(), m.num_cols()), remote(m.num_rows(), m.num_cols()); + + for (int r = 0; r < m.num_rows(); ++r) { + for (int ci = m.row_ptr(r); ci < m.row_ptr(r+1); ++ci) { + int c = m.col_ind(ci); + float v = m.val(ci); + + if (c >= localRange.lb && c < localRange.ub) { + local.push_back(r,c,v); + } else { + remote.push_back(r,c,v); + } + + } + } + + return DistMat { + .local=local, + .remote=remote + }; + +} + std::vector> part_by_rows(const std::vector &x, const int parts) { std::vector> xs; @@ -441,11 +563,14 @@ std::vector> part_by_rows(const std::vector &x, const int send_matrix(int dst, int src, CsrMat &&m, MPI_Comm comm) { + MPI_Request reqs[4]; + int numCols = m.num_cols(); - MPI_Send(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm); - MPI_Send(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm); - MPI_Send(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm); - MPI_Send(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm); + MPI_Isend(&numCols, 1, MPI_INT, dst, Tag::num_cols, comm, &reqs[0]); + MPI_Isend(m.row_ptr(), m.num_rows()+1, MPI_INT, dst, Tag::row_ptr, comm, &reqs[1]); + MPI_Isend(m.col_ind(), m.nnz(), MPI_INT, dst, Tag::col_ind, comm, &reqs[2]); + MPI_Isend(m.val(), m.nnz(), MPI_FLOAT, dst, Tag::val, comm, &reqs[3]); + MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE); return 0; } @@ -474,40 +599,57 @@ CsrMat receive_matrix(int dst, int src, MPI_Comm comm) { // receive actual data into matrix MPI_Recv(csr.row_ptr(), numRows+1, MPI_INT, 0, Tag::row_ptr, comm, MPI_STATUS_IGNORE); - // receive actual data into matrix MPI_Recv(csr.col_ind(), nnz, MPI_INT, 0, Tag::col_ind, comm, MPI_STATUS_IGNORE); - // receive actual data into matrix MPI_Recv(csr.val(), nnz, MPI_FLOAT, 0, Tag::val, comm, MPI_STATUS_IGNORE); return csr; } -int send_vector(int dst, int src, std::vector &&v, MPI_Comm comm) { +int send_x(int dst, int src, std::vector &&v, MPI_Comm comm) { MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm); return 0; } -std::vector receive_vector(int dst, int src, MPI_Comm comm) { +/* recv some amount of data, and put it in the right place + in a full x +*/ +std::vector receive_x(const int n, const int dst, int src, MPI_Comm comm) { + int rank = 0; + int size = 1; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + // which rows of x are local + Range local = get_partition(n, rank, size); // probe for size MPI_Status stat; MPI_Probe(0, Tag::x, comm, &stat); int sz; MPI_Get_count(&stat, MPI_INT, &sz); - std::vector x(sz); + if (sz != local.ub-local.lb) { + throw std::logic_error(AT); + } - std::cerr << "recv " << sz << " x entries\n"; - - // receive actual data into matrix - MPI_Recv(x.data(), x.size(), MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE); + std::cerr << "recv " << sz << " x entries into offset " << local.lb << "\n"; + std::vector x(n); + MPI_Recv(x.data() + local.lb, sz, MPI_FLOAT, 0, Tag::x, comm, MPI_STATUS_IGNORE); return x; } +enum class ProductConfig { + MODIFY, // b += + SET // b = +}; + /* Ax=b */ -__global__ void spmv(Array::View b, const CsrMat::View A, const Array::View x) { - +__global__ void spmv(ArrayView b, + const CsrMat::View A, + const ArrayView x, + const ProductConfig pc + ) { // one thread per row for (int r = blockDim.x * blockIdx.x + threadIdx.x; r < A.num_rows(); r += blockDim.x * gridDim.x) { float acc = 0; @@ -515,68 +657,89 @@ __global__ void spmv(Array::View b, const CsrMat z, const ArrayView a) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < z.size(); i += blockDim.x * gridDim.x) { + z(i) += a(i); + } } int main (int argc, char **argv) { MPI_Init(&argc, &argv); - std::cerr << "get a gpu...\n"; - CUDA_RUNTIME(cudaFree(0)); - std::cerr << "barrier...\n"; - MPI_Barrier(MPI_COMM_WORLD); - int rank = 0; int size = 1; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); - int64_t m = 100; - int64_t n = 50; - int64_t nnz = 100; + std::cerr << "get a gpu...\n"; + CUDA_RUNTIME(cudaSetDevice(rank)); + CUDA_RUNTIME(cudaFree(0)); + std::cerr << "barrier...\n"; + MPI_Barrier(MPI_COMM_WORLD); - CsrMat A; - std::vector x; + int64_t m = 150000; + int64_t n = 150000; + int64_t nnz = 11000000; - // generate and distribute A and x + CsrMat lA; // "local A" + + // generate and distribute A if (0 == rank) { - A = random_matrix(m, n, nnz); - x = random_vector(n); - std::vector> As = part_by_rows(A, size); - std::vector> xs = part_by_rows(x, size); + std::cerr << "generate matrix\n"; + lA = random_matrix(m, n, nnz); + std::cerr << "partition matrix\n"; + std::vector> As = part_by_rows(lA, size); for (size_t dst = 1; dst < size; ++dst) { std::cerr << "send A to " << dst << "\n"; send_matrix(dst, 0, std::move(As[dst]), MPI_COMM_WORLD); - std::cerr << "send x to " << dst << "\n"; - send_vector(dst, 0, std::move(xs[dst]), MPI_COMM_WORLD); } - A = As[rank]; - x = xs[rank]; + lA = As[rank]; } else { std::cerr << "recv A at " << rank << "\n"; - A = receive_matrix(rank, 0, MPI_COMM_WORLD); - std::cerr << "recv x at " << rank << "\n"; - x = receive_vector(rank, 0, MPI_COMM_WORLD); + lA = receive_matrix(rank, 0, MPI_COMM_WORLD); } - std::cerr << "A[" << A.num_rows() << "," << A.num_cols() << "] w/ " << A.nnz() << "\n"; + // each rank has a dense x. each rank owns part of it, + // but it doesn't matter what the entries are + Array lx = random_array(n); // "local x" + std::cerr << "local X: " << lx.size() << "\n"; + std::cerr << "copy x to device\n"; + Array lxd(lx.size()), rxd(lx.size()); // "local/remote x device" + + // get a local and remote split of A + std::cerr << "split local/remote A\n"; + CsrMat rA, A(lA); + { + DistMat d = split_local_remote(lA, MPI_COMM_WORLD); + lA = d.local; + rA = d.remote; + } + std::cerr << "A: " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n"; + std::cerr << "local A: " << lA.num_rows() << "x" << lA.num_cols() << " w/ " << lA.nnz() << "\n"; + std::cerr << "remote A: " << rA.num_rows() << "x" << rA.num_cols() << " w/ " << rA.nnz() << "\n"; + std::cerr << "Copy A to GPU\n"; - CsrMat Ad(A); + CsrMat Ad(A), lAd(lA), rAd(rA); // Product vector size is same as local rows of A - std::vector b(A.num_rows(), 0); + std::vector b(lA.num_rows(), 0); std::cerr << "Copy b to GPU\n"; - Array bd(b); + Array lbd(b), rbd(b); // "local b device, remote b device" - // plan allgather of complete x + // plan allgather of remote x data std::cerr << "plan allgather xs\n"; - std::vector lx(A.num_cols()); - Array lxd(A.num_cols()); std::vector recvcounts; std::vector displs; for (int i = 0; i < size; ++i) { @@ -589,42 +752,96 @@ int main (int argc, char **argv) { } } - + int loPrio, hiPrio; + CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio)); + + cudaStream_t loS, hiS; // "lo/hi prio" + CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio)); + CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio)); + + cudaEvent_t event; + CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + + const int nIters = 30; + std::vector times(nIters); + + /* ===== multiply in one shot + */ - cudaStream_t stream; - CUDA_RUNTIME(cudaStreamCreate(&stream)); // do spmv dim3 dimBlock(256); dim3 dimGrid(100); - for (int i = 0; i < 10; ++i) { + nvtxRangePush("one-shot"); + for (int i = 0; i < nIters; ++i) { + MPI_Barrier(MPI_COMM_WORLD); + double start = MPI_Wtime(); // distribute x to ranks - std::cerr << "Allgather xs\n"; - MPI_Allgatherv(x.data(), x.size(), MPI_FLOAT, lx.data(), recvcounts.data(), displs.data(), MPI_FLOAT, MPI_COMM_WORLD); + MPI_Allgatherv(lx.data() + displs[rank], recvcounts[rank], MPI_FLOAT, lx.data(), recvcounts.data(), displs.data(), MPI_FLOAT, MPI_COMM_WORLD); - std::cerr << "Copy x to GPU\n"; - lxd.set_from(lx, stream); + // copy x to GPU + lxd.set_from(lx, hiS); - std::cerr << "launch spmv\n"; - spmv<<>>(bd.view(), Ad.view(), lxd.view()); + spmv<<>>(lbd.view(), Ad.view(), lxd.view(), ProductConfig::SET); CUDA_RUNTIME(cudaGetLastError()); - CUDA_RUNTIME(cudaStreamSynchronize(stream)); - std::cerr << "done spmv\n"; - - // copy product back to host - // b = bd; - - // for (int r = 0; r < size; ++r) { - // MPI_Barrier(MPI_COMM_WORLD); - // if (rank == r) { - // for (auto &e : b) { - // std::cout << e << "\n"; - // } - // } - // } - + CUDA_RUNTIME(cudaStreamSynchronize(hiS)); + times[i] = MPI_Wtime() - start; } + nvtxRangePop(); // one-shot + MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + if (0 == rank) { + std::sort(times.begin(), times.end()); + std::cerr << times[times.size() / 2] << "\n"; + } + + + /* ===== split local and remote + multiply local, gather & multiply remote + TODO: the separate add launch can be removed if it is ensured + that The remote happens strictly after the local. + It's a small false serialization, but if we're in the case + where that matters, the launch overhead dominates anyway. + */ + nvtxRangePush("local/remote"); + for (int i = 0; i < nIters; ++i) { + + MPI_Barrier(MPI_COMM_WORLD); + double start = MPI_Wtime(); + + // overlap MPI with CUDA kernel launch + MPI_Request req; + MPI_Iallgatherv(lx.data() + displs[rank], recvcounts[rank], MPI_FLOAT, lx.data(), recvcounts.data(), displs.data(), MPI_FLOAT, MPI_COMM_WORLD, &req); + + spmv<<>>(lbd.view(), lAd.view(), lxd.view(), ProductConfig::SET); + CUDA_RUNTIME(cudaGetLastError()); + + MPI_Wait(&req, MPI_STATUS_IGNORE); + + rxd.set_from(lx, loS); + + // hiS blocks until transfer is done + CUDA_RUNTIME(cudaEventRecord(event, loS)); + CUDA_RUNTIME(cudaStreamWaitEvent(hiS, event, 0)); + + spmv<<>>(rbd.view(), rAd.view(), rxd.view(), ProductConfig::MODIFY); + CUDA_RUNTIME(cudaGetLastError()); + + // all is done when hiS is done + CUDA_RUNTIME(cudaStreamSynchronize(hiS)); + times[i] = MPI_Wtime() - start; + } + nvtxRangePop(); // local/remote + MPI_Allreduce(MPI_IN_PLACE, times.data(), times.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + if (0 == rank) { + std::sort(times.begin(), times.end()); + std::cerr << times[times.size() / 2] << "\n"; + } + + // maybe better to atomic add into result than doing separate kernel launch? + + CUDA_RUNTIME(cudaStreamDestroy(loS)); + CUDA_RUNTIME(cudaStreamDestroy(hiS)); MPI_Finalize();