provisional distributed spmv
This commit is contained in:
@@ -87,7 +87,7 @@ endfunction()
|
|||||||
|
|
||||||
if (MPI_FOUND)
|
if (MPI_FOUND)
|
||||||
add_executable(main main.cu)
|
add_executable(main main.cu)
|
||||||
target_include_directories(main PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
|
target_include_directories(main SYSTEM PRIVATE ${MPI_CXX_INCLUDE_DIRS})
|
||||||
target_link_libraries(main ${MPI_CXX_LIBRARIES})
|
target_link_libraries(main ${MPI_CXX_LIBRARIES})
|
||||||
target_link_libraries(main CUDA::nvToolsExt)
|
target_link_libraries(main CUDA::nvToolsExt)
|
||||||
# target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH})
|
# target_include_directories(main PRIVATE ${MPI_CXX_INCLUDE_PATH})
|
||||||
@@ -99,10 +99,10 @@ endif()
|
|||||||
|
|
||||||
if (MPI_FOUND)
|
if (MPI_FOUND)
|
||||||
add_executable(overlap overlap.cu)
|
add_executable(overlap overlap.cu)
|
||||||
target_include_directories(overlap PRIVATE SYSTEM ${MPI_CXX_INCLUDE_DIRS})
|
target_include_directories(overlap SYSTEM PRIVATE ${MPI_CXX_INCLUDE_DIRS})
|
||||||
target_link_libraries(overlap ${MPI_CXX_LIBRARIES})
|
target_link_libraries(overlap ${MPI_CXX_LIBRARIES})
|
||||||
target_link_libraries(overlap CUDA::nvToolsExt)
|
target_link_libraries(overlap CUDA::nvToolsExt)
|
||||||
set_cxx_options(overlap)
|
set_cuda_options(overlap)
|
||||||
set_cxx_standard(overlap)
|
set_cxx_standard(overlap)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
16
README.md
16
README.md
@@ -24,6 +24,22 @@ module load cde/v2/cmake/3.19.2
|
|||||||
mpirun -n 2 ~/software/nsight-systems-cli/2021.2.1/bin/nsys profile -c cudaProfilerApi -t cuda,mpi,nvtx -o dist-spmv_%q{OMPI_COMM_WORLD_RANK} -f true ./main
|
mpirun -n 2 ~/software/nsight-systems-cli/2021.2.1/bin/nsys profile -c cudaProfilerApi -t cuda,mpi,nvtx -o dist-spmv_%q{OMPI_COMM_WORLD_RANK} -f true ./main
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### To build with OpenMPI 4.1.1
|
||||||
|
|
||||||
|
```
|
||||||
|
module purge
|
||||||
|
module load sems-env
|
||||||
|
module load sems-cmake/3.19.1
|
||||||
|
module load sems-gcc/7.2.0
|
||||||
|
module load sems-cuda/10.1
|
||||||
|
cmake .. -DCMAKE_PREFIX_PATH=~cwpears/software/openmpi-4.1.1-cuda10.1-gcc7.2
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
~cwpears/software/openmpi-4.1.1-cuda10.1-gcc7.2/bin/mpirun -n 2 ./overlap
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
## Design Considerations
|
## Design Considerations
|
||||||
|
|
||||||
Minimize CUDA runtime calls
|
Minimize CUDA runtime calls
|
5
at.hpp
Normal file
5
at.hpp
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#define STRINGIFY(x) #x
|
||||||
|
#define TOSTRING(x) STRINGIFY(x)
|
||||||
|
#define AT __FILE__ ":" TOSTRING(__LINE__)
|
76
csr_mat.hpp
76
csr_mat.hpp
@@ -6,6 +6,8 @@
|
|||||||
#include "coo_mat.hpp"
|
#include "coo_mat.hpp"
|
||||||
#include "algorithm.hpp"
|
#include "algorithm.hpp"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
template <Where where>
|
template <Where where>
|
||||||
class CsrMat {
|
class CsrMat {
|
||||||
public:
|
public:
|
||||||
@@ -161,6 +163,16 @@ public:
|
|||||||
CsrMat(CsrMat &&other) = delete;
|
CsrMat(CsrMat &&other) = delete;
|
||||||
CsrMat(const CsrMat &other) = delete;
|
CsrMat(const CsrMat &other) = delete;
|
||||||
|
|
||||||
|
CsrMat &operator=(CsrMat &&rhs) {
|
||||||
|
if (this != &rhs) {
|
||||||
|
rowPtr_ = std::move(rhs.rowPtr_);
|
||||||
|
colInd_ = std::move(rhs.colInd_);
|
||||||
|
val_ = std::move(rhs.val_);
|
||||||
|
numCols_ = std::move(rhs.numCols_);
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
// create device matrix from host
|
// create device matrix from host
|
||||||
CsrMat(const CsrMat<Where::host> &m) :
|
CsrMat(const CsrMat<Where::host> &m) :
|
||||||
rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) {
|
rowPtr_(m.rowPtr_), colInd_(m.colInd_), val_(m.val_), numCols_(m.numCols_) {
|
||||||
@@ -193,4 +205,68 @@ public:
|
|||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// mxn random matrix with nnz
|
||||||
|
CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
|
||||||
|
|
||||||
|
if (m * n < nnz) {
|
||||||
|
throw std::logic_error(AT);
|
||||||
|
}
|
||||||
|
|
||||||
|
CooMat coo(m,n);
|
||||||
|
while(coo.nnz() < nnz) {
|
||||||
|
|
||||||
|
int64_t toPush = nnz - coo.nnz();
|
||||||
|
std::cerr << "adding " << toPush << " non-zeros\n";
|
||||||
|
for (int64_t _ = 0; _ < toPush; ++_) {
|
||||||
|
int r = rand() % m;
|
||||||
|
int c = rand() % n;
|
||||||
|
float e = 1.0;
|
||||||
|
coo.push_back(r, c, e);
|
||||||
|
}
|
||||||
|
std::cerr << "removing duplicate non-zeros\n";
|
||||||
|
coo.remove_duplicates();
|
||||||
|
}
|
||||||
|
coo.sort();
|
||||||
|
std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
|
||||||
|
CsrMat<Where::host> csr(coo);
|
||||||
|
std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
|
||||||
|
return csr;
|
||||||
|
};
|
||||||
|
|
||||||
|
// nxn diagonal matrix with bandwidth b
|
||||||
|
CsrMat<Where::host> random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) {
|
||||||
|
|
||||||
|
CooMat coo(n,n);
|
||||||
|
while(coo.nnz() < nnz) {
|
||||||
|
|
||||||
|
int64_t toPush = nnz - coo.nnz();
|
||||||
|
std::cerr << "adding " << toPush << " non-zeros\n";
|
||||||
|
for (int64_t _ = 0; _ < toPush; ++_) {
|
||||||
|
int r = rand() % n; // random row
|
||||||
|
|
||||||
|
// column in the band
|
||||||
|
int lb = r - bw;
|
||||||
|
int ub = r + bw + 1;
|
||||||
|
int64_t c = rand() % (ub - lb) + lb;
|
||||||
|
if (c < 0 || c >= n) {
|
||||||
|
// retry, don't over-weight first or last column
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
float e = 1.0;
|
||||||
|
|
||||||
|
assert(c < n);
|
||||||
|
assert(r < n);
|
||||||
|
coo.push_back(r, c, e);
|
||||||
|
}
|
||||||
|
std::cerr << "removing duplicate non-zeros\n";
|
||||||
|
coo.remove_duplicates();
|
||||||
|
}
|
||||||
|
coo.sort();
|
||||||
|
std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
|
||||||
|
CsrMat<Where::host> csr(coo);
|
||||||
|
std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
|
||||||
|
return csr;
|
||||||
};
|
};
|
108
overlap.cu
108
overlap.cu
@@ -10,91 +10,13 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
|
//#define VIEW_CHECK_BOUNDS
|
||||||
|
|
||||||
|
#include "at.hpp"
|
||||||
#include "cuda_runtime.hpp"
|
#include "cuda_runtime.hpp"
|
||||||
#include "csr_mat.hpp"
|
#include "csr_mat.hpp"
|
||||||
#include "row_part_spmv.cuh"
|
#include "row_part_spmv.cuh"
|
||||||
|
|
||||||
#define STRINGIFY(x) #x
|
|
||||||
#define TOSTRING(x) STRINGIFY(x)
|
|
||||||
#define AT __FILE__ ":" TOSTRING(__LINE__)
|
|
||||||
|
|
||||||
//#define VIEW_CHECK_BOUNDS
|
|
||||||
|
|
||||||
|
|
||||||
// mxn random matrix with nnz
|
|
||||||
CsrMat<Where::host> random_matrix(const int64_t m, const int64_t n, const int64_t nnz) {
|
|
||||||
|
|
||||||
if (m * n < nnz) {
|
|
||||||
throw std::logic_error(AT);
|
|
||||||
}
|
|
||||||
|
|
||||||
CooMat coo(m,n);
|
|
||||||
while(coo.nnz() < nnz) {
|
|
||||||
|
|
||||||
int64_t toPush = nnz - coo.nnz();
|
|
||||||
std::cerr << "adding " << toPush << " non-zeros\n";
|
|
||||||
for (int64_t _ = 0; _ < toPush; ++_) {
|
|
||||||
int r = rand() % m;
|
|
||||||
int c = rand() % n;
|
|
||||||
float e = 1.0;
|
|
||||||
coo.push_back(r, c, e);
|
|
||||||
}
|
|
||||||
std::cerr << "removing duplicate non-zeros\n";
|
|
||||||
coo.remove_duplicates();
|
|
||||||
}
|
|
||||||
coo.sort();
|
|
||||||
std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
|
|
||||||
CsrMat<Where::host> csr(coo);
|
|
||||||
std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
|
|
||||||
return csr;
|
|
||||||
};
|
|
||||||
|
|
||||||
// nxn diagonal matrix with bandwidth b
|
|
||||||
CsrMat<Where::host> random_band_matrix(const int64_t n, const int64_t bw, const int64_t nnz) {
|
|
||||||
|
|
||||||
CooMat coo(n,n);
|
|
||||||
while(coo.nnz() < nnz) {
|
|
||||||
|
|
||||||
int64_t toPush = nnz - coo.nnz();
|
|
||||||
std::cerr << "adding " << toPush << " non-zeros\n";
|
|
||||||
for (int64_t _ = 0; _ < toPush; ++_) {
|
|
||||||
int r = rand() % n; // random row
|
|
||||||
|
|
||||||
// column in the band
|
|
||||||
int lb = r - bw;
|
|
||||||
int ub = r + bw + 1;
|
|
||||||
int64_t c = rand() % (ub - lb) + lb;
|
|
||||||
if (c < 0 || c > n) {
|
|
||||||
continue; // don't over-weight first or last column
|
|
||||||
}
|
|
||||||
|
|
||||||
float e = 1.0;
|
|
||||||
coo.push_back(r, c, e);
|
|
||||||
}
|
|
||||||
std::cerr << "removing duplicate non-zeros\n";
|
|
||||||
coo.remove_duplicates();
|
|
||||||
}
|
|
||||||
coo.sort();
|
|
||||||
std::cerr << "coo: " << coo.num_rows() << "x" << coo.num_cols() << "\n";
|
|
||||||
CsrMat<Where::host> csr(coo);
|
|
||||||
std::cerr << "csr: " << csr.num_rows() << "x" << csr.num_cols() << " w/ " << csr.nnz() << "\n";
|
|
||||||
return csr;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<float> random_vector(const int64_t n) {
|
|
||||||
return std::vector<float>(n, 1.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
Array<Where::host, float> random_array(const int64_t n) {
|
|
||||||
return Array<Where::host, float>(n, 1.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
int send_x(int dst, int src, std::vector<float> &&v, MPI_Comm comm) {
|
|
||||||
MPI_Send(v.data(), v.size(), MPI_FLOAT, dst, Tag::x, comm);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* recv some amount of data, and put it in the right place
|
/* recv some amount of data, and put it in the right place
|
||||||
in a full x
|
in a full x
|
||||||
@@ -152,10 +74,10 @@ int main (int argc, char **argv) {
|
|||||||
// int64_t n = 150000;
|
// int64_t n = 150000;
|
||||||
// int64_t nnz = 11000000;
|
// int64_t nnz = 11000000;
|
||||||
// or
|
// or
|
||||||
int64_t m = 150000;
|
int64_t m = 15000;
|
||||||
int64_t n = m;
|
int64_t n = m;
|
||||||
int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks
|
int64_t bw = m/size; // ~50% local vs remote non-zeros for most ranks
|
||||||
int64_t nnz = 11000000;
|
int64_t nnz = 1100000;
|
||||||
|
|
||||||
CsrMat<Where::host> A; // "local A"
|
CsrMat<Where::host> A; // "local A"
|
||||||
|
|
||||||
@@ -168,29 +90,21 @@ int main (int argc, char **argv) {
|
|||||||
|
|
||||||
RowPartSpmv spmv(A, 0, MPI_COMM_WORLD);
|
RowPartSpmv spmv(A, 0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
std::cerr << "A: " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
|
std::cerr << "A: " << A.num_rows() << "x" << A.num_cols() << " w/ " << A.nnz() << "\n";
|
||||||
|
}
|
||||||
std::cerr << "local A: " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n";
|
std::cerr << "local A: " << spmv.lA().num_rows() << "x" << spmv.lA().num_cols() << " w/ " << spmv.lA().nnz() << "\n";
|
||||||
std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n";
|
std::cerr << "remote A: " << spmv.rA().num_rows() << "x" << spmv.rA().num_cols() << " w/ " << spmv.rA().nnz() << "\n";
|
||||||
|
|
||||||
|
const int nIters = 1;
|
||||||
int loPrio, hiPrio;
|
|
||||||
CUDA_RUNTIME(cudaDeviceGetStreamPriorityRange (&loPrio, &hiPrio));
|
|
||||||
|
|
||||||
cudaStream_t loS, hiS; // "lo/hi prio"
|
|
||||||
CUDA_RUNTIME(cudaStreamCreateWithPriority(&loS, cudaStreamNonBlocking, hiPrio));
|
|
||||||
CUDA_RUNTIME(cudaStreamCreateWithPriority(&hiS, cudaStreamNonBlocking, hiPrio));
|
|
||||||
|
|
||||||
cudaEvent_t event;
|
|
||||||
CUDA_RUNTIME(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
|
||||||
|
|
||||||
const int nIters = 30;
|
|
||||||
std::vector<double> times(nIters);
|
std::vector<double> times(nIters);
|
||||||
|
|
||||||
nvtxRangePush("overlap");
|
nvtxRangePush("overlap");
|
||||||
for (int i = 0; i < nIters; ++i) {
|
for (int i = 0; i < nIters; ++i) {
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
double start = MPI_Wtime();
|
double start = MPI_Wtime();
|
||||||
|
spmv.pack_x_async();
|
||||||
|
spmv.pack_x_wait();
|
||||||
spmv.send_x_async();
|
spmv.send_x_async();
|
||||||
spmv.launch_local();
|
spmv.launch_local();
|
||||||
spmv.recv_x_async();
|
spmv.recv_x_async();
|
||||||
|
@@ -173,6 +173,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void pack_x_async() {
|
void pack_x_async() {
|
||||||
|
assert(xSendBuf_.size() == xSendIdx_.size());
|
||||||
scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view());
|
scatter<<<100,128, 0, packStream_>>>(xSendBuf_.view(), lx_.view(), xSendIdx_.view());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -182,9 +183,12 @@ public:
|
|||||||
|
|
||||||
void send_x_async() {
|
void send_x_async() {
|
||||||
|
|
||||||
|
std::cerr << "send_x_async(): send to " << sendParams_.size() << " ranks\n";
|
||||||
|
|
||||||
// send to neighbors who want it
|
// send to neighbors who want it
|
||||||
for (auto &p : sendParams_) {
|
for (auto &p : sendParams_) {
|
||||||
int tag = 0;
|
int tag = 0;
|
||||||
|
assert(xSendBuf_.size() >= p.displ + p.count);
|
||||||
MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req);
|
MPI_Isend(xSendBuf_.data() + p.displ, p.count, MPI_FLOAT, p.dst, tag, comm_, &p.req);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -209,8 +213,10 @@ public:
|
|||||||
CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_));
|
CUDA_RUNTIME(cudaStreamSynchronize(kernelStream_));
|
||||||
}
|
}
|
||||||
|
|
||||||
void launch_local_spmv() {}
|
~RowPartSpmv() {
|
||||||
void launch_remote_spmv() {}
|
CUDA_RUNTIME(cudaStreamDestroy(kernelStream_)); kernelStream_ = 0;
|
||||||
|
CUDA_RUNTIME(cudaStreamDestroy(packStream_)); packStream_ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* create from a matrix at root
|
/* create from a matrix at root
|
||||||
*/
|
*/
|
||||||
@@ -218,11 +224,14 @@ public:
|
|||||||
const CsrMat<Where::host> &wholeA,
|
const CsrMat<Where::host> &wholeA,
|
||||||
const int root,
|
const int root,
|
||||||
MPI_Comm comm
|
MPI_Comm comm
|
||||||
) {
|
) : comm_(comm) {
|
||||||
|
|
||||||
|
CUDA_RUNTIME(cudaStreamCreate(&kernelStream_));
|
||||||
|
CUDA_RUNTIME(cudaStreamCreate(&packStream_));
|
||||||
|
|
||||||
int rank, size;
|
int rank, size;
|
||||||
MPI_Comm_rank(comm, &rank);
|
MPI_Comm_rank(comm_, &rank);
|
||||||
MPI_Comm_size(comm, &size);
|
MPI_Comm_size(comm_, &size);
|
||||||
|
|
||||||
CsrMat<Where::host> a;
|
CsrMat<Where::host> a;
|
||||||
if (root == rank) {
|
if (root == rank) {
|
||||||
@@ -231,27 +240,34 @@ public:
|
|||||||
for (size_t dst = 0; dst < size; ++dst) {
|
for (size_t dst = 0; dst < size; ++dst) {
|
||||||
if (root != dst) {
|
if (root != dst) {
|
||||||
std::cerr << "send A to " << dst << "\n";
|
std::cerr << "send A to " << dst << "\n";
|
||||||
send_matrix(dst, 0, std::move(as[dst]), MPI_COMM_WORLD);
|
send_matrix(dst, 0, std::move(as[dst]), comm_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
a = as[rank];
|
a = as[rank];
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "recv A at " << rank << "\n";
|
std::cerr << "recv A at " << rank << "\n";
|
||||||
a = receive_matrix(rank, 0, MPI_COMM_WORLD);
|
a = receive_matrix(rank, 0, comm_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// split row part of a into local and global
|
// split row part of a into local and global
|
||||||
SplitCooMat scm = split_local_remote(a, comm);
|
SplitCooMat scm = split_local_remote(a, comm);
|
||||||
|
la_ = std::move(scm.local);
|
||||||
|
ra_ = std::move(scm.remote);
|
||||||
|
assert(la_.nnz() + ra_.nnz() == a.nnz() && "lost a non-zero during split");
|
||||||
loff_ = scm.loff;
|
loff_ = scm.loff;
|
||||||
|
|
||||||
// create local part of x array
|
// create local part of x array
|
||||||
// undefined entries
|
// undefined entries
|
||||||
Range xrange = get_partition(a.num_cols(), rank, size);
|
Range xrange = get_partition(a.num_cols(), rank, size);
|
||||||
lx_ = Array<Where::device, float>(xrange.extent());
|
lx_ = Array<Where::device, float>(xrange.extent());
|
||||||
|
ly_ = Array<Where::device, float>(la_.num_rows());
|
||||||
|
|
||||||
// create remote part of x array
|
// create remote part of x array
|
||||||
// one entry per remote column
|
// one entry per remote column
|
||||||
rx_ = Array<Where::device,float>(scm.globals.size());
|
rx_ = Array<Where::device,float>(scm.globals.size());
|
||||||
|
if (0 == rx_.size()) {
|
||||||
|
std::cerr << "WARN: not receiving anything\n";
|
||||||
|
}
|
||||||
|
|
||||||
// determine which columns needed from others
|
// determine which columns needed from others
|
||||||
std::map<int, std::vector<int>> recvCols;
|
std::map<int, std::vector<int>> recvCols;
|
||||||
@@ -261,6 +277,24 @@ public:
|
|||||||
recvCols[src].push_back(c);
|
recvCols[src].push_back(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
for (int r = 0; r < size; ++r) {
|
||||||
|
MPI_Barrier(comm_);
|
||||||
|
if (r == rank) {
|
||||||
|
std::cerr << "rank " << rank << "recvCols:\n";
|
||||||
|
for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
|
||||||
|
std::cerr << "from " << it->first << ": ";
|
||||||
|
for (auto &c : it->second) {
|
||||||
|
std::cerr << c << " ";
|
||||||
|
}
|
||||||
|
std::cerr << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MPI_Barrier(comm_);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// create receive parameters
|
// create receive parameters
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
|
for (auto it = recvCols.begin(); it != recvCols.end(); ++it) {
|
||||||
@@ -272,15 +306,36 @@ public:
|
|||||||
recvParams_.push_back(param);
|
recvParams_.push_back(param);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
for (int r = 0; r < size; ++r) {
|
||||||
|
MPI_Barrier(comm_);
|
||||||
|
if (r == rank) {
|
||||||
|
std::cerr << "rank " << rank << " recvParams:\n";
|
||||||
|
for (RecvParam &p : recvParams_) {
|
||||||
|
std::cerr
|
||||||
|
<< "src=" << p.src
|
||||||
|
<< " displ=" << p.displ
|
||||||
|
<< " count=" << p.count
|
||||||
|
<< "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MPI_Barrier(comm_);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// tell others which cols I need (send 0 if nothing)
|
// tell others which cols I need (send 0 if nothing)
|
||||||
std::vector<MPI_Request> reqs(size);
|
std::vector<MPI_Request> reqs(size);
|
||||||
for (int dest = 0; dest < size; ++dest) {
|
for (int dest = 0; dest < size; ++dest) {
|
||||||
auto it = recvCols.find(dest);
|
auto it = recvCols.find(dest);
|
||||||
if (it != recvCols.end()) {
|
if (it != recvCols.end()) {
|
||||||
MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm, &reqs[dest]);
|
assert(it->second.data());
|
||||||
|
MPI_Isend(it->second.data(), it->second.size(), MPI_INT, dest, 0, comm_, &reqs[dest]);
|
||||||
} else {
|
} else {
|
||||||
int _;
|
int _;
|
||||||
MPI_Isend(&_, 0, MPI_INT, dest, 0, comm, &reqs[dest]);
|
MPI_Isend(&_ /*junk*/, 0, MPI_INT, dest, 0, comm_, &reqs[dest]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -293,10 +348,10 @@ public:
|
|||||||
MPI_Get_count(&status, MPI_INT, &count);
|
MPI_Get_count(&status, MPI_INT, &count);
|
||||||
if (count != 0) {
|
if (count != 0) {
|
||||||
sendCols[src].resize(count);
|
sendCols[src].resize(count);
|
||||||
MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
|
MPI_Recv(sendCols[src].data(), count, MPI_INT, src, 0, comm_, MPI_STATUS_IGNORE);
|
||||||
} else {
|
} else {
|
||||||
int _;
|
int _;
|
||||||
MPI_Recv(&_, 0, MPI_INT, src, 0, comm, MPI_STATUS_IGNORE);
|
MPI_Recv(&_, 0, MPI_INT, src, 0, comm_, MPI_STATUS_IGNORE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -310,19 +365,20 @@ public:
|
|||||||
param.dst = it->first;
|
param.dst = it->first;
|
||||||
for (int gc : it->second) {
|
for (int gc : it->second) {
|
||||||
int lc = gc - scm.loff;
|
int lc = gc - scm.loff;
|
||||||
|
assert(lc >= 0);
|
||||||
|
assert(lc < lx_.size());
|
||||||
offsets.push_back(lc);
|
offsets.push_back(lc);
|
||||||
}
|
}
|
||||||
param.count = offsets.size() - param.displ;
|
param.count = offsets.size() - param.displ;
|
||||||
sendParams_.push_back(param);
|
sendParams_.push_back(param);
|
||||||
}
|
}
|
||||||
|
// device version of offsets for packing
|
||||||
|
xSendIdx_ = offsets;
|
||||||
|
// buffer that x values will be placed into for sending
|
||||||
|
xSendBuf_.resize(xSendIdx_.size());
|
||||||
|
|
||||||
assert(la_.size() > 0);
|
|
||||||
assert(ra_.size() > 0); // remote A
|
|
||||||
assert(lx_.size() > 0);
|
assert(lx_.size() > 0);
|
||||||
assert(rx_.size() > 0);
|
|
||||||
assert(ly_.size() > 0);
|
assert(ly_.size() > 0);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
@@ -45,6 +45,7 @@ SplitCooMat split_local_remote(const CsrMat<Where::host> &m, MPI_Comm comm) {
|
|||||||
|
|
||||||
// which rows of x are local
|
// which rows of x are local
|
||||||
Range localRange = get_partition(m.num_cols(), rank, size);
|
Range localRange = get_partition(m.num_cols(), rank, size);
|
||||||
|
std::cerr << "[" << localRange.lb <<","<< localRange.ub << ")\n";
|
||||||
int loff = localRange.lb;
|
int loff = localRange.lb;
|
||||||
|
|
||||||
// build two matrices, local gets local non-zeros, remote gets remote non-zeros
|
// build two matrices, local gets local non-zeros, remote gets remote non-zeros
|
||||||
|
Reference in New Issue
Block a user