add CUDA-aware persistent and send/recv

This commit is contained in:
Carl Pearson
2021-06-02 16:50:54 -04:00
parent d29de778d2
commit 16de9c455b
4 changed files with 340 additions and 10 deletions

View File

@@ -127,3 +127,19 @@ if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
set_cxx_options(one-sided-gpu)
set_cxx_standard(one-sided-gpu)
endif()
if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
add_executable( persistent-gpu persistent_gpu.cpp)
target_link_libraries(persistent-gpu MPI::MPI_CXX)
target_link_libraries(persistent-gpu CUDA::cudart)
set_cxx_options( persistent-gpu)
set_cxx_standard( persistent-gpu)
endif()
if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
add_executable(send-recv-gpu send_recv_gpu.cpp)
target_link_libraries(send-recv-gpu MPI::MPI_CXX)
target_link_libraries(send-recv-gpu CUDA::cudart)
set_cxx_options(send-recv-gpu)
set_cxx_standard(send-recv-gpu)
endif()

View File

@@ -4,19 +4,23 @@ Various standalone C++ MPI tests/examples/benchmarks.
If CUDA is detected, additional binaries can be built.
| name | Kind | Reqs. | Ranks | Description |
|-------------------|-----------|----------|-------|-------------|
|[hello-world][1] | Test | MPI | 1+ | an MPI hello-world |
|[one-sided][2] | Test | MPI | 2 | one-sided communication |
|[one-sided-gpu][3] | Test | MPI+CUDA | 2 | one-sided communication on GPU buffer |
|[persistent][4] | Benchmark | MPI | 2 | ping-pong time for persistent Send/Recv |
|[send-recv][5] | Benchmark | MPI | 2 | ping-pong time for Send/Recv |
| name | Kind | Reqs. | Ranks | Description |
|--------------------|-----------|----------|-------|-------------|
|[hello-world][1] | Test | MPI | 1+ | an MPI hello-world |
|[one-sided][2] | Test | MPI | 2 | one-sided communication |
|[one-sided-gpu][3] | Test | MPI+CUDA | 2 | one-sided communication on GPU buffer |
|[persistent][4] | Benchmark | MPI | 2 | ping-pong time for persistent Send/Recv |
|[persistent-gpu][5] | Benchmark | MPI+CUDA | 2 | ping-pong time for persistent Send/Recv on GPU buffer|
|[send-recv][6] | Benchmark | MPI | 2 | ping-pong time for Send/Recv |
|[send-recv-gpu][7] | Benchmark | MPI+CUDA | 2 | ping-pong time for Send/Recv on GPU buffer|
[1]: https://github.com/cwpearson/mpi_test/blob/master/hello_world.cpp
[2]: https://github.com/cwpearson/mpi_test/blob/master/one_sided.cpp
[3]: https://github.com/cwpearson/mpi_test/blob/master/one_sided_gpu.cpp
[4]: https://github.com/cwpearson/mpi_test/blob/master/persistent.cpp
[5]: https://github.com/cwpearson/mpi_test/blob/master/send_recv.cpp
[5]: https://github.com/cwpearson/mpi_test/blob/master/persistent_gpu.cpp
[6]: https://github.com/cwpearson/mpi_test/blob/master/send_recv.cpp
[7]: https://github.com/cwpearson/mpi_test/blob/master/send_recv_gpu.cpp
## Build
```
@@ -54,8 +58,8 @@ Execute any binary you want using `mpirun`, or whatever is appropriate for your
* 1 node: `jsrun -n 1 ./persistent`
* 2 nodes: `jsrun -n 2 -r 1 -a 1 ./persistent`
* 2 nodes w/GPU: `jsrun --smpi="-gpu" -n 2 -r 1 -g 1 ./send-recv-gpu`
## Notes on specific platforms
Some Open MPIs use `long long` for their datatypes, which means we can't support ANSI C++ (`-ansi`).
Some Open MPIs use `long long` for their datatypes, which is not a part of ANSI C++ (`-ansi`).

161
persistent_gpu.cpp Normal file
View File

@@ -0,0 +1,161 @@
#include <cstdio>
#include <vector>
#include <cstdlib>
#include <mpi.h>
#include <cuda_runtime.h>
inline void checkCuda(cudaError_t result, const char *file, const int line) {
if (result != cudaSuccess) {
fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result));
exit(-1);
}
}
#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
const float sample_target = 200e-6;
struct Sample {
double raw;
double norm;
};
static Sample get_sample(int perSample, MPI_Request *sreq, MPI_Request *rreq, int rank, MPI_Comm comm) {
Sample sample;
MPI_Barrier(comm);
double start = MPI_Wtime();
for (int i = 0; i < perSample; ++i) {
if (0 == rank) {
MPI_Start(sreq);
MPI_Wait(sreq, MPI_STATUS_IGNORE);
MPI_Start(rreq);
MPI_Wait(rreq, MPI_STATUS_IGNORE);
} else if (1 == rank) {
MPI_Start(rreq);
MPI_Wait(rreq, MPI_STATUS_IGNORE);
MPI_Start(sreq);
MPI_Wait(sreq, MPI_STATUS_IGNORE);
}
}
double stop = MPI_Wtime();
sample.raw = stop-start;
sample.norm = sample.raw / perSample;
return sample;
}
int main(int argc, char **argv) {
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Get the number of processes
int size, rank;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (size < 2) {
printf("need at least 2 ranks!\n");
exit(1);
}
int src = (rank + 1) % 2;
int dst = (rank + 1) % 2;
int tag = 0;
MPI_Request sreq, rreq;
int numIters = 100;
std::vector<size_t> sweep{
1,
64,
128,
256,
512,
1 * 1024,
2 * 1024,
4 * 1024,
8 * 1024,
16 * 1024,
32 * 1024,
64 * 1024,
128 * 1024,
256 * 1024,
512 * 1024,
1 * 1024 * 1024,
2 * 1024 * 1024,
4 * 1024 * 1024,
8 * 1024 * 1024,
16 * 1024 * 1024,
32 * 1024 * 1024,
64 * 1024 * 1024,
128 * 1024 * 1024,
256 * 1024 * 1024,
};
if (0 == rank) {
printf("bytes,min,max,avg,med\n");
}
for (size_t bytes : sweep) {
std::vector<double> samples(numIters);
char *buf;
CUDA_RUNTIME(cudaMalloc(&buf, bytes));
MPI_Send_init(buf, bytes, MPI_BYTE, dst, tag, MPI_COMM_WORLD, &sreq);
MPI_Recv_init(buf, bytes, MPI_BYTE, src, tag, MPI_COMM_WORLD, &rreq);
// try to reach 200us / sample
int perSample = 1;
for (int i = 0; i < 10; ++i) {
double sample = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).raw;
// estimate number of measurements per sample
int guess = sample_target / sample + /*rounding*/0.5;
// close half the distance to this estimate
perSample += (guess - perSample) * 0.5;
if (perSample < 1) perSample = 1;
MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
if (0 == rank) {
fprintf(stderr, "sample averaged over %d iterations\n", perSample);
}
for (int i = 0; i < numIters; ++i) {
samples[i] = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).norm;
}
// each sample is the max time observed
MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
// bubble sort
bool changed = true;
while (changed) {
changed = false;
for (int i = 0; i < numIters - 1; ++i) {
if (samples[i] > samples[i+1]) {
double tmp = samples[i+1];
samples[i+1] = samples[i];
samples[i] = tmp;
changed = true;
}
}
}
// average
double avg = 0;
for (int i = 0; i < numIters; ++i) {
avg += samples[i];
}
avg /= numIters;
if (0 == rank) {
printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]);
}
MPI_Request_free(&sreq);
MPI_Request_free(&rreq);
CUDA_RUNTIME(cudaFree(buf));
}
MPI_Finalize();
return 0;
}

149
send_recv_gpu.cpp Normal file
View File

@@ -0,0 +1,149 @@
#include <cstdio>
#include <vector>
#include <cstdlib>
#include <mpi.h>
#include <cuda_runtime.h>
inline void checkCuda(cudaError_t result, const char *file, const int line) {
if (result != cudaSuccess) {
fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result));
exit(-1);
}
}
#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
const float sample_target = 200e-6;
struct Sample {
double raw;
double norm;
};
static Sample get_sample(int perSample, void *buf, int bytes, int rank, int other, MPI_Comm comm) {
Sample sample;
int tag = 0;
MPI_Barrier(comm);
double start = MPI_Wtime();
for (int i = 0; i < perSample; ++i) {
if (0 == rank) {
MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
} else if (1 == rank) {
MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
}
}
double stop = MPI_Wtime();
sample.raw = stop-start;
sample.norm = sample.raw / perSample;
return sample;
}
int main(int argc, char **argv) {
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Get the number of processes
int size, rank;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (size < 2) {
printf("need at least 2 ranks!\n");
exit(1);
}
int other = (rank + 1) % 2;
int numIters = 100;
std::vector<size_t> sweep{
1,
64,
128,
256,
512,
1 * 1024,
2 * 1024,
4 * 1024,
8 * 1024,
16 * 1024,
32 * 1024,
64 * 1024,
128 * 1024,
256 * 1024,
512 * 1024,
1 * 1024 * 1024,
2 * 1024 * 1024,
4 * 1024 * 1024,
8 * 1024 * 1024,
16 * 1024 * 1024,
32 * 1024 * 1024,
64 * 1024 * 1024,
128 * 1024 * 1024,
256 * 1024 * 1024,
};
if (0 == rank) {
printf("bytes,min,max,avg,med\n");
}
for (size_t bytes : sweep) {
std::vector<double> samples(numIters);
char *buf;
CUDA_RUNTIME(cudaMalloc(&buf, bytes));
// try to reach 200us / sample
int perSample = 1;
for (int i = 0; i < 10; ++i) {
double sample = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).raw;
// estimate number of measurements per sample
int guess = sample_target / sample + /*rounding*/0.5;
// close half the distance to this estimate
perSample += (guess - perSample) * 0.5;
if (perSample < 1) perSample = 1;
MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
if (0 == rank) {
fprintf(stderr, "sample averaged over %d iterations\n", perSample);
}
for (int i = 0; i < numIters; ++i) {
samples[i] = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).norm;
}
// each sample is the max time observed
MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
// bubble sort
bool changed = true;
while (changed) {
changed = false;
for (int i = 0; i < numIters - 1; ++i) {
if (samples[i] > samples[i+1]) {
double tmp = samples[i+1];
samples[i+1] = samples[i];
samples[i] = tmp;
changed = true;
}
}
}
// average
double avg = 0;
for (int i = 0; i < numIters; ++i) {
avg += samples[i];
}
avg /= numIters;
if (0 == rank) {
printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]);
}
CUDA_RUNTIME(cudaFree(buf));
}
MPI_Finalize();
return 0;
}