diff --git a/CMakeLists.txt b/CMakeLists.txt index 93cc614..235b25b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,3 +127,19 @@ if (MPI_FOUND AND CMAKE_CUDA_COMPILER) set_cxx_options(one-sided-gpu) set_cxx_standard(one-sided-gpu) endif() + +if (MPI_FOUND AND CMAKE_CUDA_COMPILER) + add_executable( persistent-gpu persistent_gpu.cpp) + target_link_libraries(persistent-gpu MPI::MPI_CXX) + target_link_libraries(persistent-gpu CUDA::cudart) + set_cxx_options( persistent-gpu) + set_cxx_standard( persistent-gpu) +endif() + +if (MPI_FOUND AND CMAKE_CUDA_COMPILER) + add_executable(send-recv-gpu send_recv_gpu.cpp) + target_link_libraries(send-recv-gpu MPI::MPI_CXX) + target_link_libraries(send-recv-gpu CUDA::cudart) + set_cxx_options(send-recv-gpu) + set_cxx_standard(send-recv-gpu) +endif() diff --git a/README.md b/README.md index 80a05ed..8bde800 100644 --- a/README.md +++ b/README.md @@ -4,19 +4,23 @@ Various standalone C++ MPI tests/examples/benchmarks. If CUDA is detected, additional binaries can be built. -| name | Kind | Reqs. | Ranks | Description | -|-------------------|-----------|----------|-------|-------------| -|[hello-world][1] | Test | MPI | 1+ | an MPI hello-world | -|[one-sided][2] | Test | MPI | 2 | one-sided communication | -|[one-sided-gpu][3] | Test | MPI+CUDA | 2 | one-sided communication on GPU buffer | -|[persistent][4] | Benchmark | MPI | 2 | ping-pong time for persistent Send/Recv | -|[send-recv][5] | Benchmark | MPI | 2 | ping-pong time for Send/Recv | +| name | Kind | Reqs. | Ranks | Description | +|--------------------|-----------|----------|-------|-------------| +|[hello-world][1] | Test | MPI | 1+ | an MPI hello-world | +|[one-sided][2] | Test | MPI | 2 | one-sided communication | +|[one-sided-gpu][3] | Test | MPI+CUDA | 2 | one-sided communication on GPU buffer | +|[persistent][4] | Benchmark | MPI | 2 | ping-pong time for persistent Send/Recv | +|[persistent-gpu][5] | Benchmark | MPI+CUDA | 2 | ping-pong time for persistent Send/Recv on GPU buffer| +|[send-recv][6] | Benchmark | MPI | 2 | ping-pong time for Send/Recv | +|[send-recv-gpu][7] | Benchmark | MPI+CUDA | 2 | ping-pong time for Send/Recv on GPU buffer| [1]: https://github.com/cwpearson/mpi_test/blob/master/hello_world.cpp [2]: https://github.com/cwpearson/mpi_test/blob/master/one_sided.cpp [3]: https://github.com/cwpearson/mpi_test/blob/master/one_sided_gpu.cpp [4]: https://github.com/cwpearson/mpi_test/blob/master/persistent.cpp -[5]: https://github.com/cwpearson/mpi_test/blob/master/send_recv.cpp +[5]: https://github.com/cwpearson/mpi_test/blob/master/persistent_gpu.cpp +[6]: https://github.com/cwpearson/mpi_test/blob/master/send_recv.cpp +[7]: https://github.com/cwpearson/mpi_test/blob/master/send_recv_gpu.cpp ## Build ``` @@ -54,8 +58,8 @@ Execute any binary you want using `mpirun`, or whatever is appropriate for your * 1 node: `jsrun -n 1 ./persistent` * 2 nodes: `jsrun -n 2 -r 1 -a 1 ./persistent` - +* 2 nodes w/GPU: `jsrun --smpi="-gpu" -n 2 -r 1 -g 1 ./send-recv-gpu` ## Notes on specific platforms -Some Open MPIs use `long long` for their datatypes, which means we can't support ANSI C++ (`-ansi`). +Some Open MPIs use `long long` for their datatypes, which is not a part of ANSI C++ (`-ansi`). diff --git a/persistent_gpu.cpp b/persistent_gpu.cpp new file mode 100644 index 0000000..084b22e --- /dev/null +++ b/persistent_gpu.cpp @@ -0,0 +1,161 @@ +#include +#include +#include + +#include +#include + +inline void checkCuda(cudaError_t result, const char *file, const int line) { + if (result != cudaSuccess) { + fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result)); + exit(-1); + } +} +#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__); + +const float sample_target = 200e-6; + +struct Sample { + double raw; + double norm; +}; + +static Sample get_sample(int perSample, MPI_Request *sreq, MPI_Request *rreq, int rank, MPI_Comm comm) { + Sample sample; + MPI_Barrier(comm); + double start = MPI_Wtime(); + for (int i = 0; i < perSample; ++i) { + if (0 == rank) { + MPI_Start(sreq); + MPI_Wait(sreq, MPI_STATUS_IGNORE); + MPI_Start(rreq); + MPI_Wait(rreq, MPI_STATUS_IGNORE); + } else if (1 == rank) { + MPI_Start(rreq); + MPI_Wait(rreq, MPI_STATUS_IGNORE); + MPI_Start(sreq); + MPI_Wait(sreq, MPI_STATUS_IGNORE); + } + } + double stop = MPI_Wtime(); + sample.raw = stop-start; + sample.norm = sample.raw / perSample; + return sample; +} + +int main(int argc, char **argv) { + // Initialize the MPI environment + MPI_Init(&argc, &argv); + + // Get the number of processes + int size, rank; + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + if (size < 2) { + printf("need at least 2 ranks!\n"); + exit(1); + } + + int src = (rank + 1) % 2; + int dst = (rank + 1) % 2; + int tag = 0; + MPI_Request sreq, rreq; + int numIters = 100; + + + std::vector sweep{ + 1, + 64, + 128, + 256, + 512, + 1 * 1024, + 2 * 1024, + 4 * 1024, + 8 * 1024, + 16 * 1024, + 32 * 1024, + 64 * 1024, + 128 * 1024, + 256 * 1024, + 512 * 1024, + 1 * 1024 * 1024, + 2 * 1024 * 1024, + 4 * 1024 * 1024, + 8 * 1024 * 1024, + 16 * 1024 * 1024, + 32 * 1024 * 1024, + 64 * 1024 * 1024, + 128 * 1024 * 1024, + 256 * 1024 * 1024, + }; + + if (0 == rank) { + printf("bytes,min,max,avg,med\n"); + } + + for (size_t bytes : sweep) { + std::vector samples(numIters); + char *buf; + CUDA_RUNTIME(cudaMalloc(&buf, bytes)); + + MPI_Send_init(buf, bytes, MPI_BYTE, dst, tag, MPI_COMM_WORLD, &sreq); + MPI_Recv_init(buf, bytes, MPI_BYTE, src, tag, MPI_COMM_WORLD, &rreq); + + // try to reach 200us / sample + int perSample = 1; + for (int i = 0; i < 10; ++i) { + double sample = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).raw; + // estimate number of measurements per sample + int guess = sample_target / sample + /*rounding*/0.5; + // close half the distance to this estimate + perSample += (guess - perSample) * 0.5; + if (perSample < 1) perSample = 1; + MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD); + } + + if (0 == rank) { + fprintf(stderr, "sample averaged over %d iterations\n", perSample); + } + + for (int i = 0; i < numIters; ++i) { + samples[i] = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).norm; + } + + // each sample is the max time observed + MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + + // bubble sort + bool changed = true; + while (changed) { + changed = false; + for (int i = 0; i < numIters - 1; ++i) { + if (samples[i] > samples[i+1]) { + double tmp = samples[i+1]; + samples[i+1] = samples[i]; + samples[i] = tmp; + changed = true; + } + } + } + + // average + double avg = 0; + for (int i = 0; i < numIters; ++i) { + avg += samples[i]; + } + avg /= numIters; + + if (0 == rank) { + printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]); + } + + MPI_Request_free(&sreq); + MPI_Request_free(&rreq); + CUDA_RUNTIME(cudaFree(buf)); + } + + MPI_Finalize(); + return 0; +} diff --git a/send_recv_gpu.cpp b/send_recv_gpu.cpp new file mode 100644 index 0000000..8fdef03 --- /dev/null +++ b/send_recv_gpu.cpp @@ -0,0 +1,149 @@ +#include +#include +#include + +#include +#include + +inline void checkCuda(cudaError_t result, const char *file, const int line) { + if (result != cudaSuccess) { + fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result)); + exit(-1); + } +} +#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__); + +const float sample_target = 200e-6; + +struct Sample { + double raw; + double norm; +}; + +static Sample get_sample(int perSample, void *buf, int bytes, int rank, int other, MPI_Comm comm) { + Sample sample; + int tag = 0; + MPI_Barrier(comm); + double start = MPI_Wtime(); + for (int i = 0; i < perSample; ++i) { + if (0 == rank) { + MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm); + MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE); + } else if (1 == rank) { + MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE); + MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm); + } + } + double stop = MPI_Wtime(); + sample.raw = stop-start; + sample.norm = sample.raw / perSample; + return sample; +} + +int main(int argc, char **argv) { + // Initialize the MPI environment + MPI_Init(&argc, &argv); + + // Get the number of processes + int size, rank; + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + if (size < 2) { + printf("need at least 2 ranks!\n"); + exit(1); + } + + int other = (rank + 1) % 2; + int numIters = 100; + + std::vector sweep{ + 1, + 64, + 128, + 256, + 512, + 1 * 1024, + 2 * 1024, + 4 * 1024, + 8 * 1024, + 16 * 1024, + 32 * 1024, + 64 * 1024, + 128 * 1024, + 256 * 1024, + 512 * 1024, + 1 * 1024 * 1024, + 2 * 1024 * 1024, + 4 * 1024 * 1024, + 8 * 1024 * 1024, + 16 * 1024 * 1024, + 32 * 1024 * 1024, + 64 * 1024 * 1024, + 128 * 1024 * 1024, + 256 * 1024 * 1024, + }; + + if (0 == rank) { + printf("bytes,min,max,avg,med\n"); + } + + for (size_t bytes : sweep) { + std::vector samples(numIters); + char *buf; + CUDA_RUNTIME(cudaMalloc(&buf, bytes)); + + // try to reach 200us / sample + int perSample = 1; + for (int i = 0; i < 10; ++i) { + double sample = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).raw; + // estimate number of measurements per sample + int guess = sample_target / sample + /*rounding*/0.5; + // close half the distance to this estimate + perSample += (guess - perSample) * 0.5; + if (perSample < 1) perSample = 1; + MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD); + } + + if (0 == rank) { + fprintf(stderr, "sample averaged over %d iterations\n", perSample); + } + + for (int i = 0; i < numIters; ++i) { + samples[i] = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).norm; + } + + // each sample is the max time observed + MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + + // bubble sort + bool changed = true; + while (changed) { + changed = false; + for (int i = 0; i < numIters - 1; ++i) { + if (samples[i] > samples[i+1]) { + double tmp = samples[i+1]; + samples[i+1] = samples[i]; + samples[i] = tmp; + changed = true; + } + } + } + + // average + double avg = 0; + for (int i = 0; i < numIters; ++i) { + avg += samples[i]; + } + avg /= numIters; + + if (0 == rank) { + printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]); + } + + CUDA_RUNTIME(cudaFree(buf)); + } + + MPI_Finalize(); + return 0; +}