Compare commits
10 Commits
b0d2bd24f4
...
master
Author | SHA1 | Date | |
---|---|---|---|
![]() |
286f5c0190 | ||
![]() |
16de9c455b | ||
![]() |
d29de778d2 | ||
![]() |
5d35a3d1fa | ||
![]() |
f5a415ef4d | ||
![]() |
0c90c5c45f | ||
![]() |
5000db0508 | ||
![]() |
06192614b0 | ||
![]() |
fec7a16267 | ||
![]() |
3350ad69e1 |
@@ -23,14 +23,18 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MPI_FOUND)
|
if (MPI_FOUND)
|
||||||
message(STATUS "MPI_CXX_COMPILER: ${MPI_CXX_COMPILER}")
|
message(STATUS "MPI_VERSION: ${MPI_VERSION}")
|
||||||
message(STATUS "MPI_CXX_INCLUDE_DIRS: ${MPI_CXX_INCLUDE_DIRS}")
|
message(STATUS "MPI_CXX_COMPILER: ${MPI_CXX_COMPILER}")
|
||||||
message(STATUS "MPIEXEC_EXECUTABLE: ${MPIEXEC_EXECUTABLE}")
|
message(STATUS "MPI_CXX_COMPILE_OPTIONS: ${MPI_CXX_COMPILE_OPTIONS}")
|
||||||
message(STATUS "MPIEXEC_NUMPROC_FLAG: ${MPIEXEC_NUMPROC_FLAG}")
|
message(STATUS "MPI_CXX_COMPILE_DEFINITIONS: ${MPI_CXX_COMPILE_DEFINITIONS}")
|
||||||
message(STATUS "MPIEXEC_MAX_NUMPROCS: ${MPIEXEC_MAX_NUMPROCS}")
|
message(STATUS "MPI_CXX_INCLUDE_DIRS: ${MPI_CXX_INCLUDE_DIRS}")
|
||||||
message(STATUS "MPIEXEC_PREFLAGS: ${MPIEXEC_PREFLAGS}")
|
message(STATUS "MPI_CXX_LIBRARIES: ${MPI_CXX_LIBRARIES}")
|
||||||
message(STATUS "MPIEXEC_POSTFLAGS: ${MPIEXEC_POSTFLAGS}")
|
message(STATUS "MPI_CXX_LINK_FLAGS: ${MPI_CXX_LINK_FLAGS}")
|
||||||
|
message(STATUS "MPIEXEC_EXECUTABLE: ${MPIEXEC_EXECUTABLE}")
|
||||||
|
message(STATUS "MPIEXEC_NUMPROC_FLAG: ${MPIEXEC_NUMPROC_FLAG}")
|
||||||
|
message(STATUS "MPIEXEC_MAX_NUMPROCS: ${MPIEXEC_MAX_NUMPROCS}")
|
||||||
|
message(STATUS "MPIEXEC_PREFLAGS: ${MPIEXEC_PREFLAGS}")
|
||||||
|
message(STATUS "MPIEXEC_POSTFLAGS: ${MPIEXEC_POSTFLAGS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
function(set_cxx_options target)
|
function(set_cxx_options target)
|
||||||
@@ -113,10 +117,33 @@ if (MPI_FOUND)
|
|||||||
set_cxx_standard(persistent)
|
set_cxx_standard(persistent)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (MPI_FOUND)
|
||||||
|
add_executable(send-recv send_recv.cpp)
|
||||||
|
target_link_libraries(send-recv MPI::MPI_CXX)
|
||||||
|
set_cxx_options(send-recv)
|
||||||
|
set_cxx_standard(send-recv)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
|
if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
|
||||||
add_executable(one-sided-gpu one_sided_gpu.cpp)
|
add_executable(one-sided-gpu one_sided_gpu.cpp)
|
||||||
target_link_libraries(one-sided-gpu MPI::MPI_CXX)
|
target_link_libraries(one-sided-gpu MPI::MPI_CXX)
|
||||||
target_link_libraries(one-sided-gpu CUDA::cudart)
|
target_link_libraries(one-sided-gpu CUDA::cudart)
|
||||||
set_cxx_options(one-sided-gpu)
|
set_cxx_options(one-sided-gpu)
|
||||||
set_cxx_standard(one-sided-gpu)
|
set_cxx_standard(one-sided-gpu)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
|
||||||
|
add_executable( persistent-gpu persistent_gpu.cpp)
|
||||||
|
target_link_libraries(persistent-gpu MPI::MPI_CXX)
|
||||||
|
target_link_libraries(persistent-gpu CUDA::cudart)
|
||||||
|
set_cxx_options( persistent-gpu)
|
||||||
|
set_cxx_standard( persistent-gpu)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
|
||||||
|
add_executable(send-recv-gpu send_recv_gpu.cpp)
|
||||||
|
target_link_libraries(send-recv-gpu MPI::MPI_CXX)
|
||||||
|
target_link_libraries(send-recv-gpu CUDA::cudart)
|
||||||
|
set_cxx_options(send-recv-gpu)
|
||||||
|
set_cxx_standard(send-recv-gpu)
|
||||||
|
endif()
|
||||||
|
46
README.md
46
README.md
@@ -4,12 +4,23 @@ Various standalone C++ MPI tests/examples/benchmarks.
|
|||||||
|
|
||||||
If CUDA is detected, additional binaries can be built.
|
If CUDA is detected, additional binaries can be built.
|
||||||
|
|
||||||
| name | Kind | Reqs. | Ranks | Description |
|
| name | Kind | Reqs. | Ranks | Description |
|
||||||
|---------------|-----------|----------|-------|-------------|
|
|--------------------|-----------|----------|-------|-------------|
|
||||||
|`hello-world` | Test | MPI | 1+ | an MPI hello-world |
|
|[hello-world][1] | Test | MPI | 1+ | an MPI hello-world |
|
||||||
|`one-sided` | Test | MPI | 2 | one-sided communication |
|
|[one-sided][2] | Test | MPI | 2 | one-sided communication |
|
||||||
|`one-sided-gpu`| Test | MPI+CUDA | 2 | one-sided communication on GPU buffer |
|
|[one-sided-gpu][3] | Test | MPI+CUDA | 2 | one-sided communication on GPU buffer |
|
||||||
|`persistent` | Benchmark | MPI | 2 | ping-pong time for persistent Send/Recv |
|
|[persistent][4] | Benchmark | MPI | 2 | ping-pong time for persistent Send/Recv |
|
||||||
|
|[persistent-gpu][5] | Benchmark | MPI+CUDA | 2 | ping-pong time for persistent Send/Recv on GPU buffer|
|
||||||
|
|[send-recv][6] | Benchmark | MPI | 2 | ping-pong time for Send/Recv |
|
||||||
|
|[send-recv-gpu][7] | Benchmark | MPI+CUDA | 2 | ping-pong time for Send/Recv on GPU buffer|
|
||||||
|
|
||||||
|
[1]: https://github.com/cwpearson/mpi_test/blob/master/hello_world.cpp
|
||||||
|
[2]: https://github.com/cwpearson/mpi_test/blob/master/one_sided.cpp
|
||||||
|
[3]: https://github.com/cwpearson/mpi_test/blob/master/one_sided_gpu.cpp
|
||||||
|
[4]: https://github.com/cwpearson/mpi_test/blob/master/persistent.cpp
|
||||||
|
[5]: https://github.com/cwpearson/mpi_test/blob/master/persistent_gpu.cpp
|
||||||
|
[6]: https://github.com/cwpearson/mpi_test/blob/master/send_recv.cpp
|
||||||
|
[7]: https://github.com/cwpearson/mpi_test/blob/master/send_recv_gpu.cpp
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
```
|
```
|
||||||
@@ -18,6 +29,26 @@ cmake ..
|
|||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
|
CMake will print the detected MPI environment.
|
||||||
|
Confirm it is what you expect.
|
||||||
|
For example:
|
||||||
|
```
|
||||||
|
-- MPI_VERSION:
|
||||||
|
-- MPI_CXX_COMPILER: [...]/bin/mpicxx
|
||||||
|
-- MPI_CXX_COMPILE_OPTIONS: -pthread
|
||||||
|
-- MPI_CXX_COMPILE_DEFINITIONS:
|
||||||
|
-- MPI_CXX_INCLUDE_DIRS: [...]/include
|
||||||
|
-- MPI_CXX_LIBRARIES: [...]/lib/libmpiprofilesupport.so;[...]/lib/libmpi_ibm.so
|
||||||
|
-- MPI_CXX_LINK_FLAGS: -pthread
|
||||||
|
```
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Summit
|
||||||
|
|
||||||
|
* 1 node: `jsrun -n 1 ./persistent`
|
||||||
|
* 2 nodes: `jsrun -n 2 -r 1 -a 1 ./persistent`
|
||||||
|
* 2 nodes w/GPU: `jsrun --smpi="-gpu" -n 2 -r 1 -g 1 ./send-recv-gpu`
|
||||||
|
|
||||||
## Run all tests
|
## Run all tests
|
||||||
|
|
||||||
`run-all.sh` attempts to discover certain environments automatically.
|
`run-all.sh` attempts to discover certain environments automatically.
|
||||||
@@ -33,6 +64,7 @@ If any tests fails, you can re-run them individually.
|
|||||||
|
|
||||||
Execute any binary you want using `mpirun`, or whatever is appropriate for your platform.
|
Execute any binary you want using `mpirun`, or whatever is appropriate for your platform.
|
||||||
|
|
||||||
|
|
||||||
## Notes on specific platforms
|
## Notes on specific platforms
|
||||||
|
|
||||||
Some Open MPIs use `long long` for their datatypes, which means we can't support ANSI C++ (`-ansi`).
|
Some Open MPIs use `long long` for their datatypes, which is not a part of ANSI C++ (`-ansi`).
|
||||||
|
@@ -149,7 +149,12 @@ int main(int argc, char **argv) {
|
|||||||
if (munlock(buf, bytes)) {
|
if (munlock(buf, bytes)) {
|
||||||
perror("error unlocking memory");
|
perror("error unlocking memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MPI_Request_free(&sreq);
|
||||||
|
MPI_Request_free(&rreq);
|
||||||
delete[] buf;
|
delete[] buf;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
|
161
persistent_gpu.cpp
Normal file
161
persistent_gpu.cpp
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
#include <mpi.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
inline void checkCuda(cudaError_t result, const char *file, const int line) {
|
||||||
|
if (result != cudaSuccess) {
|
||||||
|
fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result));
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
|
||||||
|
|
||||||
|
const float sample_target = 200e-6;
|
||||||
|
|
||||||
|
struct Sample {
|
||||||
|
double raw;
|
||||||
|
double norm;
|
||||||
|
};
|
||||||
|
|
||||||
|
static Sample get_sample(int perSample, MPI_Request *sreq, MPI_Request *rreq, int rank, MPI_Comm comm) {
|
||||||
|
Sample sample;
|
||||||
|
MPI_Barrier(comm);
|
||||||
|
double start = MPI_Wtime();
|
||||||
|
for (int i = 0; i < perSample; ++i) {
|
||||||
|
if (0 == rank) {
|
||||||
|
MPI_Start(sreq);
|
||||||
|
MPI_Wait(sreq, MPI_STATUS_IGNORE);
|
||||||
|
MPI_Start(rreq);
|
||||||
|
MPI_Wait(rreq, MPI_STATUS_IGNORE);
|
||||||
|
} else if (1 == rank) {
|
||||||
|
MPI_Start(rreq);
|
||||||
|
MPI_Wait(rreq, MPI_STATUS_IGNORE);
|
||||||
|
MPI_Start(sreq);
|
||||||
|
MPI_Wait(sreq, MPI_STATUS_IGNORE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double stop = MPI_Wtime();
|
||||||
|
sample.raw = stop-start;
|
||||||
|
sample.norm = sample.raw / perSample;
|
||||||
|
return sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
// Initialize the MPI environment
|
||||||
|
MPI_Init(&argc, &argv);
|
||||||
|
|
||||||
|
// Get the number of processes
|
||||||
|
int size, rank;
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
|
||||||
|
if (size < 2) {
|
||||||
|
printf("need at least 2 ranks!\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int src = (rank + 1) % 2;
|
||||||
|
int dst = (rank + 1) % 2;
|
||||||
|
int tag = 0;
|
||||||
|
MPI_Request sreq, rreq;
|
||||||
|
int numIters = 100;
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<size_t> sweep{
|
||||||
|
1,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1 * 1024,
|
||||||
|
2 * 1024,
|
||||||
|
4 * 1024,
|
||||||
|
8 * 1024,
|
||||||
|
16 * 1024,
|
||||||
|
32 * 1024,
|
||||||
|
64 * 1024,
|
||||||
|
128 * 1024,
|
||||||
|
256 * 1024,
|
||||||
|
512 * 1024,
|
||||||
|
1 * 1024 * 1024,
|
||||||
|
2 * 1024 * 1024,
|
||||||
|
4 * 1024 * 1024,
|
||||||
|
8 * 1024 * 1024,
|
||||||
|
16 * 1024 * 1024,
|
||||||
|
32 * 1024 * 1024,
|
||||||
|
64 * 1024 * 1024,
|
||||||
|
128 * 1024 * 1024,
|
||||||
|
256 * 1024 * 1024,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
printf("bytes,min,max,avg,med\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t bytes : sweep) {
|
||||||
|
std::vector<double> samples(numIters);
|
||||||
|
char *buf;
|
||||||
|
CUDA_RUNTIME(cudaMalloc(&buf, bytes));
|
||||||
|
|
||||||
|
MPI_Send_init(buf, bytes, MPI_BYTE, dst, tag, MPI_COMM_WORLD, &sreq);
|
||||||
|
MPI_Recv_init(buf, bytes, MPI_BYTE, src, tag, MPI_COMM_WORLD, &rreq);
|
||||||
|
|
||||||
|
// try to reach 200us / sample
|
||||||
|
int perSample = 1;
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
double sample = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).raw;
|
||||||
|
// estimate number of measurements per sample
|
||||||
|
int guess = sample_target / sample + /*rounding*/0.5;
|
||||||
|
// close half the distance to this estimate
|
||||||
|
perSample += (guess - perSample) * 0.5;
|
||||||
|
if (perSample < 1) perSample = 1;
|
||||||
|
MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
fprintf(stderr, "sample averaged over %d iterations\n", perSample);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < numIters; ++i) {
|
||||||
|
samples[i] = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).norm;
|
||||||
|
}
|
||||||
|
|
||||||
|
// each sample is the max time observed
|
||||||
|
MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
// bubble sort
|
||||||
|
bool changed = true;
|
||||||
|
while (changed) {
|
||||||
|
changed = false;
|
||||||
|
for (int i = 0; i < numIters - 1; ++i) {
|
||||||
|
if (samples[i] > samples[i+1]) {
|
||||||
|
double tmp = samples[i+1];
|
||||||
|
samples[i+1] = samples[i];
|
||||||
|
samples[i] = tmp;
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// average
|
||||||
|
double avg = 0;
|
||||||
|
for (int i = 0; i < numIters; ++i) {
|
||||||
|
avg += samples[i];
|
||||||
|
}
|
||||||
|
avg /= numIters;
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Request_free(&sreq);
|
||||||
|
MPI_Request_free(&rreq);
|
||||||
|
CUDA_RUNTIME(cudaFree(buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
147
send_recv.cpp
Normal file
147
send_recv.cpp
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
#include <mpi.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
const float sample_target = 200e-6;
|
||||||
|
|
||||||
|
struct Sample {
|
||||||
|
double raw;
|
||||||
|
double norm;
|
||||||
|
};
|
||||||
|
|
||||||
|
static Sample get_sample(int perSample, void *buf, int bytes, int rank, int other, MPI_Comm comm) {
|
||||||
|
Sample sample;
|
||||||
|
int tag = 0;
|
||||||
|
MPI_Barrier(comm);
|
||||||
|
double start = MPI_Wtime();
|
||||||
|
for (int i = 0; i < perSample; ++i) {
|
||||||
|
if (0 == rank) {
|
||||||
|
MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
|
||||||
|
MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
|
||||||
|
} else if (1 == rank) {
|
||||||
|
MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
|
||||||
|
MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double stop = MPI_Wtime();
|
||||||
|
sample.raw = stop-start;
|
||||||
|
sample.norm = sample.raw / perSample;
|
||||||
|
return sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
// Initialize the MPI environment
|
||||||
|
MPI_Init(&argc, &argv);
|
||||||
|
|
||||||
|
// Get the number of processes
|
||||||
|
int size, rank;
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
|
||||||
|
if (size < 2) {
|
||||||
|
printf("need at least 2 ranks!\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int other = (rank + 1) % 2;
|
||||||
|
int numIters = 100;
|
||||||
|
|
||||||
|
std::vector<size_t> sweep{
|
||||||
|
1,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1 * 1024,
|
||||||
|
2 * 1024,
|
||||||
|
4 * 1024,
|
||||||
|
8 * 1024,
|
||||||
|
16 * 1024,
|
||||||
|
32 * 1024,
|
||||||
|
64 * 1024,
|
||||||
|
128 * 1024,
|
||||||
|
256 * 1024,
|
||||||
|
512 * 1024,
|
||||||
|
1 * 1024 * 1024,
|
||||||
|
2 * 1024 * 1024,
|
||||||
|
4 * 1024 * 1024,
|
||||||
|
8 * 1024 * 1024,
|
||||||
|
16 * 1024 * 1024,
|
||||||
|
32 * 1024 * 1024,
|
||||||
|
64 * 1024 * 1024,
|
||||||
|
128 * 1024 * 1024,
|
||||||
|
256 * 1024 * 1024,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
printf("bytes,min,max,avg,med\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t bytes : sweep) {
|
||||||
|
std::vector<double> samples(numIters);
|
||||||
|
char *buf = new char[bytes];
|
||||||
|
if (mlock(buf, bytes)) {
|
||||||
|
perror("error locking memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
// try to reach 200us / sample
|
||||||
|
int perSample = 1;
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
double sample = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).raw;
|
||||||
|
// estimate number of measurements per sample
|
||||||
|
int guess = sample_target / sample + /*rounding*/0.5;
|
||||||
|
// close half the distance to this estimate
|
||||||
|
perSample += (guess - perSample) * 0.5;
|
||||||
|
if (perSample < 1) perSample = 1;
|
||||||
|
MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
fprintf(stderr, "sample averaged over %d iterations\n", perSample);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < numIters; ++i) {
|
||||||
|
samples[i] = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).norm;
|
||||||
|
}
|
||||||
|
|
||||||
|
// each sample is the max time observed
|
||||||
|
MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
// bubble sort
|
||||||
|
bool changed = true;
|
||||||
|
while (changed) {
|
||||||
|
changed = false;
|
||||||
|
for (int i = 0; i < numIters - 1; ++i) {
|
||||||
|
if (samples[i] > samples[i+1]) {
|
||||||
|
double tmp = samples[i+1];
|
||||||
|
samples[i+1] = samples[i];
|
||||||
|
samples[i] = tmp;
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// average
|
||||||
|
double avg = 0;
|
||||||
|
for (int i = 0; i < numIters; ++i) {
|
||||||
|
avg += samples[i];
|
||||||
|
}
|
||||||
|
avg /= numIters;
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (munlock(buf, bytes)) {
|
||||||
|
perror("error unlocking memory");
|
||||||
|
}
|
||||||
|
delete[] buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
149
send_recv_gpu.cpp
Normal file
149
send_recv_gpu.cpp
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
#include <mpi.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
inline void checkCuda(cudaError_t result, const char *file, const int line) {
|
||||||
|
if (result != cudaSuccess) {
|
||||||
|
fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result));
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
|
||||||
|
|
||||||
|
const float sample_target = 200e-6;
|
||||||
|
|
||||||
|
struct Sample {
|
||||||
|
double raw;
|
||||||
|
double norm;
|
||||||
|
};
|
||||||
|
|
||||||
|
static Sample get_sample(int perSample, void *buf, int bytes, int rank, int other, MPI_Comm comm) {
|
||||||
|
Sample sample;
|
||||||
|
int tag = 0;
|
||||||
|
MPI_Barrier(comm);
|
||||||
|
double start = MPI_Wtime();
|
||||||
|
for (int i = 0; i < perSample; ++i) {
|
||||||
|
if (0 == rank) {
|
||||||
|
MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
|
||||||
|
MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
|
||||||
|
} else if (1 == rank) {
|
||||||
|
MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
|
||||||
|
MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double stop = MPI_Wtime();
|
||||||
|
sample.raw = stop-start;
|
||||||
|
sample.norm = sample.raw / perSample;
|
||||||
|
return sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
// Initialize the MPI environment
|
||||||
|
MPI_Init(&argc, &argv);
|
||||||
|
|
||||||
|
// Get the number of processes
|
||||||
|
int size, rank;
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
|
||||||
|
if (size < 2) {
|
||||||
|
printf("need at least 2 ranks!\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int other = (rank + 1) % 2;
|
||||||
|
int numIters = 100;
|
||||||
|
|
||||||
|
std::vector<size_t> sweep{
|
||||||
|
1,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1 * 1024,
|
||||||
|
2 * 1024,
|
||||||
|
4 * 1024,
|
||||||
|
8 * 1024,
|
||||||
|
16 * 1024,
|
||||||
|
32 * 1024,
|
||||||
|
64 * 1024,
|
||||||
|
128 * 1024,
|
||||||
|
256 * 1024,
|
||||||
|
512 * 1024,
|
||||||
|
1 * 1024 * 1024,
|
||||||
|
2 * 1024 * 1024,
|
||||||
|
4 * 1024 * 1024,
|
||||||
|
8 * 1024 * 1024,
|
||||||
|
16 * 1024 * 1024,
|
||||||
|
32 * 1024 * 1024,
|
||||||
|
64 * 1024 * 1024,
|
||||||
|
128 * 1024 * 1024,
|
||||||
|
256 * 1024 * 1024,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
printf("bytes,min,max,avg,med\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t bytes : sweep) {
|
||||||
|
std::vector<double> samples(numIters);
|
||||||
|
char *buf;
|
||||||
|
CUDA_RUNTIME(cudaMalloc(&buf, bytes));
|
||||||
|
|
||||||
|
// try to reach 200us / sample
|
||||||
|
int perSample = 1;
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
double sample = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).raw;
|
||||||
|
// estimate number of measurements per sample
|
||||||
|
int guess = sample_target / sample + /*rounding*/0.5;
|
||||||
|
// close half the distance to this estimate
|
||||||
|
perSample += (guess - perSample) * 0.5;
|
||||||
|
if (perSample < 1) perSample = 1;
|
||||||
|
MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
fprintf(stderr, "sample averaged over %d iterations\n", perSample);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < numIters; ++i) {
|
||||||
|
samples[i] = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).norm;
|
||||||
|
}
|
||||||
|
|
||||||
|
// each sample is the max time observed
|
||||||
|
MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
// bubble sort
|
||||||
|
bool changed = true;
|
||||||
|
while (changed) {
|
||||||
|
changed = false;
|
||||||
|
for (int i = 0; i < numIters - 1; ++i) {
|
||||||
|
if (samples[i] > samples[i+1]) {
|
||||||
|
double tmp = samples[i+1];
|
||||||
|
samples[i+1] = samples[i];
|
||||||
|
samples[i] = tmp;
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// average
|
||||||
|
double avg = 0;
|
||||||
|
for (int i = 0; i < numIters; ++i) {
|
||||||
|
avg += samples[i];
|
||||||
|
}
|
||||||
|
avg /= numIters;
|
||||||
|
|
||||||
|
if (0 == rank) {
|
||||||
|
printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDA_RUNTIME(cudaFree(buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
4
summit-env.sh
Normal file
4
summit-env.sh
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
module reset
|
||||||
|
|
||||||
|
module load cuda
|
||||||
|
module load cmake
|
Reference in New Issue
Block a user